# STA 414, 2011 ASSIGNMENT #3. # # SCRIPT FOR APPLYING METHODS TO THE DATA PROVIDED. source("a3-mix.txt") # Read training data. data.trn <- as.matrix(read.table("a3-train-data.txt",head=FALSE)) x.trn <- data.trn[,1:3] t.trn <- data.trn[,4] # Fit 3-componet and 5-component, with three random initializations. set.seed(1); f3.1 <- mix.na(x.trn,3,100) set.seed(2); f3.2 <- mix.na(x.trn,3,100) set.seed(3); f3.3 <- mix.na(x.trn,3,100) cat("Log likelihoods for 3-component model fits:", c (f3.1$ll[100], f3.2$ll[100], f3.3$ll[100]), "\n") set.seed(1); f5.1 <- mix.na(x.trn,5,100) set.seed(2); f5.2 <- mix.na(x.trn,5,100) set.seed(3); f5.3 <- mix.na(x.trn,5,100) cat("Log likelihoods for 5-component model fits:", c (f5.1$ll[100], f5.2$ll[100], f5.3$ll[100]), "\n") # Plot log likelihood versus iteration for the six fits above, to confirm # that it always increases, and to see how fast EM is converging. pdf("a3-plots.pdf",pointsize=9) par(mfrow=c(1,2)) plot(f3.1$ll,xlab="iteration",ylab="log likelihood",type="l",col="red") lines(f3.2$ll,col="green") lines(f3.3$ll,col="blue") title("Three runs of EM for 3-component model") plot(f5.1$ll,xlab="iteration",ylab="log likelihood",type="l",col="red") lines(f5.2$ll,col="green") lines(f5.3$ll,col="blue") title("Three runs of EM for 5-component model") dev.off() # Use best models from above (manually selected) to fill in missing values. x.trn.fill3 <- mix.fill(x.trn,f3.1) x.trn.fill5 <- mix.fill(x.trn,f5.2) # Fit 1-component model and use it to fill in missing values, which has # the effect of just filling in with the sample mean values. f1 <- mix.na(x.trn,1,100) x.trn.fill1 <- mix.fill(x.trn,f1) # Fit linear models with original data, and data as filled in three ways. print(summary(m0<-lm(t.trn~x.trn))) print(summary(m1<-lm(t.trn~x.trn.fill1))) print(summary(m3<-lm(t.trn~x.trn.fill3))) print(summary(m5<-lm(t.trn~x.trn.fill5))) # Read test data. data.tst <- as.matrix(read.table("a3-test-data.txt",head=FALSE)) x.tst <- data.tst[,1:3] t.tst <- data.tst[,4] # Compute squared error on test data for each of the linear models found above. test.err <- function (xtst,ttst,beta) { p <- beta[1] + xtst%*%beta[-1] mean((ttst-p)^2) } print(test.err(x.tst,t.tst,coef(m0))) print(test.err(x.tst,t.tst,coef(m1))) print(test.err(x.tst,t.tst,coef(m3))) print(test.err(x.tst,t.tst,coef(m5)))