# STA 414, 2011 ASSIGNMENT #3.
#
# SCRIPT FOR APPLYING METHODS TO THE DATA PROVIDED.

source("a3-mix.txt")

# Read training data.

data.trn <- as.matrix(read.table("a3-train-data.txt",head=FALSE))

x.trn <- data.trn[,1:3]
t.trn <- data.trn[,4]

# Fit 3-componet and 5-component, with three random initializations.

set.seed(1); f3.1 <- mix.na(x.trn,3,100)
set.seed(2); f3.2 <- mix.na(x.trn,3,100)
set.seed(3); f3.3 <- mix.na(x.trn,3,100)

cat("Log likelihoods for 3-component model fits:",
     c (f3.1$ll[100], f3.2$ll[100], f3.3$ll[100]), "\n")

set.seed(1); f5.1 <- mix.na(x.trn,5,100)
set.seed(2); f5.2 <- mix.na(x.trn,5,100)
set.seed(3); f5.3 <- mix.na(x.trn,5,100)

cat("Log likelihoods for 5-component model fits:",
     c (f5.1$ll[100], f5.2$ll[100], f5.3$ll[100]), "\n")

# Plot log likelihood versus iteration for the six fits above, to confirm
# that it always increases, and to see how fast EM is converging.

pdf("a3-plots.pdf",pointsize=9)
par(mfrow=c(1,2))

plot(f3.1$ll,xlab="iteration",ylab="log likelihood",type="l",col="red")
lines(f3.2$ll,col="green")
lines(f3.3$ll,col="blue")
title("Three runs of EM for 3-component model")

plot(f5.1$ll,xlab="iteration",ylab="log likelihood",type="l",col="red")
lines(f5.2$ll,col="green")
lines(f5.3$ll,col="blue")
title("Three runs of EM for 5-component model")

dev.off()

# Use best models from above (manually selected) to fill in missing values.

x.trn.fill3 <- mix.fill(x.trn,f3.1)
x.trn.fill5 <- mix.fill(x.trn,f5.2)

# Fit 1-component model and use it to fill in missing values, which has
# the effect of just filling in with the sample mean values.

f1 <- mix.na(x.trn,1,100)
x.trn.fill1 <- mix.fill(x.trn,f1)

# Fit linear models with original data, and data as filled in three ways.

print(summary(m0<-lm(t.trn~x.trn)))
print(summary(m1<-lm(t.trn~x.trn.fill1)))
print(summary(m3<-lm(t.trn~x.trn.fill3)))
print(summary(m5<-lm(t.trn~x.trn.fill5)))

# Read test data.

data.tst <- as.matrix(read.table("a3-test-data.txt",head=FALSE))

x.tst <- data.tst[,1:3]
t.tst <- data.tst[,4]

# Compute squared error on test data for each of the linear models found above.

test.err <- function (xtst,ttst,beta)
{ p <- beta[1] + xtst%*%beta[-1]
  mean((ttst-p)^2)
}

print(test.err(x.tst,t.tst,coef(m0)))
print(test.err(x.tst,t.tst,coef(m1)))
print(test.err(x.tst,t.tst,coef(m3)))
print(test.err(x.tst,t.tst,coef(m5)))