# STA 414/2104 - SCRIPT FOR ASSIGNMENT 2. source("disc.r") # TESTS ON DATASET A. # Read the training and test data. x.trna = as.matrix (read.table("a2a-x-train",head=F)) t.trna = scan("a2a-t-train") x.tsta = as.matrix (read.table("a2a-x-test",head=F)) t.tsta = scan("a2a-t-test") # Find linear discriminant and apply it to the test data. dla = lin.disc (x.trna, t.trna) yla = x.tsta %*% dla$w + dla$w0 # Find quadratic discriminant and apply it to the test data. dqa = quad.disc (x.trna, t.trna) yqa = apply (x.tsta, 1, function (x) x %*% dqa$v %*% x + dqa$w %*% x + dqa$w0) # Find discriminant from logistic regresson with original inputs only and apply # it to the test data. dlrla = coef (glm (t.trna ~ x.trna, family="binomial", maxit=1000)) ylrla = cbind (1, x.tsta) %*% dlrla # Find discrimant from logistic regression with quadratic terms and apply it # to the test data. qp1 = c(1,2,3,4,5,6,1,1,1,1,1,2,2,2,2,3,3,3,4,4,5) # Indexes of elements qp2 = c(1,2,3,4,5,6,2,3,4,5,6,3,4,5,6,4,5,6,5,6,6) # in pair products dlrqa = coef (glm (t.trna ~ cbind (x.trna, x.trna[,qp1]*x.trna[,qp2]), family="binomial", maxit=1000)) ylrqa = cbind (1, x.tsta, x.tsta[,qp1]*x.tsta[,qp2]) %*% dlrqa # Report error rates on test data. cat ("\nTEST ERROR RATES ON DATASET A:\n\n") cat (" linear discriminant: ", round(mean((yla>0)!=t.tsta),4), "\n") cat (" quadratic discriminant: ", round(mean((yqa>0)!=t.tsta),4), "\n") cat (" linear logistic regression: ", round(mean((ylrla>0)!=t.tsta),4), "\n") cat (" quadratic logistic regression:", round(mean((ylrqa>0)!=t.tsta),4), "\n") cat ("\n\n") # Plots to investigate what's going on. postscript ("a2-plts-a.ps", horiz=F) # Compare coefficients found by linear discriminant and logistic regression # methods. plot (dla$w, dlrla[-1], xlab="Coefficients from linear discriminant", ylab="Coefficients from logistic regresson") # Look at histograms of each input variable, by class. par(mfrow=c(6,2)) for (i in 1:6) { hist (x.trna[t.trna==0,i], nclass=15, xlab=paste("input variable",i), main="class 0"); hist (x.trna[t.trna==1,i], nclass=15, xlab=paste("input variable",i), main="class 1"); } # Plot variables 1 and 2 with class marked by colour. par(mfrow=c(1,1)) plot(x.trna[,1],x.trna[,2],col=c("blue","red")[t.trna+1], pch=20, xlab="input variable 1", ylab="input variable 2") dev.off() # TESTS ON DATASET B. # Read the training and test data. x.trnb = as.matrix (read.table("a2b-x-train",head=F)) t.trnb = scan("a2b-t-train") x.tstb = as.matrix (read.table("a2b-x-test",head=F)) t.tstb = scan("a2b-t-test") # Find linear discriminant and apply it to the test data. dlb = lin.disc (x.trnb, t.trnb) ylb = x.tstb %*% dlb$w + dlb$w0 # Find quadratic discriminant and apply it to the test data. dqb = quad.disc (x.trnb, t.trnb) yqb = apply (x.tstb, 1, function (x) x %*% dqb$v %*% x + dqb$w %*% x + dqb$w0) # Find discriminant from logistic regresson with original inputs only and apply # it to the test data. dlrlb = coef (glm (t.trnb ~ x.trnb, family="binomial", maxit=1000)) ylrlb = cbind (1, x.tstb) %*% dlrlb # Find discrimant from logistic regression with quadratic terms and apply it # to the test data. qp1 = c(1,2,3,4,5,6,1,1,1,1,1,2,2,2,2,3,3,3,4,4,5) # Indexes of elements qp2 = c(1,2,3,4,5,6,2,3,4,5,6,3,4,5,6,4,5,6,5,6,6) # in pair products dlrqb = coef (glm (t.trnb ~ cbind (x.trnb, x.trnb[,qp1]*x.trnb[,qp2]), family="binomial", maxit=1000)) ylrqb = cbind (1, x.tstb, x.tstb[,qp1]*x.tstb[,qp2]) %*% dlrqb # Report error rates on test data. cat ("\nTEST ERROR RATES ON DATASET B:\n\n") cat (" linear discriminant: ", round(mean((ylb>0)!=t.tstb),4), "\n") cat (" quadratic discriminant: ", round(mean((yqb>0)!=t.tstb),4), "\n") cat (" linear logistic regression: ", round(mean((ylrlb>0)!=t.tstb),4), "\n") cat (" quadratic logistic regression:", round(mean((ylrqb>0)!=t.tstb),4), "\n") # Plots to investigate what's going on. postscript ("a2-plts-b.ps", horiz=F) # Compare coefficients found by linear discriminant and logistic regression # methods. plot (dlb$w, dlrlb[-1], xlab="Coefficients from linear discriminant", ylab="Coefficients from logistic regresson") # Look at histograms of each input variable, by class. par(mfrow=c(6,2)) for (i in 1:6) { hist (x.trnb[t.trnb==0,i], nclass=50, xlab=paste("input variable",i), main="class 0"); hist (x.trnb[t.trnb==1,i], nclass=50, xlab=paste("input variable",i), main="class 1"); } # Plot variables 1 and 3 with class marked by colour. par(mfrow=c(1,1)) plot(x.trnb[,1],x.trnb[,3],col=c("blue","red")[t.trnb+1], pch=20, xlab="input variable 1", ylab="input variable 3") dev.off()