# STA 437/1005 Fall 2010, Assignment 1, Part II. # Read the gene expression data, storing as matrix gexp, and its log as # lgexp. Also read other patient information, as data frame pinfo, and # set recur to the recurrence flag. gexp <- as.matrix(read.table("ass1-gene-exp.txt",head=TRUE)) lgexp <- log(gexp) pinfo <- read.table("ass1-patients.txt",head=TRUE) recur <- pinfo$recur pdf("a1-gene-p1.pdf") # Plot histogram of all gene expression data, and the log of the data. par(mfrow=c(2,1)) hist(gexp,nclass=100) hist(lgexp,nclass=100) # Plot histograms of log expression for some arbitrarily selected genes. par(mfrow=c(3,2)) hist(lgexp[,100],nclass=6) hist(lgexp[,200],nclass=6) hist(lgexp[,300],nclass=6) hist(lgexp[,400],nclass=6) hist(lgexp[,500],nclass=6) hist(lgexp[,600],nclass=6) dev.off() pdf("a1-gene-p2.pdf") # Plot means and standard deviations of the log expression level for all genes. par(mfrow=c(2,1)) plot(mean(as.data.frame(lgexp)), xlab="means of log gene expression, by gene",ylab="",pch=20) plot(sd(as.data.frame(lgexp)), xlab="standard deviations of log gene expression, by gene",ylab="",pch=20) # Plot means and standard deviations of the log expression levels for all # subjects. par(mfrow=c(2,1)) plot(mean(as.data.frame(t(lgexp))), xlab="means of log gene expression, by subject",ylab="",pch=20) plot(sd(as.data.frame(t(lgexp))), xlab="standard deviations of log gene expression, by subject",ylab="",pch=20) dev.off() # Find principal components of the gene expression data, with and without log, # and with and without scaling (ie, on both covariance and correlation). # Produce screeplots for each (for first 20 PCs), using a log scale for the # variance. pdf("a1-gene-p3.pdf") par(mfrow=c(2,2)) pc <- prcomp(gexp) plot(pc$sdev[1:20]^2, type="b",pch=20,xlab="pc",ylab="variance",log="y") pc.cor <- prcomp(gexp,scale=TRUE) plot(pc.cor$sdev[1:20]^2, type="b",pch=20,xlab="pc.cor",ylab="variance",log="y") pc.log <- prcomp(lgexp) plot(pc.log$sdev[1:20]^2, type="b",pch=20,xlab="pc.log",ylab="variance",log="y") pc.log.cor <- prcomp(lgexp,scale=TRUE) plot(pc.log.cor$sdev[1:20]^2, type="b",pch=20,xlab="pc.log.cor",ylab="variance",log="y") dev.off() # Print correlations of first 10 principal components with recur, for the # four ways of finding principal components above. for (method in 1:4) { cat("\nCorrelations of recur with", c("pc","pc.cor","pc.log","pc.log.cor")[method],"\n") print (round (cor ( list(pc,pc.cor,pc.log,pc.log.cor)[[method]]$x[,1:10],recur), 2)) } # Show pairwise scatterplots for first 5 components in pc.log, with colours # for recur (1=red). pdf("a1-gene-p4.pdf") par(mfrow=c(1,1)) cols <- c("green","red")[recur+1] plot(as.data.frame(pc.log$x[,1:5]),col=cols,pch=19) dev.off() # Do least squares regression of recur on first 5 components of pc, pc.cor, # pc.log, and pc.log.cor. keep <- 1:5 print(summary(lm(recur~pc$x[,keep]))) print(summary(lm(recur~pc.cor$x[,keep]))) print(summary(lm(recur~pc.log$x[,keep]))) print(summary(lm(recur~pc.log.cor$x[,keep]))) # Check significance of above regression by seeing what adjusted R-squared # is with many permutations of recur. perm.rsq <- numeric(1000) perm.rsq[1] <- summary(lm(recur~pc.log$x[,keep]))$adj.r.squared set.seed(1) for (i in 2:length(perm.rsq)) { perm.rsq[i] <- summary(lm(sample(recur)~pc.log$x[,keep]))$adj.r.squared } cat("permuation p-value:",mean(perm.rsq>=perm.rsq[1]),"\n") # Do least squares regression on just components 1, 2, 3, and 5 of pc.log. print(summary(lm(recur~pc.log$x[,c(1,2,3,5)])))