# STA 437/1005 Fall 2010, Assignment 2. # Read the gene expression data, storing as matrix gexp, and its log as # lgexp. Also read other patient information, as data frame pinfo, and # set recur to the recurrence flag. gexp <- as.matrix(read.table("ass1-gene-exp.txt",head=TRUE)) lgexp <- log(gexp) pinfo <- read.table("ass1-patients.txt",head=TRUE) recur <- pinfo$recur # PART 1. # Find principal components of the gene expression data (logged) without # (ie, on the covariance matrix). pc.log <- prcomp(lgexp) # Do least squares regression and logistic regression of recur on first # five components of pc.log. cat("\nLEAST SQUARES AND LOGISTIC FITS\n") keep <- 1:5 ls.fit <- lm(recur~pc.log$x[,keep]) print(summary(ls.fit)) logistic.fit <- glm(recur~pc.log$x[,keep],family=binomial) print(summary(logistic.fit)) # Look at the regression coefficients and predictions for least squares and # logistic regression. cat("\nCOMPARING LEAST SQUARES AND LOGISTIC FITS\n\n") print(cbind( "least squares"=coef(ls.fit), "logistic"=coef(logistic.fit), "ratio"=coef(ls.fit)/coef(logistic.fit) )) pdf("a2-p1a.pdf") plot(predict(ls.fit,type="response"),predict(logistic.fit,type="response")) dev.off() # Do least squares regression and logistic regression on subsets of the first # five principal components. cat("\n\nFITTING WITH SUBSETS OF COMPONENTS\n") print(summary(lm(recur~pc.log$x[,c(1,2,3,5)]))) print(summary(glm(recur~pc.log$x[,c(1,2,3,5)],family=binomial))) print(summary(lm(recur~pc.log$x[,c(1,3,5)]))) print(summary(glm(recur~pc.log$x[,c(1,3,5)],family=binomial))) # PART 2. # Find p-values from two sample t tests. p.values <- numeric(ncol(lgexp)) for (i in 1:ncol(lgexp)) { p.values[i] <- t.test(lgexp[recur==0,i],lgexp[recur==1,i],var.equal=FALSE) $ p.value } # See how many rejections we get at various levels with no correction and # with the Bonferroni correction. levels <- c(0.1,0.05,0.02,0.01,0.005) t.test.count <- numeric(length(levels)) bonferroni.count <- numeric(length(levels)) for (i in 1:length(levels)) { t.test.count[i] <- sum(p.values<=levels[i]) bonferroni.count[i] <- sum(p.values*length(p.values)<=levels[i]) } cat("Rejection counts from t test without and with Bonferroni correction\n\n") print(cbind ( level=levels, "t-test rej"=t.test.count, "Bonferroni rej"=bonferroni.count )) cat ("\nMinimum Bonferroni adjusted p-value:", min(p.values*length(p.values)),"\n\n") # PART 3. # Estimate number of true null hypotheses (ie, number of genes unrelated to # recurrence of cancer). m <- length(p.values) m0.est <- m * mean(p.values>0.7) / 0.3 cat("Estimate of m0:",m0.est,"\n\n") # Look at numbers of hypotheses rejected at various bounds or estiamtes of # False Discovery Rate. sorted.p.values <- sort(p.values) pdf("a2-p3a.pdf") par(mfrow=c(2,1)) hist(p.values,breaks=seq(0,1,by=0.01),prob=TRUE) hist(p.values[p.values<0.1],breaks=seq(0,0.1,by=0.001)) # prob option won't work dev.off() pdf("a2-p3b.pdf") plot(1:m,m*sorted.p.values/(1:m), type="l",ylim=c(0,1),xlab="",ylab="",yaxs="i",lwd=2) lines(1:m,m0.est*sorted.p.values/(1:m),lwd=2) abline(h=levels) title("Bound and estimate for FDR versus number of rejections") dev.off() FDR.bound.count <- numeric(length(levels)) FDR.est.count <- numeric(length(levels)) for (i in 1:length(levels)) { FDR.bound.count[i] <- max(0,(1:m)[sorted.p.values*m/(1:m)<=levels[i]]) FDR.est.count[i] <- max(0,(1:m)[sorted.p.values*m0.est/(1:m)<=levels[i]]) } cat("FDR results\n\n") print(cbind ( FDR=levels, "bound count"=FDR.bound.count, "est count"=FDR.est.count )) # PART 4. # Find normalized log gene expression data, with means for patients subtracted. lgexp.means <- rowMeans(lgexp) norm.lgexp <- lgexp - lgexp.means # Try logistic models with PCs from normalized data. norm.pc.log <- prcomp(norm.lgexp) print(summary(glm(recur~norm.pc.log$x[,keep],family=binomial))) print(summary(glm(recur~norm.pc.log$x[,c(1,3,4)],family=binomial))) # See how many genes are significant with and without Bonferroni correction. norm.p.values <- numeric(ncol(norm.lgexp)) for (i in 1:ncol(norm.lgexp)) { norm.p.values[i] <- t.test(norm.lgexp[recur==0,i],norm.lgexp[recur==1,i], var.equal=FALSE) $ p.value } norm.t.test.count <- numeric(length(levels)) norm.bonferroni.count <- numeric(length(levels)) for (i in 1:length(levels)) { norm.t.test.count[i] <- sum(norm.p.values<=levels[i]) norm.bonferroni.count[i] <- sum(norm.p.values*length(norm.p.values)<=levels[i]) } cat( "Rejection counts without and with Bonferroni correction, normalized data\n\n" ) print(cbind ( level=levels, "t-test rej"=norm.t.test.count, "Bonferroni rej"=norm.bonferroni.count ) ) # Look at False Discovery Rates with normalized data. norm.m0.est <- m * mean(norm.p.values>0.7) / 0.3 cat("Estimate of m0 with normalized data:",norm.m0.est,"\n\n") sorted.norm.p.values <- sort(norm.p.values) pdf("a2-p4a.pdf") par(mfrow=c(2,1)) hist(norm.p.values,breaks=seq(0,1,by=0.01),prob=TRUE) hist(norm.p.values[norm.p.values<0.1],breaks=seq(0,0.1,by=0.001)) dev.off() pdf("a2-p4b.pdf") plot(1:m,m*sorted.norm.p.values/(1:m), type="l",ylim=c(0,1),xlab="",ylab="",yaxs="i",lwd=2) lines(1:m,norm.m0.est*sorted.norm.p.values/(1:m),lwd=2) abline(h=levels) title("Bound and estimate for FDR versus number of rejections, normalized data") dev.off() norm.FDR.bound.count <- numeric(length(levels)) norm.FDR.est.count <- numeric(length(levels)) for (i in 1:length(levels)) { norm.FDR.bound.count[i] <- max(0,(1:m)[sorted.norm.p.values*m/(1:m)<=levels[i]]) norm.FDR.est.count[i] <- max(0,(1:m)[sorted.norm.p.values*norm.m0.est/(1:m)<=levels[i]]) } cat("FDR results, normalized data\n\n") print(cbind ( FDR=levels, "bound count"=norm.FDR.bound.count, "est count"=norm.FDR.est.count )) cat("\nP-value for testing whether patient means relate to recurrence:", t.test(lgexp.means[recur==0],lgexp.means[recur==1],var.equal=FALSE)$p.value, "\n") # PART 5. print(summary(glm(recur~norm.pc.log$x[,keep]+I(norm.pc.log$x[,keep]^2), family=binomial))) print(summary(glm(recur~norm.pc.log$x[,c(1,3,4)] + I(norm.pc.log$x[,keep]^2), family=binomial))) print(summary(glm(recur~norm.pc.log$x[,c(1,3,4)] + I(norm.pc.log$x[,c(3,5)]^2), family=binomial))) print(summary(glm(recur~norm.pc.log$x[,1] + I(norm.pc.log$x[,c(3,5)]^2), family=binomial)))