# STA 437/1005 Fall 2010, Assignment 2.


# Read the gene expression data, storing as matrix gexp, and its log as
# lgexp.  Also read other patient information, as data frame pinfo, and
# set recur to the recurrence flag.

gexp <- as.matrix(read.table("ass1-gene-exp.txt",head=TRUE))
lgexp <- log(gexp)

pinfo <- read.table("ass1-patients.txt",head=TRUE)
recur <- pinfo$recur


# PART 1.

# Find principal components of the gene expression data (logged) without
# (ie, on the covariance matrix).

pc.log <- prcomp(lgexp)

# Do least squares regression and logistic regression of recur on first 
# five components of pc.log.

cat("\nLEAST SQUARES AND LOGISTIC FITS\n")

keep <- 1:5

ls.fit <- lm(recur~pc.log$x[,keep])
print(summary(ls.fit))

logistic.fit <- glm(recur~pc.log$x[,keep],family=binomial)
print(summary(logistic.fit))

# Look at the regression coefficients and predictions for least squares and 
# logistic regression.

cat("\nCOMPARING LEAST SQUARES AND LOGISTIC FITS\n\n")

print(cbind( "least squares"=coef(ls.fit),
             "logistic"=coef(logistic.fit),
             "ratio"=coef(ls.fit)/coef(logistic.fit)
))

pdf("a2-p1a.pdf")
plot(predict(ls.fit,type="response"),predict(logistic.fit,type="response"))
dev.off()

# Do least squares regression and logistic regression on subsets of the first
# five principal components.

cat("\n\nFITTING WITH SUBSETS OF COMPONENTS\n")

print(summary(lm(recur~pc.log$x[,c(1,2,3,5)])))
print(summary(glm(recur~pc.log$x[,c(1,2,3,5)],family=binomial)))

print(summary(lm(recur~pc.log$x[,c(1,3,5)])))
print(summary(glm(recur~pc.log$x[,c(1,3,5)],family=binomial)))


# PART 2.

# Find p-values from two sample t tests.

p.values <- numeric(ncol(lgexp))
for (i in 1:ncol(lgexp))
{ p.values[i] <- 
    t.test(lgexp[recur==0,i],lgexp[recur==1,i],var.equal=FALSE) $ p.value
}

# See how many rejections we get at various levels with no correction and
# with the Bonferroni correction.

levels <- c(0.1,0.05,0.02,0.01,0.005)

t.test.count <- numeric(length(levels))
bonferroni.count <- numeric(length(levels))
for (i in 1:length(levels)) 
{ t.test.count[i] <- sum(p.values<=levels[i])
  bonferroni.count[i] <- sum(p.values*length(p.values)<=levels[i])
}

cat("Rejection counts from t test without and with Bonferroni correction\n\n")
print(cbind ( level=levels, 
              "t-test rej"=t.test.count, 
              "Bonferroni rej"=bonferroni.count ))

cat ("\nMinimum Bonferroni adjusted p-value:",
     min(p.values*length(p.values)),"\n\n")


# PART 3.

# Estimate number of true null hypotheses (ie, number of genes unrelated to
# recurrence of cancer).

m <- length(p.values)
m0.est <- m * mean(p.values>0.7) / 0.3
cat("Estimate of m0:",m0.est,"\n\n")

# Look at numbers of hypotheses rejected at various bounds or estiamtes of
# False Discovery Rate.

sorted.p.values <- sort(p.values)

pdf("a2-p3a.pdf")
par(mfrow=c(2,1))
hist(p.values,breaks=seq(0,1,by=0.01),prob=TRUE)
hist(p.values[p.values<0.1],breaks=seq(0,0.1,by=0.001)) # prob option won't work
dev.off()

pdf("a2-p3b.pdf")
plot(1:m,m*sorted.p.values/(1:m),
     type="l",ylim=c(0,1),xlab="",ylab="",yaxs="i",lwd=2)
lines(1:m,m0.est*sorted.p.values/(1:m),lwd=2)
abline(h=levels)
title("Bound and estimate for FDR versus number of rejections")
dev.off()

FDR.bound.count <- numeric(length(levels))
FDR.est.count <- numeric(length(levels))
for (i in 1:length(levels)) 
{ FDR.bound.count[i] <- max(0,(1:m)[sorted.p.values*m/(1:m)<=levels[i]])
  FDR.est.count[i] <- max(0,(1:m)[sorted.p.values*m0.est/(1:m)<=levels[i]])
}

cat("FDR results\n\n")

print(cbind ( FDR=levels, 
              "bound count"=FDR.bound.count,
              "est count"=FDR.est.count ))


# PART 4.

# Find normalized log gene expression data, with means for patients subtracted.

lgexp.means <- rowMeans(lgexp)
norm.lgexp <- lgexp - lgexp.means

# Try logistic models with PCs from normalized data.

norm.pc.log <- prcomp(norm.lgexp)

print(summary(glm(recur~norm.pc.log$x[,keep],family=binomial)))
print(summary(glm(recur~norm.pc.log$x[,c(1,3,4)],family=binomial)))

# See how many genes are significant with and without Bonferroni correction.

norm.p.values <- numeric(ncol(norm.lgexp))
for (i in 1:ncol(norm.lgexp))
{ norm.p.values[i] <- 
    t.test(norm.lgexp[recur==0,i],norm.lgexp[recur==1,i],
           var.equal=FALSE) $ p.value
}

norm.t.test.count <- numeric(length(levels))
norm.bonferroni.count <- numeric(length(levels))
for (i in 1:length(levels)) 
{ norm.t.test.count[i] <- sum(norm.p.values<=levels[i])
  norm.bonferroni.count[i] <- 
     sum(norm.p.values*length(norm.p.values)<=levels[i])
}

cat(
"Rejection counts without and with Bonferroni correction, normalized data\n\n"
)

print(cbind ( level=levels, 
              "t-test rej"=norm.t.test.count, 
              "Bonferroni rej"=norm.bonferroni.count )
)

# Look at False Discovery Rates with normalized data.

norm.m0.est <- m * mean(norm.p.values>0.7) / 0.3
cat("Estimate of m0 with normalized data:",norm.m0.est,"\n\n")

sorted.norm.p.values <- sort(norm.p.values)

pdf("a2-p4a.pdf")
par(mfrow=c(2,1))
hist(norm.p.values,breaks=seq(0,1,by=0.01),prob=TRUE)
hist(norm.p.values[norm.p.values<0.1],breaks=seq(0,0.1,by=0.001))
dev.off()

pdf("a2-p4b.pdf")
plot(1:m,m*sorted.norm.p.values/(1:m),
     type="l",ylim=c(0,1),xlab="",ylab="",yaxs="i",lwd=2)
lines(1:m,norm.m0.est*sorted.norm.p.values/(1:m),lwd=2)
abline(h=levels)
title("Bound and estimate for FDR versus number of rejections, normalized data")
dev.off()

norm.FDR.bound.count <- numeric(length(levels))
norm.FDR.est.count <- numeric(length(levels))
for (i in 1:length(levels)) 
{ norm.FDR.bound.count[i] <- 
     max(0,(1:m)[sorted.norm.p.values*m/(1:m)<=levels[i]])
  norm.FDR.est.count[i] <- 
     max(0,(1:m)[sorted.norm.p.values*norm.m0.est/(1:m)<=levels[i]])
}

cat("FDR results, normalized data\n\n")

print(cbind ( FDR=levels, 
              "bound count"=norm.FDR.bound.count,
              "est count"=norm.FDR.est.count ))

cat("\nP-value for testing whether patient means relate to recurrence:",
    t.test(lgexp.means[recur==0],lgexp.means[recur==1],var.equal=FALSE)$p.value,
    "\n")


# PART 5.

print(summary(glm(recur~norm.pc.log$x[,keep]+I(norm.pc.log$x[,keep]^2),
                  family=binomial)))

print(summary(glm(recur~norm.pc.log$x[,c(1,3,4)] + I(norm.pc.log$x[,keep]^2),
                  family=binomial)))

print(summary(glm(recur~norm.pc.log$x[,c(1,3,4)] + I(norm.pc.log$x[,c(3,5)]^2),
                  family=binomial)))

print(summary(glm(recur~norm.pc.log$x[,1] + I(norm.pc.log$x[,c(3,5)]^2),
                  family=binomial)))