# STA 414, 2012, ASSIGNMENT 2 SOLUTION, MODIFIED MLP FUNCTIONS. # # Radford M. Neal, 2012 # The networks handled by these functions have one layer of hidden units, # with tanh as the activation function. The networks parameters (weights) # are in four groups: w1 for weights on the input to hidden connections, # used in computing the summed input into a hidden unit, w1_0 for the constants # added to this sum, w2 for the weights on the hidden to output connections, # used to compute the sum that is the output of the network, and w2_0 for the # constant added this sum. These network parameters are sometimes stored as # a single vector, in the order w2_0, w2, w1_0, w1. # # The networks are used for predicting a binary class, with the probability # of class 1 given by 1/(1+exp(-o)), where o is the output of the network. # FIT MLP NETWORK USING SIMPLE GRADIENT ASCENT. The arguments are a vector # of real responses for training cases, a matrix of inputs for training cases, # two learning rates (for parameters on connections to hidden units and the # output unit), two penalty magnitudes (for input-hidden and hidden-output # weights), a TRUE/FALSE flag saying whether learning rates on input-hidden # connections are modified according to the apparent relevance of each input, # and the number of iterations of gradient ascent to do. The penalty is lambda1 # times half the sum of the squares of the input-to-hidden weights (w1) plus # lambda2 times half the sum of the squares of the hidden-to-output weights # (w2). Biases are not penalized. # # These arguments are followed by either an argument, m, giving the number # of hidden units, or a vector of initial values for parameters (from which m # can be deduced). If no initial parameter values are supplied, values are # randomly chosen independently from Gaussian distributions with mean zero and # standard deviation 0.01, except that the bias for the output unit is set to # the logit of the fraction of training cases that are in class 1. # # The result returned is a list with element ll giving the log likelihood # for each iteration, element p1 giving the matrix of probabilities for # class 1 for each iteration (rows) and training case (columns), and element # params giving the values of the network parameters for each iteration (rows). # Note that these results are for states after each iteration; the initial # values are not included. # # Note that an initial value can be supplied via the init argument that is # from the params element from the result of a previous call of mlp.train. mlp.train <- function (y, X, eta1, eta2, lambda1, lambda2, rel.eta, iters, m=NULL, init=NULL) { n <- nrow(X) p <- ncol(X) if (is.null(m)) { m <- as.integer((length(init)-1) / (p+2)) } W <- (m+1) + m*(p+1) if (is.null(init)) { init <- c (log(mean(y)/(1-mean(y))), rnorm(W-1,0,0.01) ) } if (length(init)!=W) { stop("Initial parameter vector is the wrong length") } eta <- c (rep(eta2,m+1), rep(eta1,m*(p+1)) ) lambda <- c (0, rep(lambda2,m), rep(0,m), rep(lambda1,m*p) ) ll <- rep(NA,iters) params <- matrix(NA,iters,W) p1 <- matrix(NA,iters,n) current <- init fw <- mlp.forward(X,m,current) for (iter in 1:iters) { bk <- mlp.backward(y,X,m,current,fw) gr <- mlp.grad(X,m,fw,bk) this.eta <- eta if (rel.eta) { s <- (2*m+2):length(gr) gr2 <- rowSums(matrix(gr[s],p,m)^2) r <- gr2^2 / max(gr2^2) this.eta[s] <- this.eta[s] * rep(r,times=m) } current <- current + this.eta*gr - this.eta*lambda*current params[iter,] <- current fw <- mlp.forward(X,m,current) p1[iter,] <- 1/(1+exp(-fw$o)) ll[iter] <- mlp.log.likelihood(y,fw$o) } list (ll=ll, p1=p1, params=params) } # FORWARD NETWORK COMPUTATIONS. Given the matrix of inputs, the number of # hidden units, and the network parameters, this function returns a list with # a matrix of hidden unit inputs (s) for all training cases, a corresponding # matrix of hidden unit values (h), and a vector of output unit values (o). mlp.forward <- function (X, m, params) { n <- nrow(X) p <- ncol(X) if (length(params) != (m+1) + m*(p+1)) { stop("Parameter vector is the wrong length") } w2_0 <- params[1] w2 <- params[2:(m+1)] w1_0 <- params[(m+2):(2*m+1)] w1_0m <- matrix(w1_0,n,m,byrow=T) w1 <- matrix(params[(2*m+2):length(params)],p,m) s <- X %*% w1 + w1_0m h <- tanh(s) o <- h %*% w2 + w2_0 list(s=s, h=h, o=o) } # BACKWARD NETWORK COMPUTATIONS. Given the training targets and inputs, the # number of hidden units, the network parameters, and the results of the # forward computation, this function returns a list of vectors and matrices of # partial derivatives with respect to unit values, for each training case. mlp.backward <- function (y, X, m, params, fw) { w2 <- params[2:(m+1)] p1 <- 1/(1+exp(-fw$o)) dl.do <- y-p1 dl.dh <- dl.do %*% w2 dl.ds <- (1-fw$h^2) * dl.dh list (dl.do=dl.do, dl.dh=dl.dh, dl.ds=dl.ds) } # COMPUTE GRADIENT OF LOG LIKELIHOOD WITH RESPECT TO PARAMETERS. Takes # as arguments the matrix of inputs, the number of hidden units, and the # results of the forward and backward computations. Returns a vector of # partial derivatives of the log likelihood with respect to the parameters. mlp.grad <- function (X, m, fw, bk) { p <- ncol(X) dl.dw2_0 <- sum (bk$dl.do) dl.dw2 <- t(fw$h) %*% bk$dl.do dl.dw1_0 <- apply (bk$dl.ds, 2, sum) dl.dw1 <- matrix(NA,p,m) for (j in 1:p) { dl.dw1[j,] <- X[,j] %*% bk$dl.ds } c (dl.dw2_0=dl.dw2_0, dl.dw2=dl.dw2, dl.dw1_0=dl.dw1_0, dl.dw1=dl.dw1) } # COMPUTE THE LOG LIKELIHOOD GIVEN THE NETWORK OUTPUTS. mlp.log.likelihood <- function (y, o) { sum(-log(1+exp(-(2*y-1)*o))) } # MAKE PREDICTIONS FOR TEST CASES. Takes the matrix of test inputs and the # network parameters as inputs, and returns the vector of probabilities for # class 1 in the test cases. mlp.predict <- function (X, params) { p <- ncol(X) m <- as.integer( (length(params)-1) / (p+2) ) fw <- mlp.forward(X,m,params) 1 / (1+exp(-fw$o)) } # TRAIN NETWORK USING EARLY STOPPING. Takes as arguments the training targets # and inputs, two learning rates (for parameters to hidden units and to the # output), two penalty magnitudes (for input-hidden and hidden-output # weights), a flag saying whether learning rates are adjusted for relevance, # the indexes of the validation set, the number of iterations of gradient # descent to do, the number of hidden units, and a TRUE/FALSE flag saying # whether or not to produce a plot (default FALSE). Returns a list containing # the parameters from the best iteration according to the validation set # (best.params) and the average validation log probability at the best iteration # (best.vpr). mlp.cv <- function (y, X, eta1, eta2, lambda1, lambda2, rel.eta, val.set, iters, m, cv.plot=FALSE) { est.set <- setdiff(1:length(y),val.set) r <- mlp.train (y[est.set], X[est.set,], eta1, eta2, lambda1, lambda2, rel.eta, iters, m) vpr <- rep(NA,length(r$ll)) for (i in 1:length(r$ll)) { p <- mlp.predict(X[val.set,],r$params[i,]) vpr[i] <- sum (log(p[y[val.set]==1])) + sum (log(1-p[y[val.set]==0])) } best <- order(-vpr)[1] if (cv.plot) { plot (r$ll/length(est.set), type="l", col="blue", xlim=c(0,iters), xlab=paste("iteration, best at ",best, "=",round(max(vpr)/length(val.set),4)), ylim=range(c(r$ll/length(est.set),vpr/length(val.set))), ylab="average log prob (blue:estimation, red:validation)") lines (vpr/length(val.set), col="red") } best.vpr <- vpr[best] / length(val.set) cat("Best validation log prob at iteration",best,":",round(best.vpr,4),"\n") list (best.params = r$params[best,], best.vpr=best.vpr) } # TRAIN AN EARLY STOPPING ENSEMBLE. Takes as arguments the training targets # and inputs, two learning rates (for parameters to hidden units and to the # output), two penalty magnitudes (for input-hidden and hidden-output # weights), a flag saying whether learning rates are adjusted for relevance, # the number of ways to split the training set, the number of iterations of # gradient descent to do, the number of hidden units, the matrix of test # targets, and a TRUE/FALSE flag saying whether or not plots should be produced. # Returns a list with a vector of predictions for the test cases (probability # of 1) in element tst.pr and the average validation probability in element # val.pr. mlp.ensemble <- function (y, X, eta1, eta2, lambda1, lambda2, rel.eta, S, iters, m, Xtst, cv.plot=FALSE) { n <- length(y) # Find points that divide the S subsets (first 0, then end of each subset). div <- round((n*(0:S))/S) # Sum predictions for all divisions. pred <- rep(0,nrow(Xtst)) vpr <- 0 for (h in 1:S) { cv.res <- mlp.cv (y, X, eta1, eta2, lambda1, lambda2, rel.eta, (div[h]+1):div[h+1], iters, m, cv.plot) pred <- pred + mlp.predict (Xtst, cv.res$best.params) vpr <- vpr + cv.res$best.vpr } # Return averaged predictions. pred <- pred / S vpr <- vpr / S cat("Average validation log prob :",round(vpr,4),"\n") list (tst.pred=pred, val.pr=vpr) } # FIND BEST PENALTY PARAMETERS. Considers all combinations of values for # lambda1 and lambda2 given by the arguments try1 and try2. Returns a list # with the best lambda1 & lambda2 found, the average validation log probability # for the best values, the test predictions for the best values, and tables # val.res and tst.res giving the average log probabilities of validation and # test cases for all combinations of lambda1 & lambda2 (test results are # omitted if test responses are not supplied). mlp.cross.val <- function (y, X, eta1, eta2, try1, try2, rel.eta, S, iters, m, Xtst, ytst=NULL, cv.plot=FALSE) { val.res <- matrix(NA,length(try1),length(try2)) rownames(val.res) <- paste("lambda1=",try1,sep="") colnames(val.res) <- paste("lambda2=",try2,sep="") tst.res <- if (is.null(ytst)) NULL else val.res best.val.pr <- -Inf for (i in 1:length(try1)) { for (j in 1:length(try2)) { cat("--- Lambda parameters:",try1[i],try2[j],"---\n") res <- mlp.ensemble (y, X, eta1, eta2, try1[i], try2[j], rel.eta, S, iters, m, Xtst, cv.plot) if (!is.null(tst.res)) { tst.res[i,j] <- mean (log (ifelse(ytst==1,res$tst.pr,1-res$tst.pr))) } val.res[i,j] <- res$val.pr if (res$val.pr > best.val.pr) { best.lambda1 <- try1[i] best.lambda2 <- try2[j] best.val.pr <- res$val.pr best.tst.pred <- res$tst.pred } } } list (lambda1 = best.lambda1, lambda2 = best.lambda2, best.val.pr = best.val.pr, best.tst.pred = best.tst.pred, val.res=val.res, tst.res=tst.res) }