# STA 414, SPRING 2011, ASSIGNMENT 2 SOLUTION, MODIFIED MLP FUNCTIONS. # # Radford M. Neal, 2011 # The networks handled by these functions have one layer of hidden units, # with tanh as the activation function. The networks parameters (weights) # are in four groups: w1 for weights on the input to hidden connections, # used in computing the summed input into a hidden unit, w1_0 for the constants # added to this sum, w2 for the weights on the hidden to output connections, # used to compute the sum that is the output of the network, and w2_0 for the # constant added this sum. These network parameters are sometimes stored as # a single vector, in the order w2_0, w2, w1_0, w1. # # The networks are used for predicting a binary class, with the probability # of class 1 given by 1/(1+exp(-o)), where o is the output of the network. # FIT MLP NETWORK USING SIMPLE GRADIENT ASCENT. The arguments are a vector # of 0/1 responses for training cases, a matrix of inputs for training cases, # two learning rates (for parameters on connections to hidden units and the # output unit), two penalty magnitudes (for input-hidden and hidden-output # weights), and the number of iterations of gradient ascent to do. # # These arguments are followed by either an argument, M, giving the number # of hidden units, or a vector of initial values for parameters (from which M # can be deduced). If no initial parameter values are supplied, values are # randomly chosen independently from Gaussian distributions with mean zero and # standard deviation 0.01, except that the bias for the output unit is set to # the logit of the fraction of training cases that are in class 1. # # The result returned is a list with element ll giving the log likelihood # for each iteration, element p1 giving the matrix of probabilities for # class 1 for each iteration (rows) and training case (columns), and element # params giving the values of the network parameters for each iteration (rows). # Note that these results are for states after each iteration; the initial # values are not included. # # Note that an initial value can be supplied via the init argument that is # from the params element from the result of a previous call of mlp.train. mlp.train = function (y, X, eta1, eta2, lambda1, lambda2, iters, M=NULL, init=NULL) { n = nrow(X) p = ncol(X) if (is.null(M)) { M = as.integer((length(init)-1) / (p+2)) } W = (M+1) + M*(p+1) if (is.null(init)) { init = c (log(mean(y)/(1-mean(y))), rnorm(W-1,0,0.01) ) } if (length(init)!=W) { stop("Initial parameter vector is the wrong length") } eta = c (rep(eta2,M+1), rep(eta1,M*(p+1)) ) lambda = c (0, rep(lambda2,M), rep(0,M), rep(lambda1,M*p) ) ll = rep(NA,iters) params = matrix(NA,iters,W) p1 = matrix(NA,iters,n) current = init fw = mlp.forward(X,M,current) for (iter in 1:iters) { bk = mlp.backward(y,X,M,current,fw) gr = mlp.grad(X,M,fw,bk) current = current + eta*gr - eta*lambda*current params[iter,] = current fw = mlp.forward(X,M,current) p1[iter,] = 1/(1+exp(-fw$o)) ll[iter] = mlp.log.likelihood(y,fw$o) } list (ll=ll, p1=p1, params=params) } # FORWARD NETWORK COMPUTATIONS. Given the matrix of inputs, the number of # hidden units, and the network parameters, this function returns a list with # a matrix of hidden unit inputs (s) for all training cases, a corresponding # matrix of hidden unit values (h), and a vector of output unit values (o). mlp.forward = function (X, M, params) { n = nrow(X) p = ncol(X) if (length(params) != (M+1) + M*(p+1)) { stop("Parameter vector is the wrong length") } w2_0 = params[1] w2 = params[2:(M+1)] w1_0 = params[(M+2):(2*M+1)] w1_0m = matrix(w1_0,n,M,byrow=T) w1 = matrix(params[(2*M+2):length(params)],p,M) s = X %*% w1 + w1_0m h = tanh(s) o = h %*% w2 + w2_0 list(s=s, h=h, o=o) } # BACKWARD NETWORK COMPUTATIONS. Given the training targets and inputs, the # number of hidden units, the network parameters, and the results of the # forward computation, this function returns a list of vectors and matrices of # partial derivatives with respect to unit values, for each training case. mlp.backward = function (y, X, M, params, fw) { w2 = params[2:(M+1)] p1 = 1/(1+exp(-fw$o)) dl.do = y-p1 dl.dh = dl.do %*% w2 dl.ds = (1-fw$h^2) * dl.dh list (dl.do=dl.do, dl.dh=dl.dh, dl.ds=dl.ds) } # COMPUTE GRADIENT OF LOG LIKELIHOOD WITH RESPECT TO PARAMETERS. Takes # as arguments the matrix of inputs, the number of hidden units, and the # results of the forward and backward computations. Returns a vector of # partial derivatives of the log likelihood with respect to the parameters. mlp.grad = function (X, M, fw, bk) { p = ncol(X) dl.dw2_0 = sum (bk$dl.do) dl.dw2 = t(fw$h) %*% bk$dl.do dl.dw1_0 = apply (bk$dl.ds, 2, sum) dl.dw1 = matrix(NA,p,M) for (j in 1:p) { dl.dw1[j,] = X[,j] %*% bk$dl.ds } c (dl.dw2_0=dl.dw2_0, dl.dw2=dl.dw2, dl.dw1_0=dl.dw1_0, dl.dw1=dl.dw1) } # COMPUTE THE LOG LIKELIHOOD GIVEN THE NETWORK OUTPUTS. mlp.log.likelihood = function (y, o) { sum(-log(1+exp(-(2*y-1)*o))) } # MAKE PREDICTIONS FOR TEST CASES. Takes the matrix of test inputs and the # network parameters as inputs, and returns the vector of probabilities for # class 1 in the test cases. mlp.predict = function (X, params) { p = ncol(X) M = as.integer( (length(params)-1) / (p+2) ) fw = mlp.forward(X,M,params) 1 / (1+exp(-fw$o)) } # TRAIN NETWORK USING EARLY STOPPING. Takes as arguments the training targets # and inputs, two learning rates (for parameters to hidden units and to the # output), two penalty magnitudes (for input-hidden and hidden-output # weights), the indexes of the validation set, the number of iterations of # gradient descent to do, the number of hidden units, and a TRUE/FALSE flag # saying whether or not to produce a plot (default FALSE). mlp.cv = function (y, X, eta1, eta2, lambda1, lambda2, val.set, iters, M, cv.plot=FALSE) { est.set = setdiff(1:length(y),val.set) r = mlp.train(y[est.set], X[est.set,], eta1, eta2, lambda1, lambda2, iters, M) vpr = rep(NA,length(r$ll)) for (i in 1:length(r$ll)) { p = mlp.predict(X[val.set,],r$params[i,]) vpr[i] = sum (log(p[y[val.set]==1])) + sum (log(1-p[y[val.set]==0])) } best = order(-vpr)[1] if (cv.plot) { plot (r$ll/length(est.set), type="l", col="blue", xlim=c(0,iters), xlab=paste("iteration, best at ",best, "=",round(max(vpr)/length(val.set),4)), ylim=range(c(r$ll/length(est.set),vpr/length(val.set))), ylab="average log prob (blue:estimation, red:validation)") lines (vpr/length(val.set), col="red") } cat("Best validation log prob at iteration",best,":", round(vpr[best]/length(val.set),3),"\n") r$params[best,] }