# STA 414, Spring 2014, Assignment #3, Neural network functions # The networks handled by these functions have one layer of hidden # units, with tanh as the activation function. The networks are used # for predicting a binary response, with a logistic link function. # # The networks parameters (weights) are in four groups: w1 for weights # on the input to hidden connections, used in computing the summed # input into a hidden unit, w1_0 for the constants added to this sum, # w2 for the weights on the hidden to output connections, used to # compute the sum that is the output of the network, and w2_0 for the # constant added this sum. # # The network parameters are sometimes stored as a list with named # elements w1, w1_0, w2, and w2_0, and sometimes as a single vector # containing all these parameters, in that order. Conversion between # these representations is done with R's unlist and relist functions. # SKELETON FOR LIST OF NETWORK PARAMETERS. Returns a skeleton suitable for # use with "relist" for converting a vector of parameters for a network with # p inputs and m hidden units to a list. mlp_skeleton <- function (p,m) { list (w1=matrix(0,p,m), w1_0=rep(0,m), w2=rep(0,m), w2_0=0) } # FIT MLP NETWORK USING SIMPLE GRADIENT DESCENT. The arguments are a vector # of real responses for training cases, a matrix of inputs for training cases, # the number of hidden units, the learning rate, the number of iterations # of gradient descent to do, and two penalty magnitudes (for input-hidden and # hidden-output weights), which default to zero. The penalty is the penalty # magnitude times the sum of squares of weights in the group. # # Note that the input variables and response variables should be scaled to # have close to mean zero and standard deviation one, so that a single learning # rate is roughly appropriate for all weights. # # The initial network weights are randomly chosen independently from Gaussian # distributions with mean zero and standard deviation 0.01, except that w2_0 # is set to the mean response in the training cases. # # The result returned is a list with vector element E giving minus the log # likelihood for each iteration and matrix element W giving the network # parameters for each iteration (rows) in vector (not list) form. Note that # these results are for states after each iteration, with initial values not # included. mlp_train <- function (y, X, m, eta, iters, lambda1=0, lambda2=0) { n <- nrow(X) p <- ncol(X) skel <- mlp_skeleton(p,m) E <- rep(NA,iters) W <- matrix(NA,iters,length(unlist(skel))) w <- c (rnorm(length(unlist(skel))-1,0,0.01), mean(y)) wl <- relist(w,skel) fw <- mlp_forward(X,wl) lambda <- skel lambda$w1[,] <- lambda1 lambda$w2[] <- lambda2 lambda <- unlist(lambda) for (iter in 1:iters) { bk <- mlp_backward(y,X,wl,fw) gr <- mlp_grad(X,skel,fw,bk) w <- w - eta*unlist(gr) - eta*2*lambda*w wl <- relist(w,skel) W[iter,] <- w fw <- mlp_forward(X,wl) E[iter] <- mean(log(1+exp(-fw$o*(2*y-1)))) } list (E=E, W=W) } # FORWARD NETWORK COMPUTATIONS. Given the matrix of inputs and the network # parameters in list form, this function returns a list with a matrix of hidden # unit inputs (s) for all training cases, a matrix of hidden unit values (h) for # all training cases, and a matrix (with one column) of output unit values (o) # for all training cases. mlp_forward <- function (X, wl) { n <- nrow(X) p <- ncol(X) s <- matrix(wl$w1_0,n,length(wl$w1_0),byrow=TRUE) + X %*% wl$w1 h <- tanh(s) o <- h %*% wl$w2 + wl$w2_0 list (s=s, h=h, o=o) } # BACKWARD NETWORK COMPUTATIONS. Given the training responses, training inputs, # network parameters in list form, and results of the forward computation, this # function returns a list of matrices of partial derivatives with respect to # unit values, for each training case. mlp_backward <- function (y, X, wl, fw) { dE_do <- 1/(1+exp(-fw$o)) - y dE_dh <- dE_do %*% wl$w2 dE_ds <- (1-fw$h^2) * dE_dh list (dE_do=dE_do, dE_dh=dE_dh, dE_ds=dE_ds) } # COMPUTE GRADIENT OF LOG LIKELIHOOD WITH RESPECT TO PARAMETERS. Takes # as arguments the matrix of inputs, the skeleton for the parameter list, and # the results of the forward and backward computations. Returns the gradient # of the squared error in list form. mlp_grad <- function (X, skel, fw, bk) { p <- ncol(X) gr <- skel gr$w2_0 <- sum (bk$dE_do) gr$w2 <- t(fw$h) %*% bk$dE_do gr$w1_0 <- colSums(bk$dE_ds) for (j in 1:p) { gr$w1[j,] <- X[,j] %*% bk$dE_ds } gr } # CHECK THAT GRADIENTS ARE CORRECT. mlp_check_grad <- function (y, X, skel, w, epsilon=0.001) { grd <- unlist(skel) for (i in 1:length(w)) { w1 <- w w1[i] <- w1[i]-epsilon w2 <- w w2[i] <- w2[i]+epsilon E1 <- sum ( log(1 + exp(-(2*y-1)*mlp_forward(X,relist(w1,skel))$o)) ) E2 <- sum ( log(1 + exp(-(2*y-1)*mlp_forward(X,relist(w2,skel))$o)) ) grd[i] <- (E2-E1) / (2*epsilon) } wl <- relist(w,skel) fw <- mlp_forward (X, wl) bk <- mlp_backward (y, X, wl, fw) gr <- mlp_grad (X, skel, fw, bk) list (unlist(grd), unlist(gr), unlist(gr)-unlist(grd)) }