# NEURAL NETWORK REGRESSION MODEL WITH THREE HIDDEN LAYERS AND MULTIPLE OUTPUTS. # The networks handled by these functions have three layers of hidden # units, with tanh as the activation function for the first and third # hidden layers, and the identity as the activation function for the # second (middle) hidden layer. The networks are used for predicting # a vector of real-valued responses, modelled as having independent Gaussian # noise with the same variance for all responses. Hence maximum likelihood # estimates can be found by minimizing the total squared error. These # networks can be used as auto-encoders with the middle layer as a bottleneck. # # The networks parameters (weights) are in the divided into the following # groups: # # w1 inputs to hidden layer 1 # w1_0 biases (intercepts) for hidden layer 1 # w2 hidden layer 1 to hidden layer 2 # w2_0 biases for hidden layer 2 # w3 hidden layer 2 to hidden layer 3 # w3_0 biases for hidden layer 3 # w4 hidden layer 3 to outputs # w4_0 biases for outputs # # The network parameters are sometimes stored as a list with named # elements w1, w1_0, w2, etc. and sometimes as a single vector # containing all these parameters, in the order above. Conversion between # these representations is done with R's unlist and relist functions. # SKELETON FOR LIST OF NETWORK PARAMETERS. Returns a skeleton suitable for # use with "relist" for converting a vector of parameters for a network with # p inputs, q outputs, and hidden layers with numbers of units given by m. mlp_skeleton <- function (p,m,q) { list (w1=matrix(0,p,m[1]), w1_0=rep(0,m[1]), w2=matrix(0,m[1],m[2]), w2_0=rep(0,m[2]), w3=matrix(0,m[2],m[3]), w3_0=rep(0,m[3]), w4=matrix(0,m[3],q), w4_0=rep(0,q)) } # FIT MLP NETWORK USING SIMPLE GRADIENT DESCENT. The arguments are a matrix # of real responses for training cases, a matrix of inputs for training cases, # a vector of the numbers of units in each of the three hidden layers, the # learning rate, and the number of iterations of gradient descent to do. # # Note that the input variables and response variables should be scaled to # have close to mean zero and standard deviation one, so that a single learning # rate is roughly appropriate for all weights. # # The initial network weights are randomly chosen independently from Gaussian # distributions with mean zero and standard deviation 0.01, except that w2_0 # is set to the mean of the responses in the training cases. # # The result returned is a list with element E giving the average squared error # for each iteration, and element W giving the network parameters for each # iteration (rows) in vector form. Note that these results are for states # after each iteration, with initial values not included. mlp_train <- function (Y, X, m, eta, iters) { n <- nrow(X) p <- ncol(X) q <- ncol(Y) skel <- mlp_skeleton(p,m,q) E <- rep(NA,iters) W <- matrix(NA,iters,length(unlist(skel))) w <- c (rnorm(length(unlist(skel))-q,0,0.1), colMeans(Y)) wl <- relist(w,skel) fw <- mlp_forward(X,wl) for (iter in 1:iters) { bk <- mlp_backward(Y,X,wl,fw) gr <- mlp_grad(X,skel,fw,bk) w <- w - eta*unlist(gr) wl <- relist(w,skel) W[iter,] <- w fw <- mlp_forward(X,wl) E[iter] <- mean((fw$o-Y)^2) } list (E=E, W=W) } # FORWARD NETWORK COMPUTATIONS. Given the matrix of inputs and the network # parameters in list form, this function returns a list with matrices of hidden # unit inputs (s1, s2, and s3) for all training cases, matrices of hidden unit # values (h1, h2, and h3) for all training cases, and a matrix of output unit # values (o) for all training cases. (Note that since the activation function # for the middle hidden layer is the identity, s2 and h2 will be the same.) mlp_forward <- function (X, wl) { n <- nrow(X) p <- ncol(X) s1 <- matrix(wl$w1_0,n,length(wl$w1_0),byrow=TRUE) + X %*% wl$w1 h1 <- tanh(s1) s2 <- matrix(wl$w2_0,n,length(wl$w2_0),byrow=TRUE) + h1 %*% wl$w2 h2 <- s2 s3 <- matrix(wl$w3_0,n,length(wl$w3_0),byrow=TRUE) + h2 %*% wl$w3 h3 <- tanh(s3) o <- matrix(wl$w4_0,n,length(wl$w4_0),byrow=TRUE) + h3 %*% wl$w4 list (s1=s1, h1=h1, s2=s2, h2=h2, s3=s3, h3=h3, o=o) } # BACKWARD NETWORK COMPUTATIONS. Given the training responses, training inputs, # network parameters in list form, and results of the forward computation, this # function returns a list of matrices of partial derivatives with respect to # unit values, for each training case. mlp_backward <- function (Y, X, wl, fw) { dE_do <- 2*(fw$o-Y) dE_dh3 <- dE_do %*% t(wl$w4) dE_ds3 <- (1-fw$h3^2) * dE_dh3 dE_dh2 <- dE_ds3 %*% t(wl$w3) dE_ds2 <- dE_dh2 dE_dh1 <- dE_ds2 %*% t(wl$w2) dE_ds1 <- (1-fw$h1^2) * dE_dh1 list (dE_do=dE_do, dE_dh3=dE_dh3, dE_ds3=dE_ds3, dE_dh2=dE_dh2, dE_ds2=dE_ds2, dE_dh1=dE_dh1, dE_ds1=dE_ds1) } # COMPUTE GRADIENT OF LOG LIKELIHOOD WITH RESPECT TO PARAMETERS. Takes # as arguments the matrix of inputs, the skeleton for the parameter list, and # the results of the forward and backward computations. Returns the gradient # of the squared error in list form. mlp_grad <- function (X, skel, fw, bk) { gr <- skel gr$w4_0 <- colSums (bk$dE_do) for (j in 1:ncol(fw$h3)) { gr$w4[j,] <- fw$h3[,j] %*% bk$dE_do } gr$w3_0 <- colSums (bk$dE_ds3) for (j in 1:ncol(fw$h2)) { gr$w3[j,] <- fw$h2[,j] %*% bk$dE_ds3 } gr$w2_0 <- colSums (bk$dE_ds2) for (j in 1:ncol(fw$h1)) { gr$w2[j,] <- fw$h1[,j] %*% bk$dE_ds2 } gr$w1_0 <- colSums(bk$dE_ds1) for (j in 1:ncol(X)) { gr$w1[j,] <- X[,j] %*% bk$dE_ds1 } gr } # CHECK THAT GRADIENTS ARE CORRECT. mlp_check_grad <- function (Y, X, skel, w, epsilon=0.001) { grd <- unlist(skel) for (i in 1:length(w)) { w1 <- w w1[i] <- w1[i]-epsilon w2 <- w w2[i] <- w2[i]+epsilon E1 <- sum ( (Y - mlp_forward(X,relist(w1,skel))$o)^2 ) E2 <- sum ( (Y - mlp_forward(X,relist(w2,skel))$o)^2 ) grd[i] <- (E2-E1) / (2*epsilon) } wl <- relist(w,skel) fw <- mlp_forward (X, wl) bk <- mlp_backward (Y, X, wl, fw) gr <- mlp_grad (X, skel, fw, bk) list (unlist(grd), unlist(gr), unlist(gr)-unlist(grd)) }