# MLP-MC.R - Multi-class classification with a multilayer perceptron network. # FIT MLP NETWORK USING SIMPLE GRADIENT ASCENT - MULTI-CLASS VERSION. The # arguments are the number of classes (K), a vector of responses for training # cases (integers 0 to K-1), a matrix of inputs for training cases, # two learning rates (for parameters on connections to hidden units and the # output units), and the number of iterations of gradient ascent to do. # # These arguments are followed by either an argument, q, giving the number # of hidden units, or an argument, init, that is a vector of initial values # for parameters (from which q can be deduced). If no initial parameter values # are supplied, values are randomly chosen independently from Gaussian # distributions with mean zero and standard deviation 0.0001, except that the # biases for the output units are set to the logs of the fractions of training # cases in the corresponding classes. # # The result returned is a list with element "ll" giving the log likelihood # for each iteration, element "params" giving the values of the network # parameters for each iteration (rows), and element "init" giving the initial # parameters used. # # Note that an initial value supplied via the init argument can be the # params element from the result of a previous call of mlp.train. mlp.mc.train = function (K, y, X, eta1, eta2, iters, q=NULL, init=NULL) { n = nrow(X) p = ncol(X) if (is.null(q)) { q = as.integer((length(init)-K) / (p+1+K)) } W = K + q*(p+1+K) if (is.null(init)) { init = rnorm(W,0,0.0001) for (k in 1:K) { init[k] = log(mean(y==k-1)) } } if (length(init)!=W) { stop("Initial parameter vector is the wrong length") } eta = c (rep(eta2,K*q+K), rep(eta1,q*(p+1))) ll = rep(NA,iters) params = matrix(NA,iters,W) current = init fw = mlp.mc.forward(K,X,q,current) for (iter in 1:iters) { bk = mlp.mc.backward(K,y,X,q,current,fw) gr = mlp.mc.grad(K,X,q,fw,bk) current = current + eta*gr params[iter,] = current fw = mlp.mc.forward(K,X,q,current) ll[iter] = mlp.mc.log.likelihood(y,fw$o) } list (ll=ll, params=params, init=init) } # FORWARD NETWORK COMPUTATIONS. Given the number of classes, the matrix of # inputs, the number of hidden units, and the network parameters, this function # returns a list with a matrix of hidden unit inputs (s) for all training # cases, a corresponding matrix of hidden unit values (h), and a matrix of # output unit values (o). mlp.mc.forward = function (K, X, q, params) { n = nrow(X) p = ncol(X) if (length(params) != K + q*(p+K+1)) { stop("Parameter vector is the wrong length") } beta0 = params[1:K] beta0m = matrix(beta0,n,K,byrow=T) beta = matrix(params[(K+1):(K*q+K)],q,K) gamma0 = params[(K*q+K+1):(K*q+K+q)] gamma0m = matrix(gamma0,n,q,byrow=T) gamma = matrix(params[(K*q+K+q+1):length(params)],p,q) s = X %*% gamma + gamma0m h = tanh(s) o = h %*% beta + beta0m list(s=s, h=h, o=o) } # BACKWARD NETWORK COMPUTATIONS. Given the number of classes, training targets # and inputs, the number of hidden units, the network parameters, and the # results of the forward computation, this function returns a list of vectors # and matrices of partial derivatives with respect to unit values, for each # training case. mlp.mc.backward = function (K, y, X, q, params, fw) { n = nrow(X) p = ncol(X) beta0 = params[1:K] beta0m = matrix(beta0,n,K,byrow=T) beta = matrix(params[(K+1):(K*q+K)],q,K) dl.do = matrix(NA,n,K) for (i in 1:nrow(fw$o)) { e = exp(fw$o[i,]) dl.do[i,] = - e/sum(e) dl.do[i,1+y[i]] = 1 + dl.do[i,1+y[i]] } dl.dh = dl.do %*% t(beta) dl.ds = (1-fw$h^2) * dl.dh list (dl.do=dl.do, dl.dh=dl.dh, dl.ds=dl.ds) } # COMPUTE GRADIENT OF LOG LIKELIHOOD WITH RESPECT TO PARAMETERS. Takes as # arguments the number of classes, the matrix of inputs, the number of hidden # units, and the results of the forward and backward computations. Returns a # vector of partial derivatives of the log likelihood with respect to the # parameters. mlp.mc.grad = function (K, X, q, fw, bk) { n = nrow(X) p = ncol(X) dl.dbeta0 = apply (bk$dl.do, 2, sum) dl.dbeta = t(fw$h) %*% bk$dl.do dl.dgamma0 = apply (bk$dl.ds, 2, sum) dl.dgamma = matrix(NA,p,q) for (j in 1:p) { dl.dgamma[j,] = X[,j] %*% bk$dl.ds } c (dl.dbeta0=dl.dbeta0, dl.dbeta=dl.dbeta, dl.dgamma0=dl.dgamma0, dl.dgamma=dl.dgamma) } # COMPUTE GRADIENT BY FINITE DIFFERENCES. Used to check the correctness of # mlp.mc.grad. mlp.mc.grad.check = function (K, y, X, q, params, delta=0.0001) { g = rep(NA,length(params)) for (i in 1:length(params)) { p1 = params p1[i] = p1[i]+delta l1 = mlp.mc.log.likelihood (y, mlp.mc.forward(K,X,q,p1)$o) p2 = params p2[i] = p2[i]-delta l2 = mlp.mc.log.likelihood (y, mlp.mc.forward(K,X,q,p2)$o) g[i] = (l1-l2) / (2*delta) } g } # COMPUTE THE LOG LIKELIHOOD GIVEN THE NETWORK OUTPUTS. mlp.mc.log.likelihood = function (y, o) { ll = 0 for (i in 1:length(y)) { ll = ll + o[i,1+y[i]] - log(sum(exp(o[i,]))) } ll } # MAKE PREDICTIONS FOR TEST CASES. Takes the number of classes, the matrix of # test inputs and the network parameters as inputs, and returns a list with # element 'prob' being the matrix of probabilities for the classes in the test # cases and element 'guess' being the vector of best guesses at the class for # each case. mlp.mc.predict = function (K, X, params) { p = ncol(X) q = as.integer( (length(params)-K) / (p+K+1) ) fw = mlp.mc.forward(K,X,q,params) prob = matrix(NA,nrow(X),K) guess = rep(NA,nrow(X)) for (i in 1:nrow(X)) { prob[i,] = exp(fw$o[i,]) / sum(exp(fw$o[i,])) guess[i] = order(-prob[i,])[1] - 1 } list (prob=prob, guess=guess) } # TRAIN AN MLP NETWORK USING EARLY STOPPING. The arguments are the number of # classes (K), a vector of responses for training cases (integers 0 to K-1), # a matrix of inputs for training cases, two learning rates (for parameters on # connections to hidden units and the output units), the indexes of the cases # in the validation set, the number of iterations of gradient ascent to do, # the number of hidden units, and an optional argument saying whether to # produce a plot of log probabilities for estimation and validation sets. # # The value returned is the vector of parameters from the iteration that was # judged to be best by cross validation. mlp.mc.cv = function (K, y, X, eta1, eta2, val.set, iters, q, cv.plot=F) { est.set = setdiff(1:length(y),val.set) r = mlp.mc.train (K, y[est.set], X[est.set,], eta1, eta2, iters, q) vpr = rep(NA,iters) for (i in 1:iters) { p = mlp.mc.predict(K,X[val.set,],r$params[i,]) $ prob vpr[i] = 0 for (j in 1:length(val.set)) { vpr[i] = vpr[i] + log (p [j, 1+y[val.set[j]]]) } } best = order(-vpr)[1] if (cv.plot) { plot (r$ll/length(est.set), type="l", col="blue", xlim=c(0,iters), xlab=paste("iteration, best at ",best), ylim=range(c(r$ll/length(est.set),vpr/length(val.set))), ylab="average log probability (blue:estimation, red:validation)") lines (vpr/length(val.set), col="red") } r$params[best,] } # FIND ENSEMBLE OF NETWORKS TRAINED BY EARLY STOPPING. Arguments are the # number of classes (K), a vector of responses for training cases (integers 0 # to K-1), a matrix of inputs for training cases, two learning rates (for # parameters on connections to hidden units and the output units), the number # of iterations of gradient ascent to do, the number of hidden units, the # number of folds (ie, members of the ensemble), and an optional argument # saying whether to produce plots of log probabilities for estimation and # validation sets. # # The value returned is a matrix with the rows being the vectors of parameters # for each member of the ensemble, from the iteration for each training run # judged to be best by cross validation. mlp.mc.ensemble = function (K, y, X, eta1, eta2, iters, q, n.folds=4, cv.plot=F) { folds = 1 + round (length(y) * (0:n.folds) / n.folds) params = NULL; if (cv.plot) { par(mfrow=c(ceiling(n.folds/2),2)) } for (i in 1:n.folds) { best = mlp.mc.cv (K, y, X, eta1, eta2, folds[i]:(folds[i+1]-1), iters, q, cv.plot) params = rbind(params,best) } } # MAKE PREDICTIONS USING AN ENSEMBLE OF NETWORKS. Takes the number of classes, # the matrix of test inputs, and a matrix of network parameters (with one # set of parameters in each row) as inputs, and returns a list with # element 'prob' being the matrix of probabilities for the classes in the test # cases and element 'guess' being the vector of best guesses at the class for # each case. mlp.mc.ensemble.predict = function (K, X, params) { prob = matrix(0,nrow(X),K) for (i in 1:nrow(params)) { prob = prob + mlp.mc.predict(K,X,params[i,]) $ prob } prob = prob/nrow(params) guess = rep(NA,nrow(X)) for (i in 1:nrow(X)) { guess[i] = order(-prob[i,])[1] - 1 } list (prob=prob, guess=guess) }