# MLP-MC.R - Multi-class classification with a multilayer perceptron network.


# FIT MLP NETWORK USING SIMPLE GRADIENT ASCENT - MULTI-CLASS VERSION.  The 
# arguments are the number of classes (K), a vector of responses for training 
# cases (integers 0 to K-1), a matrix of inputs for training cases,
# two learning rates (for parameters on connections to hidden units and the
# output units), and the number of iterations of gradient ascent to do.  
#
# These arguments are followed by either an argument, q, giving the number 
# of hidden units, or an argument, init, that is a vector of initial values 
# for parameters (from which q can be deduced).  If no initial parameter values
# are supplied, values are randomly chosen independently from Gaussian 
# distributions with mean zero and standard deviation 0.0001, except that the 
# biases for the output units are set to the logs of the fractions of training 
# cases in the corresponding classes.
#
# The result returned is a list with element "ll" giving the log likelihood
# for each iteration, element "params" giving the values of the network 
# parameters for each iteration (rows), and element "init" giving the initial
# parameters used.
#
# Note that an initial value supplied via the init argument can be the
# params element from the result of a previous call of mlp.train.

mlp.mc.train = function (K, y, X, eta1, eta2, iters, q=NULL, init=NULL)
{
  n = nrow(X)
  p = ncol(X)

  if (is.null(q))
  { q = as.integer((length(init)-K) / (p+1+K))
  }

  W = K + q*(p+1+K)

  if (is.null(init))
  { init = rnorm(W,0,0.0001)
    for (k in 1:K)
    { init[k] = log(mean(y==k-1))
    }
  }

  if (length(init)!=W)
  { stop("Initial parameter vector is the wrong length")
  }

  eta = c (rep(eta2,K*q+K), rep(eta1,q*(p+1)))
  
  ll = rep(NA,iters)
  params = matrix(NA,iters,W)

  current = init

  fw = mlp.mc.forward(K,X,q,current)

  for (iter in 1:iters)
  { 
    bk = mlp.mc.backward(K,y,X,q,current,fw)
    gr = mlp.mc.grad(K,X,q,fw,bk)

    current = current + eta*gr
    params[iter,] = current

    fw = mlp.mc.forward(K,X,q,current)
    ll[iter] = mlp.mc.log.likelihood(y,fw$o)
  }
  
  list (ll=ll, params=params, init=init)
}


# FORWARD NETWORK COMPUTATIONS.  Given the number of classes, the matrix of 
# inputs, the number of hidden units, and the network parameters, this function
# returns a list with a matrix of hidden unit inputs (s) for all training 
# cases, a corresponding matrix of hidden unit values (h), and a matrix of 
# output unit values (o).

mlp.mc.forward = function (K, X, q, params)
{
  n = nrow(X)
  p = ncol(X)

  if (length(params) != K + q*(p+K+1)) 
  { stop("Parameter vector is the wrong length")
  }

  beta0 = params[1:K]
  beta0m = matrix(beta0,n,K,byrow=T)
  beta = matrix(params[(K+1):(K*q+K)],q,K)
  gamma0 = params[(K*q+K+1):(K*q+K+q)]
  gamma0m = matrix(gamma0,n,q,byrow=T)
  gamma = matrix(params[(K*q+K+q+1):length(params)],p,q)

  s = X %*% gamma + gamma0m
  h = tanh(s)
  o = h %*% beta + beta0m

  list(s=s, h=h, o=o)
}


# BACKWARD NETWORK COMPUTATIONS.  Given the number of classes, training targets
# and inputs, the number of hidden units, the network parameters, and the 
# results of the forward computation, this function returns a list of vectors 
# and matrices of partial derivatives with respect to unit values, for each 
# training case.

mlp.mc.backward = function (K, y, X, q, params, fw)
{
  n = nrow(X)
  p = ncol(X)

  beta0 = params[1:K]
  beta0m = matrix(beta0,n,K,byrow=T)
  beta = matrix(params[(K+1):(K*q+K)],q,K)

  dl.do = matrix(NA,n,K)

  for (i in 1:nrow(fw$o))
  { e = exp(fw$o[i,])
    dl.do[i,] = - e/sum(e)
    dl.do[i,1+y[i]] = 1 + dl.do[i,1+y[i]]
  }

  dl.dh = dl.do %*% t(beta)
  dl.ds = (1-fw$h^2) * dl.dh

  list (dl.do=dl.do, dl.dh=dl.dh, dl.ds=dl.ds)
}


# COMPUTE GRADIENT OF LOG LIKELIHOOD WITH RESPECT TO PARAMETERS.  Takes as
# arguments the number of classes, the matrix of inputs, the number of hidden 
# units, and the results of the forward and backward computations.  Returns a 
# vector of partial derivatives of the log likelihood with respect to the 
# parameters.

mlp.mc.grad = function (K, X, q, fw, bk)
{
  n = nrow(X)
  p = ncol(X)

  dl.dbeta0 = apply (bk$dl.do, 2, sum)
  dl.dbeta = t(fw$h) %*% bk$dl.do
  dl.dgamma0 = apply (bk$dl.ds, 2, sum)
  dl.dgamma = matrix(NA,p,q)
  for (j in 1:p)
  { dl.dgamma[j,] = X[,j] %*% bk$dl.ds
  }

  c (dl.dbeta0=dl.dbeta0, dl.dbeta=dl.dbeta, dl.dgamma0=dl.dgamma0, 
     dl.dgamma=dl.dgamma)
}


# COMPUTE GRADIENT BY FINITE DIFFERENCES.  Used to check the correctness of
# mlp.mc.grad.

mlp.mc.grad.check = function (K, y, X, q, params, delta=0.0001)
{ 
  g = rep(NA,length(params))

  for (i in 1:length(params))
  { p1 = params
    p1[i] = p1[i]+delta
    l1 = mlp.mc.log.likelihood (y, mlp.mc.forward(K,X,q,p1)$o)
    p2 = params
    p2[i] = p2[i]-delta
    l2 = mlp.mc.log.likelihood (y, mlp.mc.forward(K,X,q,p2)$o)
    g[i] = (l1-l2) / (2*delta)
  }

  g
}


# COMPUTE THE LOG LIKELIHOOD GIVEN THE NETWORK OUTPUTS.  

mlp.mc.log.likelihood = function (y, o)
{
  ll = 0
  for (i in 1:length(y))
  { ll = ll + o[i,1+y[i]] - log(sum(exp(o[i,])))
  }
  ll
}


# MAKE PREDICTIONS FOR TEST CASES.  Takes the number of classes, the matrix of 
# test inputs and the network parameters as inputs, and returns a list with
# element 'prob' being the matrix of probabilities for the classes in the test 
# cases and element 'guess' being the vector of best guesses at the class for 
# each case.

mlp.mc.predict = function (K, X, params)
{ 
  p = ncol(X)
  q = as.integer( (length(params)-K) / (p+K+1) )
  fw = mlp.mc.forward(K,X,q,params)
  prob = matrix(NA,nrow(X),K)
  guess = rep(NA,nrow(X))
  for (i in 1:nrow(X))
  { prob[i,] = exp(fw$o[i,]) / sum(exp(fw$o[i,]))
    guess[i] = order(-prob[i,])[1] - 1
  }
  list (prob=prob, guess=guess)
}


# TRAIN AN MLP NETWORK USING EARLY STOPPING.  The arguments are the number of 
# classes (K), a vector of responses for training cases (integers 0 to K-1), 
# a matrix of inputs for training cases, two learning rates (for parameters on 
# connections to hidden units and the output units), the indexes of the cases
# in the validation set, the number of iterations of gradient ascent to do,
# the number of hidden units, and an optional argument saying whether to 
# produce a plot of log probabilities for estimation and validation sets.
#
# The value returned is the vector of parameters from the iteration that was
# judged to be best by cross validation.

mlp.mc.cv = function (K, y, X, eta1, eta2, val.set, iters, q, cv.plot=F)
{ 
  est.set = setdiff(1:length(y),val.set)

  r = mlp.mc.train (K, y[est.set], X[est.set,], eta1, eta2, iters, q)

  vpr = rep(NA,iters)
  for (i in 1:iters)
  { p = mlp.mc.predict(K,X[val.set,],r$params[i,]) $ prob
    vpr[i] = 0
    for (j in 1:length(val.set))
    { vpr[i] = vpr[i] + log (p [j, 1+y[val.set[j]]])
    }
  }
  
  best = order(-vpr)[1]

  if (cv.plot) 
  { plot (r$ll/length(est.set), type="l", col="blue", 
          xlim=c(0,iters), xlab=paste("iteration, best at ",best), 
          ylim=range(c(r$ll/length(est.set),vpr/length(val.set))),
          ylab="average log probability (blue:estimation, red:validation)")
    lines (vpr/length(val.set), col="red")
  }

  r$params[best,]
}


# FIND ENSEMBLE OF NETWORKS TRAINED BY EARLY STOPPING.  Arguments are the 
# number of classes (K), a vector of responses for training cases (integers 0 
# to K-1), a matrix of inputs for training cases, two learning rates (for 
# parameters on connections to hidden units and the output units), the number 
# of iterations of gradient ascent to do, the number of hidden units, the
# number of folds (ie, members of the ensemble), and an optional argument 
# saying whether to produce plots of log probabilities for estimation and 
# validation sets.
#
# The value returned is a matrix with the rows being the vectors of parameters 
# for each member of the ensemble, from the iteration for each training run
# judged to be best by cross validation.

mlp.mc.ensemble = function (K, y, X, eta1, eta2, iters, q, n.folds=4, cv.plot=F)
{
  folds = 1 + round (length(y) * (0:n.folds) / n.folds)
  params = NULL;

  if (cv.plot)
  { par(mfrow=c(ceiling(n.folds/2),2))
  }

  for (i in 1:n.folds)
  { best = mlp.mc.cv (K, y, X, eta1, eta2, folds[i]:(folds[i+1]-1), 
                      iters, q, cv.plot)
    params = rbind(params,best)
  }
}


# MAKE PREDICTIONS USING AN ENSEMBLE OF NETWORKS.  Takes the number of classes,
# the matrix of test inputs, and a matrix of network parameters (with one
# set of parameters in each row) as inputs, and returns a list with
# element 'prob' being the matrix of probabilities for the classes in the test 
# cases and element 'guess' being the vector of best guesses at the class for 
# each case.

mlp.mc.ensemble.predict = function (K, X, params)
{
  prob = matrix(0,nrow(X),K)
  for (i in 1:nrow(params))
  { prob = prob + mlp.mc.predict(K,X,params[i,]) $ prob
  }
  prob = prob/nrow(params)

  guess = rep(NA,nrow(X))
  for (i in 1:nrow(X))
  { guess[i] = order(-prob[i,])[1] - 1
  }

  list (prob=prob, guess=guess)
}