# STA 414/2104, Spring 2006, Assignment #1, Function definitions.
#
# R. M. Neal


# THE K NEAREST NEIGHBOR METHOD.  Applies the k-NN method using the values of
# k in the kvec argument (a vector of length K) to predict the response for 
# the test cases with inputs x.test (an m by p matrix), using the training 
# cases with inputs x.train (an n by p matrix) and responses y.train (a vector
# of length n).  The result is an m by K matrix of predictions for the test
# cases, with columns containing predictions found using the various values
# of k in kvec.  
# 
# The x.train and x.test arguments may be data frames, provided these can be 
# converted to numeric matrices (no strings allowed).  The x.test argument 
# can be a vector, in which case it is converted to a matrix with one row.

knn <- function (kvec, x.train, y.train, x.test)
{
  # Convert inputs to matrices if they're data frames.

  if (is.data.frame(x.train))
  { x.train <- as.matrix(x.train)
  }
  if (is.data.frame(x.test))
  { x.test <- as.matrix(x.test)
  }

  # Convert x.test to a matrix with one row if it's just a vector (ie, if
  # there's just one test case).

  if (!is.matrix(x.test))
  { x.test <- matrix(x.test,nrow=1)
  }

  # Check that numbers of cases and inputs are compatibile.

  if (nrow(x.train)!=length(y.train))
  { stop(
     "Number of training cases for inputs doesn't match number for responses")
  }
  if (ncol(x.train)!=ncol(x.test))
  { stop(
     "Number of inputs for training cases doesn't match number for test case")
  }

  # Allocate some variables.

  n.train <- nrow(x.train)
  n.test <- nrow(x.test)

  p.test <- matrix (NA, n.test, length(kvec))  # Holds predictions for each k

  dsq <- numeric(n.train)  # Holds distances to each training case

  # Find predictions for each test case, for each k.

  for (tst in 1:n.test)
  {
    # Find squared distances from this test case to each training case.

    for (trn in 1:n.train)
    { dsq[trn] <- sum ((x.train[trn,]-x.test[tst,])^2)
    }

    # Order indexes of training cases by distance to test case.

    ord <- order(dsq)

    # Make predictions using the various values of k.

    for (i in 1:length(kvec))
    { p.test[tst,i] <- mean(y.train[ord[1:kvec[i]]])
    }
  }

  # Return predictions for test cases.

  p.test
}


# FIND RESULTS OF LEAVE-ONE-OUT CROSS VALIDATION FOR K-NN.  The arguments
# are a vector, kvec, of values of k to try (of length K), an n by p matrix,
# x.train, of training inputs, and a vector, y.train, of training responses 
# (of length n).  The result is a n by K matrix of predictions for each 
# training case found using only the other training case, with the columns 
# giving predictions for the values of k in kvec.
#
# The x.train argument may be a data frame, provided it can be converted to 
# a numeric matrix (no strings allowed).  

knncv <- function (kvec, x.train, y.train)
{
  # Convert inputs to matrices if they're data frames.

  if (is.data.frame(x.train))
  { x.train <- as.matrix(x.train)
  }

  # Check that numbers of cases and inputs are compatibile.

  if (nrow(x.train)!=length(y.train))
  { stop(
     "Number of training cases for inputs doesn't match number for responses")
  }

  n.train <- nrow(x.train)
  p.cv <- matrix (NA, n.train, length(kvec))  # Holds predictions for each k

  for (trn in 1:n.train)
  { 
    p.cv[trn,] <- knn(kvec,x.train[-trn,],y.train[-trn],x.train[trn,])
  }

  p.cv
}


# THE K-NN METHOD WITH K SELECTED BY LEAVE-ONE-OUT CROSS VALIDATION.  The
# arguments are a vector, kvec, of values for k to consider, an n by p matrix,
# x.train, of inputs for the training cases, a vector of length n, y.train,
# containing the responses for the training cases, and an m by p matrix,
# x.test, of inputs for test cases.  
#
# Returns a list with the "results" element being a vector (of length m) of 
# predictions for test cases, found using the value of k in kvec that gives 
# the smallest average squared error in leave-on-out cross validation.  The
# "cv.sq.err" element is a vector of cross-validation squared errors for the
# values of k in kvec.  The "k" element is the value of k selected.
# 
# The x.train and x.test arguments may be data frames, provided these can be 
# converted to numeric matrices (no strings allowed).  The x.test argument 
# can be a vector, in which case it is converted to a matrix with one row.

knnsel <- function (kvec, x.train, y.train, x.test)
{
  # Convert inputs to matrices if they're data frames.

  if (is.data.frame(x.train))
  { x.train <- as.matrix(x.train)
  }
  if (is.data.frame(x.test))
  { x.test <- as.matrix(x.test)
  }

  # Convert x.test to a matrix with one row if it's just a vector (ie, if
  # there's just one test case).

  if (!is.matrix(x.test))
  { x.test <- matrix(x.test,nrow=1)
  }

  # Check that numbers of cases and inputs are compatibile.

  if (nrow(x.train)!=length(y.train))
  { stop(
     "Number of training cases for inputs doesn't match number for responses")
  }
  if (ncol(x.train)!=ncol(x.test))
  { stop(
     "Number of inputs for training cases doesn't match number for test case")
  }

  p.cv <- knncv(kvec,x.train,y.train)
  cv.sq.err <- apply (p.cv, 2, function(p) mean((p-y.train)^2))
  
  k <- kvec [order(cv.sq.err)[1]]

  list (results=as.vector(knn(k,x.train,y.train,x.test)),
        cv.sq.err=cv.sq.err, k=k)
}


# COMBINE K-NN PREDICTIONS BASED ON LEAVE-ONE-OUT CROSS VALIDATION.  The
# arguments are a vector, kvec, of values for k to use, an n by p matrix,
# x.train, of inputs for the training cases, a vector of length n, y.train,
# containing the responses for the training cases, and an m by p matrix,
# x.test, of inputs for test cases.  Returns a vector (of length m) of 
# predictions for test cases, found by linearly combining the k-NN predicitons
# with the valeus of k in kvec, with coefficients found by minimizing the
# average squared error in leave-one-out cross validation.
#
# Returns a list with the "results" element being a vector (of length m) of 
# predictions for test cases, and the "beta" element being the vector of
# regression coefficients (the first being the intercept) for combining the
# predictions with various values of k.
# 
# The x.train and x.test arguments may be data frames, provided these can be 
# converted to numeric matrices (no strings allowed).  The x.test argument 
# can be a vector, in which case it is converted to a matrix with one row.

knncombo <- function (kvec, x.train, y.train, x.test)
{
  # Convert inputs to matrices if they're data frames.

  if (is.data.frame(x.train))
  { x.train <- as.matrix(x.train)
  }
  if (is.data.frame(x.test))
  { x.test <- as.matrix(x.test)
  }

  # Convert x.test to a matrix with one row if it's just a vector (ie, if
  # there's just one test case).

  if (!is.matrix(x.test))
  { x.test <- matrix(x.test,nrow=1)
  }

  # Check that numbers of cases and inputs are compatibile.

  if (nrow(x.train)!=length(y.train))
  { stop(
     "Number of training cases for inputs doesn't match number for responses")
  }
  if (ncol(x.train)!=ncol(x.test))
  { stop(
     "Number of inputs for training cases doesn't match number for test case")
  }

  p.cv <- knncv(kvec,x.train,y.train)

  b <- coef (lm(y.train~p.cv))

  list (
    results=as.vector(b[1] + knn(kvec,x.train,y.train,x.test)%*%b[2:length(b)]),
    beta=b)
}