# STA 414/2104, Spring 2006, Assignment #1, Function definitions. # # R. M. Neal # THE K NEAREST NEIGHBOR METHOD. Applies the k-NN method using the values of # k in the kvec argument (a vector of length K) to predict the response for # the test cases with inputs x.test (an m by p matrix), using the training # cases with inputs x.train (an n by p matrix) and responses y.train (a vector # of length n). The result is an m by K matrix of predictions for the test # cases, with columns containing predictions found using the various values # of k in kvec. # # The x.train and x.test arguments may be data frames, provided these can be # converted to numeric matrices (no strings allowed). The x.test argument # can be a vector, in which case it is converted to a matrix with one row. knn <- function (kvec, x.train, y.train, x.test) { # Convert inputs to matrices if they're data frames. if (is.data.frame(x.train)) { x.train <- as.matrix(x.train) } if (is.data.frame(x.test)) { x.test <- as.matrix(x.test) } # Convert x.test to a matrix with one row if it's just a vector (ie, if # there's just one test case). if (!is.matrix(x.test)) { x.test <- matrix(x.test,nrow=1) } # Check that numbers of cases and inputs are compatibile. if (nrow(x.train)!=length(y.train)) { stop( "Number of training cases for inputs doesn't match number for responses") } if (ncol(x.train)!=ncol(x.test)) { stop( "Number of inputs for training cases doesn't match number for test case") } # Allocate some variables. n.train <- nrow(x.train) n.test <- nrow(x.test) p.test <- matrix (NA, n.test, length(kvec)) # Holds predictions for each k dsq <- numeric(n.train) # Holds distances to each training case # Find predictions for each test case, for each k. for (tst in 1:n.test) { # Find squared distances from this test case to each training case. for (trn in 1:n.train) { dsq[trn] <- sum ((x.train[trn,]-x.test[tst,])^2) } # Order indexes of training cases by distance to test case. ord <- order(dsq) # Make predictions using the various values of k. for (i in 1:length(kvec)) { p.test[tst,i] <- mean(y.train[ord[1:kvec[i]]]) } } # Return predictions for test cases. p.test } # FIND RESULTS OF LEAVE-ONE-OUT CROSS VALIDATION FOR K-NN. The arguments # are a vector, kvec, of values of k to try (of length K), an n by p matrix, # x.train, of training inputs, and a vector, y.train, of training responses # (of length n). The result is a n by K matrix of predictions for each # training case found using only the other training case, with the columns # giving predictions for the values of k in kvec. # # The x.train argument may be a data frame, provided it can be converted to # a numeric matrix (no strings allowed). knncv <- function (kvec, x.train, y.train) { # Convert inputs to matrices if they're data frames. if (is.data.frame(x.train)) { x.train <- as.matrix(x.train) } # Check that numbers of cases and inputs are compatibile. if (nrow(x.train)!=length(y.train)) { stop( "Number of training cases for inputs doesn't match number for responses") } n.train <- nrow(x.train) p.cv <- matrix (NA, n.train, length(kvec)) # Holds predictions for each k for (trn in 1:n.train) { p.cv[trn,] <- knn(kvec,x.train[-trn,],y.train[-trn],x.train[trn,]) } p.cv } # THE K-NN METHOD WITH K SELECTED BY LEAVE-ONE-OUT CROSS VALIDATION. The # arguments are a vector, kvec, of values for k to consider, an n by p matrix, # x.train, of inputs for the training cases, a vector of length n, y.train, # containing the responses for the training cases, and an m by p matrix, # x.test, of inputs for test cases. # # Returns a list with the "results" element being a vector (of length m) of # predictions for test cases, found using the value of k in kvec that gives # the smallest average squared error in leave-on-out cross validation. The # "cv.sq.err" element is a vector of cross-validation squared errors for the # values of k in kvec. The "k" element is the value of k selected. # # The x.train and x.test arguments may be data frames, provided these can be # converted to numeric matrices (no strings allowed). The x.test argument # can be a vector, in which case it is converted to a matrix with one row. knnsel <- function (kvec, x.train, y.train, x.test) { # Convert inputs to matrices if they're data frames. if (is.data.frame(x.train)) { x.train <- as.matrix(x.train) } if (is.data.frame(x.test)) { x.test <- as.matrix(x.test) } # Convert x.test to a matrix with one row if it's just a vector (ie, if # there's just one test case). if (!is.matrix(x.test)) { x.test <- matrix(x.test,nrow=1) } # Check that numbers of cases and inputs are compatibile. if (nrow(x.train)!=length(y.train)) { stop( "Number of training cases for inputs doesn't match number for responses") } if (ncol(x.train)!=ncol(x.test)) { stop( "Number of inputs for training cases doesn't match number for test case") } p.cv <- knncv(kvec,x.train,y.train) cv.sq.err <- apply (p.cv, 2, function(p) mean((p-y.train)^2)) k <- kvec [order(cv.sq.err)[1]] list (results=as.vector(knn(k,x.train,y.train,x.test)), cv.sq.err=cv.sq.err, k=k) } # COMBINE K-NN PREDICTIONS BASED ON LEAVE-ONE-OUT CROSS VALIDATION. The # arguments are a vector, kvec, of values for k to use, an n by p matrix, # x.train, of inputs for the training cases, a vector of length n, y.train, # containing the responses for the training cases, and an m by p matrix, # x.test, of inputs for test cases. Returns a vector (of length m) of # predictions for test cases, found by linearly combining the k-NN predicitons # with the valeus of k in kvec, with coefficients found by minimizing the # average squared error in leave-one-out cross validation. # # Returns a list with the "results" element being a vector (of length m) of # predictions for test cases, and the "beta" element being the vector of # regression coefficients (the first being the intercept) for combining the # predictions with various values of k. # # The x.train and x.test arguments may be data frames, provided these can be # converted to numeric matrices (no strings allowed). The x.test argument # can be a vector, in which case it is converted to a matrix with one row. knncombo <- function (kvec, x.train, y.train, x.test) { # Convert inputs to matrices if they're data frames. if (is.data.frame(x.train)) { x.train <- as.matrix(x.train) } if (is.data.frame(x.test)) { x.test <- as.matrix(x.test) } # Convert x.test to a matrix with one row if it's just a vector (ie, if # there's just one test case). if (!is.matrix(x.test)) { x.test <- matrix(x.test,nrow=1) } # Check that numbers of cases and inputs are compatibile. if (nrow(x.train)!=length(y.train)) { stop( "Number of training cases for inputs doesn't match number for responses") } if (ncol(x.train)!=ncol(x.test)) { stop( "Number of inputs for training cases doesn't match number for test case") } p.cv <- knncv(kvec,x.train,y.train) b <- coef (lm(y.train~p.cv)) list ( results=as.vector(b[1] + knn(kvec,x.train,y.train,x.test)%*%b[2:length(b)]), beta=b) }