R version 2.9.2 (2009-08-24) ################# ##### ridge regression ################# > library(MASS) > pr.ridge = lm.ridge(lpsa ~ ., data = train, lambda = seq(0, 100, length=20)) > plot(pr.ridge) > pr.ridge.sum = summary(pr.ridge) > attributes(pr.ridge) $names [1] "coef" "scales" "Inter" "lambda" "ym" "xm" "GCV" "kHKB" [9] "kLW" $class [1] "ridgelm" > select(pr.ridge) modified HKB estimator is 3.355691 modified L-W estimator is 3.050708 smallest value of GCV at 5.263158 > which.min(pr.ridge$GCV) 5.263158 2 ## the best fit can depend on the range and number of values for lambda > pr.ridge = lm.ridge(lpsa ~ ., data = train, lambda = seq(0, 10, length=50)) > select(pr.ridge) modified HKB estimator is 3.355691 modified L-W estimator is 3.050708 smallest value of GCV at 4.897959 > pr.ridge = lm.ridge(lpsa ~ ., data = train, lambda = seq(0, 100, length=50)) > select(pr.ridge) modified HKB estimator is 3.355691 modified L-W estimator is 3.050708 smallest value of GCV at 4.081633 > pr.ridge = lm.ridge(lpsa ~ ., data = train, lambda = seq(0, 60, length=100)) > select(pr.ridge) modified HKB estimator is 3.355691 modified L-W estimator is 3.050708 smallest value of GCV at 4.848485 > which.min(pr.ridge$GCV) 4.8484848 9 > pr.ridge = lm.ridge(lpsa ~ ., data = train, lambda = seq(4, 6, length=100)) > select(pr.ridge) modified HKB estimator is 3.355691 modified L-W estimator is 3.050708 smallest value of GCV at 4.929293 ## this is close to the book's choice of 5 > which.min(pr.ridge$GCV) 4.929293 47 > names(train) [1] "lcavol" "lweight" "age" "lbph" "svi" "lcp" "gleason" [8] "pgg45" "lpsa" # now do predictions # lm.ridge centers and scales the predictors # so we do the same # and add back in the mean (beta-hat-0) mse = function(x,y) { mean((x-y)^2)} msese = function(x,y) { sqrt(var((x-y)^2)) } > ytpredg = scale(test[,1:8],center = F, scale = pr.ridge$scales)%*% pr.ridge$coef[,47] + mean(test$lpsa) # predict the new yhats > mse(ytpredg, test$lpsa) [1] 0.4940488 # agrees with Table 3.3 > msese(ytpredg,test$lpsa) [,1] [1,] 0.1541701 # doesn't agree ### However, I don't get the same coefficients as in Table 3.3 ### whether I use my best lambda (4.929) or their choice of 5. > pr.ridge$coef[,47] lcavol lweight age lbph svi lcp 0.60726335 0.28429357 -0.11020426 0.20044128 0.28318074 -0.16182402 gleason pgg45 0.01216161 0.20566633 > lm.ridge(lpsa ~ ., data = train, lambda = 5)$coef lcavol lweight age lbph svi lcp 0.60610311 0.28418057 -0.10982178 0.20029913 0.28289364 -0.16052082 gleason pgg45 0.01245140 0.20501262 ######################### ### predictions for ordinary least squares ######################### > mse(predict(pr.lm,newdata=test[,1:8]),test$lpsa) [1] 0.521274 > msese(predict(pr.lm,newdata=test[,1:8]),test$lpsa) [1] 0.1787240 > summary(pr.best) ## I fitted this earlier Call: lm(formula = lpsa ~ lcavol + lweight, data = train) Residuals: Min 1Q Median 3Q Max -1.58852 -0.44174 0.01304 0.52613 1.93127 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.47736 0.09305 26.625 < 2e-16 *** lcavol 0.73971 0.09318 7.938 4.14e-11 *** lweight 0.31633 0.08831 3.582 0.000658 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.7613 on 64 degrees of freedom Multiple R-squared: 0.6148, Adjusted R-squared: 0.6027 F-statistic: 51.06 on 2 and 64 DF, p-value: 5.54e-14 > mse(predict(pr.best,newdata=test[,1:8]),test$lpsa) [1] 0.4924823 > msese(predict(pr.best,newdata=test[,1:8]),test$lpsa) [1] 0.1431235 ## these agree with Table 3.3 ############################ ### Lasso ############################ > library(lars) ## in the package "Lars" > pr.lars = lars(as.matrix(train[,1:8]),train$lpsa) > plot(pr.lars) ## the book states that they chose s=0.36 by 10-fold CV > predict.lars(pr.lars,newx=test[,1:8],type="fit", s=0.36,mode="fraction") $s [1] 0.36 $fraction [1] 0.36 $mode [1] "fraction" $fit 7 9 10 15 22 25 26 28 2.094397 1.442933 1.780448 2.292082 2.694805 2.008802 2.282561 1.731247 32 34 36 42 44 48 49 50 1.968744 1.693794 2.587807 2.486294 2.712385 2.491737 2.553715 2.345231 53 54 55 57 62 64 65 66 2.053297 2.953927 3.188912 1.975948 2.945964 3.035410 2.746013 2.550283 73 74 80 84 95 97 2.635603 2.697084 3.135174 3.095303 3.232944 3.698480 > mse(.Last.value$fit,test$lpsa) [1] 0.4890863 ## bigger than book, but close ###################### ##### Principal components regression ###################### > pr.pca = prcomp(train[,1:8]) > round(pr.pca$sdev,3) [1] 1.892 1.336 1.041 0.794 0.717 0.633 0.532 0.424 # they are not dropping off > screeplot(pr.pca) ## this is called a scree plot,plots the eigenvalues > matplot(1:8,pr.pca$rot[,1:7],type="l") ## this shows that the derived variables ## are hard to interpret ## the book used 10-fold CV on the training data to choose 7 derived variables > pr.pcalm = lm(train$lpsa ~ pr.pca$x[,1:7]) > pr.pcalm$coef ## these are the theta-hats (p.79) (Intercept) pr.pca$x[, 1:7]PC1 pr.pca$x[, 1:7]PC2 pr.pca$x[, 1:7]PC3 2.4523451 0.4386140 -0.2195223 0.3005516 pr.pca$x[, 1:7]PC4 pr.pca$x[, 1:7]PC5 pr.pca$x[, 1:7]PC6 pr.pca$x[, 1:7]PC7 0.1756535 0.1429723 -0.1680983 0.4506631 > pr.pca$rot[,1:7] %*% pr.pcalm$coef[-1] ## these are the beta-hats ## but they don't agree with Table 3.3 ## there may be a problem with the way the ## x's were standardized [,1] lcavol 0.55087266 lweight 0.28876032 age -0.15471478 lbph 0.21411395 svi 0.31461483 lcp -0.06229606 gleason 0.22754818 pgg45 -0.04782207 ## below I follow some code from Faraway's book "Linear models with R" > mm = apply(train[,1:8],2,mean) # find the means of the training sample > mm lcavol lweight age lbph svi -0.030983588 -0.006617411 0.118237130 -0.019930773 0.017840201 lcp gleason pgg45 -0.024915031 -0.029404561 0.066912888 > tx = as.matrix(sweep(test[,1:8],2,mm)) # center the test sample by these means > rmspr = numeric(8) # a little loop to find the best number of derived vars > for(i in 1:8){ + nx = tx %*% pr.pca$rot[,1:i] + model3 = lm(train$lpsa ~ pr.pca$x[,1:i]) + pv = cbind(1,nx) %*% model3$coef + rmspr[i] = mse(pv,test$lpsa)} > ### note that it cheats, by using the test data > which.min(rmspr) [1] 7 > min(rmspr) [1] 0.44936 ## this agrees with Table 3.3 ### trying again using scaling in prcomp > pr.pca2 = prcomp(train[,1:8], scale=TRUE) # this is recommended in the doc'n > pr.pca2$sdev [1] 1.8510947 1.2776593 1.0181731 0.7863293 0.6746955 0.6137560 0.5291255 [8] 0.4173260 > pr.pca$sdev [1] 1.8924140 1.3364527 1.0406852 0.7939366 0.7168638 0.6327541 0.5318165 [8] 0.4244570 > tx2 = as.matrix(sweep(test[,1:8],2,mm)) > mse(cbind(1,tx %*%pr.pca2$rot[,1:7]) %*% lm(train$lpsa ~ pr.pca2$x[,1:7])$coef, test$lpsa) [1] 0.4498896 > dim(pr.pca2$rot[,1:7]) [1] 8 7 > pr.pcalm2 = lm(train$lpsa ~ pr.pca2$x[,1:7]) > pr.pcalm2$coef[-1] pr.pca2$x[, 1:7]PC1 pr.pca2$x[, 1:7]PC2 pr.pca2$x[, 1:7]PC3 0.43855386 -0.22491328 0.35083028 pr.pca2$x[, 1:7]PC4 pr.pca2$x[, 1:7]PC5 pr.pca2$x[, 1:7]PC6 0.18291450 0.07310721 -0.18846741 pr.pca2$x[, 1:7]PC7 0.47184182 > pr.pca2$rot[,1:7]%*%.Last.value ## closer [,1] lcavol 0.57058087 lweight 0.32328124 age -0.15371952 lbph 0.21599970 svi 0.32212005 lcp -0.05040169 gleason 0.22857290 pgg45 -0.06362645