> library(ElemStatLearn) > data(phoneme) > phoneme.aa=phoneme[phoneme$g=="aa",] > phoneme.ao=phoneme[phoneme$g=="ao",] > dim(phoneme.aa) [1] 695 258 > dim(phoneme.ao) [1] 1022 258 > rowsaa=sample(695,400) ## a random selection of 400 training "aa"s > rowsao=sample(1022,600) ## and 600 "ao"s > aa.train=phoneme.aa[rowsaa,] > aa.test=phoneme.aa[-rowsaa,] > ao.train=phoneme.ao[rowsao,] > ao.test=phoneme.ao[-rowsao,] > plotaa = sample(600,15) # plot a few green > plotao = sample(400,15) # and red rows > plot(1:256, aa.train[plotaa[1],2:257],type="l",col="green") > lines(1:256,ao.train[plotao[1],2:257],col="red") > lines(1:256,ao.train[plotao[2],2:257],col="red") > lines(1:256,ao.train[plotao[3],2:257],col="red") > lines(1:256,aa.train[plotaa[2],2:257],col="green") > lines(1:256,aa.train[plotaa[3],2:257],col="green") ## try it -- this reproduces the top Figure in 5.5 > train=rbind(aa.train,ao.train) ## we eliminated rows for other sounds > test=rbind(aa.test,ao.test) ## so now we have our dataset > rm(ao.train,ao.test,aa.train,aa.test, plotaa,plotao) > logreg = glm(g~. - speaker , data=train[,-258], family=binomial) ## I dropped "speaker" from the data set rather than from the formula ## because I got into trouble later with predictions > plot(1:256,logreg$coef[2:257],type="l",col="gray",ylim=c(-0.4,0.4),xlab="Frequency") > smooth.fit = lm(logreg$coef[2:257] ~ ns(1:256,df=12)) > lines(1:256,smooth.fit$fitted.values,col="red") > abline(h=0) ## not quite as much shrinkage as they show in Fig 5.5 ## classification errors > dim(test) [1] 717 258 > phat = rep(717,0) > for(j in 1:717){phat[j]=as.numeric(as.numeric(test[j,1:256])%*%as.vector(smooth.fit$fitted.values))} ## this uses the red curve as the coefficients, instead of the original ## gray coefficients > table(phat<0,test$g) aa ao dcl iy sh FALSE 1 71 0 0 0 TRUE 294 351 0 0 0 > 72/717 [1] 0.1004184 ## now see how it does on training data > phat2=rep(0,1000) > for(j in 1:1000){phat2[j]=as.numeric(as.numeric(train[j,1:256])%*%as.vector(smooth.fit$fitted.values))} > table(phat2<0,train$g) aa ao dcl iy sh FALSE 0 75 0 0 0 TRUE 400 525 0 0 0 ## error rate 0.075 ## what about predicting with the original coefficients? > table(predict.glm(logreg,newdata=test[,1:257],type="response")>0.5, test$g) aa ao dcl iy sh FALSE 207 87 0 0 0 TRUE 88 335 0 0 0 > 294/717 [1] 0.4100418 ## and on training data > table(predict.glm(logreg,type="response")>0.5, train$g) aa ao dcl iy sh FALSE 366 31 0 0 0 TRUE 34 569 0 0 0 ## error rate (366+31)/1000 = 0.397 ## results are more dramatic than in text; also switch of the sign is a ## puzzle; did I get unlucky with my random subset?