# STA 437/1005, Fall 2009, Assignment #1, Dataset 1. # Read data into a data frame called d0. d0 <- read.table("bodyfat.dat",head=TRUE) # Produce histograms and QQ plots for all variables. Puts four plots on a page. pdf("a2-bodyfat-hist.pdf",pointsize=9) par(mfrow=c(4,4)) for (i in 1:ncol(d0)) { hist(d0[,i],nclass=20,xlab=names(d0)[i],main="") } dev.off() pdf("a2-bodyfat-qq.pdf",pointsize=9) par(mfrow=c(4,4)) for (i in 1:ncol(d0)) { qqnorm(d0[,i],xlab=names(d0)[i],main="") } dev.off() # Print indexes of observations that look like they may be outliers from # the univariate plots produced above. cat("\nUnivariate outliers:\n\n") cat("weight",which(d0$weight>300),"\n") cat("height",which(d0$height<50),"\n") cat("neck",which(d0$neck>50),"\n") cat("abdom",which(d0$abdom>140),"\n") cat("hip",which(d0$hip>140),"\n") cat("thigh",which(d0$thigh>80),"\n") cat("knee",which(d0$knee>47),"\n") cat("biceps",which(d0$biceps>42),"\n") cat("ankle",which(d0$ankle>29),"\n") # Eliminate suspicious observations #39 (weight, and other variables), # #42 (height), and #31 and #86 (ankle) as outliers, new data is in d1. d1 <- d0[c(-39,-42,-31,-86),] # Look at pairwise scatterplots of the remaining data. pdf("a2-bodyfat-scatter.pdf",pointsize=9) plot(d1,pch=20) dev.off() # Look at more detailed plot of density versus pcfat, with line for how # pcfat was supposed to be computed. pdf("a2-bodyfat-dens-pcfat.pdf") plot(d1$density,d1$pcfat) densgrid <- seq(0.98,1.12,length=100) lines(densgrid,495/densgrid-450) dev.off() # Eliminate observations where pcfat and density don't relate as they should, # new data in d2. d2 <- d1[-which(abs(d1$pcfat-(495/d1$density-450))>1),] # Compute squared statistical distances from the sample mean, omitting the # density variable (redundant given pcfat) and the age variable (no reason to # think it should have a normal distribution). Data is converted to matrix # form so that matrix operations can be used. d2s <- as.matrix (d2[-c(1,3),]) S <- cov(d2s) m <- apply(d2s,2,mean) M <- matrix(rep(m,each=nrow(d2s)),nrow(d2s),ncol(d2s)) dist <- diag((d2s-M) %*% solve(S) %*% t(d2s-M)) # Look at a QQ plot of squared statistical distance versus the appropriate # chi-squared distribution. pdf("a2-bodyfat-dist.pdf") qqgrid <- ((1:length(dist))-0.5) / length(dist) plot(qchisq(qqgrid,ncol(d2s)),sort(dist)) abline(0,1) dev.off() # Record which observations have big statistical distances from the mean. # Set up a colour vector in which they are red, others black. bigdist <- which(dist>40) colours <- rep("black",length(dist)) colours[bigdist] <- "red" # Look at pairwise scatterplots (except density) with these points marked. pdf("a2-bodyfat-scatter2.pdf",pointsize=9) plot(d2[,-1],pch=20,col=colours) dev.off() # Compute BMI and variations on it, and add as additional variables. Convert # units to kilograms and metres by multiplying appropriately. d2$bmi <- (d2$weight*2.2) / (d2$height*0.0254)^2 d2$bmiA <- (d2$weight*2.2) / (d2$height*0.0254)^1.8 d2$bmiB <- (d2$weight*2.2) / (d2$height*0.0254)^2.2 d2$bmiC <- (d2$weight*2.2)^0.8 / (d2$height*0.0254)^2 d2$bmiD <- (d2$weight*2.2)^1.2 / (d2$height*0.0254)^2 d2$bmiE <- (d2$weight*2.2)^1.2 / (d2$height*0.0254)^2.2 d2$bmiF <- (d2$weight*2.2)^1.2 / (d2$height*0.0254)^1.8 d2$bmiG <- (d2$weight*2.2)^0.8 / (d2$height*0.0254)^2.2 d2$bmiH <- (d2$weight*2.2)^0.8 / (d2$height*0.0254)^1.8 # Print correlations of density and pcfat with these BMI measures options(width=130) cat("\nCorrelations of pcfat with BMI measures:\n\n") print(round(cor(d2$pcfat,d2[,c(16:24)]),3)) # Plot pcfat versus bmi, bmiB, bmiC, and bmiH. Plot points with large # statistical distances from the mean in red. pdf("a2-bodyfat-bmi.pdf",pointsize=9) par(mfrow=c(2,2)) plot(d2$bmi,d2$pcfat,pch=20,col=colours) plot(d2$bmiB,d2$pcfat,pch=20,col=colours) plot(d2$bmiC,d2$pcfat,pch=20,col=colours) plot(d2$bmiH,d2$pcfat,pch=20,col=colours) dev.off()