## Note that data(prostate) needs to be corrected to duplicate book results str( prostate ) ## gives information about the variables cor( prostate[,1:8] ) ## shows pairwise correlations of all the features pairs( prostate[,1:9], col="violet" ) ## reproduces Figure 1.1 train <- subset( prostate, train==TRUE )[,1:9] ## the variables have not been standardized test <- subset( prostate, train=FALSE )[,1:9] ## but this will not affect the RSS, # if( require(leaps)) { ## will load this package # The book (page 56) uses only train subset, so we the same: prostate.leaps <- regsubsets( lpsa ~ . , data=train, nbest=70, #all! really.big=TRUE ) ## I did this differently: I did lpsa ~ . - train, data = pr.std, subset=train, ... prostate.leaps.sum <- summary( prostate.leaps ) ## gives all possible models prostate.models <- prostate.leaps.sum$which ## puts them in a matrix; try prostate.models[1:2,] prostate.models.size <- as.numeric(attr(prostate.models, "dimnames")[[1]]) ## the row names hist( prostate.models.size ) ## for information prostate.models.rss <- prostate.leaps.sum$rss ## rss is one of the few summary statistics given prostate.models.best.rss <- tapply( prostate.models.rss, prostate.models.size, min ) ## find the one with the smallest rss prostate.models.best.rss # Let us add results for the only intercept model prostate.dummy <- lm( lpsa ~ 1, data=train ) prostate.models.best.rss <- c( sum(resid(prostate.dummy)^2), prostate.models.best.rss) # Making a plot: plot( 0:8, prostate.models.best.rss, ylim=c(0, 100), type="b", xlab="subset size", ylab="Residual Sum Square", col="red2" ) points( prostate.models.size, prostate.models.rss, pch=17, col="brown",cex=0.7 )