Model Selection Handout #### #### ## V : votes for a presidential candidate ## I: are they incumbent: ## D: democrat or republican incumbent? ## W: wartime election? ## G: GDP growth rate in election year ## P: absolute GDP deflator growth rate ## N: number of quarters in which GDP growth rate > 3.2% #### #electiondata election.table<-read.table("C:/Classes/Stat214/election.txt",sep="\t",quote="",header=T) attach(election.table) election.table[1:5,] Year V I DW G P N 1 1916 0.5168 1 1 0 2.229 4.252 3 2 1920 0.3612 1 0 1 -11.463 16.535 5 3 1924 0.4176 -1 -1 0 -3.872 5.161 10 4 1928 0.4118 -1 0 0 4.623 0.183 7 5 1932 0.5916 -1 -1 0 -14.901 7.069 4 pairs(election.table[,2:ncol(election.table)], cex.labels=3, pch=23, bg='orange', cex=2) # This is the library with the leaps function # install.packages('leaps') library(leaps) # Leaps takes a design matrix as argument: throw away the intercept # column or leaps will complain X <- model.matrix(lm(V ~ I + D + W +G + P + N, election.table))[,-1] # Look at R^2 # best subsets R2 -- notice it increases election.leaps <- leaps(X, election.table$V, nbest=3, method='r2') plot(election.leaps$size, election.leaps$r2, pch=23, bg='orange', cex=2) best.model.r2 <- election.leaps$which[which((election.leaps$r2 == max(election.leaps$r2))),] print(best.model.r2) 1 2 3 4 5 6 TRUE TRUE TRUE TRUE TRUE TRUE # Look at Adjusted R^2 # best subsets adjusted election.leaps <- leaps(X, election.table$V, nbest=3, method='adjr2') plot(election.leaps$size, election.leaps$adjr2, pch=23, bg='orange', cex=2) best.model.adjr2 <- election.leaps$which[which((election.leaps$adjr2 == max(election.leaps$adjr2))),] print(best.model.adjr2) 1 2 3 4 5 6 FALSE TRUE FALSE FALSE TRUE FALSE # Look at Cp # best subsets Cp election.leaps <- leaps(X, election.table$V, nbest=3, method='Cp') plot(election.leaps$size, election.leaps$Cp, pch=23, bg='orange', cex=2) best.model.Cp <- election.leaps$which[which((election.leaps$Cp == min(election.leaps$Cp))),] print(best.model.Cp) 1 2 3 4 5 6 FALSE TRUE FALSE FALSE FALSE FALSE Cross Validation of Prediction Error # this package has the cross-validation function in it library(boot) # Leaps takes a design matrix as argument: throw away the intercept # column or leaps will complain # we will look at cross-validated estimate of error for each model leaps # thought was interesting in terms of adjusted R^2 X <- model.matrix(glm(V ~ I + D + W +G + P + N))[,-1] election.leaps <- leaps(X, V, nbest=4, method='adjr2') nmodel <- nrow(election.leaps$which) #how many models CV.error <- numeric(nmodel) # a list of 0’s of length nmodel K <- 9 for (i in 1:nmodel) { curX <- X[,election.leaps$which[i,]] cur.lm <- glm(V ~ curX) CV.error[i] <- cv.glm(model.frame(cur.lm), cur.lm, K=K)$delta[1] } CV.error [1] 0.005123864 0.005969539 0.005737972 NA 0.005354762 0.005428094 [7] 0.005908259 0.005166203 NA 0.005653756 0.004516925 0.005843429 [13] NA NA 0.008059973 0.006544132 NA 0.011622195 [19] 0.007633662 0.006313123 0.010701813 # best subsets adjusted plot(election.leaps$adjr2, CV.error, pch=23, bg='orange', cex=2, ylab='CV error', xlab='Adjusted R^2') good.subset <- !is.na(CV.error) #exclude NAs CV.error <- CV.error[good.subset] best.model.CV <- election.leaps$which[good.subset,][which((CV.error == min(CV.error))),] print(best.model.CV) 1 2 3 4 5 6 FALSE TRUE FALSE FALSE TRUE TRUE # same for Cp election.leaps <- leaps(X, V, nbest=4, method='Cp') nmodel <- nrow(election.leaps$which) CV.error <- numeric(nmodel) K <- 9 for (i in 1:nmodel) { curX <- X[,election.leaps$which[i,]] cur.lm <- glm(V ~ curX) CV.error[i] <- cv.glm(model.frame(cur.lm), cur.lm, K=K)$delta[1] } # best subsets adjusted plot(election.leaps$Cp, CV.error, pch=23, bg='orange', cex=2, ylab='CV error', xlab="Mallow's Cp") # These estimates are variable -- in general the "best" model # will change with "sampling" of the cross-validation groups. # They are more stable for larger data sets -- here we only # have 21 observations print(best.model.CV) 1 2 3 4 5 6 FALSE TRUE FALSE FALSE TRUE TRUE good.subset <- !is.na(CV.error) CV.error <- CV.error[good.subset] best.model.CV <- election.leaps$which[good.subset,][which((CV.error == min(CV.error))),] print(best.model.CV) 1 2 3 4 5 6 FALSE TRUE FALSE FALSE FALSE FALSE