Model Selection Handout #### #### ## V : votes for a presidential

advertisement
Model Selection Handout
####
####
## V : votes for a presidential candidate
## I: are they incumbent:
## D: democrat or republican incumbent?
## W: wartime election?
## G: GDP growth rate in election year
## P: absolute GDP deflator growth rate
## N: number of quarters in which GDP growth rate > 3.2%
####
#electiondata
election.table<-read.table("C:/Classes/Stat214/election.txt",sep="\t",quote="",header=T)
attach(election.table)
election.table[1:5,]
Year
V I DW
G
P N
1 1916 0.5168 1 1 0 2.229 4.252 3
2 1920 0.3612 1 0 1 -11.463 16.535 5
3 1924 0.4176 -1 -1 0 -3.872 5.161 10
4 1928 0.4118 -1 0 0 4.623 0.183 7
5 1932 0.5916 -1 -1 0 -14.901 7.069 4
pairs(election.table[,2:ncol(election.table)], cex.labels=3, pch=23,
bg='orange', cex=2)
# This is the library with the leaps function
# install.packages('leaps')
library(leaps)
# Leaps takes a design matrix as argument: throw away the intercept
# column or leaps will complain
X <- model.matrix(lm(V ~ I + D + W +G + P + N, election.table))[,-1]
# Look at R^2
# best subsets R2 -- notice it increases
election.leaps <- leaps(X, election.table$V, nbest=3, method='r2')
plot(election.leaps$size, election.leaps$r2, pch=23, bg='orange', cex=2)
best.model.r2 <- election.leaps$which[which((election.leaps$r2 ==
max(election.leaps$r2))),]
print(best.model.r2)
1 2 3 4 5 6
TRUE TRUE TRUE TRUE TRUE TRUE
# Look at Adjusted R^2
# best subsets adjusted
election.leaps <- leaps(X, election.table$V, nbest=3, method='adjr2')
plot(election.leaps$size, election.leaps$adjr2, pch=23, bg='orange', cex=2)
best.model.adjr2 <- election.leaps$which[which((election.leaps$adjr2 ==
max(election.leaps$adjr2))),]
print(best.model.adjr2)
1 2 3 4 5 6
FALSE TRUE FALSE FALSE TRUE FALSE
# Look at Cp
# best subsets Cp
election.leaps <- leaps(X, election.table$V, nbest=3, method='Cp')
plot(election.leaps$size, election.leaps$Cp, pch=23, bg='orange', cex=2)
best.model.Cp <- election.leaps$which[which((election.leaps$Cp ==
min(election.leaps$Cp))),]
print(best.model.Cp)
1
2
3
4
5
6
FALSE TRUE FALSE FALSE FALSE FALSE
Cross Validation of Prediction Error
# this package has the cross-validation function in it
library(boot)
# Leaps takes a design matrix as argument: throw away the intercept
# column or leaps will complain
# we will look at cross-validated estimate of error for each model leaps
# thought was interesting in terms of adjusted R^2
X <- model.matrix(glm(V ~ I + D + W +G + P + N))[,-1]
election.leaps <- leaps(X, V, nbest=4, method='adjr2')
nmodel <- nrow(election.leaps$which) #how many models
CV.error <- numeric(nmodel) # a list of 0’s of length nmodel
K <- 9
for (i in 1:nmodel) {
curX <- X[,election.leaps$which[i,]]
cur.lm <- glm(V ~ curX)
CV.error[i] <- cv.glm(model.frame(cur.lm), cur.lm, K=K)$delta[1]
}
CV.error
[1] 0.005123864 0.005969539 0.005737972
NA 0.005354762 0.005428094
[7] 0.005908259 0.005166203
NA 0.005653756 0.004516925 0.005843429
[13]
NA
NA 0.008059973 0.006544132
NA 0.011622195
[19] 0.007633662 0.006313123 0.010701813
# best subsets adjusted
plot(election.leaps$adjr2, CV.error, pch=23, bg='orange', cex=2,
ylab='CV error', xlab='Adjusted R^2')
good.subset <- !is.na(CV.error) #exclude NAs
CV.error <- CV.error[good.subset]
best.model.CV <- election.leaps$which[good.subset,][which((CV.error ==
min(CV.error))),]
print(best.model.CV)
1 2 3 4 5 6
FALSE TRUE FALSE FALSE TRUE TRUE
# same for Cp
election.leaps <- leaps(X, V, nbest=4, method='Cp')
nmodel <- nrow(election.leaps$which)
CV.error <- numeric(nmodel)
K <- 9
for (i in 1:nmodel) {
curX <- X[,election.leaps$which[i,]]
cur.lm <- glm(V ~ curX)
CV.error[i] <- cv.glm(model.frame(cur.lm), cur.lm, K=K)$delta[1]
}
# best subsets adjusted
plot(election.leaps$Cp, CV.error, pch=23, bg='orange', cex=2,
ylab='CV error', xlab="Mallow's Cp")
# These estimates are variable -- in general the "best" model
# will change with "sampling" of the cross-validation groups.
# They are more stable for larger data sets -- here we only
# have 21 observations
print(best.model.CV)
1 2 3 4 5 6
FALSE TRUE FALSE FALSE TRUE TRUE
good.subset <- !is.na(CV.error)
CV.error <- CV.error[good.subset]
best.model.CV <- election.leaps$which[good.subset,][which((CV.error ==
min(CV.error))),]
print(best.model.CV)
1
2
3
4
5
6
FALSE TRUE FALSE FALSE FALSE FALSE
Download