Stepwise and Cross-validation techniques for model selection

advertisement
Stat 480/580: Applied Statistics with R
Part 1: Forwards and Backwards model selection using Akaike Information Criteria (AIC)
Part 2: Cross-validation for comparing prediction abilities of models
General ideas:
 AIC: One commonly accepted method for selecting a model that is good at selecting a model for prediction and
parsimonious with respect to number of parameters. The smaller the AIC the better. AIC =



Forwards model selection: Starting with a very small model, add variables one at a time until improvement in
selection criteria (AIC, adjusted R-square, F-statistic, etc.) is no longer made.
Backwards model selection: Start with a large model and peel off variables.
Stepwise model selection (both): Start with an intermediate problem and with each model consider dropping one
or adding one variable. Stop when improvements are no longer made.
> birds <read.table(url("http://www.humboldt.edu/~mar13/Data.dir/bird.txt"), skip=15,
header=T)
> names( birds )
[1] "sv" "ag" "tl" "ae" "wt" "bh" "hl" "fl" "tt" "sk" "kl"
>
> fit.all <- lm( wt ~ ., data=birds) # use all variables to predict wt
> fit.all
Call:
lm(formula = wt ~ ., data = birds)
Coefficients:
(Intercept)
-21.6441
hl
17.8480
sv
0.8673
fl
-11.9086
ag
-0.1075
tt
6.3948
tl
0.0639
sk
8.8568
ae
0.0148
kl
6.6162
bh
0.3037
>
> fit.none <- lm( wt ~ 1, data=birds) # y = beta0 model, no x variables
> fit.none
Call:
lm(formula = wt ~ 1, data = birds)
Coefficients:
(Intercept)
25.8
> formula.all <- formula( wt ~ ag+tl+ae+bh+hl+ fl + tt + sk + kl + ag + sv)
>
1
> # add just one x variable.
> # formula.all tells what variables it can add from.
> add1( fit.none, scope= formula.all)
Single term additions
Model:
wt ~ 1
Df Sum of Sq
RSS
AIC
<none>
174.040 62.324
ag
1
0.010 174.030 64.319
tl
1
48.005 126.035 36.246
ae
1
43.345 130.695 39.405
bh
1
30.827 143.213 47.363
hl
1
40.868 133.172 41.038
fl
1
34.197 139.842 45.291
tt
1
38.892 135.147 42.320
sk
1
22.474 151.565 52.294
kl
1
28.137 145.903 48.982
sv
1
13.085 160.955 57.524
>
> fit.tl <- lm( wt ~ tl, data=birds )
> sum(fit.tl$resid^2) #RSS
[1] 126.0347
> sum( fit.none$resid^2 ) - sum(fit.tl$resid^2 ) # change in SS
[1] 48.00484
> # AIC = n*log(RSS/n)+2p, where p=number of parameters
> # Goal: minimize AIC
> 87*log(126.0347/87)+2*2
[1] 36.24648
>
> extractAIC(fit.tl) # The equivalent.
[1] 2.00000 36.24648
> #Note: extractAIC() is for lm() objects where the likelihood is not known.
> #
AIC() is for glm() objects where the likelihood is available.
> drop1( fit.all ) # drop only 1 variable
Single term deletions
Model:
wt ~ sv + ag + tl + ae + bh + hl + fl + tt + sk + kl
Df Sum of Sq
RSS
AIC
<none>
90.822 25.740
sv
1
9.569 100.391 32.455
ag
1
0.202 91.024 23.934
tl
1
1.335 92.156 25.009
ae
1
0.109 90.931 23.845
bh
1
1.942 92.764 25.581
hl
1
2.632 93.453 26.225
fl
1
1.222 92.044 24.903
tt
1
1.855 92.676 25.499
sk
1
0.956 91.777 24.651
kl
1
3.167 93.989 26.722
2
> # forwards model selection
> step( fit.none, direction="forward", scope=formula.all )
Start: AIC=62.32
wt ~ 1
+ tl
+ ae
+ hl
+ tt
+ fl
+ bh
+ kl
+ sk
+ sv
<none>
+ ag
Df Sum of Sq
RSS
1
48.005 126.035
1
43.345 130.695
1
40.868 133.172
1
38.892 135.147
1
34.197 139.842
1
30.827 143.213
1
28.137 145.903
1
22.474 151.565
1
13.085 160.955
174.040
1
0.010 174.030
AIC
36.246
39.405
41.038
42.320
45.291
47.363
48.982
52.294
57.524
62.324
64.319
Step: AIC=36.25
wt ~ tl
Df Sum of Sq
RSS
+ hl
1
15.811 110.224
+ tt
1
15.252 110.783
+ bh
1
13.383 112.652
+ kl
1
11.159 114.875
+ fl
1
9.596 116.439
+ ae
1
8.446 117.588
+ sk
1
5.005 121.030
<none>
126.035
+ sv
1
0.428 125.607
+ ag
1
0.195 125.840
AIC
26.584
27.025
28.480
30.181
31.357
32.212
34.721
36.246
37.951
38.112
Step: AIC=26.58
wt ~ tl + hl
+ sv
+ bh
+ kl
<none>
+ tt
+ sk
+ ag
+ fl
+ ae
Df Sum of Sq
RSS
1
7.154 103.069
1
4.251 105.973
1
3.344 106.880
110.224
1
2.174 108.049
1
0.639 109.584
1
0.530 109.693
1
0.302 109.921
1
0.064 110.159
AIC
22.746
25.163
25.904
26.584
26.851
28.078
28.165
28.345
28.534
3
Step: AIC=22.75
wt ~ tl + hl + sv
+ kl
+ bh
<none>
+ tt
+ sk
+ ag
+ ae
+ fl
Df Sum of Sq
1
5.471
1
5.300
1
1
1
1
1
2.341
1.519
0.605
0.259
0.046
RSS
97.598
97.770
103.069
100.729
101.550
102.464
102.810
103.023
AIC
20.000
20.154
22.746
22.747
23.454
24.234
24.527
24.707
Step: AIC=20
wt ~ tl + hl + sv + kl
+ bh
<none>
+ tt
+ sk
+ ag
+ fl
+ ae
Df Sum of Sq
RSS
1
3.212 94.386
97.598
1
1.897 95.701
1
1.697 95.901
1
0.214 97.384
1
0.104 97.494
1
0.093 97.504
AIC
19.089
20.000
20.293
20.474
21.809
21.908
21.917
Step: AIC=19.09
wt ~ tl + hl + sv + kl + bh
Df Sum of Sq
<none>
+ tt
+ sk
+ fl
+ ae
+ ag
1
1
1
1
1
1.037
0.954
0.340
0.293
0.172
RSS
94.386
93.349
93.432
94.046
94.093
94.214
AIC
19.089
20.128
20.205
20.775
20.818
20.930
Call:
lm(formula = wt ~ tl + hl + sv + kl + bh, data = birds)
Coefficients:
(Intercept)
-18.77819
tl
0.07717
hl
18.51592
sv
0.85592
kl
6.73364
bh
0.36799
4
> step( fit.all, direction="backward" ) # backwards model selection
Start: AIC=25.74
wt ~ sv + ag + tl + ae + bh + hl + fl + tt + sk + kl
- ae
- ag
- sk
- fl
- tl
- tt
- bh
<none>
- hl
- kl
- sv
Df Sum of Sq
1
0.109
1
0.202
1
0.956
1
1.222
1
1.335
1
1.855
1
1.942
1
1
1
RSS
90.931
91.024
91.777
92.044
92.156
92.676
92.764
90.822
2.632 93.453
3.167 93.989
9.569 100.391
AIC
23.845
23.934
24.651
24.903
25.009
25.499
25.581
25.740
26.225
26.722
32.455
Step: AIC=23.84
wt ~ sv + ag + tl + bh + hl + fl + tt + sk + kl
- ag
- sk
- fl
- bh
- tt
<none>
- tl
- kl
- hl
- sv
Df Sum of Sq
1
0.280
1
0.971
1
1.199
1
1.842
1
2.001
1
1
1
1
2.186
3.305
3.491
9.467
RSS
91.211
91.902
92.130
92.773
92.932
90.931
93.117
94.235
94.422
100.398
AIC
22.112
22.769
22.984
23.590
23.739
23.845
23.911
24.950
25.123
30.461
Step: AIC=22.11
wt ~ sv + tl + bh + hl + fl + tt + sk + kl
- sk
- fl
- bh
- tt
- tl
<none>
- hl
- kl
- sv
Df Sum of Sq
1
0.979
1
1.132
1
1.889
1
1.905
1
2.095
1
1
1
RSS
92.190
92.343
93.100
93.116
93.306
91.211
3.347 94.558
3.613 94.824
9.520 100.731
AIC
21.041
21.185
21.896
21.911
22.088
22.112
23.248
23.492
28.750
5
Step: AIC=21.04
wt ~ sv + tl + bh + hl + fl + tt + kl
- fl
- tt
<none>
- bh
- tl
- kl
- hl
- sv
Df Sum of Sq
1
1.159
1
1.856
1
1
1
1
1
2.547
3.194
3.352
4.032
8.813
RSS
93.349
94.046
92.190
94.736
95.384
95.542
96.222
101.003
AIC
20.128
20.775
21.041
21.412
22.004
22.148
22.765
26.984
Step: AIC=20.13
wt ~ sv + tl + bh + hl + tt + kl
- tt
<none>
- bh
- tl
- hl
- kl
- sv
Df Sum of Sq
1
1.037
1
1
1
1
1
2.352
2.550
2.916
3.332
9.746
RSS
94.386
93.349
95.701
95.899
96.265
96.680
103.095
AIC
19.089
20.128
20.293
20.473
20.804
21.179
26.768
Step: AIC=19.09
wt ~ sv + tl + bh + hl + kl
Df Sum of Sq
<none>
- tl
- bh
- kl
- hl
- sv
1
1
1
1
1
2.839
3.212
3.384
8.725
9.752
RSS
94.386
97.225
97.598
97.770
103.111
104.139
AIC
19.089
19.668
20.000
20.154
24.781
25.644
Call:
lm(formula = wt ~ sv + tl + bh + hl + kl, data = birds)
Coefficients:
(Intercept)
-18.77819
sv
0.85592
tl
0.07717
bh
0.36799
hl
18.51592
kl
6.73364
6
> # both ways "step-wise"
> fit.some <- lm( wt ~ tl + ae, data=birds )
> step( fit.some, direction="both", scope=formula.all ) Start:
wt ~ tl + ae
+ bh
+ tt
+ hl
+ kl
+ sv
+ fl
<none>
+ sk
+ ag
- ae
- tl
Df Sum of Sq
RSS
1
9.127 108.461
1
7.600 109.989
1
7.429 110.159
1
6.074 111.515
1
3.363 114.225
1
2.745 114.844
117.588
1
2.339 115.249
1
0.024 117.565
1
8.446 126.035
1
13.106 130.695
AIC=32.21
AIC
27.182
28.399
28.534
29.598
31.687
32.157
32.212
32.463
34.194
36.246
39.405
Step: AIC=27.18
wt ~ tl + ae + bh
+ sv
+ tt
+ hl
<none>
+ kl
- ae
+ sk
+ fl
+ ag
- bh
- tl
Df Sum of Sq
RSS
1
6.172 102.289
1
2.769 105.692
1
2.692 105.769
108.461
1
2.345 106.116
1
4.190 112.652
1
0.501 107.960
1
0.223 108.238
1
0.029 108.432
1
9.127 117.588
1
11.135 119.596
AIC
24.085
26.932
26.996
27.182
27.281
28.480
28.779
29.003
29.159
32.212
33.685
Step: AIC=24.08
wt ~ tl + ae + bh + sv
+ hl
+ kl
+ tt
- tl
<none>
+ sk
+ fl
+ ag
- sv
- ae
- bh
Df Sum of Sq
RSS
1
5.085 97.204
1
4.008 98.281
1
3.679 98.611
1
1.860 104.149
102.289
1
1.362 100.927
1
1.144 101.145
1
0.001 102.288
1
6.172 108.461
1
7.439 109.728
1
11.936 114.225
AIC
21.649
22.608
22.899
23.653
24.085
24.918
25.106
26.084
27.182
28.192
31.687
7
Step: AIC=21.65
wt ~ tl + ae + bh + sv + hl
- ae
+ kl
- tl
<none>
+ tt
+ sk
+ fl
+ ag
- hl
- bh
- sv
Df Sum of Sq
1
0.566
1
3.111
1
1.960
1
1
1
1
1
1
1
0.901
0.683
0.409
0.247
5.085
5.606
8.565
RSS
97.770
94.093
99.164
97.204
96.303
96.521
96.795
96.957
102.289
102.810
105.769
AIC
20.154
20.818
21.385
21.649
22.839
23.036
23.281
23.427
24.085
24.527
26.996
Step: AIC=20.15
wt ~ tl + bh + sv + hl
+ kl
<none>
+ tt
+ sk
+ ae
+ ag
- tl
+ fl
- bh
- sv
- hl
Df Sum of Sq
1
3.384
1
1
1
1
1
1
1
1
1
1.089
0.695
0.566
0.415
4.255
0.318
5.300
8.203
11.958
RSS
94.386
97.770
96.680
97.074
97.204
97.354
102.025
97.451
103.069
105.973
109.728
AIC
19.089
20.154
21.179
21.533
21.649
21.783
21.860
21.870
22.746
25.163
28.192
Step: AIC=19.09
wt ~ tl + bh + sv + hl + kl
Df Sum of Sq
<none>
- tl
- bh
+ tt
- kl
+ sk
+ fl
+ ae
+ ag
- hl
- sv
1
1
1
1
1
1
1
1
1
1
2.839
3.212
1.037
3.384
0.954
0.340
0.293
0.172
8.725
9.752
RSS
94.386
97.225
97.598
93.349
97.770
93.432
94.046
94.093
94.214
103.111
104.139
AIC
19.089
19.668
20.000
20.128
20.154
20.205
20.775
20.818
20.930
24.781
25.644
Call:
lm(formula = wt ~ tl + bh + sv + hl + kl, data = birds)
Coefficients:
(Intercept)
-18.77819
tl
0.07717
bh
0.36799
sv
0.85592
hl
18.51592
kl
6.73364
8
Demonstration of Cross-Validation: Test set method and leave-one-out CV
attach(birds)
## Let's try to decide between the 2 models > # The 2 models to consider from the sparrow dataset
fit1 <- lm( wt ~ tl )
fit2 <- lm( wt ~ tl + I(tl^2) )
plot( tl, wt )
tlspan <- seq(from=153,to=167,length=1000) # save some typing
points( tlspan , predict(fit1,newdata=data.frame(tl=tlspan)), type="l", lty=1 )
points( tlspan, predict(fit2,newdata=data.frame(tl=tlspan)), type="l", lty=2 )
24
26
wt
28
30
>
>
>
>
>
>
>
>
>
154
156
158
160
162
164
166
tl
> # AIC suggests model 2
> extractAIC(fit1)
[1] 2.00000 36.24648
> extractAIC(fit2)
[1] 3.00000 36.06651>
> # traditional techniques suggest model 1
> summary(fit1)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -11.59756
6.57429 -1.764
0.0813 .
tl
0.23313
0.04097
5.690 1.76e-07 ***
--Residual standard error: 1.218 on 85 degrees of freedom
Multiple R-squared: 0.2758,
Adjusted R-squared: 0.2673
F-statistic: 32.38 on 1 and 85 DF, p-value: 1.765e-07
> summary(fit2)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 381.07819 269.04879
1.416
0.160
tl
-4.66515
3.35540 -1.390
0.168
I(tl^2)
0.01527
0.01046
1.460
0.148
--Residual standard error: 1.21 on 84 degrees of freedom
9
Multiple R-squared: 0.2937,
Adjusted R-squared: 0.2769
F-statistic: 17.47 on 2 and 84 DF, p-value: 4.532e-07
> # Residual plots are not all that informative; perhaps model 2
> windows()
> plot( predict(fit1),residuals(fit1)); abline(h=0); title("Model 1")
2
1
0
-2
-1
residuals(fit1)
3
4
Model 1
24.0
24.5
25.0
25.5
26.0
26.5
27.0
predict(fit1)
> windows()
> plot( predict(fit2),residuals(fit2)); abline(h=0); title("Model 2")
2
1
0
-2
-1
residuals(fit2)
3
4
Model 2
25.0
25.5
26.0
26.5
27.0
27.5
predict(fit2)
>
>
>
>
>
>
# save some typing
formula1 <- formula( wt ~ tl )
formula2 <- formula( wt ~ tl + I(tl^2) )
#######################
# Test set method: Training set is 70 randomly selected birds from the 87 birds
> training <- sample( 87, size=70, replace=F ) # determine training set
>
> fit1a <- lm( formula1, subset=training )
> mean(sum( (wt[-training]-predict(fit1a, newdata=birds[-training,]))^2 ))#MSE model 1
[1] 20.04031
>
> fit2a <- lm( formula2, subset=training )
> mean(sum( (wt[-training]-predict(fit2a, newdata=birds[-training,]))^2 ))#MSE model 2
[1] 18.87475
10
> windows()
> plot( tl, wt)
> title(main="Original and training set (red) fits" )
> points( tl[training], wt[training], pch="x")
> points( tlspan , predict(fit1,newdata=data.frame(tl=tlspan)), type="l", lty=1 )
> points( tlspan, predict(fit2,newdata=data.frame(tl=tlspan)), type="l", lty=2 )
> points( tlspan , predict(fit1a,newdata=data.frame(tl=tlspan)), type="l", lty=3, lwd=2,
col="red")
> points( tlspan, predict(fit2a,newdata=data.frame(tl=tlspan)), type="l", lty=4, lwd=2,
col="red")
Original and training set (red) fits
30
x
wt
28
x
x
26
x
x
x
24
x
x
x
x
x
154
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
156
158
160
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
x
162
164
166
tl
>
> # Different traing set gives different results
> training <- sample( 87, size=70, replace=F )
> fit1a <- lm( formula1, subset=training )
> mean(sum( (wt[-training]-predict(fit1a, newdata=birds[-training,]))^2 ))#MSE model 1
[1]
> 32.52064
> fit2a <- lm( formula2, subset=training )
> mean(sum( (wt[-training]-predict(fit2a, newdata=birds[-training,]))^2 ))#MSE model 2
[1] 34.66781
11
>
>
>
>
>
>
+
+
+
+
+
+
>
########################
#Leave-one-out CV:
# Have n training sets, each of size n-1.
error1 <- 0; error2 <- 0
# to keep track of squared errors
for ( i in 1:87 )
{
fit1b <- lm( formula1, subset=-i )
error1 <- error1 + (wt[i]-predict(fit1b, newdata=birds[i,]))^2
fit2b <- lm( formula2, subset=-i )
error2 <- error2 + (wt[i]-predict(fit2b, newdata=birds[i,]))^2
}
error1 # MSE model 1
1
132.3479
> error2 # MSE model 2
1
131.3796 >
> windows()
> plot( tl, wt)
> title( main="87 different fits for each model")
> for( i in 1:87 )
+ {
+
fit1b <- lm( formula1, subset=-i )
+
points( tlspan, predict(fit1b,newdata=data.frame(tl=tlspan)), type="l", lty=1 )
+
+
fit2b <- lm( formula2, subset=-i )
+
points( tlspan, predict(fit2b,newdata=data.frame(tl=tlspan)), type="l", lty=2)
+ }
12
24
26
wt
28
30
87 different fits for each model
154
156
158
160
162
164
166
tl
13
Download