Stat 480/580: Applied Statistics with R Part 1: Forwards and Backwards model selection using Akaike Information Criteria (AIC) Part 2: Cross-validation for comparing prediction abilities of models General ideas: AIC: One commonly accepted method for selecting a model that is good at selecting a model for prediction and parsimonious with respect to number of parameters. The smaller the AIC the better. AIC = Forwards model selection: Starting with a very small model, add variables one at a time until improvement in selection criteria (AIC, adjusted R-square, F-statistic, etc.) is no longer made. Backwards model selection: Start with a large model and peel off variables. Stepwise model selection (both): Start with an intermediate problem and with each model consider dropping one or adding one variable. Stop when improvements are no longer made. > birds <read.table(url("http://www.humboldt.edu/~mar13/Data.dir/bird.txt"), skip=15, header=T) > names( birds ) [1] "sv" "ag" "tl" "ae" "wt" "bh" "hl" "fl" "tt" "sk" "kl" > > fit.all <- lm( wt ~ ., data=birds) # use all variables to predict wt > fit.all Call: lm(formula = wt ~ ., data = birds) Coefficients: (Intercept) -21.6441 hl 17.8480 sv 0.8673 fl -11.9086 ag -0.1075 tt 6.3948 tl 0.0639 sk 8.8568 ae 0.0148 kl 6.6162 bh 0.3037 > > fit.none <- lm( wt ~ 1, data=birds) # y = beta0 model, no x variables > fit.none Call: lm(formula = wt ~ 1, data = birds) Coefficients: (Intercept) 25.8 > formula.all <- formula( wt ~ ag+tl+ae+bh+hl+ fl + tt + sk + kl + ag + sv) > 1 > # add just one x variable. > # formula.all tells what variables it can add from. > add1( fit.none, scope= formula.all) Single term additions Model: wt ~ 1 Df Sum of Sq RSS AIC <none> 174.040 62.324 ag 1 0.010 174.030 64.319 tl 1 48.005 126.035 36.246 ae 1 43.345 130.695 39.405 bh 1 30.827 143.213 47.363 hl 1 40.868 133.172 41.038 fl 1 34.197 139.842 45.291 tt 1 38.892 135.147 42.320 sk 1 22.474 151.565 52.294 kl 1 28.137 145.903 48.982 sv 1 13.085 160.955 57.524 > > fit.tl <- lm( wt ~ tl, data=birds ) > sum(fit.tl$resid^2) #RSS [1] 126.0347 > sum( fit.none$resid^2 ) - sum(fit.tl$resid^2 ) # change in SS [1] 48.00484 > # AIC = n*log(RSS/n)+2p, where p=number of parameters > # Goal: minimize AIC > 87*log(126.0347/87)+2*2 [1] 36.24648 > > extractAIC(fit.tl) # The equivalent. [1] 2.00000 36.24648 > #Note: extractAIC() is for lm() objects where the likelihood is not known. > # AIC() is for glm() objects where the likelihood is available. > drop1( fit.all ) # drop only 1 variable Single term deletions Model: wt ~ sv + ag + tl + ae + bh + hl + fl + tt + sk + kl Df Sum of Sq RSS AIC <none> 90.822 25.740 sv 1 9.569 100.391 32.455 ag 1 0.202 91.024 23.934 tl 1 1.335 92.156 25.009 ae 1 0.109 90.931 23.845 bh 1 1.942 92.764 25.581 hl 1 2.632 93.453 26.225 fl 1 1.222 92.044 24.903 tt 1 1.855 92.676 25.499 sk 1 0.956 91.777 24.651 kl 1 3.167 93.989 26.722 2 > # forwards model selection > step( fit.none, direction="forward", scope=formula.all ) Start: AIC=62.32 wt ~ 1 + tl + ae + hl + tt + fl + bh + kl + sk + sv <none> + ag Df Sum of Sq RSS 1 48.005 126.035 1 43.345 130.695 1 40.868 133.172 1 38.892 135.147 1 34.197 139.842 1 30.827 143.213 1 28.137 145.903 1 22.474 151.565 1 13.085 160.955 174.040 1 0.010 174.030 AIC 36.246 39.405 41.038 42.320 45.291 47.363 48.982 52.294 57.524 62.324 64.319 Step: AIC=36.25 wt ~ tl Df Sum of Sq RSS + hl 1 15.811 110.224 + tt 1 15.252 110.783 + bh 1 13.383 112.652 + kl 1 11.159 114.875 + fl 1 9.596 116.439 + ae 1 8.446 117.588 + sk 1 5.005 121.030 <none> 126.035 + sv 1 0.428 125.607 + ag 1 0.195 125.840 AIC 26.584 27.025 28.480 30.181 31.357 32.212 34.721 36.246 37.951 38.112 Step: AIC=26.58 wt ~ tl + hl + sv + bh + kl <none> + tt + sk + ag + fl + ae Df Sum of Sq RSS 1 7.154 103.069 1 4.251 105.973 1 3.344 106.880 110.224 1 2.174 108.049 1 0.639 109.584 1 0.530 109.693 1 0.302 109.921 1 0.064 110.159 AIC 22.746 25.163 25.904 26.584 26.851 28.078 28.165 28.345 28.534 3 Step: AIC=22.75 wt ~ tl + hl + sv + kl + bh <none> + tt + sk + ag + ae + fl Df Sum of Sq 1 5.471 1 5.300 1 1 1 1 1 2.341 1.519 0.605 0.259 0.046 RSS 97.598 97.770 103.069 100.729 101.550 102.464 102.810 103.023 AIC 20.000 20.154 22.746 22.747 23.454 24.234 24.527 24.707 Step: AIC=20 wt ~ tl + hl + sv + kl + bh <none> + tt + sk + ag + fl + ae Df Sum of Sq RSS 1 3.212 94.386 97.598 1 1.897 95.701 1 1.697 95.901 1 0.214 97.384 1 0.104 97.494 1 0.093 97.504 AIC 19.089 20.000 20.293 20.474 21.809 21.908 21.917 Step: AIC=19.09 wt ~ tl + hl + sv + kl + bh Df Sum of Sq <none> + tt + sk + fl + ae + ag 1 1 1 1 1 1.037 0.954 0.340 0.293 0.172 RSS 94.386 93.349 93.432 94.046 94.093 94.214 AIC 19.089 20.128 20.205 20.775 20.818 20.930 Call: lm(formula = wt ~ tl + hl + sv + kl + bh, data = birds) Coefficients: (Intercept) -18.77819 tl 0.07717 hl 18.51592 sv 0.85592 kl 6.73364 bh 0.36799 4 > step( fit.all, direction="backward" ) # backwards model selection Start: AIC=25.74 wt ~ sv + ag + tl + ae + bh + hl + fl + tt + sk + kl - ae - ag - sk - fl - tl - tt - bh <none> - hl - kl - sv Df Sum of Sq 1 0.109 1 0.202 1 0.956 1 1.222 1 1.335 1 1.855 1 1.942 1 1 1 RSS 90.931 91.024 91.777 92.044 92.156 92.676 92.764 90.822 2.632 93.453 3.167 93.989 9.569 100.391 AIC 23.845 23.934 24.651 24.903 25.009 25.499 25.581 25.740 26.225 26.722 32.455 Step: AIC=23.84 wt ~ sv + ag + tl + bh + hl + fl + tt + sk + kl - ag - sk - fl - bh - tt <none> - tl - kl - hl - sv Df Sum of Sq 1 0.280 1 0.971 1 1.199 1 1.842 1 2.001 1 1 1 1 2.186 3.305 3.491 9.467 RSS 91.211 91.902 92.130 92.773 92.932 90.931 93.117 94.235 94.422 100.398 AIC 22.112 22.769 22.984 23.590 23.739 23.845 23.911 24.950 25.123 30.461 Step: AIC=22.11 wt ~ sv + tl + bh + hl + fl + tt + sk + kl - sk - fl - bh - tt - tl <none> - hl - kl - sv Df Sum of Sq 1 0.979 1 1.132 1 1.889 1 1.905 1 2.095 1 1 1 RSS 92.190 92.343 93.100 93.116 93.306 91.211 3.347 94.558 3.613 94.824 9.520 100.731 AIC 21.041 21.185 21.896 21.911 22.088 22.112 23.248 23.492 28.750 5 Step: AIC=21.04 wt ~ sv + tl + bh + hl + fl + tt + kl - fl - tt <none> - bh - tl - kl - hl - sv Df Sum of Sq 1 1.159 1 1.856 1 1 1 1 1 2.547 3.194 3.352 4.032 8.813 RSS 93.349 94.046 92.190 94.736 95.384 95.542 96.222 101.003 AIC 20.128 20.775 21.041 21.412 22.004 22.148 22.765 26.984 Step: AIC=20.13 wt ~ sv + tl + bh + hl + tt + kl - tt <none> - bh - tl - hl - kl - sv Df Sum of Sq 1 1.037 1 1 1 1 1 2.352 2.550 2.916 3.332 9.746 RSS 94.386 93.349 95.701 95.899 96.265 96.680 103.095 AIC 19.089 20.128 20.293 20.473 20.804 21.179 26.768 Step: AIC=19.09 wt ~ sv + tl + bh + hl + kl Df Sum of Sq <none> - tl - bh - kl - hl - sv 1 1 1 1 1 2.839 3.212 3.384 8.725 9.752 RSS 94.386 97.225 97.598 97.770 103.111 104.139 AIC 19.089 19.668 20.000 20.154 24.781 25.644 Call: lm(formula = wt ~ sv + tl + bh + hl + kl, data = birds) Coefficients: (Intercept) -18.77819 sv 0.85592 tl 0.07717 bh 0.36799 hl 18.51592 kl 6.73364 6 > # both ways "step-wise" > fit.some <- lm( wt ~ tl + ae, data=birds ) > step( fit.some, direction="both", scope=formula.all ) Start: wt ~ tl + ae + bh + tt + hl + kl + sv + fl <none> + sk + ag - ae - tl Df Sum of Sq RSS 1 9.127 108.461 1 7.600 109.989 1 7.429 110.159 1 6.074 111.515 1 3.363 114.225 1 2.745 114.844 117.588 1 2.339 115.249 1 0.024 117.565 1 8.446 126.035 1 13.106 130.695 AIC=32.21 AIC 27.182 28.399 28.534 29.598 31.687 32.157 32.212 32.463 34.194 36.246 39.405 Step: AIC=27.18 wt ~ tl + ae + bh + sv + tt + hl <none> + kl - ae + sk + fl + ag - bh - tl Df Sum of Sq RSS 1 6.172 102.289 1 2.769 105.692 1 2.692 105.769 108.461 1 2.345 106.116 1 4.190 112.652 1 0.501 107.960 1 0.223 108.238 1 0.029 108.432 1 9.127 117.588 1 11.135 119.596 AIC 24.085 26.932 26.996 27.182 27.281 28.480 28.779 29.003 29.159 32.212 33.685 Step: AIC=24.08 wt ~ tl + ae + bh + sv + hl + kl + tt - tl <none> + sk + fl + ag - sv - ae - bh Df Sum of Sq RSS 1 5.085 97.204 1 4.008 98.281 1 3.679 98.611 1 1.860 104.149 102.289 1 1.362 100.927 1 1.144 101.145 1 0.001 102.288 1 6.172 108.461 1 7.439 109.728 1 11.936 114.225 AIC 21.649 22.608 22.899 23.653 24.085 24.918 25.106 26.084 27.182 28.192 31.687 7 Step: AIC=21.65 wt ~ tl + ae + bh + sv + hl - ae + kl - tl <none> + tt + sk + fl + ag - hl - bh - sv Df Sum of Sq 1 0.566 1 3.111 1 1.960 1 1 1 1 1 1 1 0.901 0.683 0.409 0.247 5.085 5.606 8.565 RSS 97.770 94.093 99.164 97.204 96.303 96.521 96.795 96.957 102.289 102.810 105.769 AIC 20.154 20.818 21.385 21.649 22.839 23.036 23.281 23.427 24.085 24.527 26.996 Step: AIC=20.15 wt ~ tl + bh + sv + hl + kl <none> + tt + sk + ae + ag - tl + fl - bh - sv - hl Df Sum of Sq 1 3.384 1 1 1 1 1 1 1 1 1 1.089 0.695 0.566 0.415 4.255 0.318 5.300 8.203 11.958 RSS 94.386 97.770 96.680 97.074 97.204 97.354 102.025 97.451 103.069 105.973 109.728 AIC 19.089 20.154 21.179 21.533 21.649 21.783 21.860 21.870 22.746 25.163 28.192 Step: AIC=19.09 wt ~ tl + bh + sv + hl + kl Df Sum of Sq <none> - tl - bh + tt - kl + sk + fl + ae + ag - hl - sv 1 1 1 1 1 1 1 1 1 1 2.839 3.212 1.037 3.384 0.954 0.340 0.293 0.172 8.725 9.752 RSS 94.386 97.225 97.598 93.349 97.770 93.432 94.046 94.093 94.214 103.111 104.139 AIC 19.089 19.668 20.000 20.128 20.154 20.205 20.775 20.818 20.930 24.781 25.644 Call: lm(formula = wt ~ tl + bh + sv + hl + kl, data = birds) Coefficients: (Intercept) -18.77819 tl 0.07717 bh 0.36799 sv 0.85592 hl 18.51592 kl 6.73364 8 Demonstration of Cross-Validation: Test set method and leave-one-out CV attach(birds) ## Let's try to decide between the 2 models > # The 2 models to consider from the sparrow dataset fit1 <- lm( wt ~ tl ) fit2 <- lm( wt ~ tl + I(tl^2) ) plot( tl, wt ) tlspan <- seq(from=153,to=167,length=1000) # save some typing points( tlspan , predict(fit1,newdata=data.frame(tl=tlspan)), type="l", lty=1 ) points( tlspan, predict(fit2,newdata=data.frame(tl=tlspan)), type="l", lty=2 ) 24 26 wt 28 30 > > > > > > > > > 154 156 158 160 162 164 166 tl > # AIC suggests model 2 > extractAIC(fit1) [1] 2.00000 36.24648 > extractAIC(fit2) [1] 3.00000 36.06651> > # traditional techniques suggest model 1 > summary(fit1) Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -11.59756 6.57429 -1.764 0.0813 . tl 0.23313 0.04097 5.690 1.76e-07 *** --Residual standard error: 1.218 on 85 degrees of freedom Multiple R-squared: 0.2758, Adjusted R-squared: 0.2673 F-statistic: 32.38 on 1 and 85 DF, p-value: 1.765e-07 > summary(fit2) Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 381.07819 269.04879 1.416 0.160 tl -4.66515 3.35540 -1.390 0.168 I(tl^2) 0.01527 0.01046 1.460 0.148 --Residual standard error: 1.21 on 84 degrees of freedom 9 Multiple R-squared: 0.2937, Adjusted R-squared: 0.2769 F-statistic: 17.47 on 2 and 84 DF, p-value: 4.532e-07 > # Residual plots are not all that informative; perhaps model 2 > windows() > plot( predict(fit1),residuals(fit1)); abline(h=0); title("Model 1") 2 1 0 -2 -1 residuals(fit1) 3 4 Model 1 24.0 24.5 25.0 25.5 26.0 26.5 27.0 predict(fit1) > windows() > plot( predict(fit2),residuals(fit2)); abline(h=0); title("Model 2") 2 1 0 -2 -1 residuals(fit2) 3 4 Model 2 25.0 25.5 26.0 26.5 27.0 27.5 predict(fit2) > > > > > > # save some typing formula1 <- formula( wt ~ tl ) formula2 <- formula( wt ~ tl + I(tl^2) ) ####################### # Test set method: Training set is 70 randomly selected birds from the 87 birds > training <- sample( 87, size=70, replace=F ) # determine training set > > fit1a <- lm( formula1, subset=training ) > mean(sum( (wt[-training]-predict(fit1a, newdata=birds[-training,]))^2 ))#MSE model 1 [1] 20.04031 > > fit2a <- lm( formula2, subset=training ) > mean(sum( (wt[-training]-predict(fit2a, newdata=birds[-training,]))^2 ))#MSE model 2 [1] 18.87475 10 > windows() > plot( tl, wt) > title(main="Original and training set (red) fits" ) > points( tl[training], wt[training], pch="x") > points( tlspan , predict(fit1,newdata=data.frame(tl=tlspan)), type="l", lty=1 ) > points( tlspan, predict(fit2,newdata=data.frame(tl=tlspan)), type="l", lty=2 ) > points( tlspan , predict(fit1a,newdata=data.frame(tl=tlspan)), type="l", lty=3, lwd=2, col="red") > points( tlspan, predict(fit2a,newdata=data.frame(tl=tlspan)), type="l", lty=4, lwd=2, col="red") Original and training set (red) fits 30 x wt 28 x x 26 x x x 24 x x x x x 154 x x x x x x x x x x x x x x x x x x x x x 156 158 160 x x x x x x x x x x x x x x x x x x x x x x x x x x x 162 164 166 tl > > # Different traing set gives different results > training <- sample( 87, size=70, replace=F ) > fit1a <- lm( formula1, subset=training ) > mean(sum( (wt[-training]-predict(fit1a, newdata=birds[-training,]))^2 ))#MSE model 1 [1] > 32.52064 > fit2a <- lm( formula2, subset=training ) > mean(sum( (wt[-training]-predict(fit2a, newdata=birds[-training,]))^2 ))#MSE model 2 [1] 34.66781 11 > > > > > > + + + + + + > ######################## #Leave-one-out CV: # Have n training sets, each of size n-1. error1 <- 0; error2 <- 0 # to keep track of squared errors for ( i in 1:87 ) { fit1b <- lm( formula1, subset=-i ) error1 <- error1 + (wt[i]-predict(fit1b, newdata=birds[i,]))^2 fit2b <- lm( formula2, subset=-i ) error2 <- error2 + (wt[i]-predict(fit2b, newdata=birds[i,]))^2 } error1 # MSE model 1 1 132.3479 > error2 # MSE model 2 1 131.3796 > > windows() > plot( tl, wt) > title( main="87 different fits for each model") > for( i in 1:87 ) + { + fit1b <- lm( formula1, subset=-i ) + points( tlspan, predict(fit1b,newdata=data.frame(tl=tlspan)), type="l", lty=1 ) + + fit2b <- lm( formula2, subset=-i ) + points( tlspan, predict(fit2b,newdata=data.frame(tl=tlspan)), type="l", lty=2) + } 12 24 26 wt 28 30 87 different fits for each model 154 156 158 160 162 164 166 tl 13