Chapter 9, Statistical Modelling, of “The R Book”, 2nd ed. Common terms: Regression: all explanatory variables are continuous ANOVA (Analysis of Variance): all explanatory variables categorical ANCOVA (Analysis of Covariance): explanatory variables both continuous and categorical > trillium <- read.table(file="http://users.humboldt.edu/rizzardi/Data.dir/trillium", + header=T, skip=9) > > # Show which variables are numeric, integers, and factors > str( trillium ) # flower is a factor, site is an integer 'data.frame': 582 obs. of 4 variables: $ leaf : num 11.6 8.2 12.7 10.6 10.1 13.7 16 15.7 10.4 5.7 ... $ stem : num 3.4 2.9 4.6 4.2 3.4 4.7 5.3 6.3 3.4 5.5 ... $ flower: Factor w/ 3 levels "p","s","w": 1 3 3 3 1 1 3 1 1 1 ... $ site : int 4 4 4 4 4 4 4 4 4 4 ... > > # Want site to be a factor > trillium$site[1:60] [1] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [49] 2 2 2 2 4 4 4 4 4 4 4 4 > trillium$site <- factor( trillium$site ) > trillium$site[1:60] [1] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 [49] 2 2 2 2 4 4 4 4 4 4 4 4 Levels: 1 2 3 4 > levels(trillium$site) <- c("Lake","RS","Park","Road") > trillium$site[1:60] [1] Road Road Road Road Road Road Road Road Road Road Road Road Road Road Road Road Road Road Road [20] Road Road Road Road Road Road Road Road RS RS RS RS RS RS RS RS RS RS RS [39] RS RS RS RS RS RS RS RS RS RS RS RS RS RS Road Road Road Road Road [58] Road Road Road Levels: Lake RS Park Road > > > attach( trillium ) > # Example of 1-way ANOVA > # Predict Leaf using collection site > boxplot( leaf ~ site, data=trillium ) 1 > fitA <- lm( leaf ~ site, data=trillium ) > summary(fitA) Call: lm(formula = leaf ~ site, data = trillium) Residuals: Min 1Q Median -7.5860 -1.6019 -0.0247 3Q 1.4525 Max 9.1140 Coefficients: Estimate Std. Error t value (Intercept) 11.9605 0.1767 67.677 siteRS 1.1415 0.2622 4.353 sitePark 2.0871 0.2803 7.445 siteRoad 0.4255 0.2810 1.514 --Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 Pr(>|t|) < 2e-16 *** 1.59e-05 *** 3.55e-13 *** 0.131 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 2.404 on 578 degrees of freedom Multiple R-squared: 0.09607, Adjusted R-squared: 0.09138 F-statistic: 20.48 on 3 and 578 DF, p-value: 1.271e-12 > anova(fitA) Analysis of Variance Table Response: leaf Df Sum Sq Mean Sq F value Pr(>F) site 3 355.0 118.323 20.478 1.271e-12 *** Residuals 578 3339.8 5.778 --Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > > fitB <- lm( leaf ~ -1 + site, data=trillium) > summary( fitB ) 2 Call: lm(formula = leaf ~ -1 + site, data = trillium) Residuals: Min 1Q Median -7.5860 -1.6019 -0.0247 3Q 1.4525 Max 9.1140 Coefficients: Estimate Std. Error t value Pr(>|t|) siteLake 11.9605 0.1767 67.68 <2e-16 *** siteRS 13.1019 0.1937 67.64 <2e-16 *** sitePark 14.0475 0.2176 64.55 <2e-16 *** siteRoad 12.3860 0.2185 56.68 <2e-16 *** --Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 2.404 on 578 degrees of freedom Multiple R-squared: 0.9662, Adjusted R-squared: 0.966 F-statistic: 4134 on 4 and 578 DF, p-value: < 2.2e-16 > mean( leaf[site=="Park"]) [1] 14.04754 > mean( leaf[site=="RS"]) [1] 13.10195 > mean( leaf[site=="Road"]) [1] 12.38595 > mean( leaf[site=="Lake"]) [1] 11.96049 > anova(fitB) Analysis of Variance Table Response: leaf Df Sum Sq Mean Sq F value Pr(>F) site 4 95538 23884.5 4133.6 < 2.2e-16 *** Residuals 578 3340 5.8 --Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > > dev.new() > stripchart( leaf ~ site, data=trillium) > abline(v=fitB$coef, lty=c(1:4)) 3 > > > > > > > > > > > # Example of ANCOVA # Predict Leaf using collection site and stem length # Should intercepts be the same? dev.new() plot( stem, leaf, type="n" ) points( stem, leaf, pch=as.numeric(site), cex=.9 ) legend(6, 10, legend=c("Lake","Redwood Science","Park","Road"), pch=1:4 ) fitC <- lm( leaf ~ stem + site ) summary(fitC) Call: lm(formula = leaf ~ stem + site) Residuals: Min 1Q -7.5510 -1.2634 Median 0.0639 3Q 1.2324 Max 9.3789 Coefficients: Estimate Std. Error t value (Intercept) 7.47396 0.29584 25.264 stem 1.10491 0.06372 17.339 siteRS 0.42129 0.21680 1.943 sitePark 0.81628 0.23902 3.415 siteRoad -0.32491 0.23214 -1.400 --Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 Pr(>|t|) < 2e-16 < 2e-16 0.052483 0.000682 0.162168 *** *** . *** ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.951 on 577 degrees of freedom Multiple R-squared: 0.4057, Adjusted R-squared: 0.4016 F-statistic: 98.48 on 4 and 577 DF, p-value: < 2.2e-16 4 > anova(fitC) Analysis of Variance Table Response: leaf Df Sum Sq Mean Sq F value Pr(>F) stem 1 1408.12 1408.12 370.0405 < 2.2e-16 *** site 3 90.94 30.31 7.9665 3.273e-05 *** Residuals 577 2195.67 3.81 --Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > abline( fitC$coef[1], fitC$coef[2], lwd=1.5 ) > abline( (fitC$coef[1]+fitC$coef[3]), fitC$coef[2], lty=2 ) > abline( (fitC$coef[1]+fitC$coef[4]), fitC$coef[2] , lty=2 ) > abline( (fitC$coef[1]+fitC$coef[5]), fitC$coef[2], lty=2 ) trillium <- read.table(file="http://users.humboldt.edu/rizzardi/Data.dir/trillium", header=T, skip=9) # Show which variables are numeric, integers, and factors str( trillium ) # flower is a factor, site is an integer # Want site to be a factor trillium$site[1:60] trillium$site <- factor( trillium$site ) trillium$site[1:60] levels(trillium$site) <- c("Lake","RS","Park","Road") trillium$site[1:60] attach( trillium ) # Example of 1-way ANOVA # Predict Leaf using collection site boxplot( leaf ~ site, data=trillium ) fitA <- lm( leaf ~ site, data=trillium ) summary(fitA) 5 anova(fitA) fitB <- lm( leaf ~ -1 + site, data=trillium) summary( fitB ) mean( leaf[site=="Park"]) mean( leaf[site=="RS"]) mean( leaf[site=="Road"]) mean( leaf[site=="Lake"]) anova(fitB) dev.new() stripchart( leaf ~ site, data=trillium) abline(v=fitB$coef, lty=c(1:4)) # Example of ANCOVA # Predict Leaf using collection site and stem length # Should intercepts be the same? dev.new() plot( stem, leaf, type="n" ) points( stem, leaf, pch=as.numeric(site), cex=.9 ) legend(6, 10, legend=c("Lake","Redwood Science","Park","Road"), pch=1:4 ) fitC <- lm( leaf ~ stem + site ) summary(fitC) anova(fitC) abline( fitC$coef[1], fitC$coef[2], lwd=1.5 ) abline( (fitC$coef[1]+fitC$coef[3]), fitC$coef[2], lty=2 ) abline( (fitC$coef[1]+fitC$coef[4]), fitC$coef[2] , lty=2 ) abline( (fitC$coef[1]+fitC$coef[5]), fitC$coef[2], lty=2 ) 6