Beginning Chapter 9: ANCOVA

advertisement
Chapter 9, Statistical Modelling, of “The R Book”, 2nd ed.
Common terms:
Regression: all explanatory variables are continuous
ANOVA (Analysis of Variance): all explanatory variables categorical
ANCOVA (Analysis of Covariance): explanatory variables both continuous and categorical
> trillium <- read.table(file="http://users.humboldt.edu/rizzardi/Data.dir/trillium",
+ header=T, skip=9)
>
> # Show which variables are numeric, integers, and factors
> str( trillium ) # flower is a factor, site is an integer
'data.frame':
582 obs. of 4 variables:
$ leaf : num 11.6 8.2 12.7 10.6 10.1 13.7 16 15.7 10.4 5.7 ...
$ stem : num 3.4 2.9 4.6 4.2 3.4 4.7 5.3 6.3 3.4 5.5 ...
$ flower: Factor w/ 3 levels "p","s","w": 1 3 3 3 1 1 3 1 1 1 ...
$ site : int 4 4 4 4 4 4 4 4 4 4 ...
>
> # Want site to be a factor
> trillium$site[1:60]
[1] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2
[49] 2 2 2 2 4 4 4 4 4 4 4 4
> trillium$site <- factor( trillium$site )
> trillium$site[1:60]
[1] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2
[49] 2 2 2 2 4 4 4 4 4 4 4 4
Levels: 1 2 3 4
> levels(trillium$site) <- c("Lake","RS","Park","Road")
> trillium$site[1:60]
[1] Road Road Road Road Road Road Road Road Road Road Road Road Road Road Road Road Road
Road Road
[20] Road Road Road Road Road Road Road Road RS
RS
RS
RS
RS
RS
RS
RS
RS
RS
RS
[39] RS
RS
RS
RS
RS
RS
RS
RS
RS
RS
RS
RS
RS
RS
Road Road Road
Road Road
[58] Road Road Road
Levels: Lake RS Park Road
>
>
> attach( trillium )
> # Example of 1-way ANOVA
> # Predict Leaf using collection site
> boxplot( leaf ~ site, data=trillium )
1
> fitA <- lm( leaf ~ site, data=trillium )
> summary(fitA)
Call:
lm(formula = leaf ~ site, data = trillium)
Residuals:
Min
1Q Median
-7.5860 -1.6019 -0.0247
3Q
1.4525
Max
9.1140
Coefficients:
Estimate Std. Error t value
(Intercept) 11.9605
0.1767 67.677
siteRS
1.1415
0.2622
4.353
sitePark
2.0871
0.2803
7.445
siteRoad
0.4255
0.2810
1.514
--Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01
Pr(>|t|)
< 2e-16 ***
1.59e-05 ***
3.55e-13 ***
0.131
‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.404 on 578 degrees of freedom
Multiple R-squared: 0.09607,
Adjusted R-squared: 0.09138
F-statistic: 20.48 on 3 and 578 DF, p-value: 1.271e-12
> anova(fitA)
Analysis of Variance Table
Response: leaf
Df Sum Sq Mean Sq F value
Pr(>F)
site
3 355.0 118.323 20.478 1.271e-12 ***
Residuals 578 3339.8
5.778
--Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> fitB <- lm( leaf ~ -1 + site, data=trillium)
> summary( fitB )
2
Call:
lm(formula = leaf ~ -1 + site, data = trillium)
Residuals:
Min
1Q Median
-7.5860 -1.6019 -0.0247
3Q
1.4525
Max
9.1140
Coefficients:
Estimate Std. Error t value Pr(>|t|)
siteLake 11.9605
0.1767
67.68
<2e-16 ***
siteRS
13.1019
0.1937
67.64
<2e-16 ***
sitePark 14.0475
0.2176
64.55
<2e-16 ***
siteRoad 12.3860
0.2185
56.68
<2e-16 ***
--Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.404 on 578 degrees of freedom
Multiple R-squared: 0.9662,
Adjusted R-squared: 0.966
F-statistic: 4134 on 4 and 578 DF, p-value: < 2.2e-16
> mean( leaf[site=="Park"])
[1] 14.04754
> mean( leaf[site=="RS"])
[1] 13.10195
> mean( leaf[site=="Road"])
[1] 12.38595
> mean( leaf[site=="Lake"])
[1] 11.96049
> anova(fitB)
Analysis of Variance Table
Response: leaf
Df Sum Sq Mean Sq F value
Pr(>F)
site
4 95538 23884.5 4133.6 < 2.2e-16 ***
Residuals 578
3340
5.8
--Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> dev.new()
> stripchart( leaf ~ site, data=trillium)
> abline(v=fitB$coef, lty=c(1:4))
3
>
>
>
>
>
>
>
>
>
>
>
# Example of ANCOVA
# Predict Leaf using collection site and stem length
# Should intercepts be the same?
dev.new()
plot( stem, leaf, type="n" )
points( stem, leaf, pch=as.numeric(site), cex=.9 )
legend(6, 10, legend=c("Lake","Redwood Science","Park","Road"), pch=1:4 )
fitC <- lm( leaf ~ stem + site )
summary(fitC)
Call:
lm(formula = leaf ~ stem + site)
Residuals:
Min
1Q
-7.5510 -1.2634
Median
0.0639
3Q
1.2324
Max
9.3789
Coefficients:
Estimate Std. Error t value
(Intercept) 7.47396
0.29584 25.264
stem
1.10491
0.06372 17.339
siteRS
0.42129
0.21680
1.943
sitePark
0.81628
0.23902
3.415
siteRoad
-0.32491
0.23214 -1.400
--Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01
Pr(>|t|)
< 2e-16
< 2e-16
0.052483
0.000682
0.162168
***
***
.
***
‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.951 on 577 degrees of freedom
Multiple R-squared: 0.4057,
Adjusted R-squared: 0.4016
F-statistic: 98.48 on 4 and 577 DF, p-value: < 2.2e-16
4
> anova(fitC)
Analysis of Variance Table
Response: leaf
Df Sum Sq Mean Sq F value
Pr(>F)
stem
1 1408.12 1408.12 370.0405 < 2.2e-16 ***
site
3
90.94
30.31
7.9665 3.273e-05 ***
Residuals 577 2195.67
3.81
--Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> abline( fitC$coef[1], fitC$coef[2], lwd=1.5 )
> abline( (fitC$coef[1]+fitC$coef[3]), fitC$coef[2], lty=2 )
> abline( (fitC$coef[1]+fitC$coef[4]), fitC$coef[2] , lty=2 )
> abline( (fitC$coef[1]+fitC$coef[5]), fitC$coef[2], lty=2 )
trillium <- read.table(file="http://users.humboldt.edu/rizzardi/Data.dir/trillium",
header=T, skip=9)
# Show which variables are numeric, integers, and factors
str( trillium ) # flower is a factor, site is an integer
# Want site to be a factor
trillium$site[1:60]
trillium$site <- factor( trillium$site )
trillium$site[1:60]
levels(trillium$site) <- c("Lake","RS","Park","Road")
trillium$site[1:60]
attach( trillium )
# Example of 1-way ANOVA
# Predict Leaf using collection site
boxplot( leaf ~ site, data=trillium )
fitA <- lm( leaf ~ site, data=trillium )
summary(fitA)
5
anova(fitA)
fitB <- lm( leaf ~ -1 + site, data=trillium)
summary( fitB )
mean( leaf[site=="Park"])
mean( leaf[site=="RS"])
mean( leaf[site=="Road"])
mean( leaf[site=="Lake"])
anova(fitB)
dev.new()
stripchart( leaf ~ site, data=trillium)
abline(v=fitB$coef, lty=c(1:4))
# Example of ANCOVA
# Predict Leaf using collection site and stem length
# Should intercepts be the same?
dev.new()
plot( stem, leaf, type="n" )
points( stem, leaf, pch=as.numeric(site), cex=.9 )
legend(6, 10, legend=c("Lake","Redwood Science","Park","Road"), pch=1:4 )
fitC <- lm( leaf ~ stem + site )
summary(fitC)
anova(fitC)
abline( fitC$coef[1], fitC$coef[2], lwd=1.5 )
abline( (fitC$coef[1]+fitC$coef[3]), fitC$coef[2], lty=2 )
abline( (fitC$coef[1]+fitC$coef[4]), fitC$coef[2] , lty=2 )
abline( (fitC$coef[1]+fitC$coef[5]), fitC$coef[2], lty=2 )
6
Download