401 Hw 6 solution Problem 1 .

advertisement
401 Hw 6 solution
Problem 1
#Loading the data to r, file.choose() here to choose the data from your local
path. My local path is:
#rawdata<-read.table("E:/2015 fall course/data/data.txt",sep=",")
rawdata<-read.table(file.choose(),sep=",")
data<-data.frame(rawdata$V1,rawdata$V2,rawdata$V3,rawdata$V4,rawdata$V5,rawda
ta$V6,rawdata$V7,rawdata$V8,rawdata$V9,rawdata$V10,rawdata$V11)
#For colnames, either case is OK
colnames(data)<-c("ID","RI","NA20","MGO","AL203","SI02","K2O","CAO","BAO","FE
203","TYPE")
GlassData<-data[1:146,]
colnames(GlassData)<-c("ID","x1","x2","x3","x4","x5","x6","x7","x8","x9","gla
sstype")
mydata<-GlassData
mydata$glasstype <- factor(mydata$glasstype)
# Fit a logistic regression model
mylogitF <- glm(glasstype ~ x1+ x2 + x3+x4 + x5 + x6+x7 + x8 + x9, data = myd
ata, family = "binomial")
summary(mylogitF)
##
## Call:
## glm(formula = glasstype ~ x1 + x2 + x3 + x4 + x5
##
x8 + x9, family = "binomial", data = mydata)
##
## Deviance Residuals:
##
Min
1Q
Median
3Q
Max
## -1.91622 -0.83112
0.05402
0.81930
2.15093
##
## Coefficients:
##
Estimate Std. Error z value Pr(>|z|)
## (Intercept) 279.591
403.809
0.692 0.48870
## x1
275.973
239.176
1.154 0.24856
## x2
-6.269
2.628 -2.386 0.01704
## x3
-8.554
2.688 -3.182 0.00146
## x4
-2.206
2.832 -0.779 0.43603
## x5
-7.130
2.565 -2.780 0.00543
## x6
-5.983
3.245 -1.844 0.06525
## x7
-7.188
2.741 -2.622 0.00873
## x8
-9.042
3.863 -2.341 0.01924
## x9
1.697
2.359
0.720 0.47181
Based on p-value
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05
##
+ x6 + x7 +
*
**
**
.
**
*
'.' 0.1 ' ' 1
## (Dispersion parameter for binomial family taken to be 1)
##
##
Null deviance: 202.15 on 145 degrees of freedom
## Residual deviance: 141.20 on 136 degrees of freedom
## AIC: 161.2
#c)Side by side box plot for the fitted probablities
p <- predict(mylogitF,type="response")
plot(p~glasstype,data=mydata)
#or
ps <- function(x1,x2,x3,x4,x5,x6,x7,x8,x9)
{1/(1+exp(-(mylogitF$coefficients[1]+
mylogitF$coefficients[2]*x1+mylogitF$coefficients[3]*x2+
mylogitF$coefficients[4]*x3+mylogitF$coefficients[5]*x4+
mylogitF$coefficients[6]*x5+mylogitF$coefficients[7]*x6+
mylogitF$coefficients[8]*x7+mylogitF$coefficients[9]*x8+
mylogitF$coefficients[10]*x9)))}
ps<-ps(mydata$x1,mydata$x2,mydata$x3,mydata$x4,mydata$x5,mydata$x6,mydata$x7,
mydata$x8,mydata$x9)
plot(ps~glasstype,data=mydata)
#d) Partial fitting
mylogitP <- glm(glasstype ~x2 + x3+ x5 +x7, data = mydata, family = "binomial
")
summary(mylogitP)
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Deviance Residuals:
Min
1Q
-1.82105 -0.88596
Median
0.06957
3Q
0.92089
Coefficients:
Estimate Std. Error z value
(Intercept) 362.8419
83.4540
4.348
x2
-2.6675
0.7884 -3.383
x3
-4.8217
0.9445 -5.105
x5
-3.9143
0.9276 -4.220
x7
-3.0635
0.6354 -4.821
--Signif. codes: 0 '***' 0.001 '**' 0.01
Max
2.02122
Pr(>|z|)
1.38e-05
0.000716
3.31e-07
2.45e-05
1.43e-06
***
***
***
***
***
'*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 202.15
Residual deviance: 149.51
AIC: 159.51
on 145
on 141
degrees of freedom
degrees of freedom
#create side by side box plot for the fitted prob.
pp <- predict(mylogitP,type="response")
plot(pp~glasstype,data=mydata)
The difference of the two plots is: non-overlap and overlap
#f) Identify the Na-Ca pairs
mylogitPP <- glm(glasstype ~x2 +x7, data = mydata, family = "binomial")
summary(mylogitPP)
## glm(formula = glasstype ~ x2 + x7, family = "binomial", data = mydata)
##
## Deviance Residuals:
##
Min
1Q
Median
3Q
Max
## -1.3228 -1.2075
0.7125
1.1617
1.4537
##
## Coefficients:
##
Estimate Std. Error z value Pr(>|z|)
## (Intercept)
3.4031
4.4452
0.766
0.444
## x2
-0.3246
0.3078 -1.055
0.292
## x7
0.1075
0.1304
0.824
0.410
##
## (Dispersion parameter for binomial family taken to be 1)
##
##
Null deviance: 202.15 on 145 degrees of freedom
## Residual deviance: 199.64 on 143 degrees of freedom
## AIC: 205.64
pNC<- function (Na,Ca)
{1/(1+exp(-(mylogitPP$coefficients[1]+
mylogitPP$coefficients[2]*Na+mylogitPP$coefficients[3]*Ca)))}
Nav <- seq(0,25,0.1)
Cav <- seq(0,25,0.1)
GtContour<- outer(Nav,Cav,FUN=pNC)
contour(Nav,Cav,GtContour,levels=seq(.1,.9,.2),xlab="Na",ylab="Ca")
Problem 2
#Create the data set
EOPutrt<-data.frame(x=rep(c(0.02,0.06,0.11,0.22,0.56,1.10),2),y=c(67,84,98,13
1,144,NA,51,86,115,124,158,160))
EOPtrt<-data.frame(x=rep(c(0.02,0.06,0.11,0.22,0.56,1.10),2),y=c(76,97,123,15
9,191,207,47,107,139,152,201,200))
#a)Plot the dataset
library(ggplot2)
ggplot() + geom_point(data = EOPtrt, aes(x = x, y = y))
#b)Now fit the nonlinear model
REACT.fm<-nls(y~theta1*x/(theta2+x),data=EOPtrt,start=c(theta1=207,theta2=0.0
6),trace=T)
##
##
##
##
##
##
1278.509
1195.771
1195.452
1195.449
1195.449
1195.449
:
:
:
:
:
:
207.00
0.06
212.37855610
212.65533061
212.68098164
212.68347663
212.68371770
0.06367124
0.06407672
0.06411698
0.06412087
0.06412124
REACT.fm
##
##
##
##
##
##
##
##
##
Nonlinear regression model
model: y ~ theta1 * x/(theta2 + x)
data: EOPtrt
theta1
theta2
212.68372
0.06412
residual sum-of-squares: 1195
Number of iterations to convergence: 5
Achieved convergence tolerance: 1.37e-06
#c)plot the fitted function and the data
x <- seq(0, 1.2, by = .01)
y <- sapply(x, function(x){
coef(REACT.fm)[1]*x/(coef(REACT.fm)[2]+x)
})
fitteddata<-data.frame(x = x, y = y)
ggplot() + geom_point(data = EOPtrt, aes(x = x, y = y))+geom_line(data=fitted
data,aes( x = x, y = y))
#d)more information on fitting
summary(REACT.fm)
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Formula: y ~ theta1 * x/(theta2 + x)
Parameters:
Estimate Std. Error t value Pr(>|t|)
theta1 2.127e+02 6.947e+00 30.615 3.24e-11 ***
theta2 6.412e-02 8.281e-03
7.743 1.57e-05 ***
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 10.93 on 10 degrees of freedom
Number of iterations to convergence: 5
Achieved convergence tolerance: 1.37e-06
confint(REACT.fm)
##
2.5%
97.5%
## theta1 197.30212755 229.29006410
## theta2
0.04692517
0.08615995
#e)sensible point estimate
fun<- function (x) 100-coef(REACT.fm)[1]*x/(coef(REACT.fm)[2]+x)
uniroot(fun, c(0, 8))$root
## [1] 0.05690968
# f)routine nls minimization
theta<-coef(REACT.fm)
theta
##
theta1
## 212.68371770
theta2
0.06412124
se<-sqrt(diag(vcov(REACT.fm)))
se
##
theta1
theta2
## 6.947153758 0.008280945
dv<-deviance(REACT.fm)
dv
## [1] 1195.449
gsize<-101
gsize
## [1] 101
th1<-theta[1]+seq(-4*se[1],4*se[1],length=gsize)
th2<-theta[2]+seq(-4*se[2],4*se[2],length=gsize)
ss<-function(t){sum((y-t[1]*x/(t[2]+x))^2)}
SumofSquares<-apply(th,1,ss)
SumofSquares
SumofSquares<-matrix(SumofSquares,gsize,gsize)
SumofSquares
plot(th1,th2,type="n",main="Error Sum of Squares Contours")
contour(th1,th2,SumofSquares,levels=c(seq(1000,4000,200)),add=T)
# Contour corresponding to 90% confidence region
plot(th1,th2,type="n",main="Error Sum of Squares Contours")
contour(th1,th2,SumofSquares,levels=dv*c(1+.2*qf(.90,2,10)),add=T)
# g)Contour corresponding to 95% confidence region
plot(th1,th2,type="n",main="Error Sum of Squares Contours")
contour(th1,th2,SumofSquares,levels=dv*c((1+.1*qf(.95,1,10)),
(1+.2*qf(.95,2,10))),add=T)
# h) make 95% t intervals for 𝜃1 and θ2 .
coef(REACT.fm)[1]+(c(-1,1)*se[1]*qt(.975,10))
( 197.2045, 228.1629)
coef(REACT.fm)[2]+(c(-1,1)*se[2]*qt(.975,10))
( 0.04567015, 0.08257234)
# i) Make an approximate 95% confidence intervals for σ
MSE <- sqrt(dv/10)
MSE*c(sqrt(10/qchisq(.975,10)),sqrt(10/qchisq(.025,10)))
(7.639533, 19.187844)
# j) Use the R function confint() to get 95% intervals for 𝜃1 and θ2
confint(REACT.fm, level=.95)
##
2.5%
97.5%
## theta1 197.30212755 229.29006410
## theta2
0.04692517
0.08615995
# k)
y<-c(EOPutrt$y,EOPtrt$y)
x<-c(EOPutrt$x,EOPtrt$x)
z<-c(rep(1,12),rep(0,12))
mydata<-data.frame(x,y,z)
nREACT.fm<-nls(y~(theta1+theta3*z)*x/(theta2+x),data=mydata,start=c(theta1=20
7,theta3=40,theta2=.06),trace=T)
##
##
##
##
##
##
35110.55 : 207.00 40.00
0.06
2245.56 : 208.22200269 -42.80960545
2240.904 : 208.58155244 -42.00904492
2240.892 : 208.62438946 -42.02409629
2240.891 : 208.62940060 -42.02575155
2240.891 : 208.62999169 -42.02594755
0.05727877
0.05790703
0.05796415
0.05797093
0.05797173
summary(nREACT.fm)
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Formula: y ~ (theta1 + theta3 * z) * x/(theta2 + x)
Parameters:
Estimate Std. Error t value Pr(>|t|)
theta1 208.62999
5.80399 35.946 < 2e-16 ***
theta3 -42.02595
6.27214 -6.700 1.61e-06 ***
theta2
0.05797
0.00591
9.809 4.37e-09 ***
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 10.59 on 20 degrees of freedom
Number of iterations to convergence: 5
Achieved convergence tolerance: 3.609e-06
(1 observation deleted due to missingness)
confint(nREACT.fm)
##
2.5%
97.5%
## theta1 196.39379459 221.50898553
## theta3 -55.19924348 -28.95657683
## theta2
0.04599081
0.07234273
#another method based on t interval
coef(nREACT.fm)[2]+(c(-1,1)*sqrt(diag(vcov(nREACT.fm)))[2]*qt(.975,10))
## [1] -56.00115 -28.05075
The p-value of theta3 is less than 0.05, which means that theta3 is significa
nt.
#plot the fitted function and the data
x <- seq(0, 2, by = .01)
y <- sapply(x, function(x){
coef(REACT.fm)[1]*x/(coef(REACT.fm)[2]+x)
})
yt <- sapply(x, function(x){
(coef(nREACT.fm)[1]+coef(nREACT.fm)[2])*x/(coef(nREACT.fm)[3]+x)
})
fitteddata<-data.frame(x = x, y = y)
fitteddatat<-data.frame(x = x, y = yt)
ggplot() + geom_point(data = EOPtrt, aes(x = x, y = y))+geom_line(data=fitted
data,aes( x = x, y = y))+geom_point(data = EOPutrt, aes(x = x, y = y),na.rm =
TRUE)+geom_line(data=fitteddatat,aes( x = x, y = y))
Problem 3
library(xlsx)
library(leaps)
#AHdata <- read.xlsx("E:/2015 fall course/book1.xlsx",1)
AHdata <- read.xlsx(file.choose(),1)
AHdata$Garage <- factor(AHdata$Garage)
AHdata$Mutiple.Car <- factor(AHdata$Mutiple.Car)
AHdata$Central.Air <- factor(AHdata$Central.Air)
AHdata$Bsmt.Bath <- factor(AHdata$Bsmt.Bath)
AHdata$Style..2.Story. <- factor(AHdata$Style..2.Story.)
AHdata$Zone..Town.Center. <- factor(AHdata$Zone..Town.Center.)
a<-regsubsets(Price~.,nbest=1,nvmax=14,data=AHdata)
plot(a,scale="r2")
#From the mod with highest R2 for all numbers of predictors
library(DAAG)
Based on the figure above, we have:
set.seed(0)
mod1 <- cv.lm(AHdata, Price~Size,m=11)
mod2 <- cv.lm(AHdata, Price~Size+Fireplace,m=11)
mod3 <- cv.lm(AHdata, Price~Size+Fireplace+Land,m=11)
mod4 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath,m=11)
mod5 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air,m=11)
mod6 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Full.Bath+Half.Bath
,m=11)
mod7 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air+Full.Ba
th+Half.Bath,m=11)
mod8 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air+Full.Ba
th+Half.Bath+Basement..Total.,m=11)
mod9 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air+Full.Ba
th+Half.Bath+Basement..Total.+Garage,m=11)
mod10 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air+Full.B
ath+Half.Bath+Basement..Total.+Garage+Style..2.Story.,m=11)
mod11 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air+Full.B
ath+Half.Bath+Basement..Total.+Garage+Style..2.Story.+Finished.Bsmt,m=11)
mod12 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air+Full.B
ath+Half.Bath+Basement..Total.+Garage+Style..2.Story.+Finished.Bsmt+Bed.Rooms
,m=11)
mod13 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air+Full.B
ath+Half.Bath+Basement..Total.+Garage+Style..2.Story.+Finished.Bsmt+Bed.Rooms
+Zone..Town.Center.,m=11)
mod14 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air+Full.B
ath+Half.Bath+Basement..Total.+Garage+Style..2.Story.+Finished.Bsmt+Bed.Rooms
+Zone..Town.Center.+Mutiple.Car,m=11)
#the overall sum of each mode
mod.ms <- c(attr(mod1, "ms"),attr(mod2, "ms"),attr(mod3, "ms"),attr(mod4, "ms
"),attr(mod5, "ms"),attr(mod6, "ms"),attr(mod7, "ms"),attr(mod8, "ms"),attr(m
od9, "ms"),attr(mod10, "ms"),attr(mod11, "ms"),attr(mod12, "ms"),attr(mod13,
"ms"),attr(mod14, "ms"))
which.min(mod.ms)
## [1] 7
#mod 7 gives the smallest ms value, choose this to do lm
mod.lm <- lm(Price~Size+Fireplace+Land+Bsmt.Bath++Central.Air+Full.Bath+Half.
Bath,data=AHdata)
summary(mod.lm)
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Residuals:
Min
1Q Median
-45633 -11108 -2756
3Q
10470
Max
65101
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.02e+04
1.17e+04
0.87
0.3860
Size
3.89e+01
8.68e+00
4.48 2.4e-05 ***
Fireplace
1.47e+04
3.48e+03
4.23 6.1e-05 ***
Land
1.95e+00
7.71e-01
2.53
0.0134 *
Bsmt.Bath1
1.88e+04
6.02e+03
3.13
0.0025 **
Central.Air1 9.59e+03
5.36e+03
1.79
0.0777 .
Full.Bath
1.39e+04
5.84e+03
2.38
0.0196 *
Half.Bath
1.11e+04
5.69e+03
1.95
0.0546 .
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 20600 on 80 degrees of freedom
## Multiple R-squared: 0.72,
Adjusted R-squared: 0.695
## F-statistic: 29.4 on 7 and 80 DF, p-value: <2e-16
confint(mod.lm)
##
##
##
##
##
##
##
##
##
2.5 %
97.5 %
(Intercept) -1.31e+04 33585.41
Size
2.16e+01
56.20
Fireplace
7.81e+03 21668.26
Land
4.15e-01
3.48
Bsmt.Bath1
6.85e+03 30825.80
Central.Air1 -1.09e+03 20261.96
Full.Bath
2.29e+03 25543.15
Half.Bath
-2.24e+02 22405.50
Download