401 Hw 6 solution Problem 1 #Loading the data to r, file.choose() here to choose the data from your local path. My local path is: #rawdata<-read.table("E:/2015 fall course/data/data.txt",sep=",") rawdata<-read.table(file.choose(),sep=",") data<-data.frame(rawdata$V1,rawdata$V2,rawdata$V3,rawdata$V4,rawdata$V5,rawda ta$V6,rawdata$V7,rawdata$V8,rawdata$V9,rawdata$V10,rawdata$V11) #For colnames, either case is OK colnames(data)<-c("ID","RI","NA20","MGO","AL203","SI02","K2O","CAO","BAO","FE 203","TYPE") GlassData<-data[1:146,] colnames(GlassData)<-c("ID","x1","x2","x3","x4","x5","x6","x7","x8","x9","gla sstype") mydata<-GlassData mydata$glasstype <- factor(mydata$glasstype) # Fit a logistic regression model mylogitF <- glm(glasstype ~ x1+ x2 + x3+x4 + x5 + x6+x7 + x8 + x9, data = myd ata, family = "binomial") summary(mylogitF) ## ## Call: ## glm(formula = glasstype ~ x1 + x2 + x3 + x4 + x5 ## x8 + x9, family = "binomial", data = mydata) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.91622 -0.83112 0.05402 0.81930 2.15093 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 279.591 403.809 0.692 0.48870 ## x1 275.973 239.176 1.154 0.24856 ## x2 -6.269 2.628 -2.386 0.01704 ## x3 -8.554 2.688 -3.182 0.00146 ## x4 -2.206 2.832 -0.779 0.43603 ## x5 -7.130 2.565 -2.780 0.00543 ## x6 -5.983 3.245 -1.844 0.06525 ## x7 -7.188 2.741 -2.622 0.00873 ## x8 -9.042 3.863 -2.341 0.01924 ## x9 1.697 2.359 0.720 0.47181 Based on p-value ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 ## + x6 + x7 + * ** ** . ** * '.' 0.1 ' ' 1 ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 202.15 on 145 degrees of freedom ## Residual deviance: 141.20 on 136 degrees of freedom ## AIC: 161.2 #c)Side by side box plot for the fitted probablities p <- predict(mylogitF,type="response") plot(p~glasstype,data=mydata) #or ps <- function(x1,x2,x3,x4,x5,x6,x7,x8,x9) {1/(1+exp(-(mylogitF$coefficients[1]+ mylogitF$coefficients[2]*x1+mylogitF$coefficients[3]*x2+ mylogitF$coefficients[4]*x3+mylogitF$coefficients[5]*x4+ mylogitF$coefficients[6]*x5+mylogitF$coefficients[7]*x6+ mylogitF$coefficients[8]*x7+mylogitF$coefficients[9]*x8+ mylogitF$coefficients[10]*x9)))} ps<-ps(mydata$x1,mydata$x2,mydata$x3,mydata$x4,mydata$x5,mydata$x6,mydata$x7, mydata$x8,mydata$x9) plot(ps~glasstype,data=mydata) #d) Partial fitting mylogitP <- glm(glasstype ~x2 + x3+ x5 +x7, data = mydata, family = "binomial ") summary(mylogitP) ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## Deviance Residuals: Min 1Q -1.82105 -0.88596 Median 0.06957 3Q 0.92089 Coefficients: Estimate Std. Error z value (Intercept) 362.8419 83.4540 4.348 x2 -2.6675 0.7884 -3.383 x3 -4.8217 0.9445 -5.105 x5 -3.9143 0.9276 -4.220 x7 -3.0635 0.6354 -4.821 --Signif. codes: 0 '***' 0.001 '**' 0.01 Max 2.02122 Pr(>|z|) 1.38e-05 0.000716 3.31e-07 2.45e-05 1.43e-06 *** *** *** *** *** '*' 0.05 '.' 0.1 ' ' 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 202.15 Residual deviance: 149.51 AIC: 159.51 on 145 on 141 degrees of freedom degrees of freedom #create side by side box plot for the fitted prob. pp <- predict(mylogitP,type="response") plot(pp~glasstype,data=mydata) The difference of the two plots is: non-overlap and overlap #f) Identify the Na-Ca pairs mylogitPP <- glm(glasstype ~x2 +x7, data = mydata, family = "binomial") summary(mylogitPP) ## glm(formula = glasstype ~ x2 + x7, family = "binomial", data = mydata) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.3228 -1.2075 0.7125 1.1617 1.4537 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 3.4031 4.4452 0.766 0.444 ## x2 -0.3246 0.3078 -1.055 0.292 ## x7 0.1075 0.1304 0.824 0.410 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 202.15 on 145 degrees of freedom ## Residual deviance: 199.64 on 143 degrees of freedom ## AIC: 205.64 pNC<- function (Na,Ca) {1/(1+exp(-(mylogitPP$coefficients[1]+ mylogitPP$coefficients[2]*Na+mylogitPP$coefficients[3]*Ca)))} Nav <- seq(0,25,0.1) Cav <- seq(0,25,0.1) GtContour<- outer(Nav,Cav,FUN=pNC) contour(Nav,Cav,GtContour,levels=seq(.1,.9,.2),xlab="Na",ylab="Ca") Problem 2 #Create the data set EOPutrt<-data.frame(x=rep(c(0.02,0.06,0.11,0.22,0.56,1.10),2),y=c(67,84,98,13 1,144,NA,51,86,115,124,158,160)) EOPtrt<-data.frame(x=rep(c(0.02,0.06,0.11,0.22,0.56,1.10),2),y=c(76,97,123,15 9,191,207,47,107,139,152,201,200)) #a)Plot the dataset library(ggplot2) ggplot() + geom_point(data = EOPtrt, aes(x = x, y = y)) #b)Now fit the nonlinear model REACT.fm<-nls(y~theta1*x/(theta2+x),data=EOPtrt,start=c(theta1=207,theta2=0.0 6),trace=T) ## ## ## ## ## ## 1278.509 1195.771 1195.452 1195.449 1195.449 1195.449 : : : : : : 207.00 0.06 212.37855610 212.65533061 212.68098164 212.68347663 212.68371770 0.06367124 0.06407672 0.06411698 0.06412087 0.06412124 REACT.fm ## ## ## ## ## ## ## ## ## Nonlinear regression model model: y ~ theta1 * x/(theta2 + x) data: EOPtrt theta1 theta2 212.68372 0.06412 residual sum-of-squares: 1195 Number of iterations to convergence: 5 Achieved convergence tolerance: 1.37e-06 #c)plot the fitted function and the data x <- seq(0, 1.2, by = .01) y <- sapply(x, function(x){ coef(REACT.fm)[1]*x/(coef(REACT.fm)[2]+x) }) fitteddata<-data.frame(x = x, y = y) ggplot() + geom_point(data = EOPtrt, aes(x = x, y = y))+geom_line(data=fitted data,aes( x = x, y = y)) #d)more information on fitting summary(REACT.fm) ## ## ## ## ## ## ## ## ## ## ## ## ## ## Formula: y ~ theta1 * x/(theta2 + x) Parameters: Estimate Std. Error t value Pr(>|t|) theta1 2.127e+02 6.947e+00 30.615 3.24e-11 *** theta2 6.412e-02 8.281e-03 7.743 1.57e-05 *** --Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 10.93 on 10 degrees of freedom Number of iterations to convergence: 5 Achieved convergence tolerance: 1.37e-06 confint(REACT.fm) ## 2.5% 97.5% ## theta1 197.30212755 229.29006410 ## theta2 0.04692517 0.08615995 #e)sensible point estimate fun<- function (x) 100-coef(REACT.fm)[1]*x/(coef(REACT.fm)[2]+x) uniroot(fun, c(0, 8))$root ## [1] 0.05690968 # f)routine nls minimization theta<-coef(REACT.fm) theta ## theta1 ## 212.68371770 theta2 0.06412124 se<-sqrt(diag(vcov(REACT.fm))) se ## theta1 theta2 ## 6.947153758 0.008280945 dv<-deviance(REACT.fm) dv ## [1] 1195.449 gsize<-101 gsize ## [1] 101 th1<-theta[1]+seq(-4*se[1],4*se[1],length=gsize) th2<-theta[2]+seq(-4*se[2],4*se[2],length=gsize) ss<-function(t){sum((y-t[1]*x/(t[2]+x))^2)} SumofSquares<-apply(th,1,ss) SumofSquares SumofSquares<-matrix(SumofSquares,gsize,gsize) SumofSquares plot(th1,th2,type="n",main="Error Sum of Squares Contours") contour(th1,th2,SumofSquares,levels=c(seq(1000,4000,200)),add=T) # Contour corresponding to 90% confidence region plot(th1,th2,type="n",main="Error Sum of Squares Contours") contour(th1,th2,SumofSquares,levels=dv*c(1+.2*qf(.90,2,10)),add=T) # g)Contour corresponding to 95% confidence region plot(th1,th2,type="n",main="Error Sum of Squares Contours") contour(th1,th2,SumofSquares,levels=dv*c((1+.1*qf(.95,1,10)), (1+.2*qf(.95,2,10))),add=T) # h) make 95% t intervals for 𝜃1 and θ2 . coef(REACT.fm)[1]+(c(-1,1)*se[1]*qt(.975,10)) ( 197.2045, 228.1629) coef(REACT.fm)[2]+(c(-1,1)*se[2]*qt(.975,10)) ( 0.04567015, 0.08257234) # i) Make an approximate 95% confidence intervals for σ MSE <- sqrt(dv/10) MSE*c(sqrt(10/qchisq(.975,10)),sqrt(10/qchisq(.025,10))) (7.639533, 19.187844) # j) Use the R function confint() to get 95% intervals for 𝜃1 and θ2 confint(REACT.fm, level=.95) ## 2.5% 97.5% ## theta1 197.30212755 229.29006410 ## theta2 0.04692517 0.08615995 # k) y<-c(EOPutrt$y,EOPtrt$y) x<-c(EOPutrt$x,EOPtrt$x) z<-c(rep(1,12),rep(0,12)) mydata<-data.frame(x,y,z) nREACT.fm<-nls(y~(theta1+theta3*z)*x/(theta2+x),data=mydata,start=c(theta1=20 7,theta3=40,theta2=.06),trace=T) ## ## ## ## ## ## 35110.55 : 207.00 40.00 0.06 2245.56 : 208.22200269 -42.80960545 2240.904 : 208.58155244 -42.00904492 2240.892 : 208.62438946 -42.02409629 2240.891 : 208.62940060 -42.02575155 2240.891 : 208.62999169 -42.02594755 0.05727877 0.05790703 0.05796415 0.05797093 0.05797173 summary(nREACT.fm) ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## Formula: y ~ (theta1 + theta3 * z) * x/(theta2 + x) Parameters: Estimate Std. Error t value Pr(>|t|) theta1 208.62999 5.80399 35.946 < 2e-16 *** theta3 -42.02595 6.27214 -6.700 1.61e-06 *** theta2 0.05797 0.00591 9.809 4.37e-09 *** --Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 10.59 on 20 degrees of freedom Number of iterations to convergence: 5 Achieved convergence tolerance: 3.609e-06 (1 observation deleted due to missingness) confint(nREACT.fm) ## 2.5% 97.5% ## theta1 196.39379459 221.50898553 ## theta3 -55.19924348 -28.95657683 ## theta2 0.04599081 0.07234273 #another method based on t interval coef(nREACT.fm)[2]+(c(-1,1)*sqrt(diag(vcov(nREACT.fm)))[2]*qt(.975,10)) ## [1] -56.00115 -28.05075 The p-value of theta3 is less than 0.05, which means that theta3 is significa nt. #plot the fitted function and the data x <- seq(0, 2, by = .01) y <- sapply(x, function(x){ coef(REACT.fm)[1]*x/(coef(REACT.fm)[2]+x) }) yt <- sapply(x, function(x){ (coef(nREACT.fm)[1]+coef(nREACT.fm)[2])*x/(coef(nREACT.fm)[3]+x) }) fitteddata<-data.frame(x = x, y = y) fitteddatat<-data.frame(x = x, y = yt) ggplot() + geom_point(data = EOPtrt, aes(x = x, y = y))+geom_line(data=fitted data,aes( x = x, y = y))+geom_point(data = EOPutrt, aes(x = x, y = y),na.rm = TRUE)+geom_line(data=fitteddatat,aes( x = x, y = y)) Problem 3 library(xlsx) library(leaps) #AHdata <- read.xlsx("E:/2015 fall course/book1.xlsx",1) AHdata <- read.xlsx(file.choose(),1) AHdata$Garage <- factor(AHdata$Garage) AHdata$Mutiple.Car <- factor(AHdata$Mutiple.Car) AHdata$Central.Air <- factor(AHdata$Central.Air) AHdata$Bsmt.Bath <- factor(AHdata$Bsmt.Bath) AHdata$Style..2.Story. <- factor(AHdata$Style..2.Story.) AHdata$Zone..Town.Center. <- factor(AHdata$Zone..Town.Center.) a<-regsubsets(Price~.,nbest=1,nvmax=14,data=AHdata) plot(a,scale="r2") #From the mod with highest R2 for all numbers of predictors library(DAAG) Based on the figure above, we have: set.seed(0) mod1 <- cv.lm(AHdata, Price~Size,m=11) mod2 <- cv.lm(AHdata, Price~Size+Fireplace,m=11) mod3 <- cv.lm(AHdata, Price~Size+Fireplace+Land,m=11) mod4 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath,m=11) mod5 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air,m=11) mod6 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Full.Bath+Half.Bath ,m=11) mod7 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air+Full.Ba th+Half.Bath,m=11) mod8 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air+Full.Ba th+Half.Bath+Basement..Total.,m=11) mod9 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air+Full.Ba th+Half.Bath+Basement..Total.+Garage,m=11) mod10 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air+Full.B ath+Half.Bath+Basement..Total.+Garage+Style..2.Story.,m=11) mod11 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air+Full.B ath+Half.Bath+Basement..Total.+Garage+Style..2.Story.+Finished.Bsmt,m=11) mod12 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air+Full.B ath+Half.Bath+Basement..Total.+Garage+Style..2.Story.+Finished.Bsmt+Bed.Rooms ,m=11) mod13 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air+Full.B ath+Half.Bath+Basement..Total.+Garage+Style..2.Story.+Finished.Bsmt+Bed.Rooms +Zone..Town.Center.,m=11) mod14 <- cv.lm(AHdata, Price~Size+Fireplace+Land+Bsmt.Bath+Central.Air+Full.B ath+Half.Bath+Basement..Total.+Garage+Style..2.Story.+Finished.Bsmt+Bed.Rooms +Zone..Town.Center.+Mutiple.Car,m=11) #the overall sum of each mode mod.ms <- c(attr(mod1, "ms"),attr(mod2, "ms"),attr(mod3, "ms"),attr(mod4, "ms "),attr(mod5, "ms"),attr(mod6, "ms"),attr(mod7, "ms"),attr(mod8, "ms"),attr(m od9, "ms"),attr(mod10, "ms"),attr(mod11, "ms"),attr(mod12, "ms"),attr(mod13, "ms"),attr(mod14, "ms")) which.min(mod.ms) ## [1] 7 #mod 7 gives the smallest ms value, choose this to do lm mod.lm <- lm(Price~Size+Fireplace+Land+Bsmt.Bath++Central.Air+Full.Bath+Half. Bath,data=AHdata) summary(mod.lm) ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## Residuals: Min 1Q Median -45633 -11108 -2756 3Q 10470 Max 65101 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.02e+04 1.17e+04 0.87 0.3860 Size 3.89e+01 8.68e+00 4.48 2.4e-05 *** Fireplace 1.47e+04 3.48e+03 4.23 6.1e-05 *** Land 1.95e+00 7.71e-01 2.53 0.0134 * Bsmt.Bath1 1.88e+04 6.02e+03 3.13 0.0025 ** Central.Air1 9.59e+03 5.36e+03 1.79 0.0777 . Full.Bath 1.39e+04 5.84e+03 2.38 0.0196 * Half.Bath 1.11e+04 5.69e+03 1.95 0.0546 . --Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 20600 on 80 degrees of freedom ## Multiple R-squared: 0.72, Adjusted R-squared: 0.695 ## F-statistic: 29.4 on 7 and 80 DF, p-value: <2e-16 confint(mod.lm) ## ## ## ## ## ## ## ## ## 2.5 % 97.5 % (Intercept) -1.31e+04 33585.41 Size 2.16e+01 56.20 Fireplace 7.81e+03 21668.26 Land 4.15e-01 3.48 Bsmt.Bath1 6.85e+03 30825.80 Central.Air1 -1.09e+03 20261.96 Full.Bath 2.29e+03 25543.15 Half.Bath -2.24e+02 22405.50