logistic regression

advertisement
Logit Lab
material borrowed from tutorial by
William B. King
Coastal Carolina
see: ww2.coastal.edu/kingw/statistics/R-tutorials/logistic.html
# Start by loading MASS library
# Note: Functions and datasets to support Venables and Ripley, 'Modern Applied Statistics with S’
library("MASS")
#Load data set for analysis
data(menarche)
#View structure of data
str(menarche)
# There are 3 variables with 25 observations:
Age: average age of each cohort, i.e., partitioned by age
Total: total number of girls in each cohort
Menarche: number of girls that have reached menarche
# Get summary statistics
summary(menarche)
# See ranges for each variable along with distributions info
# Plot data
plot(Menarche/Total ~ Age, data=menarche)
# Wow! Looks like a really good data set for logistic regression
# What does the logistic regression command look like?
glm. out = glm(cbind(Menarche, Total-Menarche) ~ Age,
family=binomial(logit), data=menarche)
# So what is glm?
?glm
# we see that this is a generalized linear model function.
# Lets parse the command
glm. out = glm(cbind(Menarche, Total-Menarche) ~ Age,
family=binomial(logit), data=menarche)
# glm – generalized linear model
# What is cbind(Menarche, Total-Menarche) ~ Age?
# Type in
cbind(Menarche, Total-Menarche)
# Why do you get an error?
# You get an error because Menarche & Total are variables in a frame and
# not top-level variables.
# Recall the plot command we used:
plot(Menarche/Total ~ Age, data=menarche)
# Notice: data = menarche. This specifies the data frame
# this is equivalent to
plot(menarche$Menarche/menarche$Total ~ menarche$Age)
# What is cbind(Menarche, Total-Menarche)?
# when data=menarche, cbind(Menarche, Total-Menarche) is
# cbind(menarche$Menarche, menarche$Total-menarche$Menarche)
# Type it in
cbind(menarche$Menarche, menarche$Total-menarche$Menarche)
# We see that these are the Y values of the points representing the dichotomy
# Thus cbind(Menarche, Total-Menarche) ~ Age,
# are the Y ~ X values that are arguments to the model
# What about family=binomial(logit)?
# This tells the glm function to fit the data using the logit model
# Altogether
glm. out = glm(cbind(Menarche, Total-Menarche) ~ Age,
family=binomial(logit), data=menarche)
# Ok, let’s examine the result of fitting the data with the logit model
plot(Menarche/Total ~ Age, data=menarche)
lines(menarche$Age, glm.out$fitted, type="l", col="red")
title(main="Menarche Data with Fitted Logistic Regression Line")
#Good fit!!!
# Check the statistics
summary(glm.out)
# Observe that the Estimated coefficient of Age is 1.63197
# Recall that the response variable is log odds so
# so the change in odds is exp(1.632) = 5.11 times.
# Interpretation: for every year increase in age the odds of having reached
# menarche increase by exp(1.632) = 5.11 times.
Download