/* This program uses the GENMOD procedure in SAS to fit loglinear models to the Wisconsin driver data. */ DATA SET1; INFILE 'drivall.dat'; /* This program code is stored in the file INPUT AGE driverw.sas SEX D V R Y; IF(Y EQ 0) THEN Y=Y+1.0E-20; The data are stored in the file drivall.dat LABEL */ AGE = AGE GROUP D = DRIVER GROUP V = VIOLATION STATUS R = RESIDENTIAL AREA /* Enter the data. To prevent PROC GENMOD Y = COUNT; from deleting combinations of run; categories for which the observed count is zero, add a small amount 1.0e-20 to each zero count */ 871 872 proc format; value sex 1 = 'Male' 2 = 'Female'; value age 1 = '16-36' 2 = '36-55' 3 = 'over 55'; value d 1 = 'Disease' 2 = 'Control'; value v 1 = 'Some' 2 = 'None'; value r 1 = '> 150000' 2 = '39-150000' 3 = '10-39000' /* Conditional analysis of the association between disease status and traffic violations */ proc freq data=set1; tables age*sex*d*v / chisq nopercent nocol norow cmh relrisk out=set2; weight Y; run; 4 = '< 10000' 5 = 'rural'; run; 873 proc print data=set2; run; 874 /* Fit the model for conditional /* Delete the v*sex*age interaction independence of D and V given any of D and V given AGE and SEX */ combination of AGE and SEX categories */ proc genmod data=set2 order=internal; class age sex d v; proc genmod data=set2 order=internal; model count = d|sex|age v|sex v|age / class age sex d v; dist = poisson link=log model count = d|sex|age v|sex|age / maxit=50 covb itprint obstats ; dist = poisson link=log make 'parmest' out=est2; maxit=50 covb itprint obstats ; make 'parmest' out=est1; make 'obstats' out=obstat2; run; make 'obstats' out=obstat1; run; proc print data=est2; run; 875 876 /* Analyze the V*D*S marginal table */ proc freq data=set1; tables sex*d*v / chisq nopercent nocol norow cmh relrisk out=set3; /* Analyze the V*D marginal table */ weight Y; run; proc freq data=set1; tables d*v /* Fit a no three factor interaction / chisq nopercent nocol norow cmh relrisk; model */ weight Y; proc genmod data=set3 order=internal; run; class sex d v; model count = d|sex v|sex v|d / dist = poisson link=log maxit=50 covb itprint obstats ; run; 877 878 /* Analyze the V*D*A marginal table */ The FREQ Procedure proc freq data=set1; tables age*d*v Summary Statistics for D by V / chisq nopercent Controlling for AGE and SEX nocol norow cmh relrisk out=set4; Cochran-Mantel-Haenszel Statistics weight Y; Alternative Hypothesis run; /* Fit a no 3-factor interaction model */ proc genmod data=set4 order=internal; Value Prob 1 Nonzero Correlation 1 2.1067 0.1467 2 Row Mean Scores Differ 1 2.1067 0.1467 3 General Association 1 2.1067 0.1467 Estimates of the Common Relative Risk (Row1/Row2) class age d v; model count = d|age v|age DF Type of Study v|d / dist = poisson link=log maxit=50 covb itprint obstats ; Method Value 95% Conf. Limits Case-Control M-H 1.2877 0.9166 1.8092 (Odds Ratio) Logit 1.1902 0.8364 1.6937 run; 880 879 The GENMOD Procedure Model Information Distribution Poisson Link Function Homogeneity of the Odds Ratios Chi-Square Observations Used COUNT Frequency 24 8.7473 DF Pr > ChiSq Log Dependent Variable Breslow-Day Test for Class Level Information 5 0.1196 Class 881 Levels Values AGE 3 1 2 3 SEX 2 1 2 D 2 1 2 V 2 1 2 882 Analysis Of Parameter Estimates Standard AGE*SEX 1 1 1 -0.5189 0.1387 AGE*SEX Confidence Limits AGE*SEX 1 2 0 0.0000 0.0000 0.000 2 1 1 -0.4027 0.1482 -0.693 AGE*SEX 4.9950 AGE*SEX 0.4520 2 2 0 0.0000 0.0000 0.000 0.000 Wald 95% Parameter DF Estimate Error Intercept 1 4.8208 0.0888 4.6467 D 1 1 0.2184 0.1192 -0.0152 D 2 0 0.0000 0.0000 0.0000 SEX 1 1 0.3373 0.1151 0.1116 SEX 2 0 0.0000 0.0000 0.0000 SEX*D 1 1 1 1.1244 0.1446 0.8410 -0.790 3 1 0 0.0000 0.0000 AGE*SEX 0.0000 AGE*SEX*D 0.5629 3 2 0 0.0000 0.0000 0.000 1 1 1 1 -0.5757 0.3239 -1.210 AGE*SEX*D 0.0000 AGE*SEX*D 1.4078 1 1 2 0 0.0000 0.0000 0.000 1 2 1 0 0.0000 0.0000 0.000 SEX*D 1 2 0 0.0000 0.0000 0.0000 AGE*SEX*D 0.0000 1 2 2 0 0.0000 0.0000 0.000 SEX*D 2 1 0 0.0000 0.0000 0.0000 2 1 1 1 0.2871 0.2185 -0.141 2 0.0000 AGE*SEX*D 0.0000 AGE*SEX*D 2 1 2 0 0.0000 0.0000 0.000 1.1915 AGE*SEX*D 0.7964 AGE*SEX*D 2 2 1 0 0.0000 0.0000 0.000 2 2 2 0 0.0000 0.0000 0.000 0.0000 AGE*SEX*D -2.6155 AGE*SEX*D 3 1 1 0 0.0000 0.0000 0.000 3 1 2 0 0.0000 0.0000 0.000 3 2 1 0 0.0000 0.0000 0.000 3 2 2 0 0.0000 0.0000 0.000 1 1 -3.7452 0.1923 -4.122 2 0 0.0000 0.0000 0.000 SEX*D 2 0 0.0000 0.0000 0.0000 AGE 1 1 0.9884 0.1036 0.7853 AGE 2 1 0.5795 0.1107 0.3625 AGE 3 0 0.0000 0.0000 0.0000 AGE*D 1 1 1 -3.1226 0.2587 -3.6297 AGE*D 1 2 0 0.0000 0.0000 0.0000 AGE*D 2 1 1 -1.4370 0.1824 -1.7944 AGE*D 2 2 0 0.0000 0.0000 0.0000 AGE*D 3 1 0 0.0000 0.0000 0.0000 AGE*D 3 2 0 0.0000 0.0000 0.0000 0.0000 AGE*SEX*D -1.0796 AGE*SEX*D 0.0000 V 0.0000 V 0.0000 883 884 SEX*V 1 1 1 1.2380 0.1658 0.9131 1.5629 SEX*V 1 2 0 0.0000 0.0000 0.0000 Obs 0.0000 SEX*V 2 1 0 0.0000 0.0000 0.0000 0.0000 SEX*V 2 2 0 0.0000 0.0000 0.0000 1 0.0000 8 1 1 1 8.63105 26.36895 Count SEX GROUP STATUS Pred AGE*V 1 1 1 1.3904 0.1578 1.0812 2 1.6996 27 1 1 2 AGE*V 1 2 0 0.0000 0.0000 0.0000 3 0.0000 94 1 2 1 90.99597 AGE*V 2 1 1 0.4882 0.1732 0.1486 4 0.8277 275 1 2 2 278.00404 AGE*V 2 2 0 0.0000 0.0000 0.0000 5 0.0000 1 2 1 1 1.73367 AGE*V 3 1 0 0.0000 0.0000 0.0000 6 0.0000 19 2 1 2 18.26634 AGE*V 3 2 Scale 0 0.0000 0.0000 0.0000 7 0.0000 30 2 2 1 31.63948 0 1.0000 0.0000 1.0000 8 1.0000 335 2 2 2 333.36068 885 886 Total Sample Size = 2800 Model Information Data Set WORK.SET3 Distribution Criteria For Assessing Goodness Of Fit Poisson Link Function Log Dependent Variable COUNT Observations Used Frequency 8 Class Level Information Class Levels Values SEX 2 1 2 D 2 1 2 V 2 1 2 Criterion DF Value Value/DF Deviance Scaled Deviance 1 0.2899 0.2899 1 0.2899 0.2899 Pearson Chi-Square 1 0.2995 0.2995 Scaled Pearson X2 1 0.2995 0.2995 Log Likelihood 14935.9024 888 887 Standard Parameter DF Intercept Estimate Error Wald 95% Confidence Limits 1 6.5216 0.0382 6.4467 6.5964 D 1 1 -1.0521 0.0741 -1.1973 -0.9069 D 2 0 0.0000 0.0000 0.0000 0.0000 SEX 1 1 -0.0198 0.0540 -0.1257 0.0861 SEX 2 0 0.0000 0.0000 0.0000 0.0000 SEX*D 1 1 1 1.3926 0.0885 1.2190 1.5661 Obs SEX*D 1 2 0 0.0000 0.0000 0.0000 0.0000 1 SEX*D 2 1 0 0.0000 0.0000 0.0000 SEX*D 2 2 0 0.0000 0.0000 0.0000 V 1 1 -2.7757 0.1459 V 2 0 0.0000 SEX*V 1 1 1 SEX*V 1 2 SEX*V 2 1 SEX*V 2 2 D*V 1 1 D*V 1 D*V D*V Scale Observation Statistics DRIVER VIOLATION SEX GROUP STATUS 102 1 1 1 103.34706 0.0000 2 938 1 1 2 936.65296 0.0000 3 127 1 2 1 125.65297 -3.0616 -2.4898 4 665 1 2 2 666.34704 0.0000 0.0000 0.0000 5 10 2 1 1 8.65302 1.1074 0.1653 0.7835 1.4313 6 236 2 1 2 237.34705 0 0.0000 0.0000 0.0000 0.0000 7 41 2 2 1 42.34713 0 0.0000 0.0000 0.0000 0.0000 8 681 2 2 2 679.65296 0 0.0000 0.0000 0.0000 0.0000 1 -0.5359 0.1332 -0.7969 -0.2750 2 0 0.0000 0.0000 0.0000 0.0000 2 1 0 0.0000 0.0000 0.0000 0.0000 2 2 0 0.0000 0.0000 0.0000 0.0000 0 1.0000 0.0000 1.0000 1.0000 889 Count Pred 890 Total Sample Size = 2800 Parameter Estimates The GENMOD Procedure Standard Class Level Information Class Levels Values AGE 3 1 2 3 D 2 1 2 V 2 1 2 Criteria For Assessing Goodness Of Fit Criterion DF Value Value/DF Deviance 2 3.1816 1.5908 Scaled Deviance 2 3.1816 1.5908 Pearson Chi-Square 2 3.0433 1.5217 Scaled Pearson X2 2 3.0433 1.5217 Log Likelihood Wald 9 Parameter DF Estimate Error Confiden Intercept 1 5.7062 0.0569 5.5946 D 1 1 0.9998 0.0663 0.8699 D 2 0 0.0000 0.0000 0.0000 AGE 1 1 0.7136 0.0694 0.5775 AGE 2 1 0.3608 0.0737 0.2163 AGE 3 0 0.0000 0.0000 0.0000 AGE*D 1 1 1 -3.6794 0.1574 -3.9880 AGE*D 1 2 0 0.0000 0.0000 0.0000 AGE*D 2 1 1 -1.3154 0.0968 -1.5051 AGE*D 2 2 0 0.0000 0.0000 0.0000 AGE*D 3 1 0 0.0000 0.0000 0.0000 AGE*D 3 2 0 0.0000 0.0000 0.0000 V 1 1 -3.0477 0.1793 -3.3991 V 2 0 0.0000 0.0000 0.0000 14075.4288 892 891 Observation Statistics Frequency AGE DRIVER VIOLATION GROUP GROUP STATUS 9 1 1 1 46 1 1 2 124 1 2 1 610 1 2 2 5 43 2 1 1 0.0000 6 310 2 1 2 0.0000 7 29 2 2 1 0.0000 0.0000 8 436 2 2 2 1.0000 1.0000 9 60 3 1 1 10 818 3 1 2 11 15 3 2 1 12 300 3 2 2 AGE*V 1 1 1 1.4162 0.1935 1.0369 1.7956 Observation AGE*V 1 2 0 0.0000 0.0000 0.0000 0.0000 AGE*V 2 1 1 0.4958 0.1782 0.1466 0.8450 1 AGE*V 2 2 0 0.0000 0.0000 0.0000 0.0000 2 AGE*V 3 1 0 0.0000 0.0000 0.0000 0.0000 3 AGE*V 3 2 0 0.0000 0.0000 0.0000 0.0000 4 D*V 1 1 1 0.4481 0.1660 0.1227 0.7735 D*V 1 2 0 0.0000 0.0000 0.0000 D*V 2 1 0 0.0000 0.0000 0.0000 D*V 2 2 0 0.0000 0.0000 0 1.0000 0.0000 Scale 893 Count 894 # This is an illustration of using # the glm function in Splus to fit # log-linear models to the Wisconsin # Drivers data. # This file is stored as # Change the age (a), sex (s), # disease status (d), traffic violation (v) # into factors cvdg$a <- as.factor(cvdg$a) driverw.ssc cvdg$s <- as.factor(cvdg$s) cvdg$v <- as.factor(cvdg$v) # The data are in the file drivall.txt cvdg$d <- as.factor(cvdg$d) # Enter the data into a data frame and # cvd <- read.table("drivall.dat", col.names=c("a","s","d","v","r","y")) # Compute total counts summing # across the levels of r Print the table cvdg # Use the glm function to fit the # log-linear model for conditional # independence of v and d given any # combination of levels for a and s. cvdg<-aggregate(cvd, list(a=cvd$a, s=cvd$s, v=cvd$v, d=cvd$d), FUN=sum) options(contrasts=c("contr.treatment", "contr.poly")) 895 896 cvd2.step <- step(cvd2, scope= ~ . ^4, scale=1, trace=T) cvd2 <- glm(y ~ v + d + s + a + v*s + v*a + v*s*a + s*a + d*s + d*a + d*s*a, family=poisson, data=cvdg) # # After reviewing the results of the previous # steps obtain results for the final model Print the results goftests(cvdg$y, cvd2$fit, cvdg$y, cvd2$df) summary(cvd2, correlation=F) step v*a + s*a + d*s + d*a + d*s*a, family=poisson, data=cvdg, x=T) # Use the function to search for # a better # a lower bound as the null model and an # upper bound as the four factor interaction # model. # steps in the search. model. cvd3 <- glm(y ~ v + d + s + a + v*s + The scope function sets trace=T shows results for various It uses the AIC criterion # to determine which terms should be added to # or deleted from the model. # attention to hierarchical models. It restricts summary(cvd3, correlation=F) cvd3$fit cvd3$parms goftests(cvdg$y, cvd3$fit, cvdg$y, cvd3$df) anova(cvd3, test="Chisq") cvd3$cov <- solve(t(cvd3$x)%*% diag(cvd3$weight)%*%cvd3$x) cvd3$cov 897 898 # This is an illustration of using # the glm function in Splus to fit # log-linear models to the Wisconsin # Drivers data. # # Change the age (a), sex (s), # disease status (d), traffic violation (v) # into factors cvdg$a <- as.factor(cvdg$a) This file is stored as cvdg$s <- as.factor(cvdg$s) driverw.ssc cvdg$v <- as.factor(cvdg$v) cvdg$d <- as.factor(cvdg$d) # The data are in the file drivall.txt # Enter the data into a data frame and # Print the table cvd <- read.table("drivall.dat", cvdg col.names=c("a","s","d","v","r","y")) # Compute total counts summing # across the levels of r a s v d a1 s2 d3 v4 1 1 1 1 1 y 5 5 5 15 8 2 2 1 1 1 10 5 5 5 15 40 3 3 1 1 1 15 54 5 5 5 15 5 10 5 5 15 1 5 2 2 1 1 10 10 5 5 15 3 6 3 2 1 1 15 10 5 5 15 6 cvdg<-aggregate(cvd, list(a=cvd$a, s=cvd$s, v=cvd$v, d=cvd$d), FUN=sum) r 5 4 1 2 1 1 899 900 # Use the glm function to fit the # log-linear model for conditional 7 1 1 2 1 5 5 5 10 15 8 2 1 2 1 10 5 5 10 15 245 9 3 1 2 1 15 5 5 10 15 666 10 1 2 2 1 5 10 5 10 15 19 11 2 2 2 1 10 10 5 10 15 65 12 3 2 2 1 15 10 5 10 15 152 13 1 1 1 2 5 5 10 5 15 94 14 2 1 1 2 10 5 10 5 15 18 15 3 1 1 2 15 5 10 5 15 15 5 10 10 5 15 30 17 2 2 1 2 10 10 10 5 15 11 18 3 2 1 2 15 10 10 5 15 0 16 1 2 1 2 19 1 1 2 2 # independence of v and d given any 27 # combination of levels for a and s. options(contrasts=c("contr.treatment", "contr.poly")) cvd2 <- glm(y ~ v + d + s + a + v*s + v*a + v*s*a + s*a + d*s + d*a + d*s*a, family=poisson, data=cvdg) # Print the results goftests(cvdg$y, cvd2$fit, cvdg$y, cvd2$df) 5 5 10 10 15 275 20 2 1 2 2 10 5 10 10 15 217 21 3 1 2 2 15 5 10 10 15 173 Pearson test = 5 10 10 10 15 335 Degrees of freedom = 22 1 2 2 2 23 2 2 2 2 10 10 10 10 15 219 p-value = 24 3 2 2 2 15 10 10 10 15 127 Deviance test = df = p-value = 901 10.66 6 0.1 13.11 6 0.04 902 summary(cvd2, correlation=F) Call: glm(formula = y ~ v + d + s + a + v*s + v*a d:s + v*s*a + s*a + d*s + d*a + d*s*a, family = poisson, data = cvdg) Deviance Residuals: 0.548716497 0.2898641 1.89301313 da2 -2.548352249 0.1975995 -12.89655475 da3 -3.698257832 0.1949089 -18.97428563 Max v:sa2 -0.415060477 0.3775628 -1.09931504 -2.31245 -0.136422 0.00550854 0.146847 1.399646 v:sa3 -0.008519217 Min 1Q Median 3Q Coefficients: Value Std. Error (Intercept) 2.178905997 0.1894732 0.4830368 -0.01763679 sa2d 0.862758772 0.3329249 2.59145172 sa3d 0.575684805 0.3239304 1.77718685 t value 11.49980958 v 1.085454204 0.1145215 9.47816751 d 2.355448583 0.1768656 13.31772832 s -1.702429853 0.3399423 -5.00799657 a2 1.280197382 0.2339384 5.47236983 a3 1.823207341 0.2251231 8.09871177 v:s 1.349855505 0.2195407 6.14854433 va2 0.989667676 0.1803359 5.48791123 va3 1.412649998 0.1697051 8.32414471 sa2 -0.595201978 0.4660114 -1.27722635 sa3 -1.097806340 0.5468260 -2.00759720 (Dispersion Parameter for Poisson family taken to be 1 ) Null Deviance: 3839.919 on 23 df Residual Deviance: 13.10699 on 6 df Number of Fisher Scoring Iterations: 4 904 903 Start: # Use the step function to search for # a better # a lower bound as the null model and an model. The scope function sets # upper bound as the four factor interaction # model. Single term deletions Model: y ~ v + d + s + a + v*s + v*a + v*s*a + s*a + d*s + d*a + d*s*a trace=T shows results for various # steps in the search. # to determine which terms should be added to It uses the AIC criterion # or deleted from the model. # attention to hierarchical models. It restricts cvd2.step <- step(cvd2, scope= ~ . ^4, scale=1, trace=T) AIC= 49.107 Df Sum of Sq <none> Cp v:s:a 2 1.288553 11.92078 43.92078 d:s:a 2 6.864393 17.49662 49.49662 Single term additions Df Sum of Sq <none> v:d 905 RSS 10.63223 46.63223 RSS Cp 10.63223 46.63223 1 2.110434 8.52180 46.52180 906 Step: # After reviewing the results of the previous AIC= 46.3509 # steps obtain results for the final model Single term deletions cvd3 <- glm(y ~ v + d + s + a + v*s + v*a + s*a + d*s + d*a + d*s*a, Model: family=poisson, data=cvdg, x=T) y ~ v + d + s + a + v:s + v:a + s:a + d:s + d:a + d:s:a Df Sum of Sq <none> summary(cvd3, correlation=F) RSS Cp 11.27280 43.2728 97.1632 v:s 1 55.89040 67.16320 v:a 2 83.03208 94.30488 122.3049 d:s:a 2 6.86525 18.13805 goftests(cvdg$y, cvd3$fit, cvdg$y, cvd3$df) anova(cvd3, test="Chisq") Call: glm(formula = y ~ v + d + s + a + v*s + 46.1380 v*a + s*a + d*s + d*a + d*s*a, family = poisson, data = cvdg, x = T) Single term additions Df Sum of Sq <none> RSS Deviance Residuals: Cp Min 11.27280 43.27280 v:d 1 1.729600 9.54319 43.54319 v:s:a 2 1.295216 9.97758 45.97758 1Q Median 3Q -2.421475 -0.1967607 -0.01235606 0.2698361 Max 1.125939 907 Coefficients: (Intercept) v d s a2 a3 v:s va2 va3 sa2 sa3 d:s da2 da3 sa2d sa3d 908 cvd3$fit Value Std. Error t value 2.1553668 0.1877495 11.480016 1.1168202 0.1084668 10.296428 2.3554486 0.1768634 13.317895 -1.6051295 0.3153001 -5.090800 1.3533819 0.2228837 6.072144 1.8383183 0.2192511 8.384535 1.2380025 0.1655972 7.475988 0.9022461 0.1593530 5.661932 1.3904023 0.1576925 8.817175 -0.9789585 0.3119324 -3.138368 -1.0945409 0.2947236 -3.713788 0.5487165 0.2898466 1.893127 -2.5483522 0.1975963 -12.896762 -3.6982578 0.1949062 -18.974554 0.8627588 0.3329042 2.591612 0.5756849 0.3239111 1.777293 (Dispersion Parameter for Poisson family taken to be 1 ) Null Deviance: 3839.919 on 23 degrees of freedom Residual Deviance: 14.35092 on 8 degrees of freedom 1 2 6 7 909 4 5 8 9 10 3.647401 26.36894 251.5936 665.7456 18.26634 11 12 13 14 15 65.479 154.3526 90.99599 27.54566 14.16644 16 17 18 19 20 31.63937 8.526955 2.931771 278.004 207.4543 21 22 23 24 173.8336 333.3607 221.4731 124.0683 goftests(cvdg$y, cvd3$fit, cvdg$y, cvd3$df) Pearson test = Degrees of freedom = p-value = Deviance test = df = p-value = Number of Fisher Scoring Iterations: 3 3 8.631056 33.40644 54.25445 1.733664 2.521013 11.3 8 0.19 14.35 8 0.07 910 anova(cvd3, test="Chisq") Analysis of Deviance Table Poisson model Response: y Terms added sequentially Df Deviance Resid. NULL v 1 2061.160 d 1 18.586 s 1 271.006 a 2 104.791 v:s 1 40.487 v:a 2 57.061 s:a 2 158.850 d:s 1 259.640 d:a 2 847.342 d:s:a 2 6.645 (first to last) Df Resid. Dev Pr(Chi) 23 3839.919 22 1778.759 0.00000000 21 1760.173 0.00001624 20 1489.166 0.00000000 18 1384.375 0.00000000 17 1343.889 0.00000000 15 1286.827 0.00000000 13 1127.978 0.00000000 12 868.338 0.00000000 10 20.996 0.00000000 8 14.351 0.03606229 cvd3$cov <- solve(t(cvd3$x)%*% diag(cvd3$weight)%*%cvd3$x) 911