DATA SET1; INFILE 'c:\courses\st557\sas\bpd.dat'; INPUT SEX 1 YOB 2-3 APGAR 4-5 GEST 6-8 BWT 9-12 AGSYM 13-14 AGVEN 15-17 INTUB 18-21 VENTL 22-25 LOWO2 26-30 MEDO2 31-34 HIO2 35-38 SURV 39 DSURV 40-44 RDS 45 BPDHI 46-47; /* This program uses the TREEDISC macro in SAS to apply a CHAID algorithm to the BPD data. This code is stored in the file chaidbpd.sas */ /* First set some graphics options */ /* To print postscipt files in UNIX */ /* goptions cback=white ctext=black targetdevice=ps300 rotate=landscape; */ /* To print postscript files from Windows */ goptions cback=white ctext=black device=WIN target=ps rotate=landscape ; IF BPDHI>20 THEN BPD=1; ELSE BPD=2; LABEL YOB APGAR GEST BWT AGSYM = = = = = YEAR OF BIRTH APGAR SCORE GESTATIONAL AGE (WEEKSx10) BIRTH WEIGHT (GRAMS) AGE AT ONSET OF RESPIRATORY SYMPTOMS (HRSx10) AGVEN = AGE WHEN VENTILATORY ASSISTANCE BEGAN (HRS) INTUB = DURATION OF ENDOTRACHEAL INTUBATION (HRS) VENTL = DURATION OF ASSISTED VENTILATION (HRS) 1228 1227 /* Establish formats for classification variables */ LOWO2 MEDO2 HIO2 SURV DSURV = = = = = EXPOSURE TO 22-39% OXYGEN (HRS) EXPOSURE TO 40-79% OXYGEN (HRS) EXPOSURE TO 80-100% OXYGEN (HRS) SURVIVAL AS OF 5/1/1975 DURATION OF SURVIVAL AS OF 5/1/75 (HRS) RDS = SEVERITY OF RDS BPDHI = BPD ASSESSMENT BPD = BPD INDICATOR; /* Retain only babies who lived at least 72 hours */ PROC FORMAT; VALUE SEX 0 = 'FEMALE' 1 = 'MALE'; VALUE SURV 0 = 'DEAD' 1 = 'ALIVE'; VALUE RDS 0 = 'NONE' 1 = 'SLIGHT' 2 = 'MODERATE' 3 = 'SUBSTANTIAL' 4 = 'SEVERE' 5 = 'VERY SEVERE'; VALUE BPD 1 = 'YES' 2 = 'NO'; IF(DSURV > 72); run; 1229 1230 /* Load in the xmacros file */ /* Draw the tree on one page */ %inc 'c:\courses\st557\sas\xmacro.sas'; %treedisc(intree=trd, draw=graphics); /* Load in the TREEDISC macro */ /* Draw a larger tree on several pages */ %inc 'c:\courses\st557\sas\treedisc.sas'; goptions cback=white ctext=black device=WIN target=ps rotate=portrait; /* Compute a incidence levels of and other %treedisc(intree=trd, draw=graphics, pos=120 120); tree for predicting BPD from exposure to elevated oxygen, use of ventilation, explanatory variables*/ %treedisc(data=set1, depvar=bpd, nominal=rds:, ordinal=ventl: lowo2: medo2: hio2:, outtree=trd, options=noformat, trace=long); 1231 All infants BPD:78 no BPD:170 Med O2 < 157 hrs BPD:21 no BPD:152 157 Summary: H@HH @@HHH @@ HHHH @@ HHHH HH @ Med O2 450 BPD:22 no BPD:18 Low O2 < 170 hrs BPD:5 no BPD:14 @ High O2 > 90 hrs BPD:3 no BPD:0 P@PPP @@ PPPPP PPPP @ PPPP @@ PPP @ PP High O2 35 hrs BPD:2 no BPD:78 36 High O2 159 BPD:7 no BPD:64 1232 @@ @ Med O2 > 450 hrs BPD:35 no BPD:0 Low O2 > 170 hrs BPD:17 no BPD:4 @@ High O2 90 hrs BPD:2 no BPD:14 Keep exposure to the medium O levels below 450 hours When exposure to the medium O levels is between 157 and 450 hours, keep { exposure to low O levels below 170 hours { exposure to high O levels below 90 hours When exposure to the medium level of oxygen is below 157 hours, keep exposure to high O levels below 160 hours 2 2 2 2 2 High O2 > 160 hrs BPD:12 no BPD:10 1233 1234 The Algorithm: Classication and Regression Trees (Cart) Brieman, et al. (1984), Classicaiton and Regression Trees, Wadsworth. CART software sold by Salford Systems, CA. Available in the SAS data mining package Chambers and Hastie (1992), Statistical Models in S, (Chapter 9), Wadsworth. Tree( ) and related functions in Splus. All cases @ @ @@ @@ X7 18.3 1235 Deviance @@ @ DevianceLeft @@ X7 > 18.3 Build a larger tree than you really need and prune it back { cross validation { validation sample Can have { continuous variables { nominal (categorical) variabales Missing data? 1236 For a discrete response (at the k-th node): I Deviance = 2nk iX=1 Pik log(Pik ) Determination of a split: @@ Binary Splits @@ @ DevianceRight Choose (i) an explanatory variable (ii) boundary (or cut point) that maximizes the change in deviance deviance = deviance (devianceLeft + devianceRight) 1237 where Pik is the proportion of cases at the k-th node in the i-th response category. For a continuous response (at the k-th node): n Deviance = jX=1k (yjk yk)2 1238 Stop splitting if node deviance is less than some fraction of root node deviance (say < 1%). number of cases at the node is too small (say < 10 cases). Use mindev and minsize in tree.control ( ) Prune the tree: Crossvalidation Validation sample # Use the tree function in Splus to # build a classification tree. This # file is stored as bpdtree.ssc # # # # # # # Use the bpdsp3.dat data file which does not have missing values. The tree function in Splus cannot handle missing values. The APGAR score variable was not included because it contained too many missing values. The tree is pruned with the prune function in Splus # Enter the data as a matrix treec.dat <- matrix(scan("c:/courses/st557/splus/b ncol=17,byrow=T) # Select the columns to be used in the # analysis and define row and column labels ID<-treec.dat[,1] 1240 1239 # Create a data frame and a factor treec.dat<-data.frame(treec.dat) treec.dat$BPD<-as.factor(treec.dat$BPD) # Compute the classification tree and put # it in the file trees.out treec.dat <- treec.dat[, c(2:12,15,17)] dimnames(treec.dat)<-list(ID,c("Sex","Year", "Gage","Bweight","AgeSYM","AgeVEN", "Intub","Ventl","LowO2", "MedO2","HiO2","RDS","BPD")) ftree <- formula(factor(BPD)~Sex+Year+Gage+Bweight AgeSYM+AgeVEN+Intub+Ventl+ LowO2+MedO2+HiO2+RDS) trees.out<-tree(ftree,treec.dat) # Print a summary description of the tree summary(trees.out) # Print a description of what happens at # each node of the tree print(trees.out) 1241 # # # # # # # Display the tree. Unix users should # first use the motif( ) function to open # a graphics window. plot(trees.out) text(trees.out) Use crossvalidation provided by the cv.tree function to determine how to prune the tree. You should run this several times, because different random grouping of cases can yield very different results in small samples trees.cv <- cv.tree(trees.out, FUN=prune.tree) plot(trees.cv) # The following crossvalidation of the # misclassification rates only works if # the response is a factor. trees.cv <- cv.tree(trees.out, FUN=prune.misclass) plot(trees.cv) # Plot misclassification rates for simple # resubstitution. This is not a good way # to decide how much to prune. 1242 tree.mis<-prune.misclass(trees.out) plot(tree.mis) # The resulting plot suggests that the tree # should be pruned at about 6 nodes # Display the pruned tree treep.out <- prune.tree(tree = trees.out, best=6) summary(treep.out) plot(treep.out) text(treep.out) Description of the tree created for the BPD data. Each line provides the information for one node in the tree. The information is presented in the following order: [1] node number [2] decision rule for going left at this node [3] Number of sample cases that reached this node [4] value of the deviance [5] average value for the response: in this case it is the proportion of babies with BPD at this node [6] An * denotes a terminal node 1243 node), split, n, deviance, yval * denotes terminal node 5) HiO2>159.5 23 5.6520 0.56520 10) MedO2<71.5 17 4.1180 0.41180 20) Gage<342.5 10 2.4000 0.60000 40) Year<67.5 5 0.8000 0.80000 * 41) Year>67.5 5 1.2000 0.40000 * 21) Gage>342.5 7 0.8571 0.14290 * 11) MedO2>71.5 6 0.0000 1.00000 * 3) MedO2>183 60 7.6500 0.85000 6) Ventl<220.5 13 3.2310 0.46150 12) HiO2<26.5 8 0.8750 0.12500 * 13) HiO2>26.5 5 0.0000 1.00000 * 7) Ventl>220.5 47 1.9150 0.95740 14) Intub<430.5 14 1.7140 0.85710 28) Gage<330 5 1.2000 0.60000 * 29) Gage>330 9 0.0000 1.00000 * 15) Intub>430.5 33 0.0000 1.00000 * 1) root 242 52.5000 0.31820 2) MedO2<183 182 22.2900 0.14290 4) HiO2<159.5 159 11.9400 0.08176 8) LowO2<527.5 146 6.6640 0.04795 16) Ventl<146 106 0.0000 0.00000 * 17) Ventl>146 40 5.7750 0.17500 34) MedO2<158 35 3.5430 0.11430 68) Bweight<2239.5 28 0.9643 0.03571 136) Year<69.5 5 0.8000 0.20000 * 137) Year>69.5 23 0.0000 0.00000 * 69) Bweight>2239.5 7 1.7140 0.42860 * 35) MedO2>158 5 1.2000 0.60000 * 9) LowO2>527.5 13 3.2310 0.46150 18) Gage<305 6 0.0000 0.00000 * 19) Gage>305 7 0.8571 0.85710 * 1244 1245 100.0 19.0 15.0 8.5 7.9 6.7 5.7 3.3 LowO2<527.5 MedO2<71.5 0.12501.0000 Gage<342.5 Ventl<146 Gage<305 Year<67.5 1.0000 MedO2<158 Bweight<2239.5 0.80000.40000.1429 0.0000 Year<69.5 0.60000.00000.8571 0.20000.00000.4286 280 240 Ventl<220.5 HiO2<26.5 Gage<330 Intub<430.5 0.60001.00001.0000 220 HiO2<159.5 260 deviance 300 320 340 MedO2<183 | 5 10 15 size 1246 1247 -Inf 19.0 15.0 8.5 7.9 6.7 5.7 3.3 -Inf 42.0 2.3 2.0 1.3 1.0 0.0 -Inf 200 40 50 60 misclass 300 250 deviance 70 350 100.0 5 10 15 5 10 size 15 size 1248 42.0 2.3 2.0 1.3 1.0 0.0 1249 -Inf 60 HiO2<159.5 50 HiO2<45.5 1 LowO2<247 1 1 Ventl<146 0 40 misclass 70 MedO2<183 | 0 5 10 0 15 size 1250 1251 Summary Do not exceed 160 hours of exposure to the high oxygen levels Do not exceed 183 hours of exposure to the medium oxygen levels Do not exceed 528 hours of exposure to the lower oxygen levels Do not ventilate for more than 146 hours 1252