Lynn Lethbridge SHRUG November, 2010 What is Bootstrapping? A method to estimate a statistic’s sampling distribution Bootstrap samples are drawn repeatedly with replacement from the original data From each new sample, the statistic is re-calculated and saved in a dataset (ie 200 bootstraps, 200 statistics) The standard error of the statistic is calculated as the standard deviation of the bootstrap statistics Bootstrapping not used for the point estimate When to Use Bootstrapping Distribution has no clear analytical solution eg Gini coefficient, poverty intensity Test for sensitivity Complex survey design (not random) eg Statistics Canada surveys are a stratified, multistage design Households within clusters within strata are selected Observations will not be independent – variance calculated the usual way will be underestimated Two Programs One is ‘traditional’ bootstrapping re-sampling from the original sample The second is bootstrapping using Statistics Canada survey data Statistics Canada does the re-sampling heavy lifting in most of its surveys Use the bootstrap weights provided to calculate the standard error Program 1 Project where we examined the effect of trade on ‘poverty intensity’ in Canada/US Used state/province level measures in regression analysis Used bootstrapping to measure robustness of results given a different mix of policies Our dataset consists of 61 unique observations of states and provinces. Re-sample to see if results are affected if we had a different make-up of regions /** run the regression with original sample to get point estimates */ proc reg data=orig.pov97 outest=work.estpoint(keep=intercept lmurate aveuiben tradeimp tradeexp sambearn can); model sst = lmurate sambearn can; weight invse; title " 1997"; run; aveuiben tradeimp tradeexp proc transpose data=work.estpoint out=work.estpoint2(drop=_label_ rename=(col1=coef)); run; /* put sample size in a macro */ proc means data=orig.pov97 noprint; var year; output out=work.out n=totnum; run; data _null_; set work.out; call symput ('totnum', totnum); run; /** make a temporary file of original dataset */ data work.pov97; set orig.pov97; run; /* initiate bootstrap dataset */ data work.boot97fin; set _null_; run; options nonotes; /* create macro for number of bootstraps %let bt=1000; */ %macro boot; /** construct new sample of 61 observations randomly drawn with replacement */ data work.boot; do i=1 to &totnum; _p=ceil(ranuni(i+&x)*&totnum); do obsnum=_p to _p; set work.pov97 point=obsnum; if _error_ then abort; output; end; end; stop; run; /* estimate coefficients from bootstrap sample*/ proc reg data=work.boot noprint outest=work.est(keep=intercept lmurate tradeimp tradeexp sambearn can); model sst = lmurate sambearn can; weight invse; title " 1997"; run; aveuiben tradeimp tradeexp /** add coefficients to dataset data work.boot97fin; set work.boot97fin work.est; run; %mend boot; aveuiben */ /** invoke the boot macro 1000 times */ %macro reps; %do x=1 %to &bt; %boot; %end; %mend reps; %reps; options notes; /** calculate the standard deviation of each bootstrapped coefficient */ proc means data=work.boot97fin n mean std; output out=work.std std=intercept lmurate aveuiben tradeimp tradeexp sambearn can; run; proc transpose data= work.std (drop=_type_ _freq_)out=work.std2(drop=_label_ rename=(col1=se)); run; /** merge point estimates together with standard errors and calculate statistics */ data work.final; merge work.estpoint2 work.std2; t=coef/se; pvalue=(1-probnorm(abs(t)))*2; run; proc print data= work.final; run; Parameter Estimates Variable Parameter DF Estimate Standard Error Intercept lmurate aveuiben tradeimp tradeexp sambearn can 1 0.05648 1 0.06210 1 -0.00009479 1 -0.07186 1 0.02107 1 -0.06155 1 -0.03489 0.02317 0.01433 0.00003002 0.12541 0.13190 0.04973 0.02739 t Value 2.44 4.33 -3.16 -0.57 0.16 -1.24 -1.27 Pr > |t| 0.0181 <.0001 0.0026 0.5690 0.8737 0.2212 0.2081 1997 The MEANS Procedure Variable Label N Mean Std Dev ƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒ Intercept Intercept 1000 0.0581707 0.0305142 lmurate 1000 0.0616976 0.0178248 aveuiben 1000 -0.000101532 0.000037820 tradeimp 1000 -0.0258204 0.1743886 tradeexp 1000 -0.0355008 0.1880651 sambearn 1000 -0.0635708 0.0673242 can 1000 -0.0228619 0.0402765 ƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒ Obs _NAME_ 1 2 3 4 5 6 7 intercept lmurate aveuiben tradeimp tradeexp sambearn can coef 0.056482 0.062098 -0.000095 -0.071862 0.021066 -0.061547 -0.034891 se 0.03051 0.01782 0.00004 0.17439 0.18807 0.06732 0.04028 t 1.85102 3.48378 -2.50627 -0.41208 0.11202 -0.91419 -0.86628 pvalue 0.06417 0.00049 0.01220 0.68028 0.91081 0.36062 0.38634 Program 2 Project using the National Longitudinal Survey of Children and Youth (NLSCY) Examined the effect of having a child with disabilities on the health of mothers and fathers Ordered Probit utilizing Statistics Canada NLSCY bootstrap weights to estimate standard errors Weighting Many survey datasets include sampling weights so results will represent the population The mechanics of using bootstrap weights are the same as for sampling weights Each individual in survey has a sample weight and all the bootstrap weights Re-estimate your model or statistic over and over using a different weight each time Bootstrap Weight Derivation Resampling A Miracle Occurs Bootstrap Weights /** macros to indicate the dependent variable and independent variables */ %let depvar=momhealth00; %let indepvars=hhdis00 momage00 momlthigh00 momcertdip00 momunivdeg00 momimm eqinc00 hhchlt500 kids01700 momvg94 momg94 momfp94 momsmokesdaily00; /** separate macro for the independent variables and intercept */ %let allrhs=intercept_2 intercept_3 intercept_4 intercept_5 &indepvars; /*** get point estimates using sample weight */ proc logistic data=nlscy.age615validboot descending outest=work.point(keep=&allrhs); model &depvar= &indepvars / link=normit maxiter=50 rsq; weight dwtcwd1l / norm; where validdis=1; title " mom 2000 "; run; /** transpose the date which contains the point estimates */ proc transpose data=work.point out=work.pointtrans(drop=_label_ rename=(col1=coef)); run; /** put data into memory */ data work.age615validboot; set nlscy.age615validboot; run; /** create empty dataset for coefficients data work.probitboot; set _null_; run; %global bt; %let bt=1000; /** 1000 bootstrap weights provided;*/ */ %macro boot; options nonotes; %do i=1 %to &bt; proc logistic data=work.age615validboot noprint descending outest=work.est(keep=&allrhs); model &depvar =&indepvars / link=normit maxiter=50 rsq; weight bsw&i / norm; where validdis=1; title " mom 2000 "; run; data work.probitboot; set work.probitboot work.est; run; %end; options notes; %mend boot; %boot; /** calculate the standard deviation */ proc means data=work.probitboot n mean std ; output out=work.std std=&allrhs; run; proc transpose data=work.std(drop=_type_ _freq_) out=work.std2(drop=_label_ rename=(col1=se)); run; data work.final; merge work.pointtrans work.std2; /** Wald chi square */ z=coef/se; chi=z*z; pvaluechi=1-probchi(chi,1); run; proc print; title " married moms"; run; Analysis of Maximum Likelihood Estimates Parameter Standard Wald Error Chi-Square Pr > ChiSq DF Estimate <.0001 5 1 -2.9050 0.1513 368.5150 Intercept <.0001 4 1 -2.0956 0.1451 208.6086 Intercept <.0001 50.9855 3 1 -1.0202 0.1429 Intercept 0.1145 2.4906 2 1 0.2247 0.1424 Intercept <.0001 51.1371 1 0.3052 0.0427 hhdis00 0.0648 3.4098 1 0.00579 0.00314 momage00 0.0102 6.6078 1 0.1499 0.0583 momlthigh00 0.0570 3.6231 momcertdip00 1 -0.0731 0.0384 <.0001 16.9065 momunivdeg00 1 -0.1781 0.0433 <.0001 64.9256 1 0.3377 0.0419 momimm <.0001 1 -2.95E-6 6.018E-7 24.0756 eqinc00 0.0327 4.5628 1 -0.1872 0.0876 hhchlt500 <.0001 61.0665 1 -0.1262 0.0161 kids01700 <.0001 1 0.6181 0.0350 312.6018 momvg94 <.0001 1 1.1116 0.0458 589.8279 momg94 <.0001 1 1.5644 0.0912 294.0294 momfp94 <.0001 15.7629 momsmokesdaily00 1 0.1706 0.0430 The MEANS Procedure Variable N Mean Std Dev ƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒ Intercept_5 1000 -2.9650753 0.3107804 Intercept_4 1000 -2.1470196 0.2770212 Intercept_3 1000 -1.0465351 0.2621726 Intercept_ 1000 0.2091371 0.2622451 hhdis00 1000 0.2846419 0.0973226 momage00 1000 0.0057067 0.0055820 momlthigh00 1000 0.1293874 0.0932894 momcertdip00 1000 -0.0739417 0.0772243 momunivdeg00 1000 -0.1852935 0.0980241 momimm 1000 0.3191519 0.1181139 eqinc00 1000 -3.090889E-6 1.1721765E-6 hhchlt500 1000 -0.1760001 0.1143188 kids01700 1000 -0.1148346 0.0351904 momvg94 1000 0.6399775 0.0754143 momg94 1000 1.1403891 0.1000578 momfp94 1000 1.6089774 0.1664408 momsmokesdaily00 1000 0.1618192 0.0882162 ƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒƒ Obs _NAME_ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 coef se chi intercept_2 -2.90503 0.31078 87.376 intercept_3 -2.09565 0.27702 57.228 intercept_4 -1.02021 0.26217 15.143 intercept_5 0.22473 0.26225 0.734 hhdis00 0.30519 0.09732 9.834 momage00 0.00579 0.00558 1.076 momlthigh00 0.14987 0.09329 2.581 momcertdip00 -0.07309 0.07722 0.896 momunivdeg00 -0.17806 0.09802 3.300 momimm 0.33771 0.11811 8.175 eqinc00 -0.00000 0.00000 6.346 hhchlt500 -0.18722 0.11432 2.682 kids01700 -0.12618 0.03519 12.857 momvg94 0.61807 0.07541 67.169 momg94 1.11157 0.10006 123.417 momfp94 1.56445 0.16644 88.349 momsmokesdaily00 0.17064 0.08822 3.742 pvaluechi 0.00000 0.00000 0.00010 0.39147 0.00171 0.29961 0.10815 0.34390 0.06930 0.00425 0.01176 0.10149 0.00034 0.00000 0.00000 0.00000 0.05307 Thank you for your attention!