missing data

advertisement
The data set hsb_mar.sas7bdat which is based on hsb2.sas7bdat used for this seminar can be
downloaded following the link.
1.
Set the library and label the dataset
libname miss "C:\Users\lqi\Dropbox\missing data talk";
proc format;
value female 0 = "male"
1= "female";
value prog 1 = "general"
2 = "academic"
3 = "vocation" ;
value race 1 = "hispanic"
2 = "asian"
3 = "african-amer"
4 = "white";
value schtyp 1 = "public"
2 = "private";
value ses
1 =
"low"
2 = "middle"
3 = "high";
run;
options fmtsearch=(miss);
data hsb_mar;
set miss.hsb_mar;
run;
2.
Explore the missing pattern
options nofmterr nocenter nodate nolabel;
proc freq data =
hsb_mar nlevels;
tables _all_ /noprint missing;
run;
proc means data =hsb_mar nmiss N;
var _all_ ;
run;
proc means data = hsb_mar nmiss;
var _all_ ;
output out=t (drop=_type_ _freq_) nmiss=/autoname;
run;
proc transpose data = t prefix=nmiss out=s1;
var _numeric_;
run;
data s2;
set s1;
pmiss = nmiss1/200*100;
run;
proc print data = s2;
run;
ods select missPattern;
proc mi data = hsb_mar nimpute=0;
var female prog read write math science socst;
run;
3.
Complete case analysis
proc means data = hsb_mar nmiss N min max mean std;
var _numeric_ ;
run;
proc reg data = hsb_mar;
model socst =
run;
quit;
write read female math;
4.
Multiple imputation (MI)
We generate multiple imputed data sets using proc mi of SAS. Our imputation model includes all
the variables in the analysis model together with some other auxiliary variables.
proc mi data = hsb_mar nimpute = 10 seed = 4321567 out=t ;
var socst write read female math ses schtyp prog ;
run;
proc reg data = t
outest=outreg covout;
by _imputation_;
model socst = write read female math;
run;
quit;
proc mianalyze data=outreg edf=195;
modeleffects Intercept write read female math;
run;
5.
Maximum likelihood estimation of covariance and mean parameters
For more information using this approach, see Maximum Likelihood Parameter Estimation with
Incomplete Data.
proc mi data = hsb_mar noprint seed=12121;
var socst write read female math;
em
outem = emcovhsb;
run;
* 200 - (17 + 9 + 15 + 18)/4 = 185; /* column-wise average n */
data mycov;
set emcovhsb end=last;
output;
array _a_(*) _numeric_;
if last then do; _type_ = "N";
do i = 1 to dim(_a_);
_a_(i) = 185;
output;
end;
drop i;
run;
proc print data = mycov;
run;
proc reg data = mycov (type=cov) ;
model socst =
run;
quit;
write read female math;
end;
Download