Good practices in programming and data analysis and parallel

advertisement
Aldi Kraja
October 17, 2008








A. where is the location of my work?
mylogin@dsgcluster.dsg.wustl.edu
/dsguser/mylogin
pgms
Where do I go from here?
For example:
/dsgmnt/dsg200/genetics/users/jim/impfhs/
/dsgmnt/dsg200/genetics/data/jim/impfhs/
Let’s assume that the user created a lot of
programs and data (or a few of them)
data














B. After work the results will go in a central directory, maybe in the
/dsgmnt/dsg200/genetics/data/fhsanalysis/impute/
Who is in charge of that directory? Is that directory related with my
project? What are the permissions of that directory? Do I need to
change permissions of the directory, or files to read only? Do I need to
notify anybody?
Examples:
drwx------ 2 aldi genetics 4096 Feb 4 2008 mail
cd /dsgmnt/dsg200/genetics
[aldi@blade6-3-1 genetics]$ ls -al
total 16
drwxrwsr-x 4 aldi genetics 160 Sep 15 11:38 .
drwxr-xr-x 4 root root
128 Oct 14 11:38 ..
drwxrwsr-x 12 aldi genetics 312 Oct 15 12:00 data
-rw-rw-r-- 1 warwick genetics 0 Aug 26 10:50 loki.head
-rw-rw-r-- 1 warwick genetics 0 Aug 26 10:50 loki.nohead
drwxrwsr-x 16 aldi genetics 1824 Oct 16 14:26 users






Line command editors: emacs, xemacs, vi, less,
etc
sas &
Data: sas viewer (free software), sas, jmp,
through samba (windows explorer, go to
network, mercury5, login and passwd, /dsg1;
/dsg2; /dsg3; /dsg4; /dsg100; /dsg200;
/dsgweb
Batch work through LSF: bsub sas pgm.sas
proc print data=mydata (obs=10); title “my
data”; run;
proc contents data=mydata; title “contents”
run;


My SAS output will be called “results”. Do I need
to add anything else in the name of the set?
Do I need to keep the output all in one directory?

Does a colleague has the right to change your
data?

Do I have the right to delete

your data?
Do I have the right to delete your tmp data?

Can I open

Can I delete
your data in a sas viewer?
your program run?





Assume I am reading text data with sas, but
they are very large
SAS viewer does not open the created sas
data, because they have more than 37,000
columns. What do you do?
JMP can go beyond that limit; proc print by
keeping a few selected columns, compare
with the original
If my variables are genotypes of this kind
1/3 2/2 3/3 1/4
1/1 what do I need to do
to save space in our servers?
markname chrom pos
M1
1
10001
M2
1
10011
M3
1
10015
M4
1
10021
M5
2
20001
map1
genefreq1
markname
M1
M2
M3
M4
locdes1
Subject
1
2
3
4
markname alele percent
M1
1
10.0
M1
3
90.0
M2
2
25.0
M2
4
75.0
M3
2
0.10
M3
4
99.9
M4
1
0.10
M4
2
99.9
M1 M2 M3 M4… Mn
1/1 2/4 2/2 2/2 … 1/1
1/3 2/2 2/4 1/1 … 1/1
3/3 4/4 2/4 2/2 … 1/1
1/3 2/4 4/4 1/1 … 1/1
…
ganon1

*--------------------------------------;

* split.sas

* split data based on a decided number ;

* magicNumber=,

* By: Aldi Kraja

* October 3, 2008

*--------------------------------------;

options mprint mlogic symbolgen;

%macro split(magicNumber=200,
;
;
;
chroms=1,

chrome=1);


;
%do j=&chroms %to &chrome ;

libname ind&j "/dsgmnt/dsg200/genetics/data/mydata/impute/c&j";

libname out&j "/dsgmnt/dsg200/genetics/data/mydata/split/c&j";


data genefreqc&j ;
set ind&j..genefreqc&j ; run;


data mlgeno&j ;
set ind&j..mlgenoc&j ; run;


data _null_;
file "./myscript.sh";


put " cd /dsgmnt/dsg200/genetics/data/mydata/split/c&j" ";

put " find . -name '*sas7bdat' | xargs /bin/rm -f

put " cd /dsgmnt/dsg200/genetics/users/mydir/split/c&&j ";

run;

%sysexec chmod +x ./myscript.sh ;

%sysexec ./myscript.sh ;





data _null_;
set ind&j..mapc&j end=eod;
if eod then call symput("totm",trim(left(_n_)));
run;


%let counter=0;
";






















%do m=1 %to &totm %by &magicNumber;
%let counter=%eval(&counter +1);
%let s=&m ;
%let e=%eval(&s + &magicNumber -1);
%if &m = %eval((&totm /&magicNumber )* &magicNumber +1) %then %do;
%let stringcomp&counter =%quote( &s <= _n_ <= &totm ) ;
%end;
%else %do;
%let stringcomp&counter =%quote( &s <= _n_ <= &e );
%end;
markname chrom pos
%put &&stringcomp&counter ;
%end;
M1
1
10001
%do k=1 %to &counter;
data out&j..mapc&j.p&k ;
format markname $12.; length markname $12;
set ind&j..mapc&j ;
markname=compress(SNP);
if &&stringcomp&k then output;
run;
data _null_;
set out&j..mapc&j.p&k end=eod;
call symput("m"||trim(left(_n_)),trim(left(markname)));
if eod then call symput("ptotm",trim(left(_n_)));
M2
M3
M4
M5
1
1
1
2
mapc&j
run;
10011
10015
10021
20001
Subject
1
2
3
4











mlgenoc&j
data out&j..mlgenoc&j.p&k ;
set mlgenoc&j (keep=subject %do ii=1 %to &ptotm; &&m&ii %end;) ;
run;
data out&j..genefreqc&j.p&k ;
set genefreqc&j ;
%do ii=1 %to &ptotm ;
if markname ="&&m&ii" then output;
%end;
run;

%end; %* k counter loop;

%end; %* j chromosome loop;


M1 M2 M3 M4… Mn
1/1 2/4 2/2 2/2 … 1/1
1/3 2/2 2/4 1/1 … 1/1
3/3 4/4 2/4 2/2 … 1/1
1/3 2/4 4/4 1/1 … 1/1
…
%mend split;
%split
genefreqc&j
markname alele percent
M1
1
10.0
M1
3
90.0
M2
2
25.0
M2
4
75.0
M3
2
0.10
M3
4
99.9
M4
1
0.10
M4
2
99.9

*------------------------------------------------------------;

* program: parallel mixed.interface.sas

*------------------------------------------------------------;

* Purpose: run mixed model
;

*
;

* By: Aldi Kraja
;

* February 17, 2008
;

*
;

;

*------------------------------------------------------------;

%macro parallel(v=,

dsplit=,

pheno=,

type=,

rotation=,

subject=,

pedid=,

fid=,

mid=,

sex=,

dirout=,

genlabel=,

markname=);

;
data _null_;
file "./rmscript.sh";


put " cd &dirout ";

put " find . -name '*mixed*sas7bdat' | xargs /bin/rm -f

put " find . -name '*freq*.sas7bdat' | xargs /bin/rm -f


put " cd /dsgmnt/dsg200/genetics/users/mydir/split/c&v./";
run;

%sysexec chmod +x ./rmscript.sh ;

%sysexec ./rmscript.sh ;



";
";
%do y=1 %to &dsplit ;
%sysexec rm -f ./test&y..sas ;





%*---------------------------------;
%* let start to program in parallel;
%*---------------------------------;
data _null_;
file "./test&y..sas" lrecl=36000;

put "options nofmterr; options nomprint nomlogic nosymbolgen;

put '%include "/dsgmnt/dsg200/genetics/users/mydir1/macroutil.sas";
";

put '%include "/dsgmnt/dsg200//genetics/users/mydir2/mixed.sas"; ';

put " libname ph22

put " libname trip

put " libname snpc&v

put " *------------- map ------------------ ;

put " data cmap&v.p&y ;

put "

put "

put " run;

put "%sortit(cmap&v.p&y,cmap&v.p&y, &markname,nodupkey)

put " *------------- locdes ------------------ ;

put "%sortit(cmap&v.p&y,clocdes&v.p&y,&markname,nodupkey)

put " data cmap&v.p&y ; ";

put "

put "
by &markname; if in1 and in2;

put "
run;

put " %sortit(cmap&v.p&y ,cmap&v.p&y ,&markname)

put " *------------- genefreq

put " data genefreq&v.p&y ;

put "

put " set snpc&v..genefreqc&v.p&y ;

put "run;

put " %sortit(genefreq&v.p&y,genefreq&v.p&y,&markname)

put " data genefreq&v.p&y;

put "

put "

put " *------------- ganon ------------------ ;

put " data aganon&v.p&y ; set snpc&v..mlgenoc&v.p&y ; run;

put " *------------- triplet ------------------ ;

put " data trip; set trip.phenos (keep=&pedid &subject &fid &mid &sex ); run;
'/dsgmnt/dsg200/genetics/data/mydir/pheno';
";
'/dsgmnt/dsg200/genetics/data/mydir/geno/';
";
'/dsgmnt/dsg200/genetics/data/mydir/split/c&v';

";
";
";
format &markname $12.; length &markname $12 ;
";
set snpc&v..mapc&v.p&y (where=( chrom=&v ));
";
";
";
";
";
merge cmap&v.p&y (in=in1) clocdes&v.p&y (in=in2);
";
";
";
";
------------------ ;
";
";
format &markname $12.; length &markname $12 ;
";
";
";
";
";
merge genefreq&v.p&y (in=in1) clocdes&v.p&y (in=in2 ) cmap&v.p&y ;
by &markname; if in1 and in2; run;
put " %sortit(trip,trip,&subject ,nodupkey )
";
";
";
";
";


';
";
";

put " *-------------phenodata------------------ ;

put " %sortit(ph22.fanomiss,phenodata (keep=&subject &pheno ),&subject)
";

put " data phenodata; merge phenodata (in=in1) trip (in=in2); by &subject;
";

put "
";

put "

put "
put '-------------------------------------------------------------------'; ";

put "
put 'This is a note for showing that you have missing &subject in the data';";

put "
put '-------------------------------------------------------------------'; ";

put "
put _all_ ; abort abend;

put " end;
";

put " run;
";

put " %sortit(phenodata,phenodata,&subject )
";

put " %sortit(aganon&v.p&y,aganon&v.p&y,&subject )
";

put "data aganon&v.p&y ; merge aganon&v.p&y (in=in1) phenodata (in=in2); by &subject ;";

put " if in1 ;
";

put " run;
";

put " proc datasets lib=work; delete phenodata; run;
";

put " *------------- finally set up the macro ------------------ ;

put "

put "
datain=aganon&v.p&y,

put "
clocdes=clocdes&v.p&y,
";

put "
cmap=cmap&v.p&y,
";

put "
dist=position,
";

put "
cgenefreq=genefreq&v.p&y,
";

put "
triplet=trip,
";

put "
chrom=chrom,
";

put "
marshvar=cmap,
";

put "
markvar=markname,
";

put "
pedid=&pedid,

put "
subject=&subject,
";

put "
fid=&fid,
";

put "
mid=&mid,
";

put "
sex=&sex,
";

put "
qualaff=,
";

put "
phenodata=aganon&v.p&y,
";

put "
pheno=&pheno,
";

put "
qualtrait=,
";

put "
qualcovars=&sex,
";

put "
quantcovars=,
";

put "
correctForMean=NO,
";

put "
bymark=500,
";

put "
programd=MIXED,
";

put "
model=a,
";

put "
time=,

put "
genlabel=&genlabel&y,
";

put "
barplot=0,
";
if in1 and in2;
if &subject =. then do;
%alleled(
";
";
";
";
“;
";
";

put "

run;
whohasmarkers=NO)
";



%end ;
%* end of y loop for each split dataset ;
data _null_;

file "./mixedscript.sh ";

%do j=1 %to &dsplit;

put "bsub nohup sas -memsize 1G ./test&j..sas ";

put "sleep 5 ";

%end;

put "echo Finished the submission for chrom: &v a total of &dsplit jobs";


run;

X 'chmod +x ./mixedscript.sh ';

%sysexec ./mixedscript.sh ;

%mend parallel;

%parallel(v=22,

dsplit=170,

pheno=Factor1mleod,

type=allf1,

rotation=varimax,

subject=subject,

pedid=pedid,

fid=dadsubj,

mid=momsubj,

sex=sex,

dirout=/dsgmnt/dsg200/genetics/data/mydir/results/w/c&v./&type./&rot./,

genlabel=f1fhs,

markname=markname)

*------------------------------------------------------------;

* summarize.sas
;

* Purpose: summarize the results of Mixed model
;

*------------------------------------------------------------;

%global nobs;

%let nobs=0;



%include "/dsgmnt/dsg200/genetics/users/mydir/macroutil.sas";
%let study=mystudy;
libname inncbi "/dsgmnt/dsg200/genetics/data/generaldir/annot/";


%macro test(totloop=968 1105 872 816 841 912 717 738 611 693 651 625 521 420 362 357 293 385 186 318 170 170,

chroms=1,

chrome=22,

fact=1,

dirout=/dsgmnt/dsg200/genetics/data/mydir/results/final&study./f&fact./);

libname out&study "&dirout";

%let count=1;

%let tloop1=%scan(&totloop,&count);





%do %while(%scan(&totloop,&count) ^= );
%let count=%eval(&count+1);
%let tloop&count=%scan(&totloop,&count);
%end;

%let count=%eval(&count-1);

%do k=&chroms %to &chrome;

%* delete what you will append to;

proc datasets lib=out&study ;






delete mixed&study.c&k &study.c&k ; run;
libname freq&k "/dsgmnt/dsg200/genetics/data/mydir/split/c&k";
%do f=&fact %to &fact;
libname fhs&k.f&f "/dsgmnt/dsg200/genetics/data/mydir/results/w/c&k./allf&f./var";
%do j=1 %to &&tloop&k ;
%isempty(data=fhs&k.f&f..amixedf&f.&study&j)

%if &nobs =0 %then %do; %*--------------work on empty set check--------------;

data a; factor=&f ; notempty=.; empty=&j ;output; run;

%if &j=1 %then %do;

data out&study..&study.c&k ;
set a; run;


proc datasets lib=work; delete a; run;

%end; %* loop when j is 1;

%else %do;

data out&study..&study.c&k ;
set out&study..&study.c&k a; run;



proc datasets lib=work; delete a; run;
%end; %* end of loop j is not 1;

%end;

%else %do; %* the nobs is not 0 =======================;


%*-----------------------------;
data a;
factor=&f ; notempty=&j; empty=. ;output; run;

%*--------------------------------------------;

%* capture the frequencies present in the data;

%*--------------------------------------------;


data b (drop=markname);
format markn $12.; length markn $12;

set fhs&k.f&f..ameanfreqpermarkf&f.&study&j ;

markn=substr(markname,2); markn=compress(markn||"XXXXXXXXX"); run;

data b; set b (rename=(markn=markname)) ; run;

%sortit(b,b ,markname model)

proc transpose data=b out=c (rename=(COL1=AA COL2=AB COL3=BB));




by markname; var freq; run;
%*-----------work on freq ********;
data p;
set freq&k..genefreqc&k.p&j ;
run;

proc sort data=p; by markname percent allele; run;

proc transpose data=p out=q (rename=(COL1=a1 COL2=a2));


by markname; var allele; run;
proc transpose data=p out=r (rename=(COL1=MAF COL2=MOA));

by markname; var percent; run;

proc sort data=q; by markname; run;

proc sort data=r; by markname; run;




data s (drop=_NAME_);
merge q (in=in1) r (in=in2); by markname; if in1 and in2;
run;
%sortit(s,s,markname)
proc datasets lib=work; delete p q r ; run;

%*--------------------------------------------;

%* work on the mixed model output

%*--------------------------------------------;



;
%if &j=1 %then %do;
data out&study..&study.c&k ; set a; run;
%sortit(fhs&k.f&f..amixedf&f.&study&j,d,markname)
%sortit(fhs&k.f&f..ameanfreqpermarkf&f.&study&j,b,markname model)

%sortit(c,c(drop=_NAME_),markname)

data d;


merge c (in=in1) d (in=in2) s;

by markname;

if in1 and in2;

factor=&f ;

run;

data out&study..mixed&study.c&k (drop=model analysis _LABEL_); set d ;run;


proc datasets lib=work; delete a c d s; run;


%end; %* loop when j is 1;

%else %do;
data out&study..&study.c&k ;

set out&study..&study.c&k a; run;


%sortit(fhs&k.f&f..amixedf&f.&study&j,d,markname)

%sortit(c,c(drop=_NAME_),markname)
data d;


merge c (in=in1) d (in=in2) s;

by markname;

if in1 or in2; run;

data out&study..mixed&study.c&k (drop=model analysis _LABEL_); set out&study..mixed&study.c&k d ;

factor=&f ;

run;

proc datasets lib=work; delete a c d s; run;

%end; %* end of loop j is not 1;

%end;



%* end of condition the nobs is not 0;
%end; %* end of j loop (splits of the same chromosome) ;
%end;
%* end of f loop (factors)
;

%*--------------------------------------------------------------;

%* finally annotate based on the newest version of NCBI dbSNP ;

%*--------------------------------------------------------------;

%sortit(inncbi.ncbisnpb128_c&k,annot&k,markname,nodupkey)

%sortit(out&study..mixed&study.c&k,out&study..mixed&study.c&k,markname)

data out&study..mixed&study.c&k ;

merge out&study..mixed&study.c&k (in=in1) annot&k (in=in2);

by markname ; if in1; run;




proc datasets lib=work; delete annot&k ; run;
%end;
%* end of k loop (chromosome)
%mend test;
%test
;




Remove all split data
Do NOT remove the original source data
gzip unused results (you can gunzip results
at the time you need them again)
Multi-threaded systems and parallel
programming are common tools in modern
software applications, and are used to
enhance the scalability and performance of
large jobs









The statistical tasks included but were not
limited to:
􀁹 Drafting your analysis objectives
􀁹 Creating specifications for analysis datasets
􀁹 Creating specifications for tables, figures, and
listings
􀁹 Identifying first look analyses
􀁹 Performing data checking
􀁹 Parallel programming analysis datasets
􀁹 Checking data results and parallel
programming tables, figures, and listings
􀁹 Removal of the un-necessary data
Download