/**************************************************************************************************** ** geneblk_pc.sas imputes haplotypes by group, reconciles naming conventions across groups, ** ** calculates haplotype frequencies and creates data set for "expectation-substitution" ** ** analysis. ** ** ** ** Modified by Peter Kraft (pkraft@hsph.harvard.edu) from original code by Dan Stram. ** ** ** ** Last Modified 7 Oct 2005. ** ** ** ** 10/07/04 - Included &id=right(id) in all_race data step to ensure correct merging ** ** 10/07/04 - Added group-sepcific haplotype frequency output (not strat by status) ** ** 10/07/04 - Removed cosmetic %scan() that kept giving an error ** ** 11/02/04 - Added 'ploidy' option for haploids (X-linked genes in men) ** ** 10/07/05 - Added important instruction: ID variable MUST appear before SNPs in input data set. ** ** Otherwise a SNP will get read in as "ID". ** ** Hey, I didn't make the rules. I'm just telling you how it is. ** ** 10/07/05 - Another important instruction: ID variable MUST have $8. format in the input data. ** ** Otherwise merging gets all fouled up. Again, "ours is not to question why." ** ****************************************************************************************************/ * SAS macros to create haplotype inferences -- see documentation for macro genblk below; * Macros defined Main Macros: geneblk, checkfreq geneblk computes the haplotype frequency variables (as described below) checkfreq runs some basic checks on the output of geneblk Utility Macros: allblks, blk, seqn, numargs Other (SAS defined) macros used %SCAN, %EVAL, etc. See SAS documentation for their use (SCAN is used to separate lists into component values) see numargs below ; options xsync noxwait; * these options are required; options ls=132; * this can be changed by user; * options spool; * these options are for debugging turn off for production; * options symobolgen; * options macrogen; * options mprint; * Utility macro definitions -------------------------------------------------------------------------------; %macro allblks(ngrps); %do jx = 1 %to &ngrps; oxo&jx %end; %mend; %macro blk(snps,bstart,bend); %do jx = &bstart %to &bend; %scan(&snps,%eval(&jx),%str( )) %end; %mend; %macro seqn(bstart,bend,inc=1); %do jx = &bstart %to &bend %by &inc; %eval(&jx) %end; %mend; * numargs counts the number of arguments in an argument list; %macro numargs(arg); %let n=1; %do %while (%scan(&arg,%eval(&n),%str( ))^=%str()); %let n=%eval(&n+1); %end; %eval(&n-1) %mend numargs; *********************************************************************************************************************; %macro geneblk(in=in, out=out,dir=genex, lname=geneblk, id=id, groupvar=groupvar, usegrps=, snps=, blocks=,titl=titl,status=newstat, uncommon=5,tagsnps=tagsnps,ploidy=diploids); ********************************************************************************************************************** SAS Macro to create haplotype imputations based on the genotype data -- programmed by Dan Stram, Malcolm Pike, Peggy Wan at USC in -- input data set name out -- output data set name dir -- directory containing in and out lname -- libname pointing to dir id -- id variable (must be of type character) if this is longer than 8 charecters in length there may be problems groupvar -- center or ethnic group variable must be character variable -- EM algorithm is run separately for each group usegrps -- values of groupvar to use eg "W" "J" etc for Whites and Japanese in the MEC synthetic data snps -- character variables containg SNP genotypes coded as "1 1" "1 2" ... "4 4" .. with "0 0" as missing blocks -- block definition eg if snps=snp1 snp2 snp3 snp4 snp5 and the first 2 SNPs form block1 and the last 3 form block 2 then specify blocks=1 2 3 5 as the beginning and ending SNP for of each of the 2 blocks titl -- Title status -- status variable -- must equal 0 for controls and 1 for cases! uncommon -- definition of a rare haplotype (default 5 percent) in creating the blk1nhap1 variables tagsnps -- pathname for the tagSNPs program (needed if tagSNPs is not in your path) Output Created (on *.lst file): 1) Haplotype frequencies by group and case/control status for each block 2) Most common haplotype in reference group controls for each block 3) Maximum frequency (over all the groups) of each haplotype in controls for each block 4) List, for each block, of haplotypes with frequency greater than "uncommon" in at least one group 5) List, for each block, of (renumbered) haplotypes with frequency greater than "uncommon" in at least one group 6) Proc contents of all created variables See also the CheckFreqs macro for consistency checks (to make sure haplotype imputation seems consistent) The output data set contains the following expected haplotype count variables for each block (the "nhap" variables) these are named blk1thap blk1nhap1 blk1nhap2 ... blk1nhapm for block 1 these are named blk1thap blk2nhap1 blk2nhap2 ... blk2nhapm for block 2 etc here blk1thap is the expected number of copies of the most common haplotype in block 1. Here "most common haplotype" is defined as the most common in the refernce (first) group listed in the usegrps argument i.e. in the example supplied "W" is first and hence the reference group blk1nhap1 -- blk1nhapm-1 consist of the expected counts for the other common haplotypes in block 1 (those with greater frequency than specified in uncommon argument above in at least one group) blk1nhapm -- this a "garbage collection" -- the expected number of copies of all other haplotypes (the uncommon ones) combined save as above for all the other haplotype blocks *--------------------------------------------------------------------------------------------------------------------; * start; * make up scratch data set for later use; data statusonly (keep=&id &status &groupvar); libname &lname "&dir"; set &lname..∈ proc sort;by &id; * define working directory; libname &lname "&dir"; * define macro variables for number of SNPs and number of blocks (nsnps, nb) by parsing input parameters; %let x = %numargs(&snps); %let nsnps=%eval(&x); %let x=%numargs(&blocks); %let nb2=%eval(&x); %let nb=%eval(&x/2); * for each block create BLOCK directory and write tagSNPs code ; * loop over blocks; %do i=1 %to &nb; * initialize temp data sets will hold expectation variables (haplotype count and haplotype indicator); data temp_h; data temp_b; data temp_c; * creates scratch directories called block1 block2 etc. X is a shell command -- expecutes external programs; x "mkdir ""&dir\block&i"""; * defines ith block; %let blockstart=%scan(&blocks,2*(&i-1)+1,%str( )); %let blockend=%scan(&blocks,2*&i,%str( )); data _null_; * Writes a program file, predict_b.pgm, for the TAGSNPS program which will be executed for the current block will read data file, predict_b.txt created next ; file "&dir\block&i\predict_b.pgm"; put 'title= "' &titl '";'; put "SNPS = %seqn(%scan(&blocks,%eval(2*(&i-1)+1),%str( )),%scan(&blocks,%eval(2*&i),%str( ))); "; put "&ploidy.;" put "maxit=50;"; put "file='predict_b.txt' code=2 format= csv;"; put "use = %seqn(%scan(&blocks,%eval(2*(&i-1)+1),%str( )),%scan(&blocks,%eval(2*&i),%str( )));"; put "em;"; put "predict recode id='&id' idlen=8 predinfile='predict_b.txt' predoutfile='h_b.dat' sas='h_b.sas' vname='b&i._h' ;"; put "predict recode id='&id' idlen=8 indicator=0 predinfile='predict_b.txt' predoutfile='h_b_0.dat' sas='h_b_0.sas' vname='b&i._a' ;"; put "predict recode id='&id' idlen=8 indicator=1 predinfile='predict_b.txt' predoutfile='h_b_1.dat' sas='h_b_1.sas' vname='b&i._b' ;"; put "predict recode id='&id' idlen=8 indicator=2 predinfile='predict_b.txt' predoutfile='h_b_2.dat' sas='h_b_2.sas' vname='b&i._c' ;"; run; * define macro variable for number of groups in usegrps; %let ngrps=%numargs(&usegrps); * Loop over all the groups -- write data -- run tagSNPs -- and read back haplotype data from tagSNPs; %do j=1 %to &ngrps; * Write the data out to "predict_b.txt" for current group using proc export; * make up dataset containing only the data needed by tagSNPs; data predict_b; set &lname..&in (keep=&id &groupvar %blk(&snps,%scan(&blocks,%eval(2*(&i-1)+1),%str( )),%scan(&blocks,%eval(2*&i),%str( )))); if &groupvar eq %scan(&usegrps,%eval(&j),%str( )); drop &groupvar; run; * Create comma deliminted file, predict_b.txt, using PROC export; PROC EXPORT data=predict_b OUTFILE= "&dir\block&i\predict_b.txt" DBMS=CSV REPLACE; run; * Run the tagSNPs program; x "cd &dir\block&i"; x "&tagsnps predict_b.pgm haplo.out"; * Include the SAS code written out by tagsnps -- this defines datasets to read in the haplotype inferences; %include "&dir\block&i.\h_b.sas"; * expected haplotype count, E[(delta_h)|genotypes] for each haplotype h for each subject (in current group) these are the "H" variables; %include "&dir\block&i.\h_b_1.sas"; * expected haplotype indicator P(delta_h=1|genotypes] for each haplotype h for each subject these are the "B" variables; %include "&dir\block&i.\h_b_2.sas"; * expected haplotype indicator P(delta_h=2|genotypes] for each haplotype h for each subject the "C" variables; * initially the H, B, and C variables are named according to haplotype identity: ie variable h13242 referes to haplotype 13242 etc -- these will later renamed and the most common haplotype will be called thap, with the remaining called nhap1 nhap2, etc. ; run; * append haplotype inferences for current group into temp files for this block; data temp_h; set temp_h b&i._h; data temp_b; set temp_b b&i._b; data temp_c; set temp_c b&i._c; %end; * end of loop over each group; * remove unwanted first observation from temp_ files; data temp_h;set temp_h; if _n_ gt 1;run; data temp_b;set temp_b; if _n_ gt 1;run; data temp_c;set temp_c; if _n_ gt 1;run; * sort and merge temp files; proc sort data=temp_h; by &id; proc sort data=temp_b; by &id; proc sort data=temp_c; by &id; * Now for this current block, i, set all the haplotype files into temp file h_b_i; data h_b_&i; merge temp_h temp_b temp_c; by &id; proc sort data=h_b_&i; by &id; run; ******************************************************************** this begins is the trickyest part of the code -- renaming the haplotypes in this block from haplotype identity (e.g. h13242 etc) to: thap -- most common haplotype in reference group nhap1, nhap2 ... -- haplotypes with frequency >= uncommon nhap&ncount -- garbage collection -- sum of all uncommon haplotypes as well as creating variable labels that preserve the haplotype identities ; * For each block reconcile all the haplotypes within each ethnic group or center and rename varibables to blk&i.hap1 -blk&i.hap&count etc. ; * first find out how many haplotypes have been defined by tagSNPs (number of H, B, and C variables) and store as macro variable &count; data _null_; if 0 then set h_b_&i; array temp [*] _numeric_; * this defines an array containing variable names for all numerical variables in the dataset in this case these are the haplotype count variables (H variables) and the indicator variables (B and C variables) ; x=dim(temp)/3; call symput ('count',left(put(x,4.))); stop; *creates the macro variable containing the number of H (and B and C) variables -- ie the total number of haplotypes common or uncommon; run; * Make up variable Labels containing the actual haplotype identities e.g. "h13242", etc, for the renamed haplotype count variables; * write the current (haplotype identity variable names out to a dataset for processing) ; proc contents data=h_b_&i position out= vnames (keep=name varnum type) noprint ; run; proc sort; by varnum; * Now write out the code in SAS label format to "labels.dat" note that "labels.dat" is not used directly by this program it must first be further processed (to reflect the fact that the uncommon haplotypes are to be merged into one category); %let nw=0; data x; * write out haplotype identities (variable names stored in vnames) to labels.dat; file "&dir\labels.dat"; %let nw=1; set vnames; %let nw=%eval(&nw+1); n1=_n_-2; * these first variables are haplotype count variables; n2=_n_- &count-2 ; * next are the indictor B variables; n3=_n_- 2*&count-2 ; * now the C variables; sname=scanq(name,2,"_"); * create labels first for haplotype count variables; if _n_ gt 2 and _n_ le %eval(2+&count) then do; put "label blk&i.hap" n1 @17 '="' @19 sname '";'; end; * then create labels for the B variables; if _n_ gt %eval(2+&count) and _n_ le %eval(2+2*&count) then do; put "label blk&i.b" n2 @17 '="' @19 sname '";'; end; * then create labels for the C variables; if _n_ gt %eval(2+2*&count) and _n_ le %eval(2+3*&count) then do; put "label blk&i.c" n3 @17 '="' @19 sname '";'; end; run; * now rename the haplotype variables from haplotype h13242 etc to hap1, hap2 ... hap&count; * the hap1, hap2, etc names are only used temporarily -- when the uncommon haplotypes are merged the variables will be renamed to thap, nhap1, nhap2, etc.; data all_race&i ; set h_b_&i; array miss[*] _numeric_; array blk&i.b (&count) ; array blk&i.c (&count); array blk&i.hap (&count); do i=1 to dim(miss); if miss[i] = . then miss[i]=0; end; &id = right(&id); do i=1 to &count; blk&i.hap[i]=miss[i]; blk&i.b[i]=miss[&count+i]; blk&i.c[i]=miss[2*&count+i]; end; keep &id genotype blk&i.hap1-blk&i.hap&count blk&i.b1-blk&i.b&count blk&i.c1-blk&i.c&count; * Now create the nhap variables (merging the uncommon haplotypes); proc sort data=all_race&i; by &id; * merge in STATUS variable with haplotype variables -- will define the most common haplotype as the most common in controls for the reference group (first group named in the USEGRPs parameter); data ttt; merge all_race&i (in=inar) statusonly ; by &id; if inar; proc sort; by &groupvar &status; run; * create dataset (O) containing the haplotype sums and sample sizes by case control status and group; proc means data = ttt noprint; var blk&i.hap1-blk&i.hap&count; by &groupvar &status; output out=o sum=s1-s&count; proc means data = ttt noprint; var blk&i.hap1-blk&i.hap&count; by &groupvar; output out=opoolcc sum=s1-s&count; run; ******************************************************************************************************************; *** THIS SECTION DIFFERS DEPENDING ON NUMBER OF CHROMOSOMES = 2n FOR AUTOSOMES, n FOR GENES ON X OR Y IN MEN *; ******************************************************************************************************************; %if &ploidy eq diploids %then %do; * divide haplotype sums by 2N and multiply by 100 to get haplotype frequencies (in percent); data o; set o; n=_freq_; array s{&count} s1-s&count; array pct{&count} pct1-pct&count; do i=1 to &count; pct{i}=s{i}/(2*n)*100; end; data opoolcc; set opoolcc; n=_freq_; array s{&count} s1-s&count; array pct{&count} pct1-pct&count; do i=1 to &count; pct{i}=s{i}/(2*n)*100; end; * Print a report of haplotype frequencies; proc print data=opoolcc; var &groupvar n pct1-pct&count ; title 'Haplotype Frequencies for each haplotype by group'; proc print data=o; var &groupvar &status n pct1-pct&count ; title 'Haplotype Frequencies for each haplotype by group and case control status'; run; %end; %if &ploidy eq haploids %then %do; * divide haplotype sums by N and multiply by 100 to get haplotype frequencies (in percent); data o; set o; n=_freq_; array s{&count} s1-s&count; array pct{&count} pct1-pct&count; do i=1 to &count; pct{i}=s{i}/(n)*100; end; data opoolcc; set opoolcc; n=_freq_; array s{&count} s1-s&count; array pct{&count} pct1-pct&count; do i=1 to &count; pct{i}=s{i}/(n)*100; end; * Print a report of haplotype frequencies; proc print data=opoolcc; var &groupvar n pct1-pct&count ; title 'Haplotype Frequencies for each haplotype by group'; proc print data=o; var &groupvar &status n pct1-pct&count ; title 'Haplotype Frequencies for each haplotype by group and case control status'; run; %end; ******************************************************************************************************************; * identify the most common haplotype in the reference group (save position as hapnumber); data om (keep = junk maxhap hapnumber); set o ; if (&groupvar eq %scan(&usegrps,1,%str( )) and &status eq 0); * keep only controls from reference group; maxhap=max(of pct1-pct&count); * frequency of most common haplotype; array pct{&count} pct1-pct&count; *; do i=1 to &count; * pick out position of the most common haplotype save in hapnumber; if pct{i} eq maxhap then hapnumber=i; end; junk=1; run; proc print; * report which is the most common haplotype in this block; var hapnumber; title "Most common hap in reference group controls for block &i "; run; * create MPCT variables -- maximum frequency (over all the groups) of each haplotype in controls -- to identify "common" and "uncommon" haplotypes; * store in dataset O1; * drop the cases from data o; data o; set o; if (&status eq 0); * keep only controls; * save mpct variables in dataset o1; proc means noprint data=o; var pct1-pct&count; output out=o1 max=mpct1-mpct&count; run; * add a junk variable for use in later merging; data o1; set o1; junk=1; proc sort; by junk; run; * print report; proc print; var mpct1- mpct&count; title "Max of all controls for block &i "; run; data o2; set ttt; * add JUNK to TTT for merge; junk=1; proc sort; by junk; run; data oo; * merge MPCT variables with all the haplotype variables (from dataset TTT) ; merge o1 o2(in=a) om; by junk; if a; * Create the thap, and nhap variables by renaming the most common haplotype to thap, and merging the uncommon together in one final &ncount; * Do same for B and C variables; data oo1; set oo; * contains haplotype data for all subjects; * Arrays used; * old (hap) variables; array hap{&count} blk&i.hap1-blk&i.hap&count; * currently defined hap1, hap2 ... hap&ncount variables; array b{&count} blk&i.b1-blk&i.b&count; * currently defined b1, b2 ... variables; array c{&count} blk&i.c1-blk&i.c&count; * C variables; array mpct{&count} mpct1-mpct&count; * maxpercent variables if any of these are less than &uncommon merge that haplotype into an uncommon category ; * new (nhap) variables; * renumbered merging uncommon; array nhap{&count} blk&i.nhap1-blk&i.nhap&count; * nhap variables; array nb{&count} blk&i.nb1-blk&i.nb&count; * the nb variables; array nc{&count} blk&i.nc1-blk&i.nc&count; * the nc; * pointers: for each common haplotype thap or nhap inumber gives the earlier (hap1 hap2 ... ) name/position of the haplotype -- will be used to make labels for the nhap variables using the information now stored in "labels.dat"; array inumber{&count} blk&i.inumber1-blk&i.inumber&count; j=0; c_all=sum(of blk&i.c1-blk&i.c&count); b_all=sum(of blk&i.b1-blk&i.b&count); * loop over all haplotype variables; do i= 1 to &count; * identify "most common" haplotype rename hap, b, and c, to thap, tb, tc, etc; if (i eq hapnumber) then do; blk&i.thap=hap{I}; blk&i.tb=b{I}; blk&i.tc=c{i};end; * set both uncommon and most common (old) haplotype variables to missing values; if (mpct{i} lt &uncommon or i eq hapnumber ) then do; hap{i}=.;b{i}=.;c{i}=.;end; if (hap{I} ne . ) then do; * add new nhap variable for common (but not most common) haplotype; j=j+1; nhap{j}=hap{i}; nb{j}=b{i}; nc{j}=c{i}; inumber{j}=i; end; end; j=j+1; * j is now equal to 1 + the number of common haplotypes; * Now pool the uncommon haplotypes into final nhap variable; happred=sum(of blk&i.hap1-blk&i.hap&count); *Following code differs by ploidy; *It works since haplotypes must sum to 2 for diploids, 1 for haploids; %if &ploidy eq diploids %then %do; hapother=2-happred-blk&i.thap; %end; %if &ploidy eq haploids %then %do; hapother=1-happred-blk&i.thap; %end; bpred=sum(of blk&i.b1-blk&i.b&count); bother=b_all-bpred-blk&i.tb; * b_all contains sum of all B indicator variables; cpred=sum(of blk&i.c1-blk&i.c&count); cother=c_all-cpred-blk&i.tc; * c_all contains sum of all C indicator variable; nhap{j}=hapother; * final nhap variables; nb{j}=bother; nc{j}=cother; run; data oo2; set oo1(obs=1); * write report ; proc print; var blk&i.inumber1-blk&i.inumber&count; title "List of haplotypes in block &i with freq >= &uncommon pct for at least one group"; run; data oxo&i; * now create macro variable, ncount, which contains the number of common haplotypes perserved in the loop above; set oo1; length jj $1; length jjj $2; if (j lt 10) then do; jj=j; call symput ('ncount',jj); end; if (j ge 10) then do; jjj=j; call symput ('ncount',jjj); end; data oo2; set oxo&i (obs=20); * write report of first 20 observations on dataset; proc print; var &id &groupvar &status blk&i.nhap1-blk&i.nhap&ncount ; title "List, for block &i, of (renumbered) haplotypes with freq >= &uncommon percent in at least one group"; run; *; * Finally make up labels for the nhap1 -- nhap&ncount variables; *; * first create file that contains the pointers from the nhap to the hap variable names; data _null_; set oo1(obs=1); file "&dir\labels2.dat"; ncount=&ncount; put hapnumber blk&i.inumber1-blk&i.inumber&ncount; * treating the labels.dat file (from way above) as data create an indexed file containing the haplotype identity (h13234 type) names; data x; infile "&dir\labels.dat"; do i=1 to &count; input @20 name :$20.; output; end; stop; * declare variable i as an index ; proc datasets library=work; modify x; index create i; run; * read the pointers from labels2.dat and use them to index the correct labels in dataset X; data y ; infile "&dir\labels2.dat"; array h{&ncount} hnum1-hnum&ncount; input hnum1-hnum&ncount; do j=1 to &ncount; i=h{j}; output; end; stop; data z; * this will contain both the nhap variable names and the haplotype identity (h12323 type) names; set y; set x key=i; run; * Now write a SAS program file that contains the labels information; data xy; file "&dir\labels.sas"; set z; nm1=_n_-1; if(_n_=1) then do; put "label blk&i.thap=" '"h' name '";'; put "label blk&i.tb=" '"b' name '";'; put "label blk&i.tc=" '"c' name '";'; end; else do; put "label blk&i.nhap" nm1 "=" '"h' name '";'; put "label blk&i.nb" nm1 "=" '"b' name '";'; put "label blk&i.nc" nm1 "=" '"c' name '";'; end; run; * finally drop all scrach variables and add the number of common and total number of haplotypes to the file and use the lables just created; data oxo&i ( keep=&id &groupvar &status blk&i.nhap1-blk&i.nhap&ncount blk&i.nb1-blk&i.nb&ncount blk&i.nc1-blk&i.nc&ncount blk&i.thap blk&i.tb blk&i.tc ncommon&i ntotalhaps&i ); set oxo&i; ncommon&i=j; ntotalhaps&i=i-1; %include "&dir\labels.sas"; run; proc sort data=oxo&i;by &id; %end; * end of loop over all blocks; * Finally merge all the block files into one file; data allblocks ; merge %allblks(&nb); by &id; run; title "All haplotype related variables created in this run"; proc contents data=allblocks position; run; * And create final output dataset; proc sort data=&lname..∈ by &id; data &lname..&out; merge allblocks &lname..∈ by &id; run; %mend; ****************** End of Macro geneblk ****************************************************************; /* DEFINE MACROFUNCTION NUMARGS */ %macro numargs(arg); %if &arg= %then %do; 0 %end; %else %do; %let n=1; %do %until (%scan(&arg,%eval(&n),%str( ))=%str()); %let n=%eval(&n+1); %end; %eval(&n-1) %end; %mend numargs;