[2] | 1 | """ |
---|
| 2 | # galaxy tool xml files can define a galaxy supplied output filename |
---|
| 3 | # that must be passed to the tool and used to return output |
---|
| 4 | # here, the plink log file is copied to that file and removed |
---|
| 5 | # took a while to figure this out! |
---|
| 6 | # use exec_before_job to give files sensible names |
---|
| 7 | # |
---|
| 8 | # ross april 14 2007 |
---|
| 9 | # plink cleanup script |
---|
| 10 | # ross lazarus March 2007 for camp illumina whole genome data |
---|
| 11 | # note problems with multiple commands being ignored - eg --freq --missing --mendel |
---|
| 12 | # only the first seems to get done... |
---|
| 13 | # |
---|
| 14 | ##Summary statistics versus inclusion criteria |
---|
| 15 | ## |
---|
| 16 | ##Feature As summary statistic As inclusion criteria |
---|
| 17 | ##Missingness per individual --missing --mind N |
---|
| 18 | ##Missingness per marker --missing --geno N |
---|
| 19 | ##Allele frequency --freq --maf N |
---|
| 20 | ##Hardy-Weinberg equilibrium --hardy --hwe N |
---|
| 21 | ##Mendel error rates --mendel --me N M |
---|
| 22 | # |
---|
| 23 | # call as plinkClean.py $i $o $mind $geno $hwe $maf $mef $mei $outfile |
---|
| 24 | # note plinkClean_code.py does some renaming before the job starts |
---|
| 25 | |
---|
| 26 | |
---|
| 27 | <command interpreter="python2.4"> |
---|
| 28 | rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind' '$geno' '$hwe' '$maf' |
---|
| 29 | '$mef' '$mei' '$out_file1' '$out_file1.files_path' '$userId' |
---|
| 30 | |
---|
| 31 | |
---|
| 32 | """ |
---|
| 33 | import sys,shutil,os,subprocess, glob, string, tempfile, time |
---|
| 34 | from rgutils import galhtmlprefix, timenow, plinke |
---|
| 35 | prog = os.path.split(sys.argv[0])[-1] |
---|
| 36 | myversion = 'January 4 2010' |
---|
| 37 | verbose=False |
---|
| 38 | |
---|
| 39 | |
---|
| 40 | def fixoutaff(outpath='',newaff='1'): |
---|
| 41 | """ quick way to create test data sets - set all aff to 1 or 2 for |
---|
| 42 | some hapmap data and then merge |
---|
| 43 | [rerla@beast galaxy]$ head tool-data/rg/library/pbed/affyHM_CEU.fam |
---|
| 44 | 1341 14 0 0 2 1 |
---|
| 45 | 1341 2 13 14 2 1 |
---|
| 46 | 1341 13 0 0 1 1 |
---|
| 47 | 1340 9 0 0 1 1 |
---|
| 48 | 1340 10 0 0 2 1 |
---|
| 49 | """ |
---|
| 50 | nchanged = 0 |
---|
| 51 | fam = '%s.fam' % outpath |
---|
| 52 | famf = open(fam,'r') |
---|
| 53 | fl = famf.readlines() |
---|
| 54 | famf.close() |
---|
| 55 | for i,row in enumerate(fl): |
---|
| 56 | lrow = row.split() |
---|
| 57 | if lrow[-1] <> newaff: |
---|
| 58 | lrow[-1] = newaff |
---|
| 59 | fl[i] = ' '.join(lrow) |
---|
| 60 | fl[i] += '\n' |
---|
| 61 | nchanged += 1 |
---|
| 62 | fo = open(fam,'w') |
---|
| 63 | fo.write(''.join(fl)) |
---|
| 64 | fo.close() |
---|
| 65 | return nchanged |
---|
| 66 | |
---|
| 67 | |
---|
| 68 | |
---|
| 69 | def clean(): |
---|
| 70 | """ |
---|
| 71 | """ |
---|
| 72 | if len(sys.argv) < 16: |
---|
| 73 | print >> sys.stdout, '## %s expected 12 params in sys.argv, got %d - %s' % (prog,len(sys.argv),sys.argv) |
---|
| 74 | print >> sys.stdout, """this script will filter a linkage format ped |
---|
| 75 | and map file containing genotypes. It takes 14 parameters - the plink --f parameter and" |
---|
| 76 | a new filename root for the output clean data followed by the mind,geno,hwe,maf, mef and mei" |
---|
| 77 | documented in the plink docs plus the file to be returned to Galaxy |
---|
| 78 | called as: |
---|
| 79 | <command interpreter="python"> |
---|
| 80 | rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind' |
---|
| 81 | '$geno' '$hwe' '$maf' '$mef' '$mei' '$out_file1' '$out_file1.files_path' |
---|
| 82 | '$relfilter' '$afffilter' '$sexfilter' '$fixaff' |
---|
| 83 | </command> |
---|
| 84 | |
---|
| 85 | """ |
---|
| 86 | sys.exit(1) |
---|
| 87 | plog = [] |
---|
| 88 | inpath = sys.argv[1] |
---|
| 89 | inbase = sys.argv[2] |
---|
| 90 | killme = string.punctuation + string.whitespace |
---|
| 91 | trantab = string.maketrans(killme,'_'*len(killme)) |
---|
| 92 | title = sys.argv[3].translate(trantab) |
---|
| 93 | mind = sys.argv[4] |
---|
| 94 | geno = sys.argv[5] |
---|
| 95 | hwe = sys.argv[6] |
---|
| 96 | maf = sys.argv[7] |
---|
| 97 | me1 = sys.argv[8] |
---|
| 98 | me2 = sys.argv[9] |
---|
| 99 | outfname = sys.argv[10] |
---|
| 100 | outfpath = sys.argv[11] |
---|
| 101 | relf = sys.argv[12] |
---|
| 102 | afff = sys.argv[13] |
---|
| 103 | sexf = sys.argv[14] |
---|
| 104 | fixaff = sys.argv[15] |
---|
| 105 | output = os.path.join(outfpath,outfname) |
---|
| 106 | outpath = os.path.join(outfpath,title) |
---|
| 107 | outprunepath = os.path.join(outfpath,'ldprune_%s' % title) |
---|
| 108 | try: |
---|
| 109 | os.makedirs(outfpath) |
---|
| 110 | except: |
---|
| 111 | pass |
---|
| 112 | bfile = os.path.join(inpath,inbase) |
---|
| 113 | outf = file(outfname,'w') |
---|
| 114 | vcl = [plinke,'--noweb','--bfile',bfile,'--make-bed','--out', |
---|
| 115 | outpath,'--set-hh-missing','--mind',mind, |
---|
| 116 | '--geno',geno,'--maf',maf,'--hwe',hwe,'--me',me1,me2] |
---|
| 117 | # yes - the --me parameter takes 2 values - mendels per snp and per family |
---|
| 118 | if relf == 'oo': # plink filters are what they leave... |
---|
| 119 | vcl.append('--filter-nonfounders') # leave only offspring |
---|
| 120 | elif relf == 'fo': |
---|
| 121 | vcl.append('--filter-founders') |
---|
| 122 | if afff == 'affonly': |
---|
| 123 | vcl.append('--filter-controls') |
---|
| 124 | elif relf == 'unaffonly': |
---|
| 125 | vcl.append('--filter-cases') |
---|
| 126 | if sexf == 'fsex': |
---|
| 127 | vcl.append('--filter-females') |
---|
| 128 | elif relf == 'msex': |
---|
| 129 | vcl.append('--filter-males') |
---|
| 130 | p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath) |
---|
| 131 | retval = p.wait() |
---|
| 132 | plog.append('%s started, called as %s' % (prog,' '.join(sys.argv))) |
---|
| 133 | outf.write(galhtmlprefix % prog) |
---|
| 134 | outf.write('<ul>\n') |
---|
| 135 | plogf = '%s.log' % os.path.join(outfpath,title) |
---|
| 136 | try: |
---|
| 137 | plogl = file(plogf,'r').readlines() |
---|
| 138 | plog += [x.strip() for x in plogl] |
---|
| 139 | except: |
---|
| 140 | plog += ['###Cannot open plink log file %s' % plogf,] |
---|
| 141 | # if fixaff, want to 'fix' the fam file |
---|
| 142 | if fixaff <> '0': |
---|
| 143 | nchanged = fixoutaff(outpath=outpath,newaff=fixaff) |
---|
| 144 | plog += ['## fixaff was requested %d subjects affection status changed to %s' % (nchanged,fixaff)] |
---|
| 145 | pf = file(plogf,'w') |
---|
| 146 | pf.write('\n'.join(plog)) |
---|
| 147 | pf.close() |
---|
| 148 | globme = os.path.join(outfpath,'*') |
---|
| 149 | flist = glob.glob(globme) |
---|
| 150 | flist.sort() |
---|
| 151 | for i, data in enumerate( flist ): |
---|
| 152 | outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1])) |
---|
| 153 | outf.write('</ul>\n') |
---|
| 154 | outf.write("</ul></br></div></body></html>") |
---|
| 155 | outf.close() |
---|
| 156 | |
---|
| 157 | |
---|
| 158 | if __name__ == "__main__": |
---|
| 159 | clean() |
---|
| 160 | |
---|