1 | """ |
---|
2 | # galaxy tool xml files can define a galaxy supplied output filename |
---|
3 | # that must be passed to the tool and used to return output |
---|
4 | # here, the plink log file is copied to that file and removed |
---|
5 | # took a while to figure this out! |
---|
6 | # use exec_before_job to give files sensible names |
---|
7 | # |
---|
8 | # ross april 14 2007 |
---|
9 | # plink cleanup script |
---|
10 | # ross lazarus March 2007 for camp illumina whole genome data |
---|
11 | # note problems with multiple commands being ignored - eg --freq --missing --mendel |
---|
12 | # only the first seems to get done... |
---|
13 | # |
---|
14 | ##Summary statistics versus inclusion criteria |
---|
15 | ## |
---|
16 | ##Feature As summary statistic As inclusion criteria |
---|
17 | ##Missingness per individual --missing --mind N |
---|
18 | ##Missingness per marker --missing --geno N |
---|
19 | ##Allele frequency --freq --maf N |
---|
20 | ##Hardy-Weinberg equilibrium --hardy --hwe N |
---|
21 | ##Mendel error rates --mendel --me N M |
---|
22 | # |
---|
23 | # call as plinkClean.py $i $o $mind $geno $hwe $maf $mef $mei $outfile |
---|
24 | # note plinkClean_code.py does some renaming before the job starts |
---|
25 | |
---|
26 | |
---|
27 | <command interpreter="python2.4"> |
---|
28 | rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind' '$geno' '$hwe' '$maf' |
---|
29 | '$mef' '$mei' '$out_file1' '$out_file1.files_path' '$userId' |
---|
30 | |
---|
31 | |
---|
32 | """ |
---|
33 | import sys,shutil,os,subprocess, glob, string, tempfile, time |
---|
34 | from rgutils import galhtmlprefix, timenow, plinke |
---|
35 | prog = os.path.split(sys.argv[0])[-1] |
---|
36 | myversion = 'January 4 2010' |
---|
37 | verbose=False |
---|
38 | |
---|
39 | |
---|
40 | def fixoutaff(outpath='',newaff='1'): |
---|
41 | """ quick way to create test data sets - set all aff to 1 or 2 for |
---|
42 | some hapmap data and then merge |
---|
43 | [rerla@beast galaxy]$ head tool-data/rg/library/pbed/affyHM_CEU.fam |
---|
44 | 1341 14 0 0 2 1 |
---|
45 | 1341 2 13 14 2 1 |
---|
46 | 1341 13 0 0 1 1 |
---|
47 | 1340 9 0 0 1 1 |
---|
48 | 1340 10 0 0 2 1 |
---|
49 | """ |
---|
50 | nchanged = 0 |
---|
51 | fam = '%s.fam' % outpath |
---|
52 | famf = open(fam,'r') |
---|
53 | fl = famf.readlines() |
---|
54 | famf.close() |
---|
55 | for i,row in enumerate(fl): |
---|
56 | lrow = row.split() |
---|
57 | if lrow[-1] <> newaff: |
---|
58 | lrow[-1] = newaff |
---|
59 | fl[i] = ' '.join(lrow) |
---|
60 | fl[i] += '\n' |
---|
61 | nchanged += 1 |
---|
62 | fo = open(fam,'w') |
---|
63 | fo.write(''.join(fl)) |
---|
64 | fo.close() |
---|
65 | return nchanged |
---|
66 | |
---|
67 | |
---|
68 | |
---|
69 | def clean(): |
---|
70 | """ |
---|
71 | """ |
---|
72 | if len(sys.argv) < 16: |
---|
73 | print >> sys.stdout, '## %s expected 12 params in sys.argv, got %d - %s' % (prog,len(sys.argv),sys.argv) |
---|
74 | print >> sys.stdout, """this script will filter a linkage format ped |
---|
75 | and map file containing genotypes. It takes 14 parameters - the plink --f parameter and" |
---|
76 | a new filename root for the output clean data followed by the mind,geno,hwe,maf, mef and mei" |
---|
77 | documented in the plink docs plus the file to be returned to Galaxy |
---|
78 | called as: |
---|
79 | <command interpreter="python"> |
---|
80 | rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind' |
---|
81 | '$geno' '$hwe' '$maf' '$mef' '$mei' '$out_file1' '$out_file1.files_path' |
---|
82 | '$relfilter' '$afffilter' '$sexfilter' '$fixaff' |
---|
83 | </command> |
---|
84 | |
---|
85 | """ |
---|
86 | sys.exit(1) |
---|
87 | plog = [] |
---|
88 | inpath = sys.argv[1] |
---|
89 | inbase = sys.argv[2] |
---|
90 | killme = string.punctuation + string.whitespace |
---|
91 | trantab = string.maketrans(killme,'_'*len(killme)) |
---|
92 | title = sys.argv[3].translate(trantab) |
---|
93 | mind = sys.argv[4] |
---|
94 | geno = sys.argv[5] |
---|
95 | hwe = sys.argv[6] |
---|
96 | maf = sys.argv[7] |
---|
97 | me1 = sys.argv[8] |
---|
98 | me2 = sys.argv[9] |
---|
99 | outfname = sys.argv[10] |
---|
100 | outfpath = sys.argv[11] |
---|
101 | relf = sys.argv[12] |
---|
102 | afff = sys.argv[13] |
---|
103 | sexf = sys.argv[14] |
---|
104 | fixaff = sys.argv[15] |
---|
105 | output = os.path.join(outfpath,outfname) |
---|
106 | outpath = os.path.join(outfpath,title) |
---|
107 | outprunepath = os.path.join(outfpath,'ldprune_%s' % title) |
---|
108 | try: |
---|
109 | os.makedirs(outfpath) |
---|
110 | except: |
---|
111 | pass |
---|
112 | bfile = os.path.join(inpath,inbase) |
---|
113 | outf = file(outfname,'w') |
---|
114 | vcl = [plinke,'--noweb','--bfile',bfile,'--make-bed','--out', |
---|
115 | outpath,'--set-hh-missing','--mind',mind, |
---|
116 | '--geno',geno,'--maf',maf,'--hwe',hwe,'--me',me1,me2] |
---|
117 | # yes - the --me parameter takes 2 values - mendels per snp and per family |
---|
118 | if relf == 'oo': # plink filters are what they leave... |
---|
119 | vcl.append('--filter-nonfounders') # leave only offspring |
---|
120 | elif relf == 'fo': |
---|
121 | vcl.append('--filter-founders') |
---|
122 | if afff == 'affonly': |
---|
123 | vcl.append('--filter-controls') |
---|
124 | elif relf == 'unaffonly': |
---|
125 | vcl.append('--filter-cases') |
---|
126 | if sexf == 'fsex': |
---|
127 | vcl.append('--filter-females') |
---|
128 | elif relf == 'msex': |
---|
129 | vcl.append('--filter-males') |
---|
130 | p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath) |
---|
131 | retval = p.wait() |
---|
132 | plog.append('%s started, called as %s' % (prog,' '.join(sys.argv))) |
---|
133 | outf.write(galhtmlprefix % prog) |
---|
134 | outf.write('<ul>\n') |
---|
135 | plogf = '%s.log' % os.path.join(outfpath,title) |
---|
136 | try: |
---|
137 | plogl = file(plogf,'r').readlines() |
---|
138 | plog += [x.strip() for x in plogl] |
---|
139 | except: |
---|
140 | plog += ['###Cannot open plink log file %s' % plogf,] |
---|
141 | # if fixaff, want to 'fix' the fam file |
---|
142 | if fixaff <> '0': |
---|
143 | nchanged = fixoutaff(outpath=outpath,newaff=fixaff) |
---|
144 | plog += ['## fixaff was requested %d subjects affection status changed to %s' % (nchanged,fixaff)] |
---|
145 | pf = file(plogf,'w') |
---|
146 | pf.write('\n'.join(plog)) |
---|
147 | pf.close() |
---|
148 | globme = os.path.join(outfpath,'*') |
---|
149 | flist = glob.glob(globme) |
---|
150 | flist.sort() |
---|
151 | for i, data in enumerate( flist ): |
---|
152 | outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1])) |
---|
153 | outf.write('</ul>\n') |
---|
154 | outf.write("</ul></br></div></body></html>") |
---|
155 | outf.close() |
---|
156 | |
---|
157 | |
---|
158 | if __name__ == "__main__": |
---|
159 | clean() |
---|
160 | |
---|