1 | #!/usr/local/bin/python |
---|
2 | # hack to run and process a plink tdt |
---|
3 | # expects args as |
---|
4 | # bfilepath outname jobname outformat (wig,xls) |
---|
5 | # ross lazarus |
---|
6 | # for wig files, we need annotation so look for map file or complain |
---|
7 | |
---|
8 | """ |
---|
9 | Parameters for wiggle track definition lines |
---|
10 | All options are placed in a single line separated by spaces: |
---|
11 | |
---|
12 | track type=wiggle_0 name=track_label description=center_label \ |
---|
13 | visibility=display_mode color=r,g,b altColor=r,g,b \ |
---|
14 | priority=priority autoScale=on|off \ |
---|
15 | gridDefault=on|off maxHeightPixels=max:default:min \ |
---|
16 | graphType=bar|points viewLimits=lower:upper \ |
---|
17 | yLineMark=real-value yLineOnOff=on|off \ |
---|
18 | windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16 |
---|
19 | """ |
---|
20 | |
---|
21 | import sys,math,shutil,subprocess,os,time,tempfile,shutil,string |
---|
22 | from os.path import abspath |
---|
23 | from optparse import OptionParser |
---|
24 | from rgutils import timenow, plinke |
---|
25 | myversion = 'v0.003 January 2010' |
---|
26 | verbose = False |
---|
27 | |
---|
28 | |
---|
29 | def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000): |
---|
30 | """ |
---|
31 | score must be scaled to 0-1000 |
---|
32 | |
---|
33 | Want to make some wig tracks from each analysis |
---|
34 | Best n -log10(p). Make top hit the window. |
---|
35 | we use our tab output which has |
---|
36 | rs chrom offset ADD_stat ADD_p ADD_log10p |
---|
37 | rs3094315 1 792429 1.151 0.2528 0.597223 |
---|
38 | |
---|
39 | """ |
---|
40 | |
---|
41 | def is_number(s): |
---|
42 | try: |
---|
43 | float(s) |
---|
44 | return True |
---|
45 | except ValueError: |
---|
46 | return False |
---|
47 | header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description) |
---|
48 | column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ] |
---|
49 | halfwidth=100 |
---|
50 | resfpath = os.path.join(twd,resf) |
---|
51 | resf = open(resfpath,'r') |
---|
52 | resfl = resf.readlines() # dumb but convenient for millions of rows |
---|
53 | resfl = [x.split() for x in resfl] |
---|
54 | headl = resfl[0] |
---|
55 | resfl = resfl[1:] |
---|
56 | headl = [x.strip().upper() for x in headl] |
---|
57 | headIndex = dict(zip(headl,range(0,len(headl)))) |
---|
58 | # s = 'rs\tchrom\toffset\ta1\ta2\ttransmitted\tuntransmitted\tTDTChiSq\tTDTp\t-log10TDTp\tAbsTDTOR\tTDTOR' |
---|
59 | chrpos = headIndex.get('CHROM',None) |
---|
60 | rspos = headIndex.get('RS',None) |
---|
61 | offspos = headIndex.get('OFFSET',None) |
---|
62 | ppos = headIndex.get('-LOG10TDTP',None) |
---|
63 | wewant = [chrpos,rspos,offspos,ppos] |
---|
64 | if None in wewant: # missing something |
---|
65 | logf.write('### Error missing a required header in makeGFF - headIndex=%s\n' % headIndex) |
---|
66 | return |
---|
67 | resfl = [x for x in resfl if x[ppos] > ''] |
---|
68 | resfl = [(float(x[ppos]),x) for x in resfl] # decorate |
---|
69 | resfl.sort() |
---|
70 | resfl.reverse() # using -log10 so larger is better |
---|
71 | resfl = resfl[:topn] # truncate |
---|
72 | pvals = [x[0] for x in resfl] # need to scale |
---|
73 | resfl = [x[1] for x in resfl] # drop decoration |
---|
74 | maxp = max(pvals) # need to scale |
---|
75 | minp = min(pvals) |
---|
76 | prange = abs(maxp-minp) + 0.5 # fudge |
---|
77 | scalefact = 1000.0/prange |
---|
78 | logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact)) |
---|
79 | for i,row in enumerate(resfl): |
---|
80 | row[ppos] = '%d' % (int(scalefact*pvals[i])) |
---|
81 | resfl[i] = row # replace |
---|
82 | outf = file(outfname,'w') |
---|
83 | outf.write(header) |
---|
84 | outres = [] # need to resort into chrom offset order |
---|
85 | for i,lrow in enumerate(resfl): |
---|
86 | chrom,snp,offset,p, = [lrow[x] for x in wewant] |
---|
87 | gff = ('chr%s' % chrom,'rgGLM','variation','%d' % (int(offset)-halfwidth), |
---|
88 | '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i])) |
---|
89 | outres.append(gff) |
---|
90 | outres = [(x[0],int(x[3]),x) for x in outres] # decorate |
---|
91 | outres.sort() # into chrom offset |
---|
92 | outres=[x[2] for x in outres] # undecorate |
---|
93 | outres = ['\t'.join(x) for x in outres] |
---|
94 | outf.write('\n'.join(outres)) |
---|
95 | outf.write('\n') |
---|
96 | outf.close() |
---|
97 | |
---|
98 | |
---|
99 | |
---|
100 | def xformTDT(infname='',resf='',outfname='',name='foo',mapf='/usr/local/galaxy/data/rg/lped/x.bim'): |
---|
101 | """munge a plink .tdt file into either a ucsc track or an xls file |
---|
102 | CHR SNP A1:A2 T:U_TDT OR_TDT CHISQ_TDT P_TDT |
---|
103 | 0 MitoT217C 2:3 0:0 NA NA NA |
---|
104 | 0 MitoG228A 1:4 0:0 NA NA NA |
---|
105 | 0 MitoT250C 2:3 0:0 NA NA NA |
---|
106 | map file has |
---|
107 | 1 rs4378174 0 003980745 |
---|
108 | 1 rs10796404 0 005465256 |
---|
109 | 1 rs2697965 0 014023092 |
---|
110 | |
---|
111 | grrr! |
---|
112 | Changed in 1.01 to |
---|
113 | [rerla@hg fresh]$ head database/job_working_directory/445/rgTDT.tdt |
---|
114 | CHR SNP BP A1 A2 T U OR CHISQ P |
---|
115 | 1 rs12562034 758311 1 3 71 79 0.8987 0.4267 0.5136 |
---|
116 | 1 rs3934834 995669 4 2 98 129 0.7597 4.233 0.03963 |
---|
117 | |
---|
118 | |
---|
119 | """ |
---|
120 | if verbose: |
---|
121 | print 'Rgenetics Galaxy Tools, rgTDT.py.xformTDT got resf=%s, outtype=%s, outfname=%s' % (resf,outtype,outfname) |
---|
122 | wewantcols = ['SNP','CHR','BP','A1','A2','T','U','OR','CHISQ','P'] |
---|
123 | res = [] |
---|
124 | s = 'rs\tchrom\toffset\ta1\ta2\ttransmitted\tuntransmitted\tTDTChiSq\tTDTp\t-log10TDTp\tAbsTDTOR\tTDTOR' # header |
---|
125 | res.append(s) |
---|
126 | rsdict = {} |
---|
127 | if not mapf: |
---|
128 | sys.stderr.write('rgTDT called but no map file provided - cannot determine locations') |
---|
129 | sys.exit(1) |
---|
130 | map = file(mapf,'r') |
---|
131 | for l in map: # plink map |
---|
132 | ll = l.strip().split() |
---|
133 | if len(ll) >= 3: |
---|
134 | rs=ll[1].strip() |
---|
135 | chrom = ll[0] |
---|
136 | if chrom.lower() == 'x': |
---|
137 | chrom = '23' |
---|
138 | if chrom.lower() == 'y': |
---|
139 | chrom = '24' |
---|
140 | if chrom.lower() == 'mito': |
---|
141 | chrom = '25' |
---|
142 | offset = ll[3] |
---|
143 | rsdict[rs] = (chrom,offset) |
---|
144 | f = open(resf,'r') |
---|
145 | headl = f.next().strip() |
---|
146 | headl = headl.split() |
---|
147 | wewant = [headl.index(x) for x in wewantcols] |
---|
148 | llen = len(headl) |
---|
149 | lnum = anum = 0 |
---|
150 | for l in f: |
---|
151 | lnum += 1 |
---|
152 | ll = l.strip().split() |
---|
153 | if len(ll) >= llen: # valid line |
---|
154 | snp,chrom,offset,a1,a2,t,u,orat,chisq,p = [ll[x] for x in wewant] |
---|
155 | if chisq == 'NA' or p == 'NA' or orat == 'NA': |
---|
156 | continue # can't use these lines - gg gets unhappy |
---|
157 | snp = snp.strip() |
---|
158 | lp = '0.0' |
---|
159 | fp = '1.0' |
---|
160 | fakeorat = '1.0' |
---|
161 | if p.upper().strip() <> 'NA': |
---|
162 | try: |
---|
163 | fp = float(p) |
---|
164 | if fp <> 0: |
---|
165 | lp = '%6f' % -math.log10(fp) |
---|
166 | fp = '%6f' % fp |
---|
167 | except: |
---|
168 | pass |
---|
169 | else: |
---|
170 | p = '1.0' |
---|
171 | if orat.upper().strip() <> 'NA': |
---|
172 | try: |
---|
173 | fakeorat = orat |
---|
174 | if float(orat) < 1.0: |
---|
175 | fakeorat = '%6f' % (1.0/float(orat)) # invert so large values big |
---|
176 | except: |
---|
177 | pass |
---|
178 | else: |
---|
179 | orat = '1.0' |
---|
180 | outl = '\t'.join([snp,chrom,offset,a1,a2,t,u,chisq,p,lp,fakeorat,orat]) |
---|
181 | res.append(outl) |
---|
182 | f = file(outfname,'w') |
---|
183 | res.append('') |
---|
184 | f.write('\n'.join(res)) |
---|
185 | f.close() |
---|
186 | |
---|
187 | |
---|
188 | if __name__ == "__main__": |
---|
189 | """ called as |
---|
190 | <command interpreter="python"> |
---|
191 | rgTDT.py -i '$infile.extra_files_path/$infile.metadata.base_name' -o '$title' -f '$outformat' -r '$out_file1' -l '$logf' -x '${GALAXY_DATA_INDEX_DIR}/rg/bin/pl$ |
---|
192 | |
---|
193 | </command> |
---|
194 | |
---|
195 | """ |
---|
196 | u = """ called in xml as |
---|
197 | <command interpreter="python2.4"> |
---|
198 | rgTDT.py -i $i -o $out_prefix -f $outformat -r $out_file1 -l $logf |
---|
199 | </command> |
---|
200 | """ |
---|
201 | if len(sys.argv) < 6: |
---|
202 | s = '## Error rgTDT.py needs 5 command line params - got %s \n' % (sys.argv) |
---|
203 | if verbose: |
---|
204 | print >> sys.stdout, s |
---|
205 | sys.exit(0) |
---|
206 | parser = OptionParser(usage=u, version="%prog 0.01") |
---|
207 | a = parser.add_option |
---|
208 | a("-i","--infile",dest="bfname") |
---|
209 | a("-o","--oprefix",dest="oprefix") |
---|
210 | a("-f","--formatOut",dest="outformat") |
---|
211 | a("-r","--results",dest="outfname") |
---|
212 | a("-l","--logfile",dest="logf") |
---|
213 | a("-d","--du",dest="uId") |
---|
214 | a("-e","--email",dest="uEmail") |
---|
215 | a("-g","--gff",dest="gffout",default="") |
---|
216 | (options,args) = parser.parse_args() |
---|
217 | killme = string.punctuation + string.whitespace |
---|
218 | trantab = string.maketrans(killme,'_'*len(killme)) |
---|
219 | title = options.oprefix |
---|
220 | title = title.translate(trantab) |
---|
221 | map_file = '%s.bim' % (options.bfname) # |
---|
222 | me = sys.argv[0] |
---|
223 | alogf = options.logf # absolute paths |
---|
224 | od = os.path.split(alogf)[0] |
---|
225 | try: |
---|
226 | os.path.makedirs(od) |
---|
227 | except: |
---|
228 | pass |
---|
229 | aoutf = options.outfname # absolute paths |
---|
230 | od = os.path.split(aoutf)[0] |
---|
231 | try: |
---|
232 | os.path.makedirs(od) |
---|
233 | except: |
---|
234 | pass |
---|
235 | vcl = [plinke,'--noweb', '--bfile',options.bfname,'--out',title,'--mind','0.5','--tdt'] |
---|
236 | logme = [] |
---|
237 | if verbose: |
---|
238 | s = 'Rgenetics %s http://rgenetics.org Galaxy Tools rgTDT.py started %s\n' % (myversion,timenow()) |
---|
239 | print >> sys.stdout,s |
---|
240 | logme.append(s) |
---|
241 | s ='rgTDT.py: bfname=%s, logf=%s, argv = %s\n' % (options.bfname,alogf, sys.argv) |
---|
242 | print >> sys.stdout,s |
---|
243 | logme.append(s) |
---|
244 | s = 'rgTDT.py: vcl=%s\n' % (' '.join(vcl)) |
---|
245 | print >> sys.stdout,s |
---|
246 | logme.append(s) |
---|
247 | twd = tempfile.mkdtemp(suffix='rgTDT') # make sure plink doesn't spew log file into the root! |
---|
248 | tname = os.path.join(twd,title) |
---|
249 | p=subprocess.Popen(' '.join(vcl),shell=True,cwd=twd) |
---|
250 | retval = p.wait() |
---|
251 | shutil.copy('%s.log' % tname,alogf) |
---|
252 | sto = file(alogf,'a') |
---|
253 | sto.write('\n'.join(logme)) |
---|
254 | resf = '%s.tdt' % tname # plink output is here we hope |
---|
255 | xformTDT(options.bfname,resf,aoutf,title,map_file) # leaves the desired summary file |
---|
256 | gffout = options.gffout |
---|
257 | if gffout > '': |
---|
258 | makeGFF(resf=aoutf,outfname=gffout,logf=sto,twd='.',name='rgTDT_Top_Table',description=title,topn=1000) |
---|
259 | shutil.rmtree(twd) |
---|
260 | sto.close() |
---|
261 | |
---|
262 | |
---|
263 | |
---|