root/galaxy-central/tools/rgenetics/rgCaCo.py

リビジョン 2, 10.4 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/local/bin/python
2# hack to run and process a plink case control association
3# expects args as 
4# bfilepath outname jobname outformat (wig,xls)
5# ross lazarus
6# for wig files, we need annotation so look for map file or complain
7"""
8Parameters for wiggle track definition lines
9All options are placed in a single line separated by spaces:
10
11  track type=wiggle_0 name=track_label description=center_label \
12        visibility=display_mode color=r,g,b altColor=r,g,b \
13        priority=priority autoScale=on|off \
14        gridDefault=on|off maxHeightPixels=max:default:min \
15        graphType=bar|points viewLimits=lower:upper \
16        yLineMark=real-value yLineOnOff=on|off \
17        windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16
18"""
19
20import sys,math,shutil,subprocess,os,time,tempfile,string
21from os.path import abspath
22from rgutils import timenow, plinke
23
24imagedir = '/static/rg' # if needed for images
25myversion = 'V000.1 April 2007'
26verbose = False
27
28def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000):
29    """
30    score must be scaled to 0-1000
31   
32    Want to make some wig tracks from each analysis
33    Best n -log10(p). Make top hit the window.
34    we use our tab output which has
35    rs  chrom   offset  ADD_stat        ADD_p   ADD_log10p
36    rs3094315   1       792429  1.151   0.2528  0.597223
37
38    """
39
40    def is_number(s):
41        try:
42            float(s)
43            return True
44        except ValueError:
45            return False
46    header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description)         
47    column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ]
48    halfwidth=100
49    resfpath = os.path.join(twd,resf)
50    resf = open(resfpath,'r')
51    resfl = resf.readlines() # dumb but convenient for millions of rows
52    resfl = [x.split() for x in resfl]
53    headl = resfl[0]
54    resfl = resfl[1:]
55    headl = [x.strip().upper() for x in headl]
56    headIndex = dict(zip(headl,range(0,len(headl))))
57    chrpos = headIndex.get('CHR',None)
58    rspos = headIndex.get('RS',None)
59    offspos = headIndex.get('OFFSET',None)
60    ppos = headIndex.get('LOG10ARMITAGEP',None)
61    wewant = [chrpos,rspos,offspos,ppos]
62    if None in wewant: # missing something
63       logf.write('### Error missing a required header in makeGFF - headIndex=%s\n' % headIndex)
64       return
65    resfl = [x for x in resfl if x[ppos] > '']
66    resfl = [(float(x[ppos]),x) for x in resfl] # decorate
67    resfl.sort()
68    resfl.reverse() # using -log10 so larger is better
69    resfl = resfl[:topn] # truncate
70    pvals = [x[0] for x in resfl] # need to scale
71    resfl = [x[1] for x in resfl] # drop decoration 
72    maxp = max(pvals) # need to scale
73    minp = min(pvals)
74    prange = abs(maxp-minp) + 0.5 # fudge
75    scalefact = 1000.0/prange
76    logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact))
77    for i,row in enumerate(resfl):
78        row[ppos] = '%d' % (int(scalefact*pvals[i]))
79        resfl[i] = row # replace
80    outf = file(outfname,'w')
81    outf.write(header)
82    outres = [] # need to resort into chrom offset order
83    for i,lrow in enumerate(resfl):
84        chrom,snp,offset,p, = [lrow[x] for x in wewant]
85        gff = ('chr%s' % chrom,'rgGLM','variation','%d' % (int(offset)-halfwidth),
86               '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i]))
87        outres.append(gff)
88    outres = [(x[0],int(x[3]),x) for x in outres] # decorate
89    outres.sort() # into chrom offset
90    outres=[x[2] for x in outres] # undecorate
91    outres = ['\t'.join(x) for x in outres]   
92    outf.write('\n'.join(outres))
93    outf.write('\n')
94    outf.close()
95
96
97
98def plink_assocToGG(plinkout="hm",tag='test'):
99   """ plink --assoc output looks like this
100   #  CHR         SNP   A1      F_A      F_U   A2        CHISQ            P           OR
101   #   1   rs3094315    G   0.6685   0.1364    A        104.1    1.929e-24        12.77
102   # write as a genegraph input file
103   """
104   inf = file('%s.assoc' % plinkout,'r')
105   outf = file('%sassoc.xls' % plinkout,'w')
106   res = ['rs\tlog10p%s\tFakeInvOR%s\tRealOR%s' % (tag,tag,tag),] # output header for ucsc genome graphs
107   head = inf.next()
108   for l in inf:
109    ll = l.split()
110    if len(ll) >= 8:
111      p = float(ll[7])
112      if p <> 'NA': # eesh
113          logp = '%9.9f' % -math.log10(p)
114      else:
115          logp = 'NA'
116      try:
117         orat = ll[8]
118      except:
119         orat = 'NA'
120      orat2 = orat
121      # invert large negative odds ratios
122      if float(orat) < 1 and float(orat) > 0.0:
123         orat2 = '%9.9f' % (1.0/float(orat))
124      outl = [ll[1],logp, orat2, orat]
125      res.append('\t'.join(outl))
126   outf.write('\n'.join(res))
127   outf.write('\n')
128   outf.close()
129   inf.close()
130
131def xformModel(infname='',resf='',outfname='',
132               name='foo',mapf='/usr/local/galaxy/data/rg/ped/x.bim',flog=None):
133    """munge a plink .model file into either a ucsc track or an xls file
134    rerla@meme ~/plink]$ head hmYRI_CEU.model
135    CHR         SNP     TEST            AFF          UNAFF        CHISQ   DF            P
136    1   rs3094315     GENO       41/37/11        0/24/64           NA   NA           NA
137    1   rs3094315    TREND         119/59         24/152        81.05    1    2.201e-19
138    1   rs3094315  ALLELIC         119/59         24/152        104.1    1    1.929e-24
139    1   rs3094315      DOM          78/11          24/64           NA   NA           NA
140
141    bim file has
142[rerla@beast pbed]$ head plink_wgas1_example.bim
1431       rs3094315       0.792429        792429  G       A
1441       rs6672353       0.817376        817376  A       G
145    """
146    if verbose:
147        print 'Rgenetics rgCaCo.xformModel got resf=%s,  outfname=%s' % (resf,outfname)
148    res = []
149    rsdict = {}       
150    map = file(mapf,'r')
151    for l in map: # plink map
152        ll = l.strip().split()
153        if len(ll) >= 3:
154            rs=ll[1].strip()
155            chrom = ll[0]
156            if chrom.lower() == 'x':
157                chrom='23'
158            elif chrom.lower() == 'y':
159                chrom = 24
160            elif chrom.lower() == 'mito':
161                chrom = 25
162            offset = ll[3]
163            rsdict[rs] = (chrom,offset)
164    res.append('rs\tChr\tOffset\tGenop\tlog10Genop\tArmitagep\tlog10Armitagep\tAllelep\tlog10Allelep\tDomp\tlog10Domp')
165    f = open(resf,'r')
166    headl = f.readline()
167    if headl.find('\t') <> -1:
168       headl = headl.split('\t')
169       delim = '\t'
170    else:
171       headl = headl.split()
172       delim = None
173    chrpos = headl.index('CHR')
174    rspos = headl.index('SNP')
175    testpos = headl.index('TEST')
176    naffpos = headl.index('AFF')
177    nuaffpos = headl.index('UNAFF')
178    chisqpos = headl.index('CHISQ')
179    ppos = headl.index('P')   
180    wewant = [chrpos,rspos,testpos,naffpos,nuaffpos,chisqpos,ppos]
181    llen = len(headl)
182    lnum = anum = 0
183    lastsnp = None # so we know when to write out a gg line
184    outl = {}
185    f.seek(0)
186    for lnum,l in enumerate(f):
187        if lnum == 0:
188            continue
189        ll = l.split()
190        if delim:
191           ll = l.split(delim)
192        if len(ll) >= llen: # valid line
193            chr,snp,test,naff,nuaff,chi,p = [ll[x] for x in wewant]
194            snp = snp.strip()
195            chrom,offset = rsdict.get(snp,(None,None))
196            anum += 1
197            fp = 1.0 # if NA
198            lp = 0.0
199            try:
200                fp = float(p)
201                if fp > 0:
202                  lp = -math.log10(fp)
203                else:
204                    fp = 9e-100
205                    flog.write('### WARNING - Plink calculated %s for %s p value!!! 9e-100 substituted!\n' % (p,test))
206                    flog.write('### offending line #%d in %s = %s' % (lnum,l))
207            except:
208                pass
209            if snp <> lastsnp:
210                if len(outl.keys()) > 3:
211                    sl = [outl.get(x,'?') for x in ('snp','chrom','offset','GENO','TREND','ALLELIC','DOM')]
212                    res.append('\t'.join(sl)) # last snp line
213                outl = {'snp':snp,'chrom':chrom,'offset':offset} # first 3 cols for gg line
214                lastsnp = snp # reset for next marker
215            #if p == 'NA':
216            #      p = 1.0
217            # let's pass downstream for handling R is fine?
218            outl[test] = '%s\t%f' % (p,lp)
219    if len(outl.keys()) > 3:
220        l = [outl.get(x,'?') for x in ('snp','chrom','offset','GENO','TREND','ALLELIC','DOM')]
221        res.append('\t'.join(l)) # last snp line
222    f = file(outfname,'w')
223    res.append('')
224    f.write('\n'.join(res))
225    f.close()
226
227
228
229               
230if __name__ == "__main__":
231    """
232    # called as
233    <command interpreter="python">
234        rgCaCo.py '$i.extra_files_path/$i.metadata.base_name' "$name"
235        '$out_file1' '$logf' '$logf.files_path' '$gffout'
236    </command>    </command>
237    """
238    if len(sys.argv) < 7:
239       s = 'rgCaCo.py needs 6 params - got %s \n' % (sys.argv)
240       print >> sys.stdout, s
241       sys.exit(0)
242    bfname = sys.argv[1]
243    name = sys.argv[2]
244    killme = string.punctuation + string.whitespace
245    trantab = string.maketrans(killme,'_'*len(killme))
246    name = name.translate(trantab)
247    outfname = sys.argv[3]
248    logf = sys.argv[4]
249    logoutdir = sys.argv[5]
250    gffout = sys.argv[6]
251    topn = 1000
252    try:
253        os.makedirs(logoutdir)
254    except:
255        pass
256    map_file = None
257    me = sys.argv[0]
258    amapf = '%s.bim' % bfname # to decode map in xformModel
259    flog = file(logf,'w')
260    logme = []
261    cdir = os.getcwd()
262    s = 'Rgenetics %s http://rgenetics.org Galaxy Tools, rgCaCo.py started %s\n' % (myversion,timenow())
263    print >> sys.stdout, s # so will appear as blurb for file
264    logme.append(s)
265    if verbose:
266        s = 'rgCaCo.py:  bfname=%s, logf=%s, argv = %s\n' % (bfname, logf, sys.argv)
267        print >> sys.stdout, s # so will appear as blurb for file
268        logme.append(s)
269    twd = tempfile.mkdtemp(suffix='rgCaCo') # make sure plink doesn't spew log file into the root!
270    tname = os.path.join(twd,name)
271    vcl = [plinke,'--noweb','--bfile',bfname,'--out',name,'--model']
272    p=subprocess.Popen(' '.join(vcl),shell=True,stdout=flog,cwd=twd)
273    retval = p.wait()
274    resf = '%s.model' % tname # plink output is here we hope
275    xformModel(bfname,resf,outfname,name,amapf,flog) # leaves the desired summary file
276    makeGFF(resf=outfname,outfname=gffout,logf=flog,twd=twd,name='rgGLM_TopTable',description=name,topn=topn)
277    flog.write('\n'.join(logme))
278    flog.close() # close the log used
279    #shutil.copytree(twd,logoutdir)
280    shutil.rmtree(twd) # clean up
281
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。