root/galaxy-central/tools/rgenetics/rgHaploView.py

リビジョン 2, 19.3 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1"""
2released under the terms of the LGPL
3copyright ross lazarus August 2007
4for the rgenetics project
5
6Special galaxy tool for the camp2007 data
7Allows grabbing genotypes from an arbitrary region and estimating
8ld using haploview
9
10stoopid haploview won't allow control of dest directory for plots - always end
11up where the data came from - need to futz to get it where it belongs
12
13Needs a mongo results file in the location hardwired below or could be passed in as
14a library parameter - but this file must have a very specific structure
15rs chrom offset float1...floatn
16
17
18"""
19
20
21import sys, array, os, string, tempfile, shutil, subprocess, glob
22from rgutils import galhtmlprefix
23
24progname = os.path.split(sys.argv[0])[1]
25
26javabin = 'java'
27#hvbin = '/usr/local/bin/Haploview.jar'
28#hvbin = '/home/universe/linux-i686/haploview/Haploview.jar'
29# get this from tool as a parameter - can use
30
31
32
33atrandic = {'A':'1','C':'2','G':'3','T':'4','N':'0','-':'0','1':'1','2':'2','3':'3','4':'4','0':'0'}
34
35class NullDevice:
36    """ """
37    def write(self, s):
38        pass
39
40def ld():
41    """  ### Sanity check the arguments
42
43    <command interpreter="python">
44    rgHaploView.py "$ucsc_region" "$rslist" "$title" "$out_file1"
45    "$lhistIn.extra_files_path" "$lhistIn.metadata.base_name"
46    "$minmaf" "$maxdist" "$ldtype" "$hires" "$memsize" "$out_file1.files_path"
47    "$infoTrack" "$tagr2" "$hmpanel" ${GALAXY_DATA_INDEX_DIR}/rg/bin/haploview.jar
48    </command>
49
50    remember to figure out chromosome and complain if > 1?
51    and use the -chromosome <1-22,X,Y> parameter to haploview
52    skipcheck?
53    """
54    progname = os.path.split(sys.argv[0])[-1]
55    ts = '%s%s' % (string.punctuation,string.whitespace)
56    ptran =  string.maketrans(ts,'_'*len(ts))
57    if len(sys.argv) < 16:
58        s = '##!%s: Expected 16 params in sys.argv, got %d (%s)' % (progname,len(sys.argv), sys.argv)
59        print s
60        sys.exit(1)
61
62    ### Figure out what genomic region we are interested in
63    region = sys.argv[1]
64    orslist = sys.argv[2].replace('X',' ').lower() # galaxy replaces newlines with XX - go figure
65    title = sys.argv[3].translate(ptran)
66    # for outputs
67    outfile = sys.argv[4]
68    logfn = 'Log_%s.txt' % (title)
69    histextra = sys.argv[5]
70    base_name = sys.argv[6]
71    pedFileBase = os.path.join(histextra,base_name)
72    print 'pedfilebase=%s' % pedFileBase
73    minMaf=sys.argv[7]
74    if minMaf:
75        try:
76            minMaf = float(minMaf)
77        except:
78            minMaf = 0.0
79    maxDist=sys.argv[8] or None
80    ldType=sys.argv[9] or 'RSQ'
81    hiRes = (sys.argv[10].lower() == 'hi')
82    memSize= sys.argv[11] or '1000'
83    memSize = int(memSize)
84    outfpath = sys.argv[12]
85    infotrack = False # note that otherwise this breaks haploview in headless mode
86    #infotrack = sys.argv[13] == 'info'
87    # this fails in headless mode as at april 2010 with haploview 4.2
88    tagr2 = sys.argv[14] or '0.8'
89    hmpanels = sys.argv[15] # eg "['CEU','YRI']"
90    if hmpanels:
91       hmpanels = hmpanels.replace('[','')
92       hmpanels = hmpanels.replace(']','')
93       hmpanels = hmpanels.replace("'",'')
94       hmpanels = hmpanels.split(',')
95    hvbin = sys.argv[16] # added rml june 2008
96    bindir = os.path.split(hvbin)[0]
97    # jan 2010 - always assume utes are on path to avoid platform problems
98    pdfjoin = 'pdfjoin' # os.path.join(bindir,'pdfjoin')
99    pdfnup = 'pdfnup' # os.path.join(bindir,'pdfnup')
100    mogrify = 'mogrify' # os.path.join(bindir,'mogrify')
101    convert = 'convert' # os.path.join(bindir,'convert')
102    log_file = os.path.join(outfpath,logfn)
103    MAP_FILE = '%s.map' % pedFileBase
104    DATA_FILE = '%s.ped' % pedFileBase
105    try:
106        os.makedirs(outfpath)
107        s = '## made new path %s\n' % outfpath
108    except:
109        pass
110    lf = file(log_file,'w')
111    s = 'PATH=%s\n' % os.environ.get('PATH','?')
112    lf.write(s)
113    hlogf = os.path.join(outfpath,'%s.log' % pedFileBase)
114    chromosome = ''
115    spos = epos = -9
116    rslist = []
117    rsdict = {}
118    hlog = []
119
120    if region > '':
121        useRs = []
122        useRsdict={}
123        try: # TODO make a regexp?
124            c,rest = region.split(':')
125            chromosome = c.replace('chr','')
126            rest = rest.replace(',','') # remove commas
127            spos,epos = rest.split('-')
128            spos = int(spos)
129            epos = int(epos)
130            s = '## %s parsing chrom %s from %d to %d\n' % (progname,chromosome,spos,epos)
131            lf.write(s)
132            lf.write('\n')
133            print >> sys.stdout, s
134        except:
135            s = '##! %s unable to parse region %s - MUST look like "chr8:10,000-100,000\n' % (progname,region)
136            print >> sys.stdout, s
137            lf.write(s)
138            lf.write('\n')
139            lf.close()
140            sys.exit(1)
141    else:
142        useRs = orslist.split() # galaxy replaces newlines with XX - go figure
143        useRsdict = dict(zip(useRs,useRs))
144    useTemp = False
145    try:
146        dfile = open(DATA_FILE, 'r')
147    except: # bad input file name?
148        s = '##! RGeno unable to open file %s\n' % (DATA_FILE)
149        lf.write(s)
150        lf.write('\n')
151        lf.close()
152        print >> sys.stdout, s
153        raise
154        sys.exit(1)
155    try:
156        mfile = open(MAP_FILE, 'r')
157    except: # bad input file name?
158        s = '##! RGeno unable to open file %s' % (MAP_FILE)
159        lf.write(s)
160        lf.write('\n')
161        lf.close()
162        print >> sys.stdout, s
163        raise
164        sys.exit(1)
165    if len(useRs) > 0 or spos <> -9 : # subset region
166        useTemp = True
167        ### Figure out which markers are in this region
168        markers = []
169        snpcols = {}
170        chroms = {}
171        minpos = 2**32
172        maxpos = 0
173        for lnum,row in enumerate(mfile):
174            line = row.strip()
175            if not line: continue
176            chrom, snp, genpos, abspos = line.split()
177            try:
178                ic = int(chrom)
179            except:
180                ic = None
181            if ic and ic <= 23:
182                try:
183                    abspos = int(abspos)
184                    if abspos > maxpos:
185                        maxpos = abspos
186                    if abspos < minpos:
187                        minpos = abspos
188                except:
189                    abspos = epos + 999999999 # so next test fails
190            if useRsdict.get(snp,None) or (spos <> -9 and chrom == chromosome and (spos <= abspos <= epos)):
191                if chromosome == '':
192                    chromosome = chrom
193                chroms.setdefault(chrom,chrom)
194                markers.append((chrom,abspos,snp)) # decorate for sort into genomic
195                snpcols[snp] = lnum # so we know which col to find genos for this marker
196        markers.sort()
197        rslist = [x[2] for x in markers] # drop decoration
198        rsdict = dict(zip(rslist,rslist))
199        if len(rslist) == 0:
200            s = '##! %s found no rs numbers in %s' % (progname,sys.argv[1:3])
201            lf.write(s)
202            lf.write('\n')
203            lf.close()
204            print >> sys.stdout, s
205            sys.exit(1)
206        if spos == -9:
207            spos = minpos
208            epos = maxpos
209        s = '## %s looking for %d rs (%s)' % (progname,len(rslist),rslist[:5])
210        lf.write(s)
211        print >> sys.stdout, s
212        wewant = [(6+(2*snpcols[x])) for x in rslist] #
213        # column indices of first geno of each marker pair to get the markers into genomic
214        ### ... and then parse the rest of the ped file to pull out
215        ### the genotypes for all subjects for those markers
216        # /usr/local/galaxy/data/rg/1/lped/
217        tempMapName = os.path.join(outfpath,'%s.info' % title)
218        tempMap = file(tempMapName,'w')
219        tempPedName = os.path.join(outfpath,'%s.ped' % title)
220        tempPed = file(tempPedName,'w')
221        pngpath = '%s.LD.PNG' % tempPedName
222        map = ['%s\t%s' % (x[2],x[1]) for x in markers] # snp,abspos in genomic order for haploview
223        tempMap.write('%s\n' % '\n'.join(map))
224        tempMap.close()
225        nrows = 0
226        for line in dfile:
227            line = line.strip()
228            if not line:
229                continue
230            fields = line.split()
231            preamble = fields[:6]
232            g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant]
233            g = ' '.join(g)
234            g = g.split() # we'll get there
235            g = [atrandic.get(x,'0') for x in g] # numeric alleles...
236            tempPed.write('%s %s\n' % (' '.join(preamble), ' '.join(g)))
237            nrows += 1
238        tempPed.close()
239        s = '## %s: wrote %d markers, %d subjects for region %s\n' % (progname,len(rslist),nrows,region)
240        lf.write(s)
241        lf.write('\n')
242        print >> sys.stdout,s
243    else: # even if using all, must set up haploview info file instead of map
244        markers = []
245        chroms = {}
246        spos = sys.maxint
247        epos = -spos
248        for lnum,row in enumerate(mfile):
249          line = row.strip()
250          if not line: continue
251          chrom, snp, genpos, abspos = line.split()
252          try:
253            ic = int(chrom)
254          except:
255            ic = None
256          if ic and ic <= 23:
257            if chromosome == '':
258                chromosome = chrom
259            chroms.setdefault(chrom,chrom)
260            try:
261                p = int(abspos)
262                if p < spos and p <> 0:
263                    spos = p
264                if p > epos and p <> 0:
265                    epos = p
266            except:
267                pass
268            markers.append('%s %s' % (snp,abspos)) # no sort - pass
269        # now have spos and epos for hapmap if hmpanels
270        tempMapName = os.path.join(outfpath,'%s.info' % title)
271        tempMap = file(tempMapName,'w')
272        tempMap.write('\n'.join(markers))
273        tempMap.close()
274        tempPedName = os.path.join(outfpath,'%s.ped' % title)
275        try: # will fail on winblows!
276            os.symlink(DATA_FILE,tempPedName)
277        except:
278            shutil.copy(DATA_FILE,tempPedName) # wasteful but..
279    nchroms = len(chroms) # if > 1 can't really do this safely
280    if nchroms > 1:
281        s = '## warning - multiple chromosomes found in your map file - %s\n' % ','.join(chroms.keys())
282        lf.write(s)
283        print >> sys.stdout,s
284        sys.exit(1)
285    DATA_FILE = tempPedName # for haploview
286    INFO_FILE = tempMapName
287    dfile.close()
288    mfile.close()
289    fblog,blog = tempfile.mkstemp()
290    ste = open(blog,'w') # to catch the blather
291    # if no need to rewrite - set up names for haploview call
292    vcl = [javabin,'-jar',hvbin,'-n','-memory','%d' % memSize,'-pairwiseTagging',
293           '-pedfile',DATA_FILE,'-info',INFO_FILE,'-tagrsqcounts',
294           '-tagrsqcutoff',tagr2,
295           '-ldcolorscheme',ldType]
296    if minMaf:
297        vcl += ['-minMaf','%f' % minMaf]
298    if maxDist:
299        vcl += ['-maxDistance',maxDist]
300    if hiRes:
301        vcl.append('-png')
302    else:
303        vcl.append('-compressedpng')
304    if nchroms == 1:
305        vcl += ['-chromosome',chromosome]
306    if infotrack:
307        vcl.append('-infoTrack')
308    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath,stderr=ste,stdout=lf)
309    retval = p.wait()
310    s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
311    lf.write(s)
312    vcl = [mogrify, '-resize 800x400!', '*.PNG']
313    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath,stderr=lf,stdout=lf)
314    retval = p.wait()
315    s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
316    lf.write(s)
317    inpng = '%s.LD.PNG' % DATA_FILE # stupid but necessary - can't control haploview name mangle
318    inpng = inpng.replace(' ','')
319    inpng = os.path.split(inpng)[-1]
320    tmppng = '%s.tmp.png' % title
321    tmppng = tmppng.replace(' ','')
322    outpng = '1_%s.png' % title
323    outpng = outpng.replace(' ','')
324    outpng = os.path.split(outpng)[-1]
325    vcl = [convert, '-resize 800x400!', inpng, tmppng]
326    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath,stderr=lf,stdout=lf)
327    retval = p.wait()
328    s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
329    lf.write(s)
330    s = "text 10,300 '%s'" % title[:40]
331    vcl = ['convert', '-pointsize 25','-fill maroon',
332          '-draw "%s"' % s, tmppng, outpng]
333    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath,stderr=lf,stdout=lf)
334    retval = p.wait()
335    s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
336    lf.write(s)
337    try:
338        os.remove(os.path.join(outfpath,tmppng))
339    except:
340        pass    # label all the plots then delete all the .PNG files before munging
341    fnum=1
342    if hmpanels:
343        sp = '%d' % (spos/1000.) # hapmap wants kb
344        ep = '%d' % (epos/1000.)
345        for panel in hmpanels:
346            if panel > '' and panel.lower() <> 'none': # in case someone checks that option too :)
347                ptran = panel.strip()
348                ptran = ptran.replace('+','_')
349                fnum += 1 # preserve an order or else we get sorted
350                vcl = [javabin,'-jar',hvbin,'-n','-memory','%d' % memSize,
351                  '-chromosome',chromosome, '-panel',panel.strip(),
352                  '-hapmapDownload','-startpos',sp,'-endpos',ep,
353                  '-ldcolorscheme',ldType]
354                if minMaf:
355                    vcl += ['-minMaf','%f' % minMaf]
356                if maxDist:
357                    vcl += ['-maxDistance',maxDist]
358                if hiRes:
359                    vcl.append('-png')
360                else:
361                    vcl.append('-compressedpng')
362                if infotrack:
363                    vcl.append('-infoTrack')
364                p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath,stderr=ste,stdout=lf)
365                retval = p.wait()
366                inpng = 'Chromosome%s%s.LD.PNG' % (chromosome,panel)
367                inpng = inpng.replace(' ','') # mysterious spaces!
368                outpng = '%d_HapMap_%s_%s.png' % (fnum,ptran,chromosome,)
369                # hack for stupid chb+jpt
370                outpng = outpng.replace(' ','')
371                tmppng = '%s.tmp.png' % title
372                tmppng = tmppng.replace(' ','')
373                outpng = os.path.split(outpng)[-1]
374                vcl = [convert, '-resize 800x400!', inpng, tmppng]
375                p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath,stderr=lf,stdout=lf)
376                retval = p.wait()
377                s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
378                lf.write(s)
379                s = "text 10,300 'HapMap %s'" % ptran.strip()
380                vcl = [convert, '-pointsize 25','-fill maroon',
381                      '-draw "%s"' % s, tmppng, outpng]
382                p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath,stderr=lf,stdout=lf)
383                retval = p.wait()
384                s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
385                lf.write(s)
386                try:
387                    os.remove(os.path.join(outfpath,tmppng))
388                except:
389                    pass
390    nimages = len(glob.glob(os.path.join(outfpath,'*.png'))) # rely on HaploView shouting - PNG @!
391    lf.write('### nimages=%d\n' % nimages)
392    if nimages > 0: # haploview may fail?
393        vcl = '%s -format pdf -resize 800x400! *.png' % mogrify
394        p=subprocess.Popen(vcl,shell=True,cwd=outfpath,stderr=lf,stdout=lf)
395        retval = p.wait()
396        lf.write('## executing %s returned %d\n' % (vcl,retval))
397        vcl = '%s "*.pdf" --fitpaper true --outfile alljoin.pdf' % pdfjoin
398        p=subprocess.Popen(vcl,shell=True,cwd=outfpath,stderr=lf,stdout=lf)
399        retval = p.wait()
400        lf.write('## executing %s returned %d\n' % (vcl,retval))
401        vcl = '%s alljoin.pdf --nup 1x%d --outfile allnup.pdf' % (pdfnup,nimages)
402        p=subprocess.Popen(vcl,shell=True,cwd=outfpath,stderr=lf,stdout=lf)
403        retval = p.wait()
404        lf.write('## executing %s returned %d\n' % (vcl,retval))
405        #vcl = ['convert', 'allnup.pdf', 'allnup.png'] # this fails - bad pdf?
406        #p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath)
407        #retval = p.wait()
408        #lf.write('## executing %s returned %d\n' % (vcl,retval))
409    lf.write('\n'.join(hlog))
410    ste.close() # temp file used to catch haploview blather
411    hblather = open(blog,'r').readlines() # to catch the blather   
412    os.unlink(blog)
413    if len(hblather) > 0:
414       lf.write('## In addition, Haploview complained:')
415       lf.write(''.join(hblather))
416       lf.write('\n')
417    lf.close()
418    flist = glob.glob(os.path.join(outfpath, '*'))
419    flist.sort()
420    ts = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~' + string.whitespace
421    ftran =  string.maketrans(ts,'_'*len(ts))
422    outf = file(outfile,'w')
423    outf.write(galhtmlprefix % progname)
424    s = '<h4>rgenetics for Galaxy %s, wrapping HaploView</h4>' % (progname)
425    outf.write(s)
426    """
427    as at ashg 2009, convert fails on allnup.pdf - probably too complex...
428    mainthumb = 'allnup.png'
429    mainpdf = 'allnup.pdf'
430    if os.path.exists(mainpdf):
431        if not os.path.exists(mainthumb):
432            outf.write('<table><tr><td colspan="3"><a href="%s">Main combined LD plot</a></td></tr></table>\n' % (mainpdf))
433        else:
434            outf.write('<table><tr><td><a href="%s"><img src="%s" alt="Main combined LD image" hspace="10" align="middle">')
435            outf.write('</td><td>Click this thumbnail to display the main combined LD image</td></tr></table>\n' % (mainpdf,mainthumb))
436    else:
437        outf.write('(No main image was generated - this usually means a Haploview error connecting to Hapmap site - please try later)<br/>\n')
438    outf.write('## Called as %s' % sys.argv)
439    """
440    outf.write('<br><div><hr><ul>\n')
441    for i, data in enumerate( flist ):
442        dn = os.path.split(data)[-1]
443        if dn[:3] <> 'all':
444            continue
445        newdn = dn.translate(ftran)
446        if dn <> newdn:
447            os.rename(os.path.join(outfpath,dn),os.path.join(outfpath,newdn))
448            dn = newdn
449        dnlabel = dn
450        ext = dn.split('.')[-1]
451        if dn == 'allnup.pdf':
452            dnlabel = 'All pdf plots on a single page'
453        elif dn == 'alljoin.pdf':
454            dnlabel = 'All pdf plots, each on a separate page'
455        outf.write('<li><a href="%s">%s - %s</a></li>\n' % (dn,dn,dnlabel))
456    for i, data in enumerate( flist ):
457        dn = os.path.split(data)[-1]
458        if dn[:3] == 'all':
459            continue
460        newdn = dn.translate(ftran)
461        if dn <> newdn:
462            os.rename(os.path.join(outfpath,dn),os.path.join(outfpath,newdn))
463            dn = newdn
464        dnlabel = dn
465        ext = dn.split('.')[-1]
466        if dn == 'allnup.pdf':
467            dnlabel = 'All pdf plots on a single page'
468        elif dn == 'alljoin.pdf':
469            dnlabel = 'All pdf plots, each on a separate page'
470        elif ext == 'info':
471            dnlabel = '%s map data for Haploview input' % title
472        elif ext == 'ped':
473            dnlabel = '%s genotype data for Haploview input' % title
474        elif dn.find('CEU') <> -1 or dn.find('YRI') <> -1 or dn.find('CHB_JPT') <> -1: # is hapmap
475            dnlabel = 'Hapmap data'
476        if ext == 'TAGS' or ext == 'TESTS' or ext == 'CHAPS':
477            dnlabel = dnlabel + ' Tagger output'
478        outf.write('<li><a href="%s">%s - %s</a></li>\n' % (dn,dn,dnlabel))
479    outf.write('</ol><br>')
480    outf.write("</div><div><hr>Job Log follows below (see %s)<pre>" % logfn)
481    s = file(log_file,'r').readlines()
482    s = '\n'.join(s)
483    outf.write('%s</pre><hr></div>' % s)
484    outf.write('</body></html>')
485    outf.close()
486    if useTemp:
487        try:
488            os.unlink(tempMapName)
489            os.unlink(tempPedName)
490        except:
491            pass
492       
493if __name__ == "__main__":
494    ld()
495
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。