root/galaxy-central/tools/rgenetics/rgManQQ.py @ 2

リビジョン 2, 13.1 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/local/bin/python
2
3import sys,math,shutil,subprocess,os,time,tempfile,string
4from os.path import abspath
5from rgutils import timenow, RRun, galhtmlprefix, galhtmlpostfix, galhtmlattr
6progname = os.path.split(sys.argv[0])[1]
7myversion = 'V000.1 March 2010'
8verbose = False
9
10rcode="""
11# license not stated so I'm assuming LGPL is ok for my derived work?
12# generalised so 3 core fields passed as parameters ross lazarus March 24 2010 for rgenetics
13# Originally created as qqman with the following
14# attribution:
15#--------------
16# Stephen Turner
17# http://StephenTurner.us/
18# http://GettingGeneticsDone.blogspot.com/
19
20# Last updated: Tuesday, December 22, 2009
21# R code for making manhattan plots and QQ plots from plink output files.
22# With GWAS data this can take a lot of memory. Recommended for use on
23# 64bit machines only, for now.
24
25#
26
27library(ggplot2)
28
29coloursTouse = c('firebrick','darkblue','goldenrod','darkgreen')
30# not too fugly but need a colour expert please...
31
32
33manhattan = function(chrom=NULL,offset=NULL,pvals=NULL, title=NULL, max.y="max",
34   suggestiveline=0, genomewide=T, size.x.labels=9, size.y.labels=10, annotate=F, SNPlist=NULL,grey=0) {
35
36        if (annotate & is.null(SNPlist)) stop("You requested annotation but provided no SNPlist!")
37        genomewideline=NULL # was genomewideline=-log10(5e-8)
38        if (genomewide) { # use bonferroni since might be only a small region?
39            genomewideline = -log10(0.05/length(pvals)) }
40        d=data.frame(CHR=chrom,BP=offset,P=pvals)
41
42        #limit to only chrs 1-23?
43        d=d[d$CHR %in% 1:23, ]
44
45        if ("CHR" %in% names(d) & "BP" %in% names(d) & "P" %in% names(d) ) {
46                d=na.omit(d)
47                d=d[d$P>0 & d$P<=1, ]
48                d$logp = -log10(d$P)
49
50                d$pos=NA
51                ticks=NULL
52                lastbase=0
53                chrlist = unique(d$CHR)
54                nchr = length(chrlist) # may be any number?
55                if (nchr >= 2) {
56                for (x in c(1:nchr)) {
57                        i = chrlist[x] # need the chrom number - may not == index
58                        if (x == 1) { # first time
59                                d[d$CHR==i, ]$pos=d[d$CHR==i, ]$BP
60                                tks = d[d$CHR==i, ]$pos[floor(length(d[d$CHR==i, ]$pos)/2)+1]
61                        }       else {
62                                lastchr = chrlist[x-1] # previous whatever the list
63                                lastbase=lastbase+tail(subset(d,CHR==lastchr)$BP, 1)
64                                d[d$CHR==i, ]$pos=d[d$CHR==i, ]$BP+lastbase
65                                tks=c(tks, d[d$CHR==i, ]$pos[floor(length(d[d$CHR==i, ]$pos)/2)+1])
66                        }
67                    ticklim=c(min(d$pos),max(d$pos))
68                    xlabs = chrlist
69                    }
70                } else { # nchr is 1
71                   nticks = 10
72                   last = max(offset)
73                   first = min(offset)
74                   tks = c()
75                   t = (last-first)/nticks # units per tick
76                   for (x in c(1:nticks)) tks = c(tks,round(x*t))
77                   xlabs = tks
78                   ticklim = c(first,last)
79                } # else
80                if (grey) {mycols=rep(c("gray10","gray60"),max(d$CHR))
81                           } else {
82                           mycols=rep(coloursTouse,max(d$CHR))
83                           }
84
85                if (max.y=="max") maxy=ceiling(max(d$logp)) else maxy=max.y
86                maxy = max(maxy,1.1*genomewideline)
87                # if (maxy<8) maxy=8
88                # only makes sense if genome wide is assumed - we could have a fine mapping region? 
89                if (annotate) d.annotate=d[as.numeric(substr(d$SNP,3,100)) %in% SNPlist, ]
90                if (nchr >= 2) {
91                        manplot=qplot(pos,logp,data=d, ylab=expression(-log[10](italic(p))) , colour=factor(CHR))
92                        manplot=manplot+scale_x_continuous(name="Chromosome", breaks=tks, labels=xlabs) }
93                else {
94                        manplot=qplot(BP,logp,data=d, ylab=expression(-log[10](italic(p))) , colour=factor(CHR))
95                        manplot=manplot+scale_x_continuous("BP") }                 
96                manplot=manplot+scale_y_continuous(limits=c(0,maxy), breaks=1:maxy, labels=1:maxy)
97                manplot=manplot+scale_colour_manual(value=mycols)
98                if (annotate) {  manplot=manplot + geom_point(data=d.annotate, colour=I("green3")) }
99                manplot=manplot + opts(legend.position = "none")
100                manplot=manplot + opts(title=title)
101                manplot=manplot+opts(
102                        panel.background=theme_blank(),
103                        axis.text.x=theme_text(size=size.x.labels, colour="grey50"),
104                        axis.text.y=theme_text(size=size.y.labels, colour="grey50"),
105                        axis.ticks=theme_segment(colour=NA)
106                )
107                #manplot = manplot + opts(panel.grid.y.minor=theme_blank(),panel.grid.y.major=theme_blank())
108                #manplot = manplot + opts(panel.grid.major=theme_blank())
109                 
110                if (suggestiveline) manplot=manplot+geom_hline(yintercept=suggestiveline,colour="blue", alpha=I(1/3))
111                if (genomewideline) manplot=manplot+geom_hline(yintercept=genomewideline,colour="red")
112                manplot
113        }       else {
114                stop("Make sure your data frame contains columns CHR, BP, and P")
115        }
116}
117
118
119
120qq = function(pvector, title=NULL, spartan=F) {
121        # Thanks to Daniel Shriner at NHGRI for providing this code for creating expected and observed values
122        o = -log10(sort(pvector,decreasing=F))
123        e = -log10( 1:length(o)/length(o) )
124        # you could use base graphics
125        # plot(e,o,pch=19,cex=0.25, xlab=expression(Expected~~-log[10](italic(p))),
126        # ylab=expression(Observed~~-log[10](italic(p))), xlim=c(0,max(e)), ylim=c(0,max(e)))
127        # lines(e,e,col="red")
128        #You'll need ggplot2 installed to do the rest
129        qq=qplot(e,o, xlim=c(0,max(e)), ylim=c(0,max(o))) + stat_abline(intercept=0,slope=1, col="red")
130        qq=qq+opts(title=title)
131        qq=qq+scale_x_continuous(name=expression(Expected~~-log[10](italic(p))))
132        qq=qq+scale_y_continuous(name=expression(Observed~~-log[10](italic(p))))
133        if (spartan) plot=plot+opts(panel.background=theme_rect(col="grey50"), panel.grid.minor=theme_blank())
134        qq
135}
136"""
137
138# we need another string to avoid confusion over string substitutions with %in%
139# instantiate rcode2 string with infile,chromcol,offsetcol,pvalscols,title before saving and running
140
141rcode2 = """rgqqMan = function(infile="%s",chromcolumn=%s, offsetcolumn=%s, pvalscolumns=%s,
142title="%s",grey=%d) {
143rawd = read.table(infile,head=T,sep='\\t')
144dn = names(rawd)
145cc = dn[chromcolumn]
146oc = dn[offsetcolumn]
147nams = c(cc,oc)
148plen = length(rawd[,1])
149doreorder=1
150print(paste('###',plen,'values read from',infile,'read - now running plots',sep=' '))
151if (plen > 0) {
152  for (pvalscolumn in pvalscolumns) {
153  if (pvalscolumn > 0)
154     {
155     cname = names(rawd)[pvalscolumn]
156     mytitle = paste('p=',cname,', ',title,sep='')
157     myfname = chartr(' ','_',cname)
158     myqqplot = qq(rawd[,pvalscolumn],title=mytitle)
159     ggsave(filename=paste(myfname,"qqplot.png",sep='_'),myqqplot,width=11,height=8,dpi=100)
160     print(paste('## qqplot on',cname,'done'))
161     if ((chromcolumn > 0) & (offsetcolumn > 0)) {
162         if (doreorder) {
163             rawd = rawd[do.call(order,rawd[nams]),]
164             # mmmf - suggested by http://onertipaday.blogspot.com/2007/08/sortingordering-dataframe-according.html
165             # in case not yet ordered
166             doreorder = 0
167             }
168         print(paste('## manhattan on',cname,'starting',chromcolumn,offsetcolumn,pvalscolumn))
169         mymanplot= manhattan(chrom=rawd[,chromcolumn],offset=rawd[,offsetcolumn],pvals=rawd[,pvalscolumn],title=mytitle,grey=grey)
170         print(paste('## manhattan plot on',cname,'done'))
171         ggsave(filename=paste(myfname,"manhattan.png",sep='_'),mymanplot,width=11,height=8,dpi=100)
172         }
173         else {
174              print(paste('chrom column =',chromcolumn,'offset column = ',offsetcolumn,
175              'so no Manhattan plot - supply both chromosome and offset as numerics for Manhattan plots if required'))
176              }
177     }
178  else {
179        print(paste('pvalue column =',pvalscolumn,'Cannot parse it so no plots possible'))
180      }
181  } # for pvalscolumn
182 } else { print('## Problem - no values available to plot - was there really a chromosome and offset column?') }
183}
184
185rgqqMan()
186# execute with defaults as substituted
187"""
188
189
190def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir,beTidy=False):
191    """
192    we may have an interval file or a tabular file - if interval, will have chr1... so need to adjust
193    to chrom numbers
194    draw a qq for pvals and a manhattan plot if chrom/offset <> 0
195    contains some R scripts as text strings - we substitute defaults into the calls
196    to make them do our bidding - and save the resulting code for posterity
197    this can be called externally, I guess...for QC eg?
198    """
199    ffd,filtered_fname = tempfile.mkstemp(prefix='rgManQQtemp')
200    f = open(filtered_fname,'w')
201    inf = open(input_fname,'r')
202    ohead = inf.readline().strip().split('\t') # see if we have a header
203    inf.seek(0) # rewind
204    newhead = ['pval%d' % (x+1) for x in pval_cols]
205    newhead.insert(0,'Offset')
206    newhead.insert(0,'Chrom')
207    havehead = 0
208    wewant = [chrom_col,offset_col]
209    wewant += pval_cols
210    try:
211        allnums = ['%d' % x for x in ohead] # this should barf if non numerics == header row?
212        f.write('\t'.join(newhead)) # for R to read
213        f.write('\n')
214    except:
215        havehead = 1
216        newhead = [ohead[chrom_col],ohead[offset_col]]
217        newhead += [ohead[x] for x in pval_cols]
218        f.write('\t'.join(newhead)) # use the original head
219        f.write('\n')
220    for i,row in enumerate(inf):
221        if i == 0 and havehead:
222            continue # ignore header
223        sr = row.strip().split('\t')
224        if len(sr) > 1:
225            if sr[chrom_col].lower().find('chr') <> -1:
226                sr[chrom_col] = sr[chrom_col][3:]
227            newr = [sr[x] for x in wewant] # grab cols we need
228            s = '\t'.join(newr)
229            f.write(s)
230            f.write('\n')
231    f.close()
232    pvc = [x+3 for x in range(len(pval_cols))] # 2 for offset and chrom, 1 for r offset start
233    pvc = 'c(%s)' % (','.join(map(str,pvc)))
234    rcmd = '%s%s' % (rcode,rcode2 % (filtered_fname,'1','2',pvc,title,grey))
235    rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir)
236    rlog.append('## R script=')
237    rlog.append(rcmd)
238    if beTidy:
239        os.unlink(filtered_fname)
240    return rlog,flist
241 
242
243def main():
244    u = """<command interpreter="python">
245        rgManQQ.py '$input_file' "$name" '$out_html' '$out_html.files_path' '$chrom_col' '$offset_col' '$pval_col'
246    </command>
247    """
248    npar = 8
249    if len(sys.argv) < npar:
250            print >> sys.stdout, '## error - too few command line parameters - wanting %d' % npar
251            print >> sys.stdout, u
252            sys.exit(1)
253    input_fname = sys.argv[1]
254    title = sys.argv[2]
255    killme = string.punctuation + string.whitespace
256    trantab = string.maketrans(killme,'_'*len(killme))
257    ctitle = title.translate(trantab)
258    outhtml = sys.argv[3]
259    outdir = sys.argv[4]
260    try:
261         chrom_col = int(sys.argv[5])
262    except:
263         chrom_col = -1
264    try:
265        offset_col = int(sys.argv[6])
266    except:
267        offset_col = -1
268    p = sys.argv[7].strip().split(',')
269    try:
270        p = [int(x) for x in p]
271    except:
272        p = [-1]
273    if chrom_col == -1 or offset_col == -1: # was passed as zero - do not do manhattan plots
274        chrom_col = -1
275        offset_col = -1
276    grey = 0
277    if (sys.argv[8].lower() in ['1','true']):
278       grey = 1
279    if p == [-1]:
280        print >> sys.stderr,'## Cannot run rgManQQ - missing pval column'
281        sys.exit(1)
282    rlog,flist = doManQQ(input_fname,chrom_col,offset_col,p,title,grey,ctitle,outdir)
283    flist.sort()
284    html = [galhtmlprefix % progname,]
285    html.append('<h1>%s</h1>' % title)
286    if len(flist) > 0:
287        html.append('<table>\n')
288        for row in flist:
289            fname,expl = row # RRun returns pairs of filenames fiddled for the log and R script
290            e = os.path.splitext(fname)[-1]
291            if e in ['.png','.jpg']:
292               s= '<tr><td><a href="%s"><img src="%s" alt="%s hspace="10" width="400"><br>(Click to download image %s)</a></td></tr>' \
293                 % (fname,fname,expl,expl )
294               html.append(s)
295            else:
296               html.append('<tr><td><a href="%s">%s</a></td></tr>' % (fname,expl))
297        html.append('</table>\n')
298    else:
299        html.append('<h2>### Error - R returned no files - please confirm that parameters are sane</h1>')   
300    html.append('<h3>R log follows below</h3><hr><pre>\n')
301    html += rlog
302    html.append('</pre>\n')   
303    html.append(galhtmlattr % (progname,timenow()))
304    html.append(galhtmlpostfix)
305    htmlf = file(outhtml,'w')
306    htmlf.write('\n'.join(html))
307    htmlf.write('\n')
308    htmlf.close()
309   
310 
311
312if __name__ == "__main__":
313    main()
314
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。