1 | #!/usr/local/bin/python |
---|
2 | |
---|
3 | import sys,math,shutil,subprocess,os,time,tempfile,string |
---|
4 | from os.path import abspath |
---|
5 | from rgutils import timenow, RRun, galhtmlprefix, galhtmlpostfix, galhtmlattr |
---|
6 | progname = os.path.split(sys.argv[0])[1] |
---|
7 | myversion = 'V000.1 March 2010' |
---|
8 | verbose = False |
---|
9 | |
---|
10 | rcode=""" |
---|
11 | # license not stated so I'm assuming LGPL is ok for my derived work? |
---|
12 | # generalised so 3 core fields passed as parameters ross lazarus March 24 2010 for rgenetics |
---|
13 | # Originally created as qqman with the following |
---|
14 | # attribution: |
---|
15 | #-------------- |
---|
16 | # Stephen Turner |
---|
17 | # http://StephenTurner.us/ |
---|
18 | # http://GettingGeneticsDone.blogspot.com/ |
---|
19 | |
---|
20 | # Last updated: Tuesday, December 22, 2009 |
---|
21 | # R code for making manhattan plots and QQ plots from plink output files. |
---|
22 | # With GWAS data this can take a lot of memory. Recommended for use on |
---|
23 | # 64bit machines only, for now. |
---|
24 | |
---|
25 | # |
---|
26 | |
---|
27 | library(ggplot2) |
---|
28 | |
---|
29 | coloursTouse = c('firebrick','darkblue','goldenrod','darkgreen') |
---|
30 | # not too fugly but need a colour expert please... |
---|
31 | |
---|
32 | |
---|
33 | manhattan = function(chrom=NULL,offset=NULL,pvals=NULL, title=NULL, max.y="max", |
---|
34 | suggestiveline=0, genomewide=T, size.x.labels=9, size.y.labels=10, annotate=F, SNPlist=NULL,grey=0) { |
---|
35 | |
---|
36 | if (annotate & is.null(SNPlist)) stop("You requested annotation but provided no SNPlist!") |
---|
37 | genomewideline=NULL # was genomewideline=-log10(5e-8) |
---|
38 | if (genomewide) { # use bonferroni since might be only a small region? |
---|
39 | genomewideline = -log10(0.05/length(pvals)) } |
---|
40 | d=data.frame(CHR=chrom,BP=offset,P=pvals) |
---|
41 | |
---|
42 | #limit to only chrs 1-23? |
---|
43 | d=d[d$CHR %in% 1:23, ] |
---|
44 | |
---|
45 | if ("CHR" %in% names(d) & "BP" %in% names(d) & "P" %in% names(d) ) { |
---|
46 | d=na.omit(d) |
---|
47 | d=d[d$P>0 & d$P<=1, ] |
---|
48 | d$logp = -log10(d$P) |
---|
49 | |
---|
50 | d$pos=NA |
---|
51 | ticks=NULL |
---|
52 | lastbase=0 |
---|
53 | chrlist = unique(d$CHR) |
---|
54 | nchr = length(chrlist) # may be any number? |
---|
55 | if (nchr >= 2) { |
---|
56 | for (x in c(1:nchr)) { |
---|
57 | i = chrlist[x] # need the chrom number - may not == index |
---|
58 | if (x == 1) { # first time |
---|
59 | d[d$CHR==i, ]$pos=d[d$CHR==i, ]$BP |
---|
60 | tks = d[d$CHR==i, ]$pos[floor(length(d[d$CHR==i, ]$pos)/2)+1] |
---|
61 | } else { |
---|
62 | lastchr = chrlist[x-1] # previous whatever the list |
---|
63 | lastbase=lastbase+tail(subset(d,CHR==lastchr)$BP, 1) |
---|
64 | d[d$CHR==i, ]$pos=d[d$CHR==i, ]$BP+lastbase |
---|
65 | tks=c(tks, d[d$CHR==i, ]$pos[floor(length(d[d$CHR==i, ]$pos)/2)+1]) |
---|
66 | } |
---|
67 | ticklim=c(min(d$pos),max(d$pos)) |
---|
68 | xlabs = chrlist |
---|
69 | } |
---|
70 | } else { # nchr is 1 |
---|
71 | nticks = 10 |
---|
72 | last = max(offset) |
---|
73 | first = min(offset) |
---|
74 | tks = c() |
---|
75 | t = (last-first)/nticks # units per tick |
---|
76 | for (x in c(1:nticks)) tks = c(tks,round(x*t)) |
---|
77 | xlabs = tks |
---|
78 | ticklim = c(first,last) |
---|
79 | } # else |
---|
80 | if (grey) {mycols=rep(c("gray10","gray60"),max(d$CHR)) |
---|
81 | } else { |
---|
82 | mycols=rep(coloursTouse,max(d$CHR)) |
---|
83 | } |
---|
84 | |
---|
85 | if (max.y=="max") maxy=ceiling(max(d$logp)) else maxy=max.y |
---|
86 | maxy = max(maxy,1.1*genomewideline) |
---|
87 | # if (maxy<8) maxy=8 |
---|
88 | # only makes sense if genome wide is assumed - we could have a fine mapping region? |
---|
89 | if (annotate) d.annotate=d[as.numeric(substr(d$SNP,3,100)) %in% SNPlist, ] |
---|
90 | if (nchr >= 2) { |
---|
91 | manplot=qplot(pos,logp,data=d, ylab=expression(-log[10](italic(p))) , colour=factor(CHR)) |
---|
92 | manplot=manplot+scale_x_continuous(name="Chromosome", breaks=tks, labels=xlabs) } |
---|
93 | else { |
---|
94 | manplot=qplot(BP,logp,data=d, ylab=expression(-log[10](italic(p))) , colour=factor(CHR)) |
---|
95 | manplot=manplot+scale_x_continuous("BP") } |
---|
96 | manplot=manplot+scale_y_continuous(limits=c(0,maxy), breaks=1:maxy, labels=1:maxy) |
---|
97 | manplot=manplot+scale_colour_manual(value=mycols) |
---|
98 | if (annotate) { manplot=manplot + geom_point(data=d.annotate, colour=I("green3")) } |
---|
99 | manplot=manplot + opts(legend.position = "none") |
---|
100 | manplot=manplot + opts(title=title) |
---|
101 | manplot=manplot+opts( |
---|
102 | panel.background=theme_blank(), |
---|
103 | axis.text.x=theme_text(size=size.x.labels, colour="grey50"), |
---|
104 | axis.text.y=theme_text(size=size.y.labels, colour="grey50"), |
---|
105 | axis.ticks=theme_segment(colour=NA) |
---|
106 | ) |
---|
107 | #manplot = manplot + opts(panel.grid.y.minor=theme_blank(),panel.grid.y.major=theme_blank()) |
---|
108 | #manplot = manplot + opts(panel.grid.major=theme_blank()) |
---|
109 | |
---|
110 | if (suggestiveline) manplot=manplot+geom_hline(yintercept=suggestiveline,colour="blue", alpha=I(1/3)) |
---|
111 | if (genomewideline) manplot=manplot+geom_hline(yintercept=genomewideline,colour="red") |
---|
112 | manplot |
---|
113 | } else { |
---|
114 | stop("Make sure your data frame contains columns CHR, BP, and P") |
---|
115 | } |
---|
116 | } |
---|
117 | |
---|
118 | |
---|
119 | |
---|
120 | qq = function(pvector, title=NULL, spartan=F) { |
---|
121 | # Thanks to Daniel Shriner at NHGRI for providing this code for creating expected and observed values |
---|
122 | o = -log10(sort(pvector,decreasing=F)) |
---|
123 | e = -log10( 1:length(o)/length(o) ) |
---|
124 | # you could use base graphics |
---|
125 | # plot(e,o,pch=19,cex=0.25, xlab=expression(Expected~~-log[10](italic(p))), |
---|
126 | # ylab=expression(Observed~~-log[10](italic(p))), xlim=c(0,max(e)), ylim=c(0,max(e))) |
---|
127 | # lines(e,e,col="red") |
---|
128 | #You'll need ggplot2 installed to do the rest |
---|
129 | qq=qplot(e,o, xlim=c(0,max(e)), ylim=c(0,max(o))) + stat_abline(intercept=0,slope=1, col="red") |
---|
130 | qq=qq+opts(title=title) |
---|
131 | qq=qq+scale_x_continuous(name=expression(Expected~~-log[10](italic(p)))) |
---|
132 | qq=qq+scale_y_continuous(name=expression(Observed~~-log[10](italic(p)))) |
---|
133 | if (spartan) plot=plot+opts(panel.background=theme_rect(col="grey50"), panel.grid.minor=theme_blank()) |
---|
134 | qq |
---|
135 | } |
---|
136 | """ |
---|
137 | |
---|
138 | # we need another string to avoid confusion over string substitutions with %in% |
---|
139 | # instantiate rcode2 string with infile,chromcol,offsetcol,pvalscols,title before saving and running |
---|
140 | |
---|
141 | rcode2 = """rgqqMan = function(infile="%s",chromcolumn=%s, offsetcolumn=%s, pvalscolumns=%s, |
---|
142 | title="%s",grey=%d) { |
---|
143 | rawd = read.table(infile,head=T,sep='\\t') |
---|
144 | dn = names(rawd) |
---|
145 | cc = dn[chromcolumn] |
---|
146 | oc = dn[offsetcolumn] |
---|
147 | nams = c(cc,oc) |
---|
148 | plen = length(rawd[,1]) |
---|
149 | doreorder=1 |
---|
150 | print(paste('###',plen,'values read from',infile,'read - now running plots',sep=' ')) |
---|
151 | if (plen > 0) { |
---|
152 | for (pvalscolumn in pvalscolumns) { |
---|
153 | if (pvalscolumn > 0) |
---|
154 | { |
---|
155 | cname = names(rawd)[pvalscolumn] |
---|
156 | mytitle = paste('p=',cname,', ',title,sep='') |
---|
157 | myfname = chartr(' ','_',cname) |
---|
158 | myqqplot = qq(rawd[,pvalscolumn],title=mytitle) |
---|
159 | ggsave(filename=paste(myfname,"qqplot.png",sep='_'),myqqplot,width=11,height=8,dpi=100) |
---|
160 | print(paste('## qqplot on',cname,'done')) |
---|
161 | if ((chromcolumn > 0) & (offsetcolumn > 0)) { |
---|
162 | if (doreorder) { |
---|
163 | rawd = rawd[do.call(order,rawd[nams]),] |
---|
164 | # mmmf - suggested by http://onertipaday.blogspot.com/2007/08/sortingordering-dataframe-according.html |
---|
165 | # in case not yet ordered |
---|
166 | doreorder = 0 |
---|
167 | } |
---|
168 | print(paste('## manhattan on',cname,'starting',chromcolumn,offsetcolumn,pvalscolumn)) |
---|
169 | mymanplot= manhattan(chrom=rawd[,chromcolumn],offset=rawd[,offsetcolumn],pvals=rawd[,pvalscolumn],title=mytitle,grey=grey) |
---|
170 | print(paste('## manhattan plot on',cname,'done')) |
---|
171 | ggsave(filename=paste(myfname,"manhattan.png",sep='_'),mymanplot,width=11,height=8,dpi=100) |
---|
172 | } |
---|
173 | else { |
---|
174 | print(paste('chrom column =',chromcolumn,'offset column = ',offsetcolumn, |
---|
175 | 'so no Manhattan plot - supply both chromosome and offset as numerics for Manhattan plots if required')) |
---|
176 | } |
---|
177 | } |
---|
178 | else { |
---|
179 | print(paste('pvalue column =',pvalscolumn,'Cannot parse it so no plots possible')) |
---|
180 | } |
---|
181 | } # for pvalscolumn |
---|
182 | } else { print('## Problem - no values available to plot - was there really a chromosome and offset column?') } |
---|
183 | } |
---|
184 | |
---|
185 | rgqqMan() |
---|
186 | # execute with defaults as substituted |
---|
187 | """ |
---|
188 | |
---|
189 | |
---|
190 | def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir,beTidy=False): |
---|
191 | """ |
---|
192 | we may have an interval file or a tabular file - if interval, will have chr1... so need to adjust |
---|
193 | to chrom numbers |
---|
194 | draw a qq for pvals and a manhattan plot if chrom/offset <> 0 |
---|
195 | contains some R scripts as text strings - we substitute defaults into the calls |
---|
196 | to make them do our bidding - and save the resulting code for posterity |
---|
197 | this can be called externally, I guess...for QC eg? |
---|
198 | """ |
---|
199 | ffd,filtered_fname = tempfile.mkstemp(prefix='rgManQQtemp') |
---|
200 | f = open(filtered_fname,'w') |
---|
201 | inf = open(input_fname,'r') |
---|
202 | ohead = inf.readline().strip().split('\t') # see if we have a header |
---|
203 | inf.seek(0) # rewind |
---|
204 | newhead = ['pval%d' % (x+1) for x in pval_cols] |
---|
205 | newhead.insert(0,'Offset') |
---|
206 | newhead.insert(0,'Chrom') |
---|
207 | havehead = 0 |
---|
208 | wewant = [chrom_col,offset_col] |
---|
209 | wewant += pval_cols |
---|
210 | try: |
---|
211 | allnums = ['%d' % x for x in ohead] # this should barf if non numerics == header row? |
---|
212 | f.write('\t'.join(newhead)) # for R to read |
---|
213 | f.write('\n') |
---|
214 | except: |
---|
215 | havehead = 1 |
---|
216 | newhead = [ohead[chrom_col],ohead[offset_col]] |
---|
217 | newhead += [ohead[x] for x in pval_cols] |
---|
218 | f.write('\t'.join(newhead)) # use the original head |
---|
219 | f.write('\n') |
---|
220 | for i,row in enumerate(inf): |
---|
221 | if i == 0 and havehead: |
---|
222 | continue # ignore header |
---|
223 | sr = row.strip().split('\t') |
---|
224 | if len(sr) > 1: |
---|
225 | if sr[chrom_col].lower().find('chr') <> -1: |
---|
226 | sr[chrom_col] = sr[chrom_col][3:] |
---|
227 | newr = [sr[x] for x in wewant] # grab cols we need |
---|
228 | s = '\t'.join(newr) |
---|
229 | f.write(s) |
---|
230 | f.write('\n') |
---|
231 | f.close() |
---|
232 | pvc = [x+3 for x in range(len(pval_cols))] # 2 for offset and chrom, 1 for r offset start |
---|
233 | pvc = 'c(%s)' % (','.join(map(str,pvc))) |
---|
234 | rcmd = '%s%s' % (rcode,rcode2 % (filtered_fname,'1','2',pvc,title,grey)) |
---|
235 | rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir) |
---|
236 | rlog.append('## R script=') |
---|
237 | rlog.append(rcmd) |
---|
238 | if beTidy: |
---|
239 | os.unlink(filtered_fname) |
---|
240 | return rlog,flist |
---|
241 | |
---|
242 | |
---|
243 | def main(): |
---|
244 | u = """<command interpreter="python"> |
---|
245 | rgManQQ.py '$input_file' "$name" '$out_html' '$out_html.files_path' '$chrom_col' '$offset_col' '$pval_col' |
---|
246 | </command> |
---|
247 | """ |
---|
248 | npar = 8 |
---|
249 | if len(sys.argv) < npar: |
---|
250 | print >> sys.stdout, '## error - too few command line parameters - wanting %d' % npar |
---|
251 | print >> sys.stdout, u |
---|
252 | sys.exit(1) |
---|
253 | input_fname = sys.argv[1] |
---|
254 | title = sys.argv[2] |
---|
255 | killme = string.punctuation + string.whitespace |
---|
256 | trantab = string.maketrans(killme,'_'*len(killme)) |
---|
257 | ctitle = title.translate(trantab) |
---|
258 | outhtml = sys.argv[3] |
---|
259 | outdir = sys.argv[4] |
---|
260 | try: |
---|
261 | chrom_col = int(sys.argv[5]) |
---|
262 | except: |
---|
263 | chrom_col = -1 |
---|
264 | try: |
---|
265 | offset_col = int(sys.argv[6]) |
---|
266 | except: |
---|
267 | offset_col = -1 |
---|
268 | p = sys.argv[7].strip().split(',') |
---|
269 | try: |
---|
270 | p = [int(x) for x in p] |
---|
271 | except: |
---|
272 | p = [-1] |
---|
273 | if chrom_col == -1 or offset_col == -1: # was passed as zero - do not do manhattan plots |
---|
274 | chrom_col = -1 |
---|
275 | offset_col = -1 |
---|
276 | grey = 0 |
---|
277 | if (sys.argv[8].lower() in ['1','true']): |
---|
278 | grey = 1 |
---|
279 | if p == [-1]: |
---|
280 | print >> sys.stderr,'## Cannot run rgManQQ - missing pval column' |
---|
281 | sys.exit(1) |
---|
282 | rlog,flist = doManQQ(input_fname,chrom_col,offset_col,p,title,grey,ctitle,outdir) |
---|
283 | flist.sort() |
---|
284 | html = [galhtmlprefix % progname,] |
---|
285 | html.append('<h1>%s</h1>' % title) |
---|
286 | if len(flist) > 0: |
---|
287 | html.append('<table>\n') |
---|
288 | for row in flist: |
---|
289 | fname,expl = row # RRun returns pairs of filenames fiddled for the log and R script |
---|
290 | e = os.path.splitext(fname)[-1] |
---|
291 | if e in ['.png','.jpg']: |
---|
292 | s= '<tr><td><a href="%s"><img src="%s" alt="%s hspace="10" width="400"><br>(Click to download image %s)</a></td></tr>' \ |
---|
293 | % (fname,fname,expl,expl ) |
---|
294 | html.append(s) |
---|
295 | else: |
---|
296 | html.append('<tr><td><a href="%s">%s</a></td></tr>' % (fname,expl)) |
---|
297 | html.append('</table>\n') |
---|
298 | else: |
---|
299 | html.append('<h2>### Error - R returned no files - please confirm that parameters are sane</h1>') |
---|
300 | html.append('<h3>R log follows below</h3><hr><pre>\n') |
---|
301 | html += rlog |
---|
302 | html.append('</pre>\n') |
---|
303 | html.append(galhtmlattr % (progname,timenow())) |
---|
304 | html.append(galhtmlpostfix) |
---|
305 | htmlf = file(outhtml,'w') |
---|
306 | htmlf.write('\n'.join(html)) |
---|
307 | htmlf.write('\n') |
---|
308 | htmlf.close() |
---|
309 | |
---|
310 | |
---|
311 | |
---|
312 | if __name__ == "__main__": |
---|
313 | main() |
---|
314 | |
---|