Context Navigation

rgQC.py @ 2

リビジョン 2, 62.8 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

行番号
1	# oct 15 rpy replaced - temp fix until we get gnuplot working
2	# rpy deprecated - replace with RRun
3	# fixes to run functional test! oct1 2009
4	# needed to expand our path with os.path.realpath to get newpath working with
5	# all the fancy pdfnup stuff
6	# and a fix to pruneld to write output to where it should be
7	# smallish data in test-data/smallwga in various forms
8	# python ../tools/rgenetics/rgQC.py -i smallwga -o smallwga -s smallwga/smallwga.html -p smallwga
9	# child files are deprecated and broken as at july 19 2009
10	# need to move them to the html file extrafiles path
11	# found lots of corner cases with some illumina data where cnv markers were
12	# included
13	# and where affection status was all missing !
14	# added links to tab files showing worst 1/keepfrac markers and subjects
15	# ross lazarus january 2008
16	#
17	# added named parameters
18	# to ensure no silly slippages if non required parameter in the most general case
19	# some potentially useful things here reusable in complex scripts
20	# with lots'o'html (TM)
21	# aug 17 2007 rml
22	#
23	# added marker and subject and parenting april 14 rml
24	# took a while to get the absolute paths right for all the file munging
25	# as of april 16 seems to work..
26	# getting galaxy to serve images in html reports is a little tricky
27	# we don't want QC reports to be dozens of individual files, so need
28	# to use the url /static/rg/... since galaxy's web server will happily serve images
29	# from there
30	# galaxy passes output files as relative paths
31	# these have to be munged by rgQC.py before calling this
32	# galaxy will pass in 2 file names - one for the log
33	# and one for the final html report
34	# of the form './database/files/dataset_66.dat'
35	# we need to be working in that directory so our plink output files are there
36	# so these have to be munged by rgQC.py before calling this
37	# note no ped file passed so had to remove the -l option
38	# for plinkParse.py that makes a heterozygosity report from the ped
39	# file - needs fixing...
40	# new: importing manhattan/qqplot plotter
41	# def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir):
42	# """ draw a qq for pvals and a manhattan plot if chrom/offset <> 0
43	# contains some R scripts as text strings - we substitute defaults into the calls
44	# to make them do our bidding - and save the resulting code for posterity
45	# this can be called externally, I guess...for QC eg?
46	# """
47	#
48	# rcmd = '%s%s' % (rcode,rcode2 % (input_fname,chrom_col,offset_col,pval_cols,title,grey))
49	# rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir)
50	# return rlog,flist
51
52
53	from optparse import OptionParser
54
55	import sys,os,shutil, glob, math, subprocess, time, operator, random, tempfile, copy, string
56	from os.path import abspath
57	from rgutils import galhtmlprefix, galhtmlpostfix, RRun, timenow, plinke, rexe, runPlink, pruneLD
58	import rgManQQ
59
60	prog = os.path.split(sys.argv[0])[1]
61	vers = '0.4 april 2009 rml'
62	idjoiner = '_~_~_' # need something improbable..
63	# many of these may need fixing for a new install
64
65	myversion = vers
66	keepfrac = 20 # fraction to keep after sorting by each interesting value
67
68	missvals = {'0':'0','N':'N','-9':'-9','-':'-'} # fix me if these change!
69
70	mogresize = "x300" # this controls the width for jpeg thumbnails
71
72
73
74
75	def makePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='20',nup=3,height=10,width=8,rgbin=''):
76	"""
77	marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
78	'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
79	subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
80	"""
81
82
83	def rHist(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=50):
84	""" rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
85	# generic histogram and vertical boxplot in a 3:1 layout
86	# returns the graphic file name for inclusion in the web page
87	# broken out here for reuse
88	# ross lazarus march 2007
89	"""
90	screenmat = (1,2,1,2) # create a 2x2 cabvas
91	widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
92	rpy.r.pdf( outfname, height , width )
93	#rpy.r.layout(rpy.r.matrix(rpy.r.c(1,1,1,2), 1, 4, byrow = True)) # 3 to 1 vertical plot
94	m = rpy.r.matrix((1,1,1,2),nrow=1,ncol=4,byrow=True)
95	# in R, m = matrix(c(1,2),nrow=1,ncol=2,byrow=T)
96	rpy.r("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))") # 4 to 1 vertical plot
97	maint = 'QC for %s - %s' % (basename,title)
98	rpy.r.hist(plotme,main=maint, xlab=xlabname,breaks=nbreaks,col="maroon",cex=0.8)
99	rpy.r.boxplot(plotme,main='',col="maroon",outline=False)
100	rpy.r.dev_off()
101
102	def rCum(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=100):
103	"""
104	Useful to see what various cutoffs yield - plot percentiles
105	"""
106	n = len(plotme)
107	maxveclen = 1000.0 # for reasonable pdf sizes!
108	yvec = copy.copy(plotme)
109	# arrives already in decending order of importance missingness or mendel count by subj or marker
110	xvec = range(n)
111	xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
112	# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
113	if n > maxveclen: # oversample part of the distribution
114	always = min(1000,n/20) # oversample smaller of lowest few hundred items or 5%
115	skip = int(n/maxveclen) # take 1 in skip to get about maxveclen points
116	samplei = [i for i in range(n) if (i % skip == 0) or (i < always)] # always oversample first sorted values
117	yvec = [yvec[i] for i in samplei] # always get first and last
118	xvec = [xvec[i] for i in samplei] # always get first and last
119	# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
120	rpy.r.pdf( outfname, height , width )
121	maint = 'QC for %s - %s' % (basename,title)
122	rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
123	rpy.r.plot(xvec,yvec,type='p',main=maint, ylab=xlabname, xlab='Sample Percentile',col="maroon",cex=0.8)
124	rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
125	rpy.r.dev_off()
126
127	def rQQ(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
128	"""
129	y is data for a qq plot and ends up on the x axis go figure
130	if sampling, oversample low values - all the top 1% ?
131	this version called with -log10 transformed hwe
132	"""
133	nrows = len(plotme)
134	fn = float(nrows)
135	xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
136	mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
137	maxveclen = 3000
138	yvec = copy.copy(plotme)
139	if nrows > maxveclen:
140	# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
141	# oversample part of the distribution
142	always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
143	skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
144	samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
145	# always oversample first sorted (here lowest) values
146	yvec = [yvec[i] for i in samplei] # always get first and last
147	xvec = [xvec[i] for i in samplei] # and sample xvec same way
148	maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
149	else:
150	maint='Log QQ Plot(n=%d)' % (nrows)
151	mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
152	ylab = '%s' % xlabname
153	xlab = '-log10(Uniform 0-1)'
154	# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
155	rpy.r.pdf( outfname, height , width )
156	rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
157	rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
158	rpy.r.points(mx,mx,type='l')
159	rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
160	rpy.r.dev_off()
161
162	def rMultiQQ(plotme = [],nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
163	"""
164	data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
165	grid of qq plots to show departure from null at extremes of data quality
166	Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
167	and ends up on the x axis go figure
168	if sampling, oversample low values - all the top 1% ?
169	"""
170	data = copy.copy(plotme)
171	nvals = len(data)
172	stepsize = nvals/nsplits
173	logstep = math.log10(stepsize) # so is 3 for steps of 1000
174	quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
175	data.sort(key=itertools.itemgetter(1)) # into x order
176	rpy.r.pdf( outfname, height , width )
177	rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
178	yvec = [-math.log10(random.random()) for x in range(stepsize)]
179	yvec.sort() # size of each step is expected range for xvec under null?!
180	for rowstart in quints:
181	rowend = rowstart + stepsize
182	if nvals - rowend < stepsize: # finish last split
183	rowend = nvals
184	row = data[rowstart:rowend]
185	row.sort(key=itertools.itemgetter(2)) # into y order
186	for colstart in quints:
187	colend = colstart + stepsize
188	if nvals - colend < stepsize: # finish last split
189	colend = nvals
190	cell = row[colstart:colend]
191	xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
192	rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
193	rpy.r.points(c(0,logstep),c(0,logstep),type='l')
194	rpy.r.dev_off()
195
196
197	def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
198	"""
199	y is data for a qqnorm plot
200	if sampling, oversample low values - all the top 1% ?
201	"""
202	rangeunif = len(plotme)
203	nunif = 1000
204	maxveclen = 3000
205	nrows = len(plotme)
206	data = copy.copy(plotme)
207	if nrows > maxveclen:
208	# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
209	# oversample part of the distribution
210	always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
211	skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
212	samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
213	# always oversample first sorted (here lowest) values
214	yvec = [data[i] for i in samplei] # always get first and last
215	maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
216	else:
217	yvec = data
218	maint='Log QQ Plot(n=%d)' % (nrows)
219	n = 1000
220	ylab = '%s' % xlabname
221	xlab = 'Normal'
222	# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
223	rpy.r.pdf( outfname, height , width )
224	rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
225	rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
226	rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
227	rpy.r.dev_off()
228
229	def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
230	"""
231	layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
232	like the GAIN qc tools
233	y is data for a qq plot and ends up on the x axis go figure
234	if sampling, oversample low values - all the top 1% ?
235	"""
236	rangeunif = len(plotme)
237	nunif = 1000
238	fn = float(rangeunif)
239	xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
240	skip = max(int(rangeunif/fn),1)
241	# force include last points
242	mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
243	maxveclen = 2000
244	nrows = len(plotme)
245	data = copy.copy(plotme)
246	data.sort() # low to high - oversample low values
247	if nrows > maxveclen:
248	# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
249	# oversample part of the distribution
250	always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
251	skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
252	samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
253	# always oversample first sorted (here lowest) values
254	yvec = [data[i] for i in samplei] # always get first and last
255	xvec = [xvec[i] for i in samplei] # and sample xvec same way
256	maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
257	else:
258	yvec = data
259	maint='Log QQ Plot(n=%d)' % (nrows)
260	n = 1000
261	mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
262	ylab = '%s' % xlabname
263	xlab = '-log10(Uniform 0-1)'
264	# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
265	rpy.r.pdf( outfname, height , width )
266	rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
267	rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
268	rpy.r.points(mx,mx,type='l')
269	rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
270	rpy.r.dev_off()
271
272
273	fdsto,stofile = tempfile.mkstemp()
274	sto = open(stofile,'w')
275	import rpy # delay to avoid rpy stdout chatter replacing galaxy file blurb
276	mog = 'mogrify'
277	pdfnup = 'pdfnup'
278	pdfjoin = 'pdfjoin'
279	shead = subjects.pop(0) # get rid of head
280	mhead = markers.pop(0)
281	maf = mhead.index('maf')
282	missfrac = mhead.index('missfrac')
283	logphweall = mhead.index('logp_hwe_all')
284	logphweunaff = mhead.index('logp_hwe_unaff')
285	# check for at least some unaffected rml june 2009
286	m_mendel = mhead.index('N_Mendel')
287	fracmiss = shead.index('FracMiss')
288	s_mendel = shead.index('Mendel_errors')
289	s_het = shead.index('F_Stat')
290	params = {}
291	hweres = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
292	and x[logphweunaff].upper() <> 'NA']
293	if len(hweres) <> 0:
294	xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
295	# plot for each of these cols
296	else: # try hwe all instead - maybe no affection status available
297	xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
298	ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
299	oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
300	qqplotme = [1,0,0,0,0,0,0] #
301	qnplotme = [0,0,0,0,0,0,1] #
302	nplots = len(xs)
303	xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
304	'Marker Mendel Error Count', 'Missing Rate: Subjects',
305	'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
306	plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
307	ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
308	ordplotnames = ['%s_cum' % x for x in plotnames]
309	ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
310	outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
311	ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
312	datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
313	titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
314	"Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
315	html = []
316	pdflist = []
317	for n,column in enumerate(xs):
318	dat = [float(x[column]) for x in datasources[n] if len(x) >= column
319	and x[column][:2].upper() <> 'NA'] # plink gives both!
320	if sum(dat) <> 0: # eg nada for mendel if case control?
321	rHist(plotme=dat,outfname=outfnames[n],xlabname=xlabnames[n],
322	title=titles[n],basename=basename,nbreaks=nbreaks)
323	row = [titles[n],ploturls[n],outfnames[n] ]
324	html.append(row)
325	pdflist.append(outfnames[n])
326	if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
327	otitle = 'Ranked %s' % titles[n]
328	dat.sort()
329	if oreverseme[n]:
330	dat.reverse()
331	rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
332	title=otitle,basename=basename,nbreaks=1000)
333	row = [otitle,ordploturls[n],ordoutfnames[n]]
334	html.append(row)
335	pdflist.append(ordoutfnames[n])
336	if qqplotme[n]: #
337	otitle = 'LogQQ plot %s' % titles[n]
338	dat.sort()
339	dat.reverse()
340	ofn = os.path.split(ordoutfnames[n])
341	ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
342	ofu = os.path.split(ordploturls[n])
343	ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
344	rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
345	title=otitle,basename=basename)
346	row = [otitle,ofu,ofn]
347	html.append(row)
348	pdflist.append(ofn)
349	elif qnplotme[n]:
350	otitle = 'F Statistic %s' % titles[n]
351	dat.sort()
352	dat.reverse()
353	ofn = os.path.split(ordoutfnames[n])
354	ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
355	ofu = os.path.split(ordploturls[n])
356	ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
357	rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
358	title=otitle,basename=basename)
359	row = [otitle,ofu,ofn]
360	html.append(row)
361	pdflist.append(ofn)
362	else:
363	print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
364	if nup>0:
365	# pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
366	# pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
367	filestojoin = ' '.join(pdflist) # all the file names so far
368	afname = '%s_All_Paged.pdf' % (basename)
369	fullafname = os.path.join(newfpath,afname)
370	expl = 'All %s QC Plots joined into a single pdf' % basename
371	vcl = '%s %s --outfile %s ' % (pdfjoin,filestojoin, fullafname)
372	# make single page pdf
373	x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
374	retval = x.wait()
375	row = [expl,afname,fullafname]
376	html.insert(0,row) # last rather than second
377	nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
378	fullnfname = os.path.join(newfpath,nfname)
379	expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
380	vcl = '%s %s --nup %dx%d --frame true --outfile %s' % (pdfnup,afname,nup,nup,fullnfname)
381	# make thumbnail images
382	x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
383	retval = x.wait()
384	row = [expl,nfname,fullnfname]
385	html.insert(1,row) # this goes second
386	vcl = '%s -format jpg -resize %s %s' % (mog, mogresize, os.path.join(newfpath,'*.pdf'))
387	# make thumbnail images
388	x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
389	retval = x.wait()
390	sto.close()
391	cruft = open(stofile,'r').readlines()
392	return html,cruft # elements for an ordered list of urls or whatever..
393
394
395	def RmakePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='100',nup=3,height=8,width=10,rexe=''):
396	"""
397	nice try but the R scripts are huge and take forever to run if there's a lot of data
398	marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
399	'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
400	subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
401	"""
402	colour = "maroon"
403
404	def rHist(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=nbreaks):
405	""" rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
406	# generic histogram and vertical boxplot in a 3:1 layout
407	# returns the graphic file name for inclusion in the web page
408	# broken out here for reuse
409	# ross lazarus march 2007
410	"""
411	R = []
412	maint = 'QC for %s - %s' % (basename,title)
413	screenmat = (1,2,1,2) # create a 2x2 canvas
414	widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
415	R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
416	R.append("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))")
417	R.append("plotme = read.table(file='%s',head=F,sep='\t')" % plotme)
418	R.append('hist(plotme, main="%s",xlab="%s",breaks=%d,col="%s")' % (maint,xlabname,nbreaks,colour))
419	R.append('boxplot(plotme,main="",col="%s",outline=F)' % (colour) )
420	R.append('dev.off()')
421	return R
422
423	def rCum(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=100):
424	"""
425	Useful to see what various cutoffs yield - plot percentiles
426	"""
427	R = []
428	n = len(plotme)
429	R.append("plotme = read.table(file='%s',head=T,sep='\t')" % plotme)
430	# arrives already in decending order of importance missingness or mendel count by subj or marker
431	maint = 'QC for %s - %s' % (basename,title)
432	R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
433	R.append("par(lab=c(10,10,10))")
434	R.append('plot(plotme$xvec,plotme$yvec,type="p",main="%s",ylab="%s",xlab="Sample Percentile",col="%s")' % (maint,xlabname,colour))
435	R.append('dev.off()')
436	return R
437
438	def rQQ(plotme='', outfname='fname',title='title',xlabname='Sample',basename=''):
439	"""
440	y is data for a qq plot and ends up on the x axis go figure
441	if sampling, oversample low values - all the top 1% ?
442	this version called with -log10 transformed hwe
443	"""
444	R = []
445	nrows = len(plotme)
446	fn = float(nrows)
447	xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
448	#mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
449	maxveclen = 3000
450	yvec = copy.copy(plotme)
451	if nrows > maxveclen:
452	# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
453	# oversample part of the distribution
454	always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
455	skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
456	if skip < 2:
457	skip = 2
458	samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
459	# always oversample first sorted (here lowest) values
460	yvec = [yvec[i] for i in samplei] # always get first and last
461	xvec = [xvec[i] for i in samplei] # and sample xvec same way
462	maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
463	else:
464	maint='Log QQ Plot(n=%d)' % (nrows)
465	mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
466	ylab = '%s' % xlabname
467	xlab = '-log10(Uniform 0-1)'
468	# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
469	x = ['%f' % x for x in xvec]
470	R.append('xvec = c(%s)' % ','.join(x))
471	y = ['%f' % x for x in yvec]
472	R.append('yvec = c(%s)' % ','.join(y))
473	R.append('mx = c(0,%f)' % (math.log10(fn)))
474	R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
475	R.append("par(lab=c(10,10,10))")
476	R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
477	R.append('points(mx,mx,type="l")')
478	R.append('grid(col="lightgray",lty="dotted")')
479	R.append('dev.off()')
480	return R
481
482	def rMultiQQ(plotme = '',nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
483	"""
484	data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
485	grid of qq plots to show departure from null at extremes of data quality
486	Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
487	and ends up on the x axis go figure
488	if sampling, oversample low values - all the top 1% ?
489	"""
490	data = copy.copy(plotme)
491	nvals = len(data)
492	stepsize = nvals/nsplits
493	logstep = math.log10(stepsize) # so is 3 for steps of 1000
494	R.append('mx = c(0,%f)' % logstep)
495	quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
496	data.sort(key=itertools.itemgetter(1)) # into x order
497	#rpy.r.pdf( outfname, h , w )
498	#rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
499	R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
500	yvec = [-math.log10(random.random()) for x in range(stepsize)]
501	yvec.sort() # size of each step is expected range for xvec under null?!
502	y = ['%f' % x for x in yvec]
503	R.append('yvec = c(%s)' % ','.join(y))
504	for rowstart in quints:
505	rowend = rowstart + stepsize
506	if nvals - rowend < stepsize: # finish last split
507	rowend = nvals
508	row = data[rowstart:rowend]
509	row.sort(key=itertools.itemgetter(2)) # into y order
510	for colstart in quints:
511	colend = colstart + stepsize
512	if nvals - colend < stepsize: # finish last split
513	colend = nvals
514	cell = row[colstart:colend]
515	xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
516	x = ['%f' % x for x in xvec]
517	R.append('xvec = c(%s)' % ','.join(x))
518	R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
519	R.append('points(mx,mx,type="l")')
520	R.append('grid(col="lightgray",lty="dotted")')
521	#rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
522	#rpy.r.points(c(0,logstep),c(0,logstep),type='l')
523	R.append('dev.off()')
524	#rpy.r.dev_off()
525	return R
526
527
528	def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
529	"""
530	y is data for a qqnorm plot
531	if sampling, oversample low values - all the top 1% ?
532	"""
533	rangeunif = len(plotme)
534	nunif = 1000
535	maxveclen = 3000
536	nrows = len(plotme)
537	data = copy.copy(plotme)
538	if nrows > maxveclen:
539	# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
540	# oversample part of the distribution
541	always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
542	skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
543	samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
544	# always oversample first sorted (here lowest) values
545	yvec = [data[i] for i in samplei] # always get first and last
546	maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
547	else:
548	yvec = data
549	maint='Log QQ Plot(n=%d)' % (nrows)
550	n = 1000
551	ylab = '%s' % xlabname
552	xlab = 'Normal'
553	# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
554	#rpy.r.pdf( outfname, h , w )
555	#rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
556	#rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
557	#rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
558	#rpy.r.dev_off()
559	y = ['%f' % x for x in yvec]
560	R.append('yvec = c(%s)' % ','.join(y))
561	R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
562	R.append("par(lab=c(10,10,10))")
563	R.append('qqnorm(yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
564	R.append('grid(col="lightgray",lty="dotted")')
565	R.append('dev.off()')
566	return R
567
568	def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
569	"""
570	layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
571	like the GAIN qc tools
572	y is data for a qq plot and ends up on the x axis go figure
573	if sampling, oversample low values - all the top 1% ?
574	"""
575	rangeunif = len(plotme)
576	nunif = 1000
577	fn = float(rangeunif)
578	xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
579	skip = max(int(rangeunif/fn),1)
580	# force include last points
581	mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
582	maxveclen = 2000
583	nrows = len(plotme)
584	data = copy.copy(plotme)
585	data.sort() # low to high - oversample low values
586	if nrows > maxveclen:
587	# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
588	# oversample part of the distribution
589	always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
590	skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
591	samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
592	# always oversample first sorted (here lowest) values
593	yvec = [data[i] for i in samplei] # always get first and last
594	xvec = [xvec[i] for i in samplei] # and sample xvec same way
595	maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
596	else:
597	yvec = data
598	maint='Log QQ Plot(n=%d)' % (nrows)
599	n = 1000
600	mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
601	ylab = '%s' % xlabname
602	xlab = '-log10(Uniform 0-1)'
603	R.append('mx = c(0,%f)' % (math.log10(fn)))
604	x = ['%f' % x for x in xvec]
605	R.append('xvec = c(%s)' % ','.join(x))
606	y = ['%f' % x for x in yvec]
607	R.append('yvec = c(%s)' % ','.join(y))
608	R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
609	R.append("par(lab=c(10,10,10))")
610	R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
611	R.append('points(mx,mx,type="l")')
612	R.append('grid(col="lightgray",lty="dotted")')
613	R.append('dev.off()')
614
615
616	shead = subjects.pop(0) # get rid of head
617	mhead = markers.pop(0)
618	maf = mhead.index('maf')
619	missfrac = mhead.index('missfrac')
620	logphweall = mhead.index('logp_hwe_all')
621	logphweunaff = mhead.index('logp_hwe_unaff')
622	# check for at least some unaffected rml june 2009
623	m_mendel = mhead.index('N_Mendel')
624	fracmiss = shead.index('FracMiss')
625	s_mendel = shead.index('Mendel_errors')
626	s_het = shead.index('F_Stat')
627	params = {}
628	h = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
629	and x[logphweunaff].upper() <> 'NA']
630	if len(h) <> 0:
631	xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
632	# plot for each of these cols
633	else: # try hwe all instead - maybe no affection status available
634	xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
635	ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
636	oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
637	qqplotme = [1,0,0,0,0,0,0] #
638	qnplotme = [0,0,0,0,0,0,1] #
639	nplots = len(xs)
640	xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
641	'Marker Mendel Error Count', 'Missing Rate: Subjects',
642	'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
643	plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
644	ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
645	ordplotnames = ['%s_cum' % x for x in plotnames]
646	ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
647	outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
648	ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
649	datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
650	titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
651	"Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
652	html = []
653	pdflist = []
654	R = []
655	for n,column in enumerate(xs):
656	dfn = '%d_%s.txt' % (n,titles[n])
657	dfilepath = os.path.join(newfpath,dfn)
658	dat = [float(x[column]) for x in datasources[n] if len(x) >= column
659	and x[column][:2].upper() <> 'NA'] # plink gives both!
660	if sum(dat) <> 0: # eg nada for mendel if case control?
661	plotme = file(dfilepath,'w')
662	plotme.write('\n'.join(['%f' % x for x in dat])) # pass as a file - copout!
663	tR = rHist(plotme=dfilepath,outfname=outfnames[n],xlabname=xlabnames[n],
664	title=titles[n],basename=basename,nbreaks=nbreaks)
665	R += tR
666	row = [titles[n],ploturls[n],outfnames[n] ]
667	html.append(row)
668	pdflist.append(outfnames[n])
669	if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
670	otitle = 'Ranked %s' % titles[n]
671	dat.sort()
672	if oreverseme[n]:
673	dat.reverse()
674	ndat = len(dat)
675	xvec = range(ndat)
676	xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
677	# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
678	maxveclen = 1000.0 # for reasonable pdf sizes!
679	if ndat > maxveclen: # oversample part of the distribution
680	always = min(1000,ndat/20) # oversample smaller of lowest few hundred items or 5%
681	skip = int(ndat/maxveclen) # take 1 in skip to get about maxveclen points
682	samplei = [i for i in range(ndat) if (i % skip == 0) or (i < always)] # always oversample first sorted values
683	yvec = [yvec[i] for i in samplei] # always get first and last
684	xvec = [xvec[i] for i in samplei] # always get first and last
685	plotme = file(dfilepath,'w')
686	plotme.write('xvec\tyvec\n')
687	plotme.write('\n'.join(['%f\t%f' % (xvec[i],y) for y in yvec])) # pass as a file - copout!
688	tR = rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
689	title=otitle,basename=basename,nbreaks=nbreaks)
690	R += tR
691	row = [otitle,ordploturls[n],ordoutfnames[n]]
692	html.append(row)
693	pdflist.append(ordoutfnames[n])
694	if qqplotme[n]: #
695	otitle = 'LogQQ plot %s' % titles[n]
696	dat.sort()
697	dat.reverse()
698	ofn = os.path.split(ordoutfnames[n])
699	ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
700	ofu = os.path.split(ordploturls[n])
701	ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
702	tR = rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
703	title=otitle,basename=basename)
704	R += tR
705	row = [otitle,ofu,ofn]
706	html.append(row)
707	pdflist.append(ofn)
708	elif qnplotme[n]:
709	otitle = 'F Statistic %s' % titles[n]
710	dat.sort()
711	dat.reverse()
712	ofn = os.path.split(ordoutfnames[n])
713	ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
714	ofu = os.path.split(ordploturls[n])
715	ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
716	tR = rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
717	title=otitle,basename=basename)
718	R += tR
719	row = [otitle,ofu,ofn]
720	html.append(row)
721	pdflist.append(ofn)
722	else:
723	print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
724	rlog,flist = RRun(rcmd=R,title='makeQCplots',outdir=newfpath)
725	if nup>0:
726	# pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
727	# pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
728	filestojoin = ' '.join(pdflist) # all the file names so far
729	afname = '%s_All_Paged.pdf' % (basename)
730	fullafname = os.path.join(newfpath,afname)
731	expl = 'All %s QC Plots joined into a single pdf' % basename
732	vcl = 'pdfjoin %s --outfile %s ' % (filestojoin, fullafname)
733	# make single page pdf
734	x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
735	retval = x.wait()
736	row = [expl,afname,fullafname]
737	html.insert(0,row) # last rather than second
738	nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
739	fullnfname = os.path.join(newfpath,nfname)
740	expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
741	vcl = 'pdfnup %s --nup %dx%d --frame true --outfile %s' % (afname,nup,nup,fullnfname)
742	# make thumbnail images
743	x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
744	retval = x.wait()
745	row = [expl,nfname,fullnfname]
746	html.insert(1,row) # this goes second
747	vcl = 'mogrify -format jpg -resize %s %s' % (mogresize, os.path.join(newfpath,'*.pdf'))
748	# make thumbnail images
749	x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
750	retval = x.wait()
751	return html # elements for an ordered list of urls or whatever..
752
753	def countHet(bedf='fakeped_500000',linkageped=True,froot='fake500k',outfname="ahetf",logf='rgQC.log'):
754	"""
755	NO LONGER USED - historical interest
756	count het loci for each subject to look for outliers = ? contamination
757	assume ped file is linkage format
758	need to make a ped file from the bed file we were passed
759	"""
760	vcl = [plinke,'--bfile',bedf,'--recode','--out','%s_recode' % froot] # write a recoded ped file from the real .bed file
761	p=subprocess.Popen(' '.join(vcl),shell=True)
762	retval = p.wait()
763	f = open('%s_recode.recode.ped' % froot,'r')
764	if not linkageped:
765	head = f.next() # throw away header
766	hets = [] # simple count of het loci per subject. Expect poisson?
767	n = 1
768	for l in f:
769	n += 1
770	ll = l.strip().split()
771	if len(ll) > 6:
772	iid = idjoiner.join(ll[:2]) # fam_iid
773	gender = ll[4]
774	alleles = ll[6:]
775	nallele = len(alleles)
776	nhet = 0
777	for i in range(nallele/2):
778	a1=alleles[2*i]
779	a2=alleles[2*i+1]
780	if alleles[2i] <> alleles[2i+1]: # must be het
781	if not missvals.get(a1,None) and not missvals.get(a2,None):
782	nhet += 1
783	hets.append((nhet,iid,gender)) # for sorting later
784	f.close()
785	hets.sort()
786	hets.reverse() # biggest nhet now on top
787	f = open(outfname ,'w')
788	res = ['%d\t%s\t%s' % (x) for x in hets] # I love list comprehensions
789	res.insert(0,'nhetloci\tfamid_iid\tgender')
790	res.append('')
791	f.write('\n'.join(res))
792	f.close()
793
794
795
796	def subjectRep(froot='cleantest',outfname="srep",newfpath='.',logf = None):
797	"""by subject (missingness = .imiss, mendel = .imendel)
798	assume replicates have an underscore in family id for
799	hapmap testing
800	For sorting, we need floats and integers
801	"""
802	isexfile = '%s.sexcheck' % froot
803	imissfile = '%s.imiss' % froot
804	imendfile = '%s.imendel' % froot
805	ihetfile = '%s.het' % froot
806	logf.write('## subject reports starting at %s\n' % timenow())
807	outfile = os.path.join(newfpath,outfname)
808	idlist = []
809	imissdict = {}
810	imenddict = {}
811	isexdict = {}
812	ihetdict = {}
813	Tops = {}
814	Tnames = ['Ranked Subject Missing Genotype', 'Ranked Subject Mendel',
815	'Ranked Sex check', 'Ranked Inbreeding (het) F statistic']
816	Tsorts = [2,3,6,8]
817	Treverse = [True,True,True,False] # so first values are worser
818	#rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
819	## FID IID MISS_PHENO N_MISS N_GENO F_MISS
820	## 1552042370_A 1552042370_A N 5480 549883 0.009966
821	## 1552042410_A 1552042410_A N 1638 549883 0.002979
822
823	# ------------------missing--------------------
824	# imiss has FID IID MISS_PHENO N_MISS F_MISS
825	# we want F_MISS
826	try:
827	f = open(imissfile,'r')
828	except:
829	logf.write('# file %s is missing - talk about irony\n' % imissfile)
830	f = None
831	if f:
832	for n,line in enumerate(f):
833	ll = line.strip().split()
834	if n == 0:
835	head = [x.upper() for x in ll] # expect above
836	fidpos = head.index('FID')
837	iidpos = head.index('IID')
838	fpos = head.index('F_MISS')
839	elif len(ll) >= fpos: # full line
840	fid = ll[fidpos]
841	#if fid.find('_') == -1: # not replicate! - icondb ids have these...
842	iid = ll[iidpos]
843	fmiss = ll[fpos]
844	id = '%s%s%s' % (fid,idjoiner,iid)
845	imissdict[id] = fmiss
846	idlist.append(id)
847	f.close()
848	logf.write('## imissfile %s contained %d ids\n' % (imissfile,len(idlist)))
849	# ------------------mend-------------------
850	# *.imendel has FID IID N
851	# we want N
852	gotmend = True
853	try:
854	f = open(imendfile,'r')
855	except:
856	gotmend = False
857	for id in idlist:
858	imenddict[id] = '0'
859	if gotmend:
860	for n,line in enumerate(f):
861	ll = line.strip().split()
862	if n == 0:
863	head = [x.upper() for x in ll] # expect above
864	npos = head.index('N')
865	fidpos = head.index('FID')
866	iidpos = head.index('IID')
867	elif len(ll) >= npos: # full line
868	fid = ll[fidpos]
869	iid = ll[iidpos]
870	id = '%s%s%s' % (fid,idjoiner,iid)
871	nmend = ll[npos]
872	imenddict[id] = nmend
873	f.close()
874	else:
875	logf.write('## error No %s file - assuming not family data\n' % imendfile)
876	# ------------------sex check------------------
877	#[rerla@hg fresh]$ head /home/rerla/fresh/database/files/dataset_978_files/CAMP2007Dirty.sexcheck
878	# sexcheck has FID IID PEDSEX SNPSEX STATUS F
879	##
880	## FID Family ID
881	## IID Individual ID
882	## PEDSEX Sex as determined in pedigree file (1=male, 2=female)
883	## SNPSEX Sex as determined by X chromosome
884	## STATUS Displays "PROBLEM" or "OK" for each individual
885	## F The actual X chromosome inbreeding (homozygosity) estimate
886	##
887	## A PROBLEM arises if the two sexes do not match, or if the SNP data or pedigree data are
888	## ambiguous with regard to sex.
889	## A male call is made if F is more than 0.8; a femle call is made if F is less than 0.2.
890	try:
891	f = open(isexfile,'r')
892	got_sexcheck = 1
893	except:
894	got_sexcheck = 0
895	if got_sexcheck:
896	for n,line in enumerate(f):
897	ll = line.strip().split()
898	if n == 0:
899	head = [x.upper() for x in ll] # expect above
900	fidpos = head.index('FID')
901	iidpos = head.index('IID')
902	pedsexpos = head.index('PEDSEX')
903	snpsexpos = head.index('SNPSEX')
904	statuspos = head.index('STATUS')
905	fpos = head.index('F')
906	elif len(ll) >= fpos: # full line
907	fid = ll[fidpos]
908	iid = ll[iidpos]
909	fest = ll[fpos]
910	pedsex = ll[pedsexpos]
911	snpsex = ll[snpsexpos]
912	stat = ll[statuspos]
913	id = '%s%s%s' % (fid,idjoiner,iid)
914	isexdict[id] = (pedsex,snpsex,stat,fest)
915	f.close()
916	else:
917	# this only happens if there are no subjects!
918	logf.write('No %s file - assuming no sex errors' % isexfile)
919	##
920	## FID IID O(HOM) E(HOM) N(NM) F
921	## 457 2 490665 4.928e+05 722154 -0.009096
922	## 457 3 464519 4.85e+05 710986 -0.0908
923	## 1037 2 461632 4.856e+05 712025 -0.106
924	## 1037 1 491845 4.906e+05 719353 0.005577
925	try:
926	f = open(ihetfile,'r')
927	except:
928	f = None
929	logf.write('## No %s file - did we run --het in plink?\n' % ihetfile)
930	if f:
931	for i,line in enumerate(f):
932	ll = line.strip().split()
933	if i == 0:
934	head = [x.upper() for x in ll] # expect above
935	fidpos = head.index('FID')
936	iidpos = head.index('IID')
937	fpos = head.index('F')
938	n = 0
939	elif len(ll) >= fpos: # full line
940	fid = ll[fidpos]
941	iid = ll[iidpos]
942	fhet = ll[fpos]
943	id = '%s%s%s' % (fid,idjoiner,iid)
944	ihetdict[id] = fhet
945	f.close() # now assemble and output result list
946	rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','XHomEst','F_Stat']
947	res = []
948	fres = [] # floats for sorting
949	for id in idlist: # for each snp in found order
950	fid,iid = id.split(idjoiner) # recover keys
951	f_missing = imissdict.get(id,'0.0')
952	nmend = imenddict.get(id,'0')
953	(pedsex,snpsex,status,fest) = isexdict.get(id,('0','0','0','0.0'))
954	fhet = ihetdict.get(id,'0.0')
955	res.append([fid,iid,f_missing,nmend,pedsex,snpsex,status,fest,fhet])
956	try:
957	ff_missing = float(f_missing)
958	except:
959	ff_missing = 0.0
960	try:
961	inmend = int(nmend)
962	except:
963	inmend = 0
964	try:
965	ffest = float(fest)
966	except:
967	fest = 0.0
968	try:
969	ffhet = float(fhet)
970	except:
971	ffhet = 0.0
972	fres.append([fid,iid,ff_missing,inmend,pedsex,snpsex,status,ffest,ffhet])
973	ntokeep = max(20,len(res)/keepfrac)
974	for i,col in enumerate(Tsorts):
975	fres.sort(key=operator.itemgetter(col))
976	if Treverse[i]:
977	fres.reverse()
978	repname = Tnames[i]
979	Tops[repname] = fres[0:ntokeep]
980	Tops[repname] = [map(str,x) for x in Tops[repname]]
981	Tops[repname].insert(0,rhead)
982	res.sort()
983	res.insert(0,rhead)
984	logf.write('### writing %s report with %s' % ( outfile,res[0]))
985	f = open(outfile,'w')
986	f.write('\n'.join(['\t'.join(x) for x in res]))
987	f.write('\n')
988	f.close()
989	return res,Tops
990
991	def markerRep(froot='cleantest',outfname="mrep",newfpath='.',logf=None,maplist=None ):
992	"""by marker (hwe = .hwe, missingness=.lmiss, freq = .frq)
993	keep a list of marker order but keep all stats in dicts
994	write out a fake xls file for R or SAS etc
995	kinda clunky, but..
996	TODO: ensure stable if any file not found?
997	"""
998	mapdict = {}
999	if maplist <> None:
1000	rslist = [x[1] for x in maplist]
1001	offset = [(x[0],x[3]) for x in maplist]
1002	mapdict = dict(zip(rslist,offset))
1003	hwefile = '%s.hwe' % froot
1004	lmissfile = '%s.lmiss' % froot
1005	freqfile = '%s.frq' % froot
1006	lmendfile = '%s.lmendel' % froot
1007	outfile = os.path.join(newfpath,outfname)
1008	markerlist = []
1009	chromlist = []
1010	hwedict = {}
1011	lmissdict = {}
1012	freqdict = {}
1013	lmenddict = {}
1014	Tops = {}
1015	Tnames = ['Ranked Marker MAF', 'Ranked Marker Missing Genotype', 'Ranked Marker HWE', 'Ranked Marker Mendel']
1016	Tsorts = [3,6,10,11]
1017	Treverse = [False,True,True,True] # so first values are worse(r)
1018	#res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend])
1019	#rhead = ['snp','chrom','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
1020	# -------------------hwe--------------------------
1021	# hwe has SNP TEST GENO O(HET) E(HET) P_HWD
1022	# we want all hwe where P_HWD <> NA
1023	# ah changed in 1.04 to
1024	## CHR SNP TEST A1 A2 GENO O(HET) E(HET) P
1025	## 1 rs6671164 ALL 2 3 34/276/613 0.299 0.3032 0.6644
1026	## 1 rs6671164 AFF 2 3 0/0/0 nan nan NA
1027	## 1 rs6671164 UNAFF 2 3 34/276/613 0.299 0.3032 0.6644
1028	## 1 rs4448553 ALL 2 3 8/176/748 0.1888 0.1848 0.5975
1029	## 1 rs4448553 AFF 2 3 0/0/0 nan nan NA
1030	## 1 rs4448553 UNAFF 2 3 8/176/748 0.1888 0.1848 0.5975
1031	## 1 rs1990150 ALL 1 3 54/303/569 0.3272 0.3453 0.1067
1032	## 1 rs1990150 AFF 1 3 0/0/0 nan nan NA
1033	## 1 rs1990150 UNAFF 1 3 54/303/569 0.3272 0.3453 0.1067
1034	logf.write('## marker reports starting at %s\n' % timenow())
1035	try:
1036	f = open(hwefile,'r')
1037	except:
1038	f = None
1039	logf.write('## error - no hwefile %s found\n' % hwefile)
1040	if f:
1041	for i,line in enumerate(f):
1042	ll = line.strip().split()
1043	if i == 0: # head
1044	head = [x.upper() for x in ll] # expect above
1045	try:
1046	testpos = head.index('TEST')
1047	except:
1048	testpos = 2 # patch for 1.04 which has b0rken headers - otherwise use head.index('TEST')
1049	try:
1050	ppos = head.index('P')
1051	except:
1052	ppos = 8 # patch - for head.index('P')
1053	snppos = head.index('SNP')
1054	chrpos = head.index('CHR')
1055	logf.write('hwe header testpos=%d,ppos=%d,snppos=%d\n' % (testpos,ppos,snppos))
1056	elif len(ll) >= ppos: # full line
1057	ps = ll[ppos].upper()
1058	rs = ll[snppos]
1059	chrom = ll[chrpos]
1060	test = ll[testpos]
1061	if not hwedict.get(rs,None):
1062	hwedict[rs] = {}
1063	markerlist.append(rs)
1064	chromlist.append(chrom) # one place to find it?
1065	lpvals = 0
1066	if ps.upper() <> 'NA' and ps.upper() <> 'NAN': # worth keeping
1067	lpvals = '0'
1068	if ps <> '1':
1069	try:
1070	pval = float(ps)
1071	lpvals = '%f' % -math.log10(pval)
1072	except:
1073	pass
1074	hwedict[rs][test] = (ps,lpvals)
1075	else:
1076	logf.write('short line #%d = %s\n' % (i,ll))
1077	f.close()
1078	# ------------------missing--------------------
1079	"""lmiss has
1080	CHR SNP N_MISS N_GENO F_MISS
1081	1 rs12354060 0 73 0
1082	1 rs4345758 1 73 0.0137
1083	1 rs2691310 73 73 1
1084	1 rs2531266 73 73 1
1085	# we want F_MISS"""
1086	try:
1087	f = open(lmissfile,'r')
1088	except:
1089	f = None
1090	if f:
1091	for i,line in enumerate(f):
1092	ll = line.strip().split()
1093	if i == 0:
1094	head = [x.upper() for x in ll] # expect above
1095	fracpos = head.index('F_MISS')
1096	npos = head.index('N_MISS')
1097	snppos = head.index('SNP')
1098	elif len(ll) >= fracpos: # full line
1099	rs = ll[snppos]
1100	fracval = ll[fracpos]
1101	lmissdict[rs] = fracval # for now, just that?
1102	else:
1103	lmissdict[rs] = 'NA'
1104	f.close()
1105	# ------------------freq-------------------
1106	# frq has CHR SNP A1 A2 MAF NM
1107	# we want maf
1108	try:
1109	f = open(freqfile,'r')
1110	except:
1111	f = None
1112	if f:
1113	for i,line in enumerate(f):
1114	ll = line.strip().split()
1115	if i == 0:
1116	head = [x.upper() for x in ll] # expect above
1117	mafpos = head.index('MAF')
1118	a1pos = head.index('A1')
1119	a2pos = head.index('A2')
1120	snppos = head.index('SNP')
1121	elif len(ll) >= mafpos: # full line
1122	rs = ll[snppos]
1123	a1 = ll[a1pos]
1124	a2 = ll[a2pos]
1125	maf = ll[mafpos]
1126	freqdict[rs] = (maf,a1,a2)
1127	f.close()
1128	# ------------------mend-------------------
1129	# lmend has CHR SNP N
1130	# we want N
1131	gotmend = True
1132	try:
1133	f = open(lmendfile,'r')
1134	except:
1135	gotmend = False
1136	for rs in markerlist:
1137	lmenddict[rs] = '0'
1138	if gotmend:
1139	for i,line in enumerate(f):
1140	ll = line.strip().split()
1141	if i == 0:
1142	head = [x.upper() for x in ll] # expect above
1143	npos = head.index('N')
1144	snppos = head.index('SNP')
1145	elif len(ll) >= npos: # full line
1146	rs = ll[snppos]
1147	nmend = ll[npos]
1148	lmenddict[rs] = nmend
1149	f.close()
1150	else:
1151	logf.write('No %s file - assuming not family data\n' % lmendfile)
1152	# now assemble result list
1153	rhead = ['snp','chromosome','offset','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
1154	res = []
1155	fres = []
1156	for rs in markerlist: # for each snp in found order
1157	f_missing = lmissdict.get(rs,'NA')
1158	maf,a1,a2 = freqdict.get(rs,('NA','NA','NA'))
1159	hwe_all = hwedict[rs].get('ALL',('NA','NA')) # hope this doesn't change...
1160	hwe_unaff = hwedict[rs].get('UNAFF',('NA','NA'))
1161	nmend = lmenddict.get(rs,'NA')
1162	(chrom,offset)=mapdict.get(rs,('?','0'))
1163	res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend])
1164	ntokeep = max(10,len(res)/keepfrac)
1165
1166	def msortk(item=None):
1167	"""
1168	deal with non numeric sorting
1169	"""
1170	try:
1171	return float(item)
1172	except:
1173	return item
1174
1175	for i,col in enumerate(Tsorts):
1176	res.sort(key=msortk(lambda x:x[col]))
1177	if Treverse[i]:
1178	res.reverse()
1179	repname = Tnames[i]
1180	Tops[repname] = res[0:ntokeep]
1181	Tops[repname].insert(0,rhead)
1182	res.sort(key=lambda x: '%s_%10d' % (x[1].ljust(4,'0'),int(x[2]))) # in chrom offset order
1183	res.insert(0,rhead)
1184	f = open(outfile,'w')
1185	f.write('\n'.join(['\t'.join(x) for x in res]))
1186	f.close()
1187	return res,Tops
1188
1189
1190
1191
1192	def getfSize(fpath,outpath):
1193	"""
1194	format a nice file size string
1195	"""
1196	size = ''
1197	fp = os.path.join(outpath,fpath)
1198	if os.path.isfile(fp):
1199	n = float(os.path.getsize(fp))
1200	if n > 2**20:
1201	size = ' (%1.1f MB)' % (n/2**20)
1202	elif n > 2**10:
1203	size = ' (%1.1f KB)' % (n/2**10)
1204	elif n > 0:
1205	size = ' (%d B)' % (int(n))
1206	return size
1207
1208
1209	if __name__ == "__main__":
1210	u = """ called in xml as
1211	<command interpreter="python">
1212	rgQC.py -i '$input_file.extra_files_path/$input_file.metadata.base_name' -o "$out_prefix"
1213	-s '$html_file' -p '$html_file.files_path' -l '${GALAXY_DATA_INDEX_DIR}/rg/bin/plink'
1214	-r '${GALAXY_DATA_INDEX_DIR}/rg/bin/R'
1215	</command>
1216
1217	Prepare a qc report - eg:
1218	print "%s %s -i birdlped -o birdlped -l qc.log -s htmlf -m marker.xls -s sub.xls -p ./" % (sys.executable,prog)
1219
1220	"""
1221	progname = os.path.basename(sys.argv[0])
1222	if len(sys.argv) < 9:
1223	print '%s requires 6 parameters - got %d = %s' % (progname,len(sys.argv),sys.argv)
1224	sys.exit(1)
1225	parser = OptionParser(usage=u, version="%prog 0.01")
1226	a = parser.add_option
1227	a("-i","--infile",dest="infile")
1228	a("-o","--oprefix",dest="opref")
1229	a("-l","--plinkexe",dest="plinke", default=plinke)
1230	a("-r","--rexe",dest="rexe", default=rexe)
1231	a("-s","--snps",dest="htmlf")
1232	#a("-m","--markerRaw",dest="markf")
1233	#a("-r","--rawsubject",dest="subjf")
1234	a("-p","--patho",dest="newfpath")
1235	(options,args) = parser.parse_args()
1236	basename = os.path.split(options.infile)[-1] # just want the file prefix to find the .xls files below
1237	killme = string.punctuation + string.whitespace
1238	trantab = string.maketrans(killme,'_'*len(killme))
1239	opref = options.opref.translate(trantab)
1240	alogh,alog = tempfile.mkstemp(suffix='.txt')
1241	plogh,plog = tempfile.mkstemp(suffix='.txt')
1242	alogf = open(alog,'w')
1243	plogf = open(plog,'w')
1244	ahtmlf = options.htmlf
1245	amarkf = 'MarkerDetails_%s.xls' % opref
1246	asubjf = 'SubjectDetails_%s.xls' % opref
1247	newfpath = options.newfpath
1248	newfpath = os.path.realpath(newfpath)
1249	try:
1250	os.makedirs(newfpath)
1251	except:
1252	pass
1253	ofn = basename
1254	bfn = options.infile
1255	try:
1256	mapf = '%s.bim' % bfn
1257	maplist = file(mapf,'r').readlines()
1258	maplist = [x.split() for x in maplist]
1259	except:
1260	maplist = None
1261	alogf.write('## error - cannot open %s to read map - no offsets will be available for output files')
1262	#rerla@beast galaxy]$ head test-data/tinywga.bim
1263	#22 rs2283802 0 21784722 4 2
1264	#22 rs2267000 0 21785366 4 2
1265	rgbin = os.path.split(rexe)[0] # get our rg bin path
1266	#plinktasks = [' --freq',' --missing',' --mendel',' --hardy',' --check-sex'] # plink v1 fixes that bug!
1267	# if we could, do all at once? Nope. Probably never.
1268	plinktasks = [['--freq',],['--hwe 0.0', '--missing','--hardy'],
1269	['--mendel',],['--check-sex',]]
1270	vclbase = [options.plinke,'--noweb','--out',basename,'--bfile',bfn,'--mind','1.0','--geno','1.0','--maf','0.0']
1271	runPlink(logf=plogf,plinktasks=plinktasks,cd=newfpath, vclbase=vclbase)
1272	plinktasks = [['--bfile',bfn,'--indep-pairwise 40 20 0.5','--out %s' % basename],
1273	['--bfile',bfn,'--extract %s.prune.in --make-bed --out ldp_%s' % (basename, basename)],
1274	['--bfile ldp_%s --het --out %s' % (basename,basename)]]
1275	# subset of ld independent markers for eigenstrat and other requirements
1276	vclbase = [options.plinke,'--noweb']
1277	plogout = pruneLD(plinktasks=plinktasks,cd=newfpath,vclbase = vclbase)
1278	plogf.write('\n'.join(plogout))
1279	plogf.write('\n')
1280	repout = os.path.join(newfpath,basename)
1281	subjects,subjectTops = subjectRep(froot=repout,outfname=asubjf,newfpath=newfpath,
1282	logf=alogf) # writes the subject_froot.xls file
1283	markers,markerTops = markerRep(froot=repout,outfname=amarkf,newfpath=newfpath,
1284	logf=alogf,maplist=maplist) # marker_froot.xls
1285	nbreaks = 100
1286	s = '## starting plotpage, newfpath=%s,m=%s,s=%s/n' % (newfpath,markers[:2],subjects[:2])
1287	alogf.write(s)
1288	print s
1289	plotpage,cruft = makePlots(markers=markers,subjects=subjects,newfpath=newfpath,
1290	basename=basename,nbreaks=nbreaks,height=10,width=8,rgbin=rgbin)
1291	#plotpage = RmakePlots(markers=markers,subjects=subjects,newfpath=newfpath,basename=basename,nbreaks=nbreaks,rexe=rexe)
1292
1293	# [titles[n],plotnames[n],outfnames[n] ]
1294	html = []
1295	html.append('<table cellpadding="5" border="0">')
1296	size = getfSize(amarkf,newfpath)
1297	html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \
1298	(amarkf,'Click here to download the Marker QC Detail report file',size))
1299	size = getfSize(asubjf,newfpath)
1300	html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \
1301	(asubjf,'Click here to download the Subject QC Detail report file',size))
1302	for (title,url,ofname) in plotpage:
1303	ttitle = 'Ranked %s' % title
1304	dat = subjectTops.get(ttitle,None)
1305	if not dat:
1306	dat = markerTops.get(ttitle,None)
1307	imghref = '%s.jpg' % os.path.splitext(url)[0] # removes .pdf
1308	thumbnail = os.path.join(newfpath,imghref)
1309	if not os.path.exists(thumbnail): # for multipage pdfs, mogrify makes multiple jpgs - fugly hack
1310	imghref = '%s-0.jpg' % os.path.splitext(url)[0] # try the first jpg
1311	thumbnail = os.path.join(newfpath,imghref)
1312	if not os.path.exists(thumbnail):
1313	html.append('<tr><td colspan="3"><a href="%s">%s</a></td></tr>' % (url,title))
1314	else:
1315	html.append('<tr><td><a href="%s"><img src="%s" alt="%s" hspace="10" align="middle">' \
1316	% (url,imghref,title))
1317	if dat: # one or the other - write as an extra file and make a link here
1318	t = '%s.xls' % (ttitle.replace(' ','_'))
1319	fname = os.path.join(newfpath,t)
1320	f = file(fname,'w')
1321	f.write('\n'.join(['\t'.join(x) for x in dat])) # the report
1322	size = getfSize(t,newfpath)
1323	html.append('</a></td><td>%s</td><td><a href="%s">Worst data</a>%s</td></tr>' % (title,t,size))
1324	else:
1325	html.append('</a></td><td>%s</td><td> </td></tr>' % (title))
1326	html.append('</table><hr><h3>All output files from the QC run are available below</h3>')
1327	html.append('<table cellpadding="5" border="0">\n')
1328	flist = os.listdir(newfpath) # we want to catch 'em all
1329	flist.sort()
1330	for f in flist:
1331	fname = os.path.split(f)[-1]
1332	size = getfSize(fname,newfpath)
1333	html.append('<tr><td><a href="%s">%s</a>%s</td></tr>' % (fname,fname,size))
1334	html.append('</table>')
1335	alogf.close()
1336	plogf.close()
1337	llog = open(alog,'r').readlines()
1338	plogfile = open(plog,'r').readlines()
1339	os.unlink(alog)
1340	os.unlink(plog)
1341	llog += plogfile # add lines from pruneld log
1342	lf = file(ahtmlf,'w') # galaxy will show this as the default view
1343	lf.write(galhtmlprefix % progname)
1344	s = '\n<div>Output from Rgenetics QC report tool run at %s<br>\n' % (timenow())
1345	lf.write('<h4>%s</h4>\n' % s)
1346	lf.write('</div><div><h4>(Click any preview image to download a full sized PDF version)</h4><br><ol>\n')
1347	lf.write('\n'.join(html))
1348	lf.write('<h4>QC run log contents</h4>')
1349	lf.write('<pre>%s</pre>' % (''.join(llog))) # plink logs
1350	if len(cruft) > 0:
1351	lf.write('<h2>Blather from pdfnup follows:</h2><pre>%s</pre>' % (''.join(cruft))) # pdfnup
1352	lf.write('%s\n<hr>\n' % galhtmlpostfix)
1353	lf.close()
1354

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/rgenetics/rgQC.py @ 2

異なるフォーマットでダウンロード: