Context Navigation

rgQC.py @ 2

リビジョン 2, 62.8 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

Rev	行番号
[2]	1	# oct 15 rpy replaced - temp fix until we get gnuplot working
	2	# rpy deprecated - replace with RRun
	3	# fixes to run functional test! oct1 2009
	4	# needed to expand our path with os.path.realpath to get newpath working with
	5	# all the fancy pdfnup stuff
	6	# and a fix to pruneld to write output to where it should be
	7	# smallish data in test-data/smallwga in various forms
	8	# python ../tools/rgenetics/rgQC.py -i smallwga -o smallwga -s smallwga/smallwga.html -p smallwga
	9	# child files are deprecated and broken as at july 19 2009
	10	# need to move them to the html file extrafiles path
	11	# found lots of corner cases with some illumina data where cnv markers were
	12	# included
	13	# and where affection status was all missing !
	14	# added links to tab files showing worst 1/keepfrac markers and subjects
	15	# ross lazarus january 2008
	16	#
	17	# added named parameters
	18	# to ensure no silly slippages if non required parameter in the most general case
	19	# some potentially useful things here reusable in complex scripts
	20	# with lots'o'html (TM)
	21	# aug 17 2007 rml
	22	#
	23	# added marker and subject and parenting april 14 rml
	24	# took a while to get the absolute paths right for all the file munging
	25	# as of april 16 seems to work..
	26	# getting galaxy to serve images in html reports is a little tricky
	27	# we don't want QC reports to be dozens of individual files, so need
	28	# to use the url /static/rg/... since galaxy's web server will happily serve images
	29	# from there
	30	# galaxy passes output files as relative paths
	31	# these have to be munged by rgQC.py before calling this
	32	# galaxy will pass in 2 file names - one for the log
	33	# and one for the final html report
	34	# of the form './database/files/dataset_66.dat'
	35	# we need to be working in that directory so our plink output files are there
	36	# so these have to be munged by rgQC.py before calling this
	37	# note no ped file passed so had to remove the -l option
	38	# for plinkParse.py that makes a heterozygosity report from the ped
	39	# file - needs fixing...
	40	# new: importing manhattan/qqplot plotter
	41	# def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir):
	42	# """ draw a qq for pvals and a manhattan plot if chrom/offset <> 0
	43	# contains some R scripts as text strings - we substitute defaults into the calls
	44	# to make them do our bidding - and save the resulting code for posterity
	45	# this can be called externally, I guess...for QC eg?
	46	# """
	47	#
	48	# rcmd = '%s%s' % (rcode,rcode2 % (input_fname,chrom_col,offset_col,pval_cols,title,grey))
	49	# rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir)
	50	# return rlog,flist
	51
	52
	53	from optparse import OptionParser
	54
	55	import sys,os,shutil, glob, math, subprocess, time, operator, random, tempfile, copy, string
	56	from os.path import abspath
	57	from rgutils import galhtmlprefix, galhtmlpostfix, RRun, timenow, plinke, rexe, runPlink, pruneLD
	58	import rgManQQ
	59
	60	prog = os.path.split(sys.argv[0])[1]
	61	vers = '0.4 april 2009 rml'
	62	idjoiner = '_~_~_' # need something improbable..
	63	# many of these may need fixing for a new install
	64
	65	myversion = vers
	66	keepfrac = 20 # fraction to keep after sorting by each interesting value
	67
	68	missvals = {'0':'0','N':'N','-9':'-9','-':'-'} # fix me if these change!
	69
	70	mogresize = "x300" # this controls the width for jpeg thumbnails
	71
	72
	73
	74
	75	def makePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='20',nup=3,height=10,width=8,rgbin=''):
	76	"""
	77	marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
	78	'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
	79	subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
	80	"""
	81
	82
	83	def rHist(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=50):
	84	""" rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
	85	# generic histogram and vertical boxplot in a 3:1 layout
	86	# returns the graphic file name for inclusion in the web page
	87	# broken out here for reuse
	88	# ross lazarus march 2007
	89	"""
	90	screenmat = (1,2,1,2) # create a 2x2 cabvas
	91	widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
	92	rpy.r.pdf( outfname, height , width )
	93	#rpy.r.layout(rpy.r.matrix(rpy.r.c(1,1,1,2), 1, 4, byrow = True)) # 3 to 1 vertical plot
	94	m = rpy.r.matrix((1,1,1,2),nrow=1,ncol=4,byrow=True)
	95	# in R, m = matrix(c(1,2),nrow=1,ncol=2,byrow=T)
	96	rpy.r("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))") # 4 to 1 vertical plot
	97	maint = 'QC for %s - %s' % (basename,title)
	98	rpy.r.hist(plotme,main=maint, xlab=xlabname,breaks=nbreaks,col="maroon",cex=0.8)
	99	rpy.r.boxplot(plotme,main='',col="maroon",outline=False)
	100	rpy.r.dev_off()
	101
	102	def rCum(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=100):
	103	"""
	104	Useful to see what various cutoffs yield - plot percentiles
	105	"""
	106	n = len(plotme)
	107	maxveclen = 1000.0 # for reasonable pdf sizes!
	108	yvec = copy.copy(plotme)
	109	# arrives already in decending order of importance missingness or mendel count by subj or marker
	110	xvec = range(n)
	111	xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
	112	# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
	113	if n > maxveclen: # oversample part of the distribution
	114	always = min(1000,n/20) # oversample smaller of lowest few hundred items or 5%
	115	skip = int(n/maxveclen) # take 1 in skip to get about maxveclen points
	116	samplei = [i for i in range(n) if (i % skip == 0) or (i < always)] # always oversample first sorted values
	117	yvec = [yvec[i] for i in samplei] # always get first and last
	118	xvec = [xvec[i] for i in samplei] # always get first and last
	119	# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
	120	rpy.r.pdf( outfname, height , width )
	121	maint = 'QC for %s - %s' % (basename,title)
	122	rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
	123	rpy.r.plot(xvec,yvec,type='p',main=maint, ylab=xlabname, xlab='Sample Percentile',col="maroon",cex=0.8)
	124	rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
	125	rpy.r.dev_off()
	126
	127	def rQQ(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
	128	"""
	129	y is data for a qq plot and ends up on the x axis go figure
	130	if sampling, oversample low values - all the top 1% ?
	131	this version called with -log10 transformed hwe
	132	"""
	133	nrows = len(plotme)
	134	fn = float(nrows)
	135	xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
	136	mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
	137	maxveclen = 3000
	138	yvec = copy.copy(plotme)
	139	if nrows > maxveclen:
	140	# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
	141	# oversample part of the distribution
	142	always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
	143	skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
	144	samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
	145	# always oversample first sorted (here lowest) values
	146	yvec = [yvec[i] for i in samplei] # always get first and last
	147	xvec = [xvec[i] for i in samplei] # and sample xvec same way
	148	maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
	149	else:
	150	maint='Log QQ Plot(n=%d)' % (nrows)
	151	mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
	152	ylab = '%s' % xlabname
	153	xlab = '-log10(Uniform 0-1)'
	154	# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
	155	rpy.r.pdf( outfname, height , width )
	156	rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
	157	rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
	158	rpy.r.points(mx,mx,type='l')
	159	rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
	160	rpy.r.dev_off()
	161
	162	def rMultiQQ(plotme = [],nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
	163	"""
	164	data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
	165	grid of qq plots to show departure from null at extremes of data quality
	166	Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
	167	and ends up on the x axis go figure
	168	if sampling, oversample low values - all the top 1% ?
	169	"""
	170	data = copy.copy(plotme)
	171	nvals = len(data)
	172	stepsize = nvals/nsplits
	173	logstep = math.log10(stepsize) # so is 3 for steps of 1000
	174	quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
	175	data.sort(key=itertools.itemgetter(1)) # into x order
	176	rpy.r.pdf( outfname, height , width )
	177	rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
	178	yvec = [-math.log10(random.random()) for x in range(stepsize)]
	179	yvec.sort() # size of each step is expected range for xvec under null?!
	180	for rowstart in quints:
	181	rowend = rowstart + stepsize
	182	if nvals - rowend < stepsize: # finish last split
	183	rowend = nvals
	184	row = data[rowstart:rowend]
	185	row.sort(key=itertools.itemgetter(2)) # into y order
	186	for colstart in quints:
	187	colend = colstart + stepsize
	188	if nvals - colend < stepsize: # finish last split
	189	colend = nvals
	190	cell = row[colstart:colend]
	191	xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
	192	rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
	193	rpy.r.points(c(0,logstep),c(0,logstep),type='l')
	194	rpy.r.dev_off()
	195
	196
	197	def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
	198	"""
	199	y is data for a qqnorm plot
	200	if sampling, oversample low values - all the top 1% ?
	201	"""
	202	rangeunif = len(plotme)
	203	nunif = 1000
	204	maxveclen = 3000
	205	nrows = len(plotme)
	206	data = copy.copy(plotme)
	207	if nrows > maxveclen:
	208	# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
	209	# oversample part of the distribution
	210	always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
	211	skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
	212	samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
	213	# always oversample first sorted (here lowest) values
	214	yvec = [data[i] for i in samplei] # always get first and last
	215	maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
	216	else:
	217	yvec = data
	218	maint='Log QQ Plot(n=%d)' % (nrows)
	219	n = 1000
	220	ylab = '%s' % xlabname
	221	xlab = 'Normal'
	222	# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
	223	rpy.r.pdf( outfname, height , width )
	224	rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
	225	rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
	226	rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
	227	rpy.r.dev_off()
	228
	229	def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
	230	"""
	231	layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
	232	like the GAIN qc tools
	233	y is data for a qq plot and ends up on the x axis go figure
	234	if sampling, oversample low values - all the top 1% ?
	235	"""
	236	rangeunif = len(plotme)
	237	nunif = 1000
	238	fn = float(rangeunif)
	239	xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
	240	skip = max(int(rangeunif/fn),1)
	241	# force include last points
	242	mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
	243	maxveclen = 2000
	244	nrows = len(plotme)
	245	data = copy.copy(plotme)
	246	data.sort() # low to high - oversample low values
	247	if nrows > maxveclen:
	248	# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
	249	# oversample part of the distribution
	250	always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
	251	skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
	252	samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
	253	# always oversample first sorted (here lowest) values
	254	yvec = [data[i] for i in samplei] # always get first and last
	255	xvec = [xvec[i] for i in samplei] # and sample xvec same way
	256	maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
	257	else:
	258	yvec = data
	259	maint='Log QQ Plot(n=%d)' % (nrows)
	260	n = 1000
	261	mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
	262	ylab = '%s' % xlabname
	263	xlab = '-log10(Uniform 0-1)'
	264	# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
	265	rpy.r.pdf( outfname, height , width )
	266	rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
	267	rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
	268	rpy.r.points(mx,mx,type='l')
	269	rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
	270	rpy.r.dev_off()
	271
	272
	273	fdsto,stofile = tempfile.mkstemp()
	274	sto = open(stofile,'w')
	275	import rpy # delay to avoid rpy stdout chatter replacing galaxy file blurb
	276	mog = 'mogrify'
	277	pdfnup = 'pdfnup'
	278	pdfjoin = 'pdfjoin'
	279	shead = subjects.pop(0) # get rid of head
	280	mhead = markers.pop(0)
	281	maf = mhead.index('maf')
	282	missfrac = mhead.index('missfrac')
	283	logphweall = mhead.index('logp_hwe_all')
	284	logphweunaff = mhead.index('logp_hwe_unaff')
	285	# check for at least some unaffected rml june 2009
	286	m_mendel = mhead.index('N_Mendel')
	287	fracmiss = shead.index('FracMiss')
	288	s_mendel = shead.index('Mendel_errors')
	289	s_het = shead.index('F_Stat')
	290	params = {}
	291	hweres = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
	292	and x[logphweunaff].upper() <> 'NA']
	293	if len(hweres) <> 0:
	294	xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
	295	# plot for each of these cols
	296	else: # try hwe all instead - maybe no affection status available
	297	xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
	298	ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
	299	oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
	300	qqplotme = [1,0,0,0,0,0,0] #
	301	qnplotme = [0,0,0,0,0,0,1] #
	302	nplots = len(xs)
	303	xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
	304	'Marker Mendel Error Count', 'Missing Rate: Subjects',
	305	'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
	306	plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
	307	ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
	308	ordplotnames = ['%s_cum' % x for x in plotnames]
	309	ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
	310	outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
	311	ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
	312	datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
	313	titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
	314	"Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
	315	html = []
	316	pdflist = []
	317	for n,column in enumerate(xs):
	318	dat = [float(x[column]) for x in datasources[n] if len(x) >= column
	319	and x[column][:2].upper() <> 'NA'] # plink gives both!
	320	if sum(dat) <> 0: # eg nada for mendel if case control?
	321	rHist(plotme=dat,outfname=outfnames[n],xlabname=xlabnames[n],
	322	title=titles[n],basename=basename,nbreaks=nbreaks)
	323	row = [titles[n],ploturls[n],outfnames[n] ]
	324	html.append(row)
	325	pdflist.append(outfnames[n])
	326	if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
	327	otitle = 'Ranked %s' % titles[n]
	328	dat.sort()
	329	if oreverseme[n]:
	330	dat.reverse()
	331	rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
	332	title=otitle,basename=basename,nbreaks=1000)
	333	row = [otitle,ordploturls[n],ordoutfnames[n]]
	334	html.append(row)
	335	pdflist.append(ordoutfnames[n])
	336	if qqplotme[n]: #
	337	otitle = 'LogQQ plot %s' % titles[n]
	338	dat.sort()
	339	dat.reverse()
	340	ofn = os.path.split(ordoutfnames[n])
	341	ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
	342	ofu = os.path.split(ordploturls[n])
	343	ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
	344	rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
	345	title=otitle,basename=basename)
	346	row = [otitle,ofu,ofn]
	347	html.append(row)
	348	pdflist.append(ofn)
	349	elif qnplotme[n]:
	350	otitle = 'F Statistic %s' % titles[n]
	351	dat.sort()
	352	dat.reverse()
	353	ofn = os.path.split(ordoutfnames[n])
	354	ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
	355	ofu = os.path.split(ordploturls[n])
	356	ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
	357	rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
	358	title=otitle,basename=basename)
	359	row = [otitle,ofu,ofn]
	360	html.append(row)
	361	pdflist.append(ofn)
	362	else:
	363	print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
	364	if nup>0:
	365	# pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
	366	# pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
	367	filestojoin = ' '.join(pdflist) # all the file names so far
	368	afname = '%s_All_Paged.pdf' % (basename)
	369	fullafname = os.path.join(newfpath,afname)
	370	expl = 'All %s QC Plots joined into a single pdf' % basename
	371	vcl = '%s %s --outfile %s ' % (pdfjoin,filestojoin, fullafname)
	372	# make single page pdf
	373	x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
	374	retval = x.wait()
	375	row = [expl,afname,fullafname]
	376	html.insert(0,row) # last rather than second
	377	nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
	378	fullnfname = os.path.join(newfpath,nfname)
	379	expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
	380	vcl = '%s %s --nup %dx%d --frame true --outfile %s' % (pdfnup,afname,nup,nup,fullnfname)
	381	# make thumbnail images
	382	x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
	383	retval = x.wait()
	384	row = [expl,nfname,fullnfname]
	385	html.insert(1,row) # this goes second
	386	vcl = '%s -format jpg -resize %s %s' % (mog, mogresize, os.path.join(newfpath,'*.pdf'))
	387	# make thumbnail images
	388	x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
	389	retval = x.wait()
	390	sto.close()
	391	cruft = open(stofile,'r').readlines()
	392	return html,cruft # elements for an ordered list of urls or whatever..
	393
	394
	395	def RmakePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='100',nup=3,height=8,width=10,rexe=''):
	396	"""
	397	nice try but the R scripts are huge and take forever to run if there's a lot of data
	398	marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
	399	'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
	400	subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
	401	"""
	402	colour = "maroon"
	403
	404	def rHist(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=nbreaks):
	405	""" rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
	406	# generic histogram and vertical boxplot in a 3:1 layout
	407	# returns the graphic file name for inclusion in the web page
	408	# broken out here for reuse
	409	# ross lazarus march 2007
	410	"""
	411	R = []
	412	maint = 'QC for %s - %s' % (basename,title)
	413	screenmat = (1,2,1,2) # create a 2x2 canvas
	414	widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
	415	R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
	416	R.append("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))")
	417	R.append("plotme = read.table(file='%s',head=F,sep='\t')" % plotme)
	418	R.append('hist(plotme, main="%s",xlab="%s",breaks=%d,col="%s")' % (maint,xlabname,nbreaks,colour))
	419	R.append('boxplot(plotme,main="",col="%s",outline=F)' % (colour) )
	420	R.append('dev.off()')
	421	return R
	422
	423	def rCum(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=100):
	424	"""
	425	Useful to see what various cutoffs yield - plot percentiles
	426	"""
	427	R = []
	428	n = len(plotme)
	429	R.append("plotme = read.table(file='%s',head=T,sep='\t')" % plotme)
	430	# arrives already in decending order of importance missingness or mendel count by subj or marker
	431	maint = 'QC for %s - %s' % (basename,title)
	432	R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
	433	R.append("par(lab=c(10,10,10))")
	434	R.append('plot(plotme$xvec,plotme$yvec,type="p",main="%s",ylab="%s",xlab="Sample Percentile",col="%s")' % (maint,xlabname,colour))
	435	R.append('dev.off()')
	436	return R
	437
	438	def rQQ(plotme='', outfname='fname',title='title',xlabname='Sample',basename=''):
	439	"""
	440	y is data for a qq plot and ends up on the x axis go figure
	441	if sampling, oversample low values - all the top 1% ?
	442	this version called with -log10 transformed hwe
	443	"""
	444	R = []
	445	nrows = len(plotme)
	446	fn = float(nrows)
	447	xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
	448	#mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
	449	maxveclen = 3000
	450	yvec = copy.copy(plotme)
	451	if nrows > maxveclen:
	452	# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
	453	# oversample part of the distribution
	454	always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
	455	skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
	456	if skip < 2:
	457	skip = 2
	458	samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
	459	# always oversample first sorted (here lowest) values
	460	yvec = [yvec[i] for i in samplei] # always get first and last
	461	xvec = [xvec[i] for i in samplei] # and sample xvec same way
	462	maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
	463	else:
	464	maint='Log QQ Plot(n=%d)' % (nrows)
	465	mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
	466	ylab = '%s' % xlabname
	467	xlab = '-log10(Uniform 0-1)'
	468	# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
	469	x = ['%f' % x for x in xvec]
	470	R.append('xvec = c(%s)' % ','.join(x))
	471	y = ['%f' % x for x in yvec]
	472	R.append('yvec = c(%s)' % ','.join(y))
	473	R.append('mx = c(0,%f)' % (math.log10(fn)))
	474	R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
	475	R.append("par(lab=c(10,10,10))")
	476	R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
	477	R.append('points(mx,mx,type="l")')
	478	R.append('grid(col="lightgray",lty="dotted")')
	479	R.append('dev.off()')
	480	return R
	481
	482	def rMultiQQ(plotme = '',nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
	483	"""
	484	data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
	485	grid of qq plots to show departure from null at extremes of data quality
	486	Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
	487	and ends up on the x axis go figure
	488	if sampling, oversample low values - all the top 1% ?
	489	"""
	490	data = copy.copy(plotme)
	491	nvals = len(data)
	492	stepsize = nvals/nsplits
	493	logstep = math.log10(stepsize) # so is 3 for steps of 1000
	494	R.append('mx = c(0,%f)' % logstep)
	495	quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
	496	data.sort(key=itertools.itemgetter(1)) # into x order
	497	#rpy.r.pdf( outfname, h , w )
	498	#rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
	499	R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
	500	yvec = [-math.log10(random.random()) for x in range(stepsize)]
	501	yvec.sort() # size of each step is expected range for xvec under null?!
	502	y = ['%f' % x for x in yvec]
	503	R.append('yvec = c(%s)' % ','.join(y))
	504	for rowstart in quints:
	505	rowend = rowstart + stepsize
	506	if nvals - rowend < stepsize: # finish last split
	507	rowend = nvals
	508	row = data[rowstart:rowend]
	509	row.sort(key=itertools.itemgetter(2)) # into y order
	510	for colstart in quints:
	511	colend = colstart + stepsize
	512	if nvals - colend < stepsize: # finish last split
	513	colend = nvals
	514	cell = row[colstart:colend]
	515	xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
	516	x = ['%f' % x for x in xvec]
	517	R.append('xvec = c(%s)' % ','.join(x))
	518	R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
	519	R.append('points(mx,mx,type="l")')
	520	R.append('grid(col="lightgray",lty="dotted")')
	521	#rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
	522	#rpy.r.points(c(0,logstep),c(0,logstep),type='l')
	523	R.append('dev.off()')
	524	#rpy.r.dev_off()
	525	return R
	526
	527
	528	def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
	529	"""
	530	y is data for a qqnorm plot
	531	if sampling, oversample low values - all the top 1% ?
	532	"""
	533	rangeunif = len(plotme)
	534	nunif = 1000
	535	maxveclen = 3000
	536	nrows = len(plotme)
	537	data = copy.copy(plotme)
	538	if nrows > maxveclen:
	539	# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
	540	# oversample part of the distribution
	541	always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
	542	skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
	543	samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
	544	# always oversample first sorted (here lowest) values
	545	yvec = [data[i] for i in samplei] # always get first and last
	546	maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
	547	else:
	548	yvec = data
	549	maint='Log QQ Plot(n=%d)' % (nrows)
	550	n = 1000
	551	ylab = '%s' % xlabname
	552	xlab = 'Normal'
	553	# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
	554	#rpy.r.pdf( outfname, h , w )
	555	#rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
	556	#rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
	557	#rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
	558	#rpy.r.dev_off()
	559	y = ['%f' % x for x in yvec]
	560	R.append('yvec = c(%s)' % ','.join(y))
	561	R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
	562	R.append("par(lab=c(10,10,10))")
	563	R.append('qqnorm(yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
	564	R.append('grid(col="lightgray",lty="dotted")')
	565	R.append('dev.off()')
	566	return R
	567
	568	def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
	569	"""
	570	layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
	571	like the GAIN qc tools
	572	y is data for a qq plot and ends up on the x axis go figure
	573	if sampling, oversample low values - all the top 1% ?
	574	"""
	575	rangeunif = len(plotme)
	576	nunif = 1000
	577	fn = float(rangeunif)
	578	xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
	579	skip = max(int(rangeunif/fn),1)
	580	# force include last points
	581	mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
	582	maxveclen = 2000
	583	nrows = len(plotme)
	584	data = copy.copy(plotme)
	585	data.sort() # low to high - oversample low values
	586	if nrows > maxveclen:
	587	# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
	588	# oversample part of the distribution
	589	always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
	590	skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
	591	samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
	592	# always oversample first sorted (here lowest) values
	593	yvec = [data[i] for i in samplei] # always get first and last
	594	xvec = [xvec[i] for i in samplei] # and sample xvec same way
	595	maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
	596	else:
	597	yvec = data
	598	maint='Log QQ Plot(n=%d)' % (nrows)
	599	n = 1000
	600	mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
	601	ylab = '%s' % xlabname
	602	xlab = '-log10(Uniform 0-1)'
	603	R.append('mx = c(0,%f)' % (math.log10(fn)))
	604	x = ['%f' % x for x in xvec]
	605	R.append('xvec = c(%s)' % ','.join(x))
	606	y = ['%f' % x for x in yvec]
	607	R.append('yvec = c(%s)' % ','.join(y))
	608	R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
	609	R.append("par(lab=c(10,10,10))")
	610	R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
	611	R.append('points(mx,mx,type="l")')
	612	R.append('grid(col="lightgray",lty="dotted")')
	613	R.append('dev.off()')
	614
	615
	616	shead = subjects.pop(0) # get rid of head
	617	mhead = markers.pop(0)
	618	maf = mhead.index('maf')
	619	missfrac = mhead.index('missfrac')
	620	logphweall = mhead.index('logp_hwe_all')
	621	logphweunaff = mhead.index('logp_hwe_unaff')
	622	# check for at least some unaffected rml june 2009
	623	m_mendel = mhead.index('N_Mendel')
	624	fracmiss = shead.index('FracMiss')
	625	s_mendel = shead.index('Mendel_errors')
	626	s_het = shead.index('F_Stat')
	627	params = {}
	628	h = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
	629	and x[logphweunaff].upper() <> 'NA']
	630	if len(h) <> 0:
	631	xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
	632	# plot for each of these cols
	633	else: # try hwe all instead - maybe no affection status available
	634	xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
	635	ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
	636	oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
	637	qqplotme = [1,0,0,0,0,0,0] #
	638	qnplotme = [0,0,0,0,0,0,1] #
	639	nplots = len(xs)
	640	xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
	641	'Marker Mendel Error Count', 'Missing Rate: Subjects',
	642	'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
	643	plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
	644	ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
	645	ordplotnames = ['%s_cum' % x for x in plotnames]
	646	ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
	647	outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
	648	ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
	649	datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
	650	titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
	651	"Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
	652	html = []
	653	pdflist = []
	654	R = []
	655	for n,column in enumerate(xs):
	656	dfn = '%d_%s.txt' % (n,titles[n])
	657	dfilepath = os.path.join(newfpath,dfn)
	658	dat = [float(x[column]) for x in datasources[n] if len(x) >= column
	659	and x[column][:2].upper() <> 'NA'] # plink gives both!
	660	if sum(dat) <> 0: # eg nada for mendel if case control?
	661	plotme = file(dfilepath,'w')
	662	plotme.write('\n'.join(['%f' % x for x in dat])) # pass as a file - copout!
	663	tR = rHist(plotme=dfilepath,outfname=outfnames[n],xlabname=xlabnames[n],
	664	title=titles[n],basename=basename,nbreaks=nbreaks)
	665	R += tR
	666	row = [titles[n],ploturls[n],outfnames[n] ]
	667	html.append(row)
	668	pdflist.append(outfnames[n])
	669	if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
	670	otitle = 'Ranked %s' % titles[n]
	671	dat.sort()
	672	if oreverseme[n]:
	673	dat.reverse()
	674	ndat = len(dat)
	675	xvec = range(ndat)
	676	xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
	677	# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
	678	maxveclen = 1000.0 # for reasonable pdf sizes!
	679	if ndat > maxveclen: # oversample part of the distribution
	680	always = min(1000,ndat/20) # oversample smaller of lowest few hundred items or 5%
	681	skip = int(ndat/maxveclen) # take 1 in skip to get about maxveclen points
	682	samplei = [i for i in range(ndat) if (i % skip == 0) or (i < always)] # always oversample first sorted values
	683	yvec = [yvec[i] for i in samplei] # always get first and last
	684	xvec = [xvec[i] for i in samplei] # always get first and last
	685	plotme = file(dfilepath,'w')
	686	plotme.write('xvec\tyvec\n')
	687	plotme.write('\n'.join(['%f\t%f' % (xvec[i],y) for y in yvec])) # pass as a file - copout!
	688	tR = rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
	689	title=otitle,basename=basename,nbreaks=nbreaks)
	690	R += tR
	691	row = [otitle,ordploturls[n],ordoutfnames[n]]
	692	html.append(row)
	693	pdflist.append(ordoutfnames[n])
	694	if qqplotme[n]: #
	695	otitle = 'LogQQ plot %s' % titles[n]
	696	dat.sort()
	697	dat.reverse()
	698	ofn = os.path.split(ordoutfnames[n])
	699	ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
	700	ofu = os.path.split(ordploturls[n])
	701	ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
	702	tR = rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
	703	title=otitle,basename=basename)
	704	R += tR
	705	row = [otitle,ofu,ofn]
	706	html.append(row)
	707	pdflist.append(ofn)
	708	elif qnplotme[n]:
	709	otitle = 'F Statistic %s' % titles[n]
	710	dat.sort()
	711	dat.reverse()
	712	ofn = os.path.split(ordoutfnames[n])
	713	ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
	714	ofu = os.path.split(ordploturls[n])
	715	ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
	716	tR = rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
	717	title=otitle,basename=basename)
	718	R += tR
	719	row = [otitle,ofu,ofn]
	720	html.append(row)
	721	pdflist.append(ofn)
	722	else:
	723	print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
	724	rlog,flist = RRun(rcmd=R,title='makeQCplots',outdir=newfpath)
	725	if nup>0:
	726	# pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
	727	# pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
	728	filestojoin = ' '.join(pdflist) # all the file names so far
	729	afname = '%s_All_Paged.pdf' % (basename)
	730	fullafname = os.path.join(newfpath,afname)
	731	expl = 'All %s QC Plots joined into a single pdf' % basename
	732	vcl = 'pdfjoin %s --outfile %s ' % (filestojoin, fullafname)
	733	# make single page pdf
	734	x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
	735	retval = x.wait()
	736	row = [expl,afname,fullafname]
	737	html.insert(0,row) # last rather than second
	738	nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
	739	fullnfname = os.path.join(newfpath,nfname)
	740	expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
	741	vcl = 'pdfnup %s --nup %dx%d --frame true --outfile %s' % (afname,nup,nup,fullnfname)
	742	# make thumbnail images
	743	x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
	744	retval = x.wait()
	745	row = [expl,nfname,fullnfname]
	746	html.insert(1,row) # this goes second
	747	vcl = 'mogrify -format jpg -resize %s %s' % (mogresize, os.path.join(newfpath,'*.pdf'))
	748	# make thumbnail images
	749	x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
	750	retval = x.wait()
	751	return html # elements for an ordered list of urls or whatever..
	752
	753	def countHet(bedf='fakeped_500000',linkageped=True,froot='fake500k',outfname="ahetf",logf='rgQC.log'):
	754	"""
	755	NO LONGER USED - historical interest
	756	count het loci for each subject to look for outliers = ? contamination
	757	assume ped file is linkage format
	758	need to make a ped file from the bed file we were passed
	759	"""
	760	vcl = [plinke,'--bfile',bedf,'--recode','--out','%s_recode' % froot] # write a recoded ped file from the real .bed file
	761	p=subprocess.Popen(' '.join(vcl),shell=True)
	762	retval = p.wait()
	763	f = open('%s_recode.recode.ped' % froot,'r')
	764	if not linkageped:
	765	head = f.next() # throw away header
	766	hets = [] # simple count of het loci per subject. Expect poisson?
	767	n = 1
	768	for l in f:
	769	n += 1
	770	ll = l.strip().split()
	771	if len(ll) > 6:
	772	iid = idjoiner.join(ll[:2]) # fam_iid
	773	gender = ll[4]
	774	alleles = ll[6:]
	775	nallele = len(alleles)
	776	nhet = 0
	777	for i in range(nallele/2):
	778	a1=alleles[2*i]
	779	a2=alleles[2*i+1]
	780	if alleles[2i] <> alleles[2i+1]: # must be het
	781	if not missvals.get(a1,None) and not missvals.get(a2,None):
	782	nhet += 1
	783	hets.append((nhet,iid,gender)) # for sorting later
	784	f.close()
	785	hets.sort()
	786	hets.reverse() # biggest nhet now on top
	787	f = open(outfname ,'w')
	788	res = ['%d\t%s\t%s' % (x) for x in hets] # I love list comprehensions
	789	res.insert(0,'nhetloci\tfamid_iid\tgender')
	790	res.append('')
	791	f.write('\n'.join(res))
	792	f.close()
	793
	794
	795
	796	def subjectRep(froot='cleantest',outfname="srep",newfpath='.',logf = None):
	797	"""by subject (missingness = .imiss, mendel = .imendel)
	798	assume replicates have an underscore in family id for
	799	hapmap testing
	800	For sorting, we need floats and integers
	801	"""
	802	isexfile = '%s.sexcheck' % froot
	803	imissfile = '%s.imiss' % froot
	804	imendfile = '%s.imendel' % froot
	805	ihetfile = '%s.het' % froot
	806	logf.write('## subject reports starting at %s\n' % timenow())
	807	outfile = os.path.join(newfpath,outfname)
	808	idlist = []
	809	imissdict = {}
	810	imenddict = {}
	811	isexdict = {}
	812	ihetdict = {}
	813	Tops = {}
	814	Tnames = ['Ranked Subject Missing Genotype', 'Ranked Subject Mendel',
	815	'Ranked Sex check', 'Ranked Inbreeding (het) F statistic']
	816	Tsorts = [2,3,6,8]
	817	Treverse = [True,True,True,False] # so first values are worser
	818	#rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
	819	## FID IID MISS_PHENO N_MISS N_GENO F_MISS
	820	## 1552042370_A 1552042370_A N 5480 549883 0.009966
	821	## 1552042410_A 1552042410_A N 1638 549883 0.002979
	822
	823	# ------------------missing--------------------
	824	# imiss has FID IID MISS_PHENO N_MISS F_MISS
	825	# we want F_MISS
	826	try:
	827	f = open(imissfile,'r')
	828	except:
	829	logf.write('# file %s is missing - talk about irony\n' % imissfile)
	830	f = None
	831	if f:
	832	for n,line in enumerate(f):
	833	ll = line.strip().split()
	834	if n == 0:
	835	head = [x.upper() for x in ll] # expect above
	836	fidpos = head.index('FID')
	837	iidpos = head.index('IID')
	838	fpos = head.index('F_MISS')
	839	elif len(ll) >= fpos: # full line
	840	fid = ll[fidpos]
	841	#if fid.find('_') == -1: # not replicate! - icondb ids have these...
	842	iid = ll[iidpos]
	843	fmiss = ll[fpos]
	844	id = '%s%s%s' % (fid,idjoiner,iid)
	845	imissdict[id] = fmiss
	846	idlist.append(id)
	847	f.close()
	848	logf.write('## imissfile %s contained %d ids\n' % (imissfile,len(idlist)))
	849	# ------------------mend-------------------
	850	# *.imendel has FID IID N
	851	# we want N
	852	gotmend = True
	853	try:
	854	f = open(imendfile,'r')
	855	except:
	856	gotmend = False
	857	for id in idlist:
	858	imenddict[id] = '0'
	859	if gotmend:
	860	for n,line in enumerate(f):
	861	ll = line.strip().split()
	862	if n == 0:
	863	head = [x.upper() for x in ll] # expect above
	864	npos = head.index('N')
	865	fidpos = head.index('FID')
	866	iidpos = head.index('IID')
	867	elif len(ll) >= npos: # full line
	868	fid = ll[fidpos]
	869	iid = ll[iidpos]
	870	id = '%s%s%s' % (fid,idjoiner,iid)
	871	nmend = ll[npos]
	872	imenddict[id] = nmend
	873	f.close()
	874	else:
	875	logf.write('## error No %s file - assuming not family data\n' % imendfile)
	876	# ------------------sex check------------------
	877	#[rerla@hg fresh]$ head /home/rerla/fresh/database/files/dataset_978_files/CAMP2007Dirty.sexcheck
	878	# sexcheck has FID IID PEDSEX SNPSEX STATUS F
	879	##
	880	## FID Family ID
	881	## IID Individual ID
	882	## PEDSEX Sex as determined in pedigree file (1=male, 2=female)
	883	## SNPSEX Sex as determined by X chromosome
	884	## STATUS Displays "PROBLEM" or "OK" for each individual
	885	## F The actual X chromosome inbreeding (homozygosity) estimate
	886	##
	887	## A PROBLEM arises if the two sexes do not match, or if the SNP data or pedigree data are
	888	## ambiguous with regard to sex.
	889	## A male call is made if F is more than 0.8; a femle call is made if F is less than 0.2.
	890	try:
	891	f = open(isexfile,'r')
	892	got_sexcheck = 1
	893	except:
	894	got_sexcheck = 0
	895	if got_sexcheck:
	896	for n,line in enumerate(f):
	897	ll = line.strip().split()
	898	if n == 0:
	899	head = [x.upper() for x in ll] # expect above
	900	fidpos = head.index('FID')
	901	iidpos = head.index('IID')
	902	pedsexpos = head.index('PEDSEX')
	903	snpsexpos = head.index('SNPSEX')
	904	statuspos = head.index('STATUS')
	905	fpos = head.index('F')
	906	elif len(ll) >= fpos: # full line
	907	fid = ll[fidpos]
	908	iid = ll[iidpos]
	909	fest = ll[fpos]
	910	pedsex = ll[pedsexpos]
	911	snpsex = ll[snpsexpos]
	912	stat = ll[statuspos]
	913	id = '%s%s%s' % (fid,idjoiner,iid)
	914	isexdict[id] = (pedsex,snpsex,stat,fest)
	915	f.close()
	916	else:
	917	# this only happens if there are no subjects!
	918	logf.write('No %s file - assuming no sex errors' % isexfile)
	919	##
	920	## FID IID O(HOM) E(HOM) N(NM) F
	921	## 457 2 490665 4.928e+05 722154 -0.009096
	922	## 457 3 464519 4.85e+05 710986 -0.0908
	923	## 1037 2 461632 4.856e+05 712025 -0.106
	924	## 1037 1 491845 4.906e+05 719353 0.005577
	925	try:
	926	f = open(ihetfile,'r')
	927	except:
	928	f = None
	929	logf.write('## No %s file - did we run --het in plink?\n' % ihetfile)
	930	if f:
	931	for i,line in enumerate(f):
	932	ll = line.strip().split()
	933	if i == 0:
	934	head = [x.upper() for x in ll] # expect above
	935	fidpos = head.index('FID')
	936	iidpos = head.index('IID')
	937	fpos = head.index('F')
	938	n = 0
	939	elif len(ll) >= fpos: # full line
	940	fid = ll[fidpos]
	941	iid = ll[iidpos]
	942	fhet = ll[fpos]
	943	id = '%s%s%s' % (fid,idjoiner,iid)
	944	ihetdict[id] = fhet
	945	f.close() # now assemble and output result list
	946	rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','XHomEst','F_Stat']
	947	res = []
	948	fres = [] # floats for sorting
	949	for id in idlist: # for each snp in found order
	950	fid,iid = id.split(idjoiner) # recover keys
	951	f_missing = imissdict.get(id,'0.0')
	952	nmend = imenddict.get(id,'0')
	953	(pedsex,snpsex,status,fest) = isexdict.get(id,('0','0','0','0.0'))
	954	fhet = ihetdict.get(id,'0.0')
	955	res.append([fid,iid,f_missing,nmend,pedsex,snpsex,status,fest,fhet])
	956	try:
	957	ff_missing = float(f_missing)
	958	except:
	959	ff_missing = 0.0
	960	try:
	961	inmend = int(nmend)
	962	except:
	963	inmend = 0
	964	try:
	965	ffest = float(fest)
	966	except:
	967	fest = 0.0
	968	try:
	969	ffhet = float(fhet)
	970	except:
	971	ffhet = 0.0
	972	fres.append([fid,iid,ff_missing,inmend,pedsex,snpsex,status,ffest,ffhet])
	973	ntokeep = max(20,len(res)/keepfrac)
	974	for i,col in enumerate(Tsorts):
	975	fres.sort(key=operator.itemgetter(col))
	976	if Treverse[i]:
	977	fres.reverse()
	978	repname = Tnames[i]
	979	Tops[repname] = fres[0:ntokeep]
	980	Tops[repname] = [map(str,x) for x in Tops[repname]]
	981	Tops[repname].insert(0,rhead)
	982	res.sort()
	983	res.insert(0,rhead)
	984	logf.write('### writing %s report with %s' % ( outfile,res[0]))
	985	f = open(outfile,'w')
	986	f.write('\n'.join(['\t'.join(x) for x in res]))
	987	f.write('\n')
	988	f.close()
	989	return res,Tops
	990
	991	def markerRep(froot='cleantest',outfname="mrep",newfpath='.',logf=None,maplist=None ):
	992	"""by marker (hwe = .hwe, missingness=.lmiss, freq = .frq)
	993	keep a list of marker order but keep all stats in dicts
	994	write out a fake xls file for R or SAS etc
	995	kinda clunky, but..
	996	TODO: ensure stable if any file not found?
	997	"""
	998	mapdict = {}
	999	if maplist <> None:
	1000	rslist = [x[1] for x in maplist]
	1001	offset = [(x[0],x[3]) for x in maplist]
	1002	mapdict = dict(zip(rslist,offset))
	1003	hwefile = '%s.hwe' % froot
	1004	lmissfile = '%s.lmiss' % froot
	1005	freqfile = '%s.frq' % froot
	1006	lmendfile = '%s.lmendel' % froot
	1007	outfile = os.path.join(newfpath,outfname)
	1008	markerlist = []
	1009	chromlist = []
	1010	hwedict = {}
	1011	lmissdict = {}
	1012	freqdict = {}
	1013	lmenddict = {}
	1014	Tops = {}
	1015	Tnames = ['Ranked Marker MAF', 'Ranked Marker Missing Genotype', 'Ranked Marker HWE', 'Ranked Marker Mendel']
	1016	Tsorts = [3,6,10,11]
	1017	Treverse = [False,True,True,True] # so first values are worse(r)
	1018	#res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend])
	1019	#rhead = ['snp','chrom','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
	1020	# -------------------hwe--------------------------
	1021	# hwe has SNP TEST GENO O(HET) E(HET) P_HWD
	1022	# we want all hwe where P_HWD <> NA
	1023	# ah changed in 1.04 to
	1024	## CHR SNP TEST A1 A2 GENO O(HET) E(HET) P
	1025	## 1 rs6671164 ALL 2 3 34/276/613 0.299 0.3032 0.6644
	1026	## 1 rs6671164 AFF 2 3 0/0/0 nan nan NA
	1027	## 1 rs6671164 UNAFF 2 3 34/276/613 0.299 0.3032 0.6644
	1028	## 1 rs4448553 ALL 2 3 8/176/748 0.1888 0.1848 0.5975
	1029	## 1 rs4448553 AFF 2 3 0/0/0 nan nan NA
	1030	## 1 rs4448553 UNAFF 2 3 8/176/748 0.1888 0.1848 0.5975
	1031	## 1 rs1990150 ALL 1 3 54/303/569 0.3272 0.3453 0.1067
	1032	## 1 rs1990150 AFF 1 3 0/0/0 nan nan NA
	1033	## 1 rs1990150 UNAFF 1 3 54/303/569 0.3272 0.3453 0.1067
	1034	logf.write('## marker reports starting at %s\n' % timenow())
	1035	try:
	1036	f = open(hwefile,'r')
	1037	except:
	1038	f = None
	1039	logf.write('## error - no hwefile %s found\n' % hwefile)
	1040	if f:
	1041	for i,line in enumerate(f):
	1042	ll = line.strip().split()
	1043	if i == 0: # head
	1044	head = [x.upper() for x in ll] # expect above
	1045	try:
	1046	testpos = head.index('TEST')
	1047	except:
	1048	testpos = 2 # patch for 1.04 which has b0rken headers - otherwise use head.index('TEST')
	1049	try:
	1050	ppos = head.index('P')
	1051	except:
	1052	ppos = 8 # patch - for head.index('P')
	1053	snppos = head.index('SNP')
	1054	chrpos = head.index('CHR')
	1055	logf.write('hwe header testpos=%d,ppos=%d,snppos=%d\n' % (testpos,ppos,snppos))
	1056	elif len(ll) >= ppos: # full line
	1057	ps = ll[ppos].upper()
	1058	rs = ll[snppos]
	1059	chrom = ll[chrpos]
	1060	test = ll[testpos]
	1061	if not hwedict.get(rs,None):
	1062	hwedict[rs] = {}
	1063	markerlist.append(rs)
	1064	chromlist.append(chrom) # one place to find it?
	1065	lpvals = 0
	1066	if ps.upper() <> 'NA' and ps.upper() <> 'NAN': # worth keeping
	1067	lpvals = '0'
	1068	if ps <> '1':
	1069	try:
	1070	pval = float(ps)
	1071	lpvals = '%f' % -math.log10(pval)
	1072	except:
	1073	pass
	1074	hwedict[rs][test] = (ps,lpvals)
	1075	else:
	1076	logf.write('short line #%d = %s\n' % (i,ll))
	1077	f.close()
	1078	# ------------------missing--------------------
	1079	"""lmiss has
	1080	CHR SNP N_MISS N_GENO F_MISS
	1081	1 rs12354060 0 73 0
	1082	1 rs4345758 1 73 0.0137
	1083	1 rs2691310 73 73 1
	1084	1 rs2531266 73 73 1
	1085	# we want F_MISS"""
	1086	try:
	1087	f = open(lmissfile,'r')
	1088	except:
	1089	f = None
	1090	if f:
	1091	for i,line in enumerate(f):
	1092	ll = line.strip().split()
	1093	if i == 0:
	1094	head = [x.upper() for x in ll] # expect above
	1095	fracpos = head.index('F_MISS')
	1096	npos = head.index('N_MISS')
	1097	snppos = head.index('SNP')
	1098	elif len(ll) >= fracpos: # full line
	1099	rs = ll[snppos]
	1100	fracval = ll[fracpos]
	1101	lmissdict[rs] = fracval # for now, just that?
	1102	else:
	1103	lmissdict[rs] = 'NA'
	1104	f.close()
	1105	# ------------------freq-------------------
	1106	# frq has CHR SNP A1 A2 MAF NM
	1107	# we want maf
	1108	try:
	1109	f = open(freqfile,'r')
	1110	except:
	1111	f = None
	1112	if f:
	1113	for i,line in enumerate(f):
	1114	ll = line.strip().split()
	1115	if i == 0:
	1116	head = [x.upper() for x in ll] # expect above
	1117	mafpos = head.index('MAF')
	1118	a1pos = head.index('A1')
	1119	a2pos = head.index('A2')
	1120	snppos = head.index('SNP')
	1121	elif len(ll) >= mafpos: # full line
	1122	rs = ll[snppos]
	1123	a1 = ll[a1pos]
	1124	a2 = ll[a2pos]
	1125	maf = ll[mafpos]
	1126	freqdict[rs] = (maf,a1,a2)
	1127	f.close()
	1128	# ------------------mend-------------------
	1129	# lmend has CHR SNP N
	1130	# we want N
	1131	gotmend = True
	1132	try:
	1133	f = open(lmendfile,'r')
	1134	except:
	1135	gotmend = False
	1136	for rs in markerlist:
	1137	lmenddict[rs] = '0'
	1138	if gotmend:
	1139	for i,line in enumerate(f):
	1140	ll = line.strip().split()
	1141	if i == 0:
	1142	head = [x.upper() for x in ll] # expect above
	1143	npos = head.index('N')
	1144	snppos = head.index('SNP')
	1145	elif len(ll) >= npos: # full line
	1146	rs = ll[snppos]
	1147	nmend = ll[npos]
	1148	lmenddict[rs] = nmend
	1149	f.close()
	1150	else:
	1151	logf.write('No %s file - assuming not family data\n' % lmendfile)
	1152	# now assemble result list
	1153	rhead = ['snp','chromosome','offset','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
	1154	res = []
	1155	fres = []
	1156	for rs in markerlist: # for each snp in found order
	1157	f_missing = lmissdict.get(rs,'NA')
	1158	maf,a1,a2 = freqdict.get(rs,('NA','NA','NA'))
	1159	hwe_all = hwedict[rs].get('ALL',('NA','NA')) # hope this doesn't change...
	1160	hwe_unaff = hwedict[rs].get('UNAFF',('NA','NA'))
	1161	nmend = lmenddict.get(rs,'NA')
	1162	(chrom,offset)=mapdict.get(rs,('?','0'))
	1163	res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend])
	1164	ntokeep = max(10,len(res)/keepfrac)
	1165
	1166	def msortk(item=None):
	1167	"""
	1168	deal with non numeric sorting
	1169	"""
	1170	try:
	1171	return float(item)
	1172	except:
	1173	return item
	1174
	1175	for i,col in enumerate(Tsorts):
	1176	res.sort(key=msortk(lambda x:x[col]))
	1177	if Treverse[i]:
	1178	res.reverse()
	1179	repname = Tnames[i]
	1180	Tops[repname] = res[0:ntokeep]
	1181	Tops[repname].insert(0,rhead)
	1182	res.sort(key=lambda x: '%s_%10d' % (x[1].ljust(4,'0'),int(x[2]))) # in chrom offset order
	1183	res.insert(0,rhead)
	1184	f = open(outfile,'w')
	1185	f.write('\n'.join(['\t'.join(x) for x in res]))
	1186	f.close()
	1187	return res,Tops
	1188
	1189
	1190
	1191
	1192	def getfSize(fpath,outpath):
	1193	"""
	1194	format a nice file size string
	1195	"""
	1196	size = ''
	1197	fp = os.path.join(outpath,fpath)
	1198	if os.path.isfile(fp):
	1199	n = float(os.path.getsize(fp))
	1200	if n > 2**20:
	1201	size = ' (%1.1f MB)' % (n/2**20)
	1202	elif n > 2**10:
	1203	size = ' (%1.1f KB)' % (n/2**10)
	1204	elif n > 0:
	1205	size = ' (%d B)' % (int(n))
	1206	return size
	1207
	1208
	1209	if __name__ == "__main__":
	1210	u = """ called in xml as
	1211	<command interpreter="python">
	1212	rgQC.py -i '$input_file.extra_files_path/$input_file.metadata.base_name' -o "$out_prefix"
	1213	-s '$html_file' -p '$html_file.files_path' -l '${GALAXY_DATA_INDEX_DIR}/rg/bin/plink'
	1214	-r '${GALAXY_DATA_INDEX_DIR}/rg/bin/R'
	1215	</command>
	1216
	1217	Prepare a qc report - eg:
	1218	print "%s %s -i birdlped -o birdlped -l qc.log -s htmlf -m marker.xls -s sub.xls -p ./" % (sys.executable,prog)
	1219
	1220	"""
	1221	progname = os.path.basename(sys.argv[0])
	1222	if len(sys.argv) < 9:
	1223	print '%s requires 6 parameters - got %d = %s' % (progname,len(sys.argv),sys.argv)
	1224	sys.exit(1)
	1225	parser = OptionParser(usage=u, version="%prog 0.01")
	1226	a = parser.add_option
	1227	a("-i","--infile",dest="infile")
	1228	a("-o","--oprefix",dest="opref")
	1229	a("-l","--plinkexe",dest="plinke", default=plinke)
	1230	a("-r","--rexe",dest="rexe", default=rexe)
	1231	a("-s","--snps",dest="htmlf")
	1232	#a("-m","--markerRaw",dest="markf")
	1233	#a("-r","--rawsubject",dest="subjf")
	1234	a("-p","--patho",dest="newfpath")
	1235	(options,args) = parser.parse_args()
	1236	basename = os.path.split(options.infile)[-1] # just want the file prefix to find the .xls files below
	1237	killme = string.punctuation + string.whitespace
	1238	trantab = string.maketrans(killme,'_'*len(killme))
	1239	opref = options.opref.translate(trantab)
	1240	alogh,alog = tempfile.mkstemp(suffix='.txt')
	1241	plogh,plog = tempfile.mkstemp(suffix='.txt')
	1242	alogf = open(alog,'w')
	1243	plogf = open(plog,'w')
	1244	ahtmlf = options.htmlf
	1245	amarkf = 'MarkerDetails_%s.xls' % opref
	1246	asubjf = 'SubjectDetails_%s.xls' % opref
	1247	newfpath = options.newfpath
	1248	newfpath = os.path.realpath(newfpath)
	1249	try:
	1250	os.makedirs(newfpath)
	1251	except:
	1252	pass
	1253	ofn = basename
	1254	bfn = options.infile
	1255	try:
	1256	mapf = '%s.bim' % bfn
	1257	maplist = file(mapf,'r').readlines()
	1258	maplist = [x.split() for x in maplist]
	1259	except:
	1260	maplist = None
	1261	alogf.write('## error - cannot open %s to read map - no offsets will be available for output files')
	1262	#rerla@beast galaxy]$ head test-data/tinywga.bim
	1263	#22 rs2283802 0 21784722 4 2
	1264	#22 rs2267000 0 21785366 4 2
	1265	rgbin = os.path.split(rexe)[0] # get our rg bin path
	1266	#plinktasks = [' --freq',' --missing',' --mendel',' --hardy',' --check-sex'] # plink v1 fixes that bug!
	1267	# if we could, do all at once? Nope. Probably never.
	1268	plinktasks = [['--freq',],['--hwe 0.0', '--missing','--hardy'],
	1269	['--mendel',],['--check-sex',]]
	1270	vclbase = [options.plinke,'--noweb','--out',basename,'--bfile',bfn,'--mind','1.0','--geno','1.0','--maf','0.0']
	1271	runPlink(logf=plogf,plinktasks=plinktasks,cd=newfpath, vclbase=vclbase)
	1272	plinktasks = [['--bfile',bfn,'--indep-pairwise 40 20 0.5','--out %s' % basename],
	1273	['--bfile',bfn,'--extract %s.prune.in --make-bed --out ldp_%s' % (basename, basename)],
	1274	['--bfile ldp_%s --het --out %s' % (basename,basename)]]
	1275	# subset of ld independent markers for eigenstrat and other requirements
	1276	vclbase = [options.plinke,'--noweb']
	1277	plogout = pruneLD(plinktasks=plinktasks,cd=newfpath,vclbase = vclbase)
	1278	plogf.write('\n'.join(plogout))
	1279	plogf.write('\n')
	1280	repout = os.path.join(newfpath,basename)
	1281	subjects,subjectTops = subjectRep(froot=repout,outfname=asubjf,newfpath=newfpath,
	1282	logf=alogf) # writes the subject_froot.xls file
	1283	markers,markerTops = markerRep(froot=repout,outfname=amarkf,newfpath=newfpath,
	1284	logf=alogf,maplist=maplist) # marker_froot.xls
	1285	nbreaks = 100
	1286	s = '## starting plotpage, newfpath=%s,m=%s,s=%s/n' % (newfpath,markers[:2],subjects[:2])
	1287	alogf.write(s)
	1288	print s
	1289	plotpage,cruft = makePlots(markers=markers,subjects=subjects,newfpath=newfpath,
	1290	basename=basename,nbreaks=nbreaks,height=10,width=8,rgbin=rgbin)
	1291	#plotpage = RmakePlots(markers=markers,subjects=subjects,newfpath=newfpath,basename=basename,nbreaks=nbreaks,rexe=rexe)
	1292
	1293	# [titles[n],plotnames[n],outfnames[n] ]
	1294	html = []
	1295	html.append('<table cellpadding="5" border="0">')
	1296	size = getfSize(amarkf,newfpath)
	1297	html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \
	1298	(amarkf,'Click here to download the Marker QC Detail report file',size))
	1299	size = getfSize(asubjf,newfpath)
	1300	html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \
	1301	(asubjf,'Click here to download the Subject QC Detail report file',size))
	1302	for (title,url,ofname) in plotpage:
	1303	ttitle = 'Ranked %s' % title
	1304	dat = subjectTops.get(ttitle,None)
	1305	if not dat:
	1306	dat = markerTops.get(ttitle,None)
	1307	imghref = '%s.jpg' % os.path.splitext(url)[0] # removes .pdf
	1308	thumbnail = os.path.join(newfpath,imghref)
	1309	if not os.path.exists(thumbnail): # for multipage pdfs, mogrify makes multiple jpgs - fugly hack
	1310	imghref = '%s-0.jpg' % os.path.splitext(url)[0] # try the first jpg
	1311	thumbnail = os.path.join(newfpath,imghref)
	1312	if not os.path.exists(thumbnail):
	1313	html.append('<tr><td colspan="3"><a href="%s">%s</a></td></tr>' % (url,title))
	1314	else:
	1315	html.append('<tr><td><a href="%s"><img src="%s" alt="%s" hspace="10" align="middle">' \
	1316	% (url,imghref,title))
	1317	if dat: # one or the other - write as an extra file and make a link here
	1318	t = '%s.xls' % (ttitle.replace(' ','_'))
	1319	fname = os.path.join(newfpath,t)
	1320	f = file(fname,'w')
	1321	f.write('\n'.join(['\t'.join(x) for x in dat])) # the report
	1322	size = getfSize(t,newfpath)
	1323	html.append('</a></td><td>%s</td><td><a href="%s">Worst data</a>%s</td></tr>' % (title,t,size))
	1324	else:
	1325	html.append('</a></td><td>%s</td><td> </td></tr>' % (title))
	1326	html.append('</table><hr><h3>All output files from the QC run are available below</h3>')
	1327	html.append('<table cellpadding="5" border="0">\n')
	1328	flist = os.listdir(newfpath) # we want to catch 'em all
	1329	flist.sort()
	1330	for f in flist:
	1331	fname = os.path.split(f)[-1]
	1332	size = getfSize(fname,newfpath)
	1333	html.append('<tr><td><a href="%s">%s</a>%s</td></tr>' % (fname,fname,size))
	1334	html.append('</table>')
	1335	alogf.close()
	1336	plogf.close()
	1337	llog = open(alog,'r').readlines()
	1338	plogfile = open(plog,'r').readlines()
	1339	os.unlink(alog)
	1340	os.unlink(plog)
	1341	llog += plogfile # add lines from pruneld log
	1342	lf = file(ahtmlf,'w') # galaxy will show this as the default view
	1343	lf.write(galhtmlprefix % progname)
	1344	s = '\n<div>Output from Rgenetics QC report tool run at %s<br>\n' % (timenow())
	1345	lf.write('<h4>%s</h4>\n' % s)
	1346	lf.write('</div><div><h4>(Click any preview image to download a full sized PDF version)</h4><br><ol>\n')
	1347	lf.write('\n'.join(html))
	1348	lf.write('<h4>QC run log contents</h4>')
	1349	lf.write('<pre>%s</pre>' % (''.join(llog))) # plink logs
	1350	if len(cruft) > 0:
	1351	lf.write('<h2>Blather from pdfnup follows:</h2><pre>%s</pre>' % (''.join(cruft))) # pdfnup
	1352	lf.write('%s\n<hr>\n' % galhtmlpostfix)
	1353	lf.close()
	1354

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/rgenetics/rgQC.py @ 2

異なるフォーマットでダウンロード: