# oct 15 rpy replaced - temp fix until we get gnuplot working
# rpy deprecated - replace with RRun
# fixes to run functional test! oct1 2009
# needed to expand our path with os.path.realpath to get newpath working with
# all the fancy pdfnup stuff
# and a fix to pruneld to write output to where it should be
# smallish data in test-data/smallwga in various forms
# python ../tools/rgenetics/rgQC.py -i smallwga -o smallwga -s smallwga/smallwga.html -p smallwga
# child files are deprecated and broken as at july 19 2009
# need to move them to the html file extrafiles path
# found lots of corner cases with some illumina data where cnv markers were
# included
# and where affection status was all missing !
# added links to tab files showing worst 1/keepfrac markers and subjects
# ross lazarus january 2008
#
# added named parameters
# to ensure no silly slippages if non required parameter in the most general case
# some potentially useful things here reusable in complex scripts
# with lots'o'html (TM)
# aug 17 2007 rml
#
# added marker and subject and parenting april 14 rml
# took a while to get the absolute paths right for all the file munging
# as of april 16 seems to work..
# getting galaxy to serve images in html reports is a little tricky
# we don't want QC reports to be dozens of individual files, so need
# to use the url /static/rg/... since galaxy's web server will happily serve images
# from there
# galaxy passes output files as relative paths
# these have to be munged by rgQC.py before calling this
# galaxy will pass in 2 file names - one for the log
# and one for the final html report
# of the form './database/files/dataset_66.dat'
# we need to be working in that directory so our plink output files are there
# so these have to be munged by rgQC.py before calling this
# note no ped file passed so had to remove the -l option
# for plinkParse.py that makes a heterozygosity report from the ped
# file - needs fixing...
# new: importing manhattan/qqplot plotter
# def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir):
# """ draw a qq for pvals and a manhattan plot if chrom/offset <> 0
# contains some R scripts as text strings - we substitute defaults into the calls
# to make them do our bidding - and save the resulting code for posterity
# this can be called externally, I guess...for QC eg?
# """
#
# rcmd = '%s%s' % (rcode,rcode2 % (input_fname,chrom_col,offset_col,pval_cols,title,grey))
# rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir)
# return rlog,flist
from optparse import OptionParser
import sys,os,shutil, glob, math, subprocess, time, operator, random, tempfile, copy, string
from os.path import abspath
from rgutils import galhtmlprefix, galhtmlpostfix, RRun, timenow, plinke, rexe, runPlink, pruneLD
import rgManQQ
prog = os.path.split(sys.argv[0])[1]
vers = '0.4 april 2009 rml'
idjoiner = '_~_~_' # need something improbable..
# many of these may need fixing for a new install
myversion = vers
keepfrac = 20 # fraction to keep after sorting by each interesting value
missvals = {'0':'0','N':'N','-9':'-9','-':'-'} # fix me if these change!
mogresize = "x300" # this controls the width for jpeg thumbnails
def makePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='20',nup=3,height=10,width=8,rgbin=''):
"""
marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
"""
def rHist(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=50):
""" rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
# generic histogram and vertical boxplot in a 3:1 layout
# returns the graphic file name for inclusion in the web page
# broken out here for reuse
# ross lazarus march 2007
"""
screenmat = (1,2,1,2) # create a 2x2 cabvas
widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
rpy.r.pdf( outfname, height , width )
#rpy.r.layout(rpy.r.matrix(rpy.r.c(1,1,1,2), 1, 4, byrow = True)) # 3 to 1 vertical plot
m = rpy.r.matrix((1,1,1,2),nrow=1,ncol=4,byrow=True)
# in R, m = matrix(c(1,2),nrow=1,ncol=2,byrow=T)
rpy.r("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))") # 4 to 1 vertical plot
maint = 'QC for %s - %s' % (basename,title)
rpy.r.hist(plotme,main=maint, xlab=xlabname,breaks=nbreaks,col="maroon",cex=0.8)
rpy.r.boxplot(plotme,main='',col="maroon",outline=False)
rpy.r.dev_off()
def rCum(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=100):
"""
Useful to see what various cutoffs yield - plot percentiles
"""
n = len(plotme)
maxveclen = 1000.0 # for reasonable pdf sizes!
yvec = copy.copy(plotme)
# arrives already in decending order of importance missingness or mendel count by subj or marker
xvec = range(n)
xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
if n > maxveclen: # oversample part of the distribution
always = min(1000,n/20) # oversample smaller of lowest few hundred items or 5%
skip = int(n/maxveclen) # take 1 in skip to get about maxveclen points
samplei = [i for i in range(n) if (i % skip == 0) or (i < always)] # always oversample first sorted values
yvec = [yvec[i] for i in samplei] # always get first and last
xvec = [xvec[i] for i in samplei] # always get first and last
# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
rpy.r.pdf( outfname, height , width )
maint = 'QC for %s - %s' % (basename,title)
rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
rpy.r.plot(xvec,yvec,type='p',main=maint, ylab=xlabname, xlab='Sample Percentile',col="maroon",cex=0.8)
rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
rpy.r.dev_off()
def rQQ(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
"""
y is data for a qq plot and ends up on the x axis go figure
if sampling, oversample low values - all the top 1% ?
this version called with -log10 transformed hwe
"""
nrows = len(plotme)
fn = float(nrows)
xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
maxveclen = 3000
yvec = copy.copy(plotme)
if nrows > maxveclen:
# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
# oversample part of the distribution
always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
# always oversample first sorted (here lowest) values
yvec = [yvec[i] for i in samplei] # always get first and last
xvec = [xvec[i] for i in samplei] # and sample xvec same way
maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
else:
maint='Log QQ Plot(n=%d)' % (nrows)
mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
ylab = '%s' % xlabname
xlab = '-log10(Uniform 0-1)'
# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
rpy.r.pdf( outfname, height , width )
rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
rpy.r.points(mx,mx,type='l')
rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
rpy.r.dev_off()
def rMultiQQ(plotme = [],nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
"""
data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
grid of qq plots to show departure from null at extremes of data quality
Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
and ends up on the x axis go figure
if sampling, oversample low values - all the top 1% ?
"""
data = copy.copy(plotme)
nvals = len(data)
stepsize = nvals/nsplits
logstep = math.log10(stepsize) # so is 3 for steps of 1000
quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
data.sort(key=itertools.itemgetter(1)) # into x order
rpy.r.pdf( outfname, height , width )
rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
yvec = [-math.log10(random.random()) for x in range(stepsize)]
yvec.sort() # size of each step is expected range for xvec under null?!
for rowstart in quints:
rowend = rowstart + stepsize
if nvals - rowend < stepsize: # finish last split
rowend = nvals
row = data[rowstart:rowend]
row.sort(key=itertools.itemgetter(2)) # into y order
for colstart in quints:
colend = colstart + stepsize
if nvals - colend < stepsize: # finish last split
colend = nvals
cell = row[colstart:colend]
xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
rpy.r.points(c(0,logstep),c(0,logstep),type='l')
rpy.r.dev_off()
def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
"""
y is data for a qqnorm plot
if sampling, oversample low values - all the top 1% ?
"""
rangeunif = len(plotme)
nunif = 1000
maxveclen = 3000
nrows = len(plotme)
data = copy.copy(plotme)
if nrows > maxveclen:
# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
# oversample part of the distribution
always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
# always oversample first sorted (here lowest) values
yvec = [data[i] for i in samplei] # always get first and last
maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
else:
yvec = data
maint='Log QQ Plot(n=%d)' % (nrows)
n = 1000
ylab = '%s' % xlabname
xlab = 'Normal'
# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
rpy.r.pdf( outfname, height , width )
rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
rpy.r.dev_off()
def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
"""
layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
like the GAIN qc tools
y is data for a qq plot and ends up on the x axis go figure
if sampling, oversample low values - all the top 1% ?
"""
rangeunif = len(plotme)
nunif = 1000
fn = float(rangeunif)
xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
skip = max(int(rangeunif/fn),1)
# force include last points
mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
maxveclen = 2000
nrows = len(plotme)
data = copy.copy(plotme)
data.sort() # low to high - oversample low values
if nrows > maxveclen:
# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
# oversample part of the distribution
always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
# always oversample first sorted (here lowest) values
yvec = [data[i] for i in samplei] # always get first and last
xvec = [xvec[i] for i in samplei] # and sample xvec same way
maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
else:
yvec = data
maint='Log QQ Plot(n=%d)' % (nrows)
n = 1000
mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
ylab = '%s' % xlabname
xlab = '-log10(Uniform 0-1)'
# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
rpy.r.pdf( outfname, height , width )
rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
rpy.r.points(mx,mx,type='l')
rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
rpy.r.dev_off()
fdsto,stofile = tempfile.mkstemp()
sto = open(stofile,'w')
import rpy # delay to avoid rpy stdout chatter replacing galaxy file blurb
mog = 'mogrify'
pdfnup = 'pdfnup'
pdfjoin = 'pdfjoin'
shead = subjects.pop(0) # get rid of head
mhead = markers.pop(0)
maf = mhead.index('maf')
missfrac = mhead.index('missfrac')
logphweall = mhead.index('logp_hwe_all')
logphweunaff = mhead.index('logp_hwe_unaff')
# check for at least some unaffected rml june 2009
m_mendel = mhead.index('N_Mendel')
fracmiss = shead.index('FracMiss')
s_mendel = shead.index('Mendel_errors')
s_het = shead.index('F_Stat')
params = {}
hweres = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
and x[logphweunaff].upper() <> 'NA']
if len(hweres) <> 0:
xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
# plot for each of these cols
else: # try hwe all instead - maybe no affection status available
xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
qqplotme = [1,0,0,0,0,0,0] #
qnplotme = [0,0,0,0,0,0,1] #
nplots = len(xs)
xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
'Marker Mendel Error Count', 'Missing Rate: Subjects',
'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
ordplotnames = ['%s_cum' % x for x in plotnames]
ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
"Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
html = []
pdflist = []
for n,column in enumerate(xs):
dat = [float(x[column]) for x in datasources[n] if len(x) >= column
and x[column][:2].upper() <> 'NA'] # plink gives both!
if sum(dat) <> 0: # eg nada for mendel if case control?
rHist(plotme=dat,outfname=outfnames[n],xlabname=xlabnames[n],
title=titles[n],basename=basename,nbreaks=nbreaks)
row = [titles[n],ploturls[n],outfnames[n] ]
html.append(row)
pdflist.append(outfnames[n])
if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
otitle = 'Ranked %s' % titles[n]
dat.sort()
if oreverseme[n]:
dat.reverse()
rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
title=otitle,basename=basename,nbreaks=1000)
row = [otitle,ordploturls[n],ordoutfnames[n]]
html.append(row)
pdflist.append(ordoutfnames[n])
if qqplotme[n]: #
otitle = 'LogQQ plot %s' % titles[n]
dat.sort()
dat.reverse()
ofn = os.path.split(ordoutfnames[n])
ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
ofu = os.path.split(ordploturls[n])
ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
title=otitle,basename=basename)
row = [otitle,ofu,ofn]
html.append(row)
pdflist.append(ofn)
elif qnplotme[n]:
otitle = 'F Statistic %s' % titles[n]
dat.sort()
dat.reverse()
ofn = os.path.split(ordoutfnames[n])
ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
ofu = os.path.split(ordploturls[n])
ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
title=otitle,basename=basename)
row = [otitle,ofu,ofn]
html.append(row)
pdflist.append(ofn)
else:
print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
if nup>0:
# pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
# pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
filestojoin = ' '.join(pdflist) # all the file names so far
afname = '%s_All_Paged.pdf' % (basename)
fullafname = os.path.join(newfpath,afname)
expl = 'All %s QC Plots joined into a single pdf' % basename
vcl = '%s %s --outfile %s ' % (pdfjoin,filestojoin, fullafname)
# make single page pdf
x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
retval = x.wait()
row = [expl,afname,fullafname]
html.insert(0,row) # last rather than second
nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
fullnfname = os.path.join(newfpath,nfname)
expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
vcl = '%s %s --nup %dx%d --frame true --outfile %s' % (pdfnup,afname,nup,nup,fullnfname)
# make thumbnail images
x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
retval = x.wait()
row = [expl,nfname,fullnfname]
html.insert(1,row) # this goes second
vcl = '%s -format jpg -resize %s %s' % (mog, mogresize, os.path.join(newfpath,'*.pdf'))
# make thumbnail images
x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
retval = x.wait()
sto.close()
cruft = open(stofile,'r').readlines()
return html,cruft # elements for an ordered list of urls or whatever..
def RmakePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='100',nup=3,height=8,width=10,rexe=''):
"""
nice try but the R scripts are huge and take forever to run if there's a lot of data
marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
"""
colour = "maroon"
def rHist(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=nbreaks):
""" rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
# generic histogram and vertical boxplot in a 3:1 layout
# returns the graphic file name for inclusion in the web page
# broken out here for reuse
# ross lazarus march 2007
"""
R = []
maint = 'QC for %s - %s' % (basename,title)
screenmat = (1,2,1,2) # create a 2x2 canvas
widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
R.append("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))")
R.append("plotme = read.table(file='%s',head=F,sep='\t')" % plotme)
R.append('hist(plotme, main="%s",xlab="%s",breaks=%d,col="%s")' % (maint,xlabname,nbreaks,colour))
R.append('boxplot(plotme,main="",col="%s",outline=F)' % (colour) )
R.append('dev.off()')
return R
def rCum(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=100):
"""
Useful to see what various cutoffs yield - plot percentiles
"""
R = []
n = len(plotme)
R.append("plotme = read.table(file='%s',head=T,sep='\t')" % plotme)
# arrives already in decending order of importance missingness or mendel count by subj or marker
maint = 'QC for %s - %s' % (basename,title)
R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
R.append("par(lab=c(10,10,10))")
R.append('plot(plotme$xvec,plotme$yvec,type="p",main="%s",ylab="%s",xlab="Sample Percentile",col="%s")' % (maint,xlabname,colour))
R.append('dev.off()')
return R
def rQQ(plotme='', outfname='fname',title='title',xlabname='Sample',basename=''):
"""
y is data for a qq plot and ends up on the x axis go figure
if sampling, oversample low values - all the top 1% ?
this version called with -log10 transformed hwe
"""
R = []
nrows = len(plotme)
fn = float(nrows)
xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
#mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
maxveclen = 3000
yvec = copy.copy(plotme)
if nrows > maxveclen:
# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
# oversample part of the distribution
always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
if skip < 2:
skip = 2
samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
# always oversample first sorted (here lowest) values
yvec = [yvec[i] for i in samplei] # always get first and last
xvec = [xvec[i] for i in samplei] # and sample xvec same way
maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
else:
maint='Log QQ Plot(n=%d)' % (nrows)
mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
ylab = '%s' % xlabname
xlab = '-log10(Uniform 0-1)'
# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
x = ['%f' % x for x in xvec]
R.append('xvec = c(%s)' % ','.join(x))
y = ['%f' % x for x in yvec]
R.append('yvec = c(%s)' % ','.join(y))
R.append('mx = c(0,%f)' % (math.log10(fn)))
R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
R.append("par(lab=c(10,10,10))")
R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
R.append('points(mx,mx,type="l")')
R.append('grid(col="lightgray",lty="dotted")')
R.append('dev.off()')
return R
def rMultiQQ(plotme = '',nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
"""
data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
grid of qq plots to show departure from null at extremes of data quality
Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
and ends up on the x axis go figure
if sampling, oversample low values - all the top 1% ?
"""
data = copy.copy(plotme)
nvals = len(data)
stepsize = nvals/nsplits
logstep = math.log10(stepsize) # so is 3 for steps of 1000
R.append('mx = c(0,%f)' % logstep)
quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
data.sort(key=itertools.itemgetter(1)) # into x order
#rpy.r.pdf( outfname, h , w )
#rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
yvec = [-math.log10(random.random()) for x in range(stepsize)]
yvec.sort() # size of each step is expected range for xvec under null?!
y = ['%f' % x for x in yvec]
R.append('yvec = c(%s)' % ','.join(y))
for rowstart in quints:
rowend = rowstart + stepsize
if nvals - rowend < stepsize: # finish last split
rowend = nvals
row = data[rowstart:rowend]
row.sort(key=itertools.itemgetter(2)) # into y order
for colstart in quints:
colend = colstart + stepsize
if nvals - colend < stepsize: # finish last split
colend = nvals
cell = row[colstart:colend]
xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
x = ['%f' % x for x in xvec]
R.append('xvec = c(%s)' % ','.join(x))
R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
R.append('points(mx,mx,type="l")')
R.append('grid(col="lightgray",lty="dotted")')
#rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
#rpy.r.points(c(0,logstep),c(0,logstep),type='l')
R.append('dev.off()')
#rpy.r.dev_off()
return R
def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
"""
y is data for a qqnorm plot
if sampling, oversample low values - all the top 1% ?
"""
rangeunif = len(plotme)
nunif = 1000
maxveclen = 3000
nrows = len(plotme)
data = copy.copy(plotme)
if nrows > maxveclen:
# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
# oversample part of the distribution
always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
# always oversample first sorted (here lowest) values
yvec = [data[i] for i in samplei] # always get first and last
maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
else:
yvec = data
maint='Log QQ Plot(n=%d)' % (nrows)
n = 1000
ylab = '%s' % xlabname
xlab = 'Normal'
# need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
#rpy.r.pdf( outfname, h , w )
#rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
#rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
#rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
#rpy.r.dev_off()
y = ['%f' % x for x in yvec]
R.append('yvec = c(%s)' % ','.join(y))
R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
R.append("par(lab=c(10,10,10))")
R.append('qqnorm(yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
R.append('grid(col="lightgray",lty="dotted")')
R.append('dev.off()')
return R
def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
"""
layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
like the GAIN qc tools
y is data for a qq plot and ends up on the x axis go figure
if sampling, oversample low values - all the top 1% ?
"""
rangeunif = len(plotme)
nunif = 1000
fn = float(rangeunif)
xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
skip = max(int(rangeunif/fn),1)
# force include last points
mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
maxveclen = 2000
nrows = len(plotme)
data = copy.copy(plotme)
data.sort() # low to high - oversample low values
if nrows > maxveclen:
# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
# oversample part of the distribution
always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
# always oversample first sorted (here lowest) values
yvec = [data[i] for i in samplei] # always get first and last
xvec = [xvec[i] for i in samplei] # and sample xvec same way
maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
else:
yvec = data
maint='Log QQ Plot(n=%d)' % (nrows)
n = 1000
mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
ylab = '%s' % xlabname
xlab = '-log10(Uniform 0-1)'
R.append('mx = c(0,%f)' % (math.log10(fn)))
x = ['%f' % x for x in xvec]
R.append('xvec = c(%s)' % ','.join(x))
y = ['%f' % x for x in yvec]
R.append('yvec = c(%s)' % ','.join(y))
R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
R.append("par(lab=c(10,10,10))")
R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
R.append('points(mx,mx,type="l")')
R.append('grid(col="lightgray",lty="dotted")')
R.append('dev.off()')
shead = subjects.pop(0) # get rid of head
mhead = markers.pop(0)
maf = mhead.index('maf')
missfrac = mhead.index('missfrac')
logphweall = mhead.index('logp_hwe_all')
logphweunaff = mhead.index('logp_hwe_unaff')
# check for at least some unaffected rml june 2009
m_mendel = mhead.index('N_Mendel')
fracmiss = shead.index('FracMiss')
s_mendel = shead.index('Mendel_errors')
s_het = shead.index('F_Stat')
params = {}
h = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
and x[logphweunaff].upper() <> 'NA']
if len(h) <> 0:
xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
# plot for each of these cols
else: # try hwe all instead - maybe no affection status available
xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
qqplotme = [1,0,0,0,0,0,0] #
qnplotme = [0,0,0,0,0,0,1] #
nplots = len(xs)
xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
'Marker Mendel Error Count', 'Missing Rate: Subjects',
'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
ordplotnames = ['%s_cum' % x for x in plotnames]
ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
"Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
html = []
pdflist = []
R = []
for n,column in enumerate(xs):
dfn = '%d_%s.txt' % (n,titles[n])
dfilepath = os.path.join(newfpath,dfn)
dat = [float(x[column]) for x in datasources[n] if len(x) >= column
and x[column][:2].upper() <> 'NA'] # plink gives both!
if sum(dat) <> 0: # eg nada for mendel if case control?
plotme = file(dfilepath,'w')
plotme.write('\n'.join(['%f' % x for x in dat])) # pass as a file - copout!
tR = rHist(plotme=dfilepath,outfname=outfnames[n],xlabname=xlabnames[n],
title=titles[n],basename=basename,nbreaks=nbreaks)
R += tR
row = [titles[n],ploturls[n],outfnames[n] ]
html.append(row)
pdflist.append(outfnames[n])
if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
otitle = 'Ranked %s' % titles[n]
dat.sort()
if oreverseme[n]:
dat.reverse()
ndat = len(dat)
xvec = range(ndat)
xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
# now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
maxveclen = 1000.0 # for reasonable pdf sizes!
if ndat > maxveclen: # oversample part of the distribution
always = min(1000,ndat/20) # oversample smaller of lowest few hundred items or 5%
skip = int(ndat/maxveclen) # take 1 in skip to get about maxveclen points
samplei = [i for i in range(ndat) if (i % skip == 0) or (i < always)] # always oversample first sorted values
yvec = [yvec[i] for i in samplei] # always get first and last
xvec = [xvec[i] for i in samplei] # always get first and last
plotme = file(dfilepath,'w')
plotme.write('xvec\tyvec\n')
plotme.write('\n'.join(['%f\t%f' % (xvec[i],y) for y in yvec])) # pass as a file - copout!
tR = rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
title=otitle,basename=basename,nbreaks=nbreaks)
R += tR
row = [otitle,ordploturls[n],ordoutfnames[n]]
html.append(row)
pdflist.append(ordoutfnames[n])
if qqplotme[n]: #
otitle = 'LogQQ plot %s' % titles[n]
dat.sort()
dat.reverse()
ofn = os.path.split(ordoutfnames[n])
ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
ofu = os.path.split(ordploturls[n])
ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
tR = rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
title=otitle,basename=basename)
R += tR
row = [otitle,ofu,ofn]
html.append(row)
pdflist.append(ofn)
elif qnplotme[n]:
otitle = 'F Statistic %s' % titles[n]
dat.sort()
dat.reverse()
ofn = os.path.split(ordoutfnames[n])
ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
ofu = os.path.split(ordploturls[n])
ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
tR = rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
title=otitle,basename=basename)
R += tR
row = [otitle,ofu,ofn]
html.append(row)
pdflist.append(ofn)
else:
print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
rlog,flist = RRun(rcmd=R,title='makeQCplots',outdir=newfpath)
if nup>0:
# pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
# pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
filestojoin = ' '.join(pdflist) # all the file names so far
afname = '%s_All_Paged.pdf' % (basename)
fullafname = os.path.join(newfpath,afname)
expl = 'All %s QC Plots joined into a single pdf' % basename
vcl = 'pdfjoin %s --outfile %s ' % (filestojoin, fullafname)
# make single page pdf
x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
retval = x.wait()
row = [expl,afname,fullafname]
html.insert(0,row) # last rather than second
nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
fullnfname = os.path.join(newfpath,nfname)
expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
vcl = 'pdfnup %s --nup %dx%d --frame true --outfile %s' % (afname,nup,nup,fullnfname)
# make thumbnail images
x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
retval = x.wait()
row = [expl,nfname,fullnfname]
html.insert(1,row) # this goes second
vcl = 'mogrify -format jpg -resize %s %s' % (mogresize, os.path.join(newfpath,'*.pdf'))
# make thumbnail images
x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
retval = x.wait()
return html # elements for an ordered list of urls or whatever..
def countHet(bedf='fakeped_500000',linkageped=True,froot='fake500k',outfname="ahetf",logf='rgQC.log'):
"""
NO LONGER USED - historical interest
count het loci for each subject to look for outliers = ? contamination
assume ped file is linkage format
need to make a ped file from the bed file we were passed
"""
vcl = [plinke,'--bfile',bedf,'--recode','--out','%s_recode' % froot] # write a recoded ped file from the real .bed file
p=subprocess.Popen(' '.join(vcl),shell=True)
retval = p.wait()
f = open('%s_recode.recode.ped' % froot,'r')
if not linkageped:
head = f.next() # throw away header
hets = [] # simple count of het loci per subject. Expect poisson?
n = 1
for l in f:
n += 1
ll = l.strip().split()
if len(ll) > 6:
iid = idjoiner.join(ll[:2]) # fam_iid
gender = ll[4]
alleles = ll[6:]
nallele = len(alleles)
nhet = 0
for i in range(nallele/2):
a1=alleles[2*i]
a2=alleles[2*i+1]
if alleles[2*i] <> alleles[2*i+1]: # must be het
if not missvals.get(a1,None) and not missvals.get(a2,None):
nhet += 1
hets.append((nhet,iid,gender)) # for sorting later
f.close()
hets.sort()
hets.reverse() # biggest nhet now on top
f = open(outfname ,'w')
res = ['%d\t%s\t%s' % (x) for x in hets] # I love list comprehensions
res.insert(0,'nhetloci\tfamid_iid\tgender')
res.append('')
f.write('\n'.join(res))
f.close()
def subjectRep(froot='cleantest',outfname="srep",newfpath='.',logf = None):
"""by subject (missingness = .imiss, mendel = .imendel)
assume replicates have an underscore in family id for
hapmap testing
For sorting, we need floats and integers
"""
isexfile = '%s.sexcheck' % froot
imissfile = '%s.imiss' % froot
imendfile = '%s.imendel' % froot
ihetfile = '%s.het' % froot
logf.write('## subject reports starting at %s\n' % timenow())
outfile = os.path.join(newfpath,outfname)
idlist = []
imissdict = {}
imenddict = {}
isexdict = {}
ihetdict = {}
Tops = {}
Tnames = ['Ranked Subject Missing Genotype', 'Ranked Subject Mendel',
'Ranked Sex check', 'Ranked Inbreeding (het) F statistic']
Tsorts = [2,3,6,8]
Treverse = [True,True,True,False] # so first values are worser
#rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
## FID IID MISS_PHENO N_MISS N_GENO F_MISS
## 1552042370_A 1552042370_A N 5480 549883 0.009966
## 1552042410_A 1552042410_A N 1638 549883 0.002979
# ------------------missing--------------------
# imiss has FID IID MISS_PHENO N_MISS F_MISS
# we want F_MISS
try:
f = open(imissfile,'r')
except:
logf.write('# file %s is missing - talk about irony\n' % imissfile)
f = None
if f:
for n,line in enumerate(f):
ll = line.strip().split()
if n == 0:
head = [x.upper() for x in ll] # expect above
fidpos = head.index('FID')
iidpos = head.index('IID')
fpos = head.index('F_MISS')
elif len(ll) >= fpos: # full line
fid = ll[fidpos]
#if fid.find('_') == -1: # not replicate! - icondb ids have these...
iid = ll[iidpos]
fmiss = ll[fpos]
id = '%s%s%s' % (fid,idjoiner,iid)
imissdict[id] = fmiss
idlist.append(id)
f.close()
logf.write('## imissfile %s contained %d ids\n' % (imissfile,len(idlist)))
# ------------------mend-------------------
# *.imendel has FID IID N
# we want N
gotmend = True
try:
f = open(imendfile,'r')
except:
gotmend = False
for id in idlist:
imenddict[id] = '0'
if gotmend:
for n,line in enumerate(f):
ll = line.strip().split()
if n == 0:
head = [x.upper() for x in ll] # expect above
npos = head.index('N')
fidpos = head.index('FID')
iidpos = head.index('IID')
elif len(ll) >= npos: # full line
fid = ll[fidpos]
iid = ll[iidpos]
id = '%s%s%s' % (fid,idjoiner,iid)
nmend = ll[npos]
imenddict[id] = nmend
f.close()
else:
logf.write('## error No %s file - assuming not family data\n' % imendfile)
# ------------------sex check------------------
#[rerla@hg fresh]$ head /home/rerla/fresh/database/files/dataset_978_files/CAMP2007Dirty.sexcheck
# sexcheck has FID IID PEDSEX SNPSEX STATUS F
##
## FID Family ID
## IID Individual ID
## PEDSEX Sex as determined in pedigree file (1=male, 2=female)
## SNPSEX Sex as determined by X chromosome
## STATUS Displays "PROBLEM" or "OK" for each individual
## F The actual X chromosome inbreeding (homozygosity) estimate
##
## A PROBLEM arises if the two sexes do not match, or if the SNP data or pedigree data are
## ambiguous with regard to sex.
## A male call is made if F is more than 0.8; a femle call is made if F is less than 0.2.
try:
f = open(isexfile,'r')
got_sexcheck = 1
except:
got_sexcheck = 0
if got_sexcheck:
for n,line in enumerate(f):
ll = line.strip().split()
if n == 0:
head = [x.upper() for x in ll] # expect above
fidpos = head.index('FID')
iidpos = head.index('IID')
pedsexpos = head.index('PEDSEX')
snpsexpos = head.index('SNPSEX')
statuspos = head.index('STATUS')
fpos = head.index('F')
elif len(ll) >= fpos: # full line
fid = ll[fidpos]
iid = ll[iidpos]
fest = ll[fpos]
pedsex = ll[pedsexpos]
snpsex = ll[snpsexpos]
stat = ll[statuspos]
id = '%s%s%s' % (fid,idjoiner,iid)
isexdict[id] = (pedsex,snpsex,stat,fest)
f.close()
else:
# this only happens if there are no subjects!
logf.write('No %s file - assuming no sex errors' % isexfile)
##
## FID IID O(HOM) E(HOM) N(NM) F
## 457 2 490665 4.928e+05 722154 -0.009096
## 457 3 464519 4.85e+05 710986 -0.0908
## 1037 2 461632 4.856e+05 712025 -0.106
## 1037 1 491845 4.906e+05 719353 0.005577
try:
f = open(ihetfile,'r')
except:
f = None
logf.write('## No %s file - did we run --het in plink?\n' % ihetfile)
if f:
for i,line in enumerate(f):
ll = line.strip().split()
if i == 0:
head = [x.upper() for x in ll] # expect above
fidpos = head.index('FID')
iidpos = head.index('IID')
fpos = head.index('F')
n = 0
elif len(ll) >= fpos: # full line
fid = ll[fidpos]
iid = ll[iidpos]
fhet = ll[fpos]
id = '%s%s%s' % (fid,idjoiner,iid)
ihetdict[id] = fhet
f.close() # now assemble and output result list
rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','XHomEst','F_Stat']
res = []
fres = [] # floats for sorting
for id in idlist: # for each snp in found order
fid,iid = id.split(idjoiner) # recover keys
f_missing = imissdict.get(id,'0.0')
nmend = imenddict.get(id,'0')
(pedsex,snpsex,status,fest) = isexdict.get(id,('0','0','0','0.0'))
fhet = ihetdict.get(id,'0.0')
res.append([fid,iid,f_missing,nmend,pedsex,snpsex,status,fest,fhet])
try:
ff_missing = float(f_missing)
except:
ff_missing = 0.0
try:
inmend = int(nmend)
except:
inmend = 0
try:
ffest = float(fest)
except:
fest = 0.0
try:
ffhet = float(fhet)
except:
ffhet = 0.0
fres.append([fid,iid,ff_missing,inmend,pedsex,snpsex,status,ffest,ffhet])
ntokeep = max(20,len(res)/keepfrac)
for i,col in enumerate(Tsorts):
fres.sort(key=operator.itemgetter(col))
if Treverse[i]:
fres.reverse()
repname = Tnames[i]
Tops[repname] = fres[0:ntokeep]
Tops[repname] = [map(str,x) for x in Tops[repname]]
Tops[repname].insert(0,rhead)
res.sort()
res.insert(0,rhead)
logf.write('### writing %s report with %s' % ( outfile,res[0]))
f = open(outfile,'w')
f.write('\n'.join(['\t'.join(x) for x in res]))
f.write('\n')
f.close()
return res,Tops
def markerRep(froot='cleantest',outfname="mrep",newfpath='.',logf=None,maplist=None ):
"""by marker (hwe = .hwe, missingness=.lmiss, freq = .frq)
keep a list of marker order but keep all stats in dicts
write out a fake xls file for R or SAS etc
kinda clunky, but..
TODO: ensure stable if any file not found?
"""
mapdict = {}
if maplist <> None:
rslist = [x[1] for x in maplist]
offset = [(x[0],x[3]) for x in maplist]
mapdict = dict(zip(rslist,offset))
hwefile = '%s.hwe' % froot
lmissfile = '%s.lmiss' % froot
freqfile = '%s.frq' % froot
lmendfile = '%s.lmendel' % froot
outfile = os.path.join(newfpath,outfname)
markerlist = []
chromlist = []
hwedict = {}
lmissdict = {}
freqdict = {}
lmenddict = {}
Tops = {}
Tnames = ['Ranked Marker MAF', 'Ranked Marker Missing Genotype', 'Ranked Marker HWE', 'Ranked Marker Mendel']
Tsorts = [3,6,10,11]
Treverse = [False,True,True,True] # so first values are worse(r)
#res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend])
#rhead = ['snp','chrom','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
# -------------------hwe--------------------------
# hwe has SNP TEST GENO O(HET) E(HET) P_HWD
# we want all hwe where P_HWD <> NA
# ah changed in 1.04 to
## CHR SNP TEST A1 A2 GENO O(HET) E(HET) P
## 1 rs6671164 ALL 2 3 34/276/613 0.299 0.3032 0.6644
## 1 rs6671164 AFF 2 3 0/0/0 nan nan NA
## 1 rs6671164 UNAFF 2 3 34/276/613 0.299 0.3032 0.6644
## 1 rs4448553 ALL 2 3 8/176/748 0.1888 0.1848 0.5975
## 1 rs4448553 AFF 2 3 0/0/0 nan nan NA
## 1 rs4448553 UNAFF 2 3 8/176/748 0.1888 0.1848 0.5975
## 1 rs1990150 ALL 1 3 54/303/569 0.3272 0.3453 0.1067
## 1 rs1990150 AFF 1 3 0/0/0 nan nan NA
## 1 rs1990150 UNAFF 1 3 54/303/569 0.3272 0.3453 0.1067
logf.write('## marker reports starting at %s\n' % timenow())
try:
f = open(hwefile,'r')
except:
f = None
logf.write('## error - no hwefile %s found\n' % hwefile)
if f:
for i,line in enumerate(f):
ll = line.strip().split()
if i == 0: # head
head = [x.upper() for x in ll] # expect above
try:
testpos = head.index('TEST')
except:
testpos = 2 # patch for 1.04 which has b0rken headers - otherwise use head.index('TEST')
try:
ppos = head.index('P')
except:
ppos = 8 # patch - for head.index('P')
snppos = head.index('SNP')
chrpos = head.index('CHR')
logf.write('hwe header testpos=%d,ppos=%d,snppos=%d\n' % (testpos,ppos,snppos))
elif len(ll) >= ppos: # full line
ps = ll[ppos].upper()
rs = ll[snppos]
chrom = ll[chrpos]
test = ll[testpos]
if not hwedict.get(rs,None):
hwedict[rs] = {}
markerlist.append(rs)
chromlist.append(chrom) # one place to find it?
lpvals = 0
if ps.upper() <> 'NA' and ps.upper() <> 'NAN': # worth keeping
lpvals = '0'
if ps <> '1':
try:
pval = float(ps)
lpvals = '%f' % -math.log10(pval)
except:
pass
hwedict[rs][test] = (ps,lpvals)
else:
logf.write('short line #%d = %s\n' % (i,ll))
f.close()
# ------------------missing--------------------
"""lmiss has
CHR SNP N_MISS N_GENO F_MISS
1 rs12354060 0 73 0
1 rs4345758 1 73 0.0137
1 rs2691310 73 73 1
1 rs2531266 73 73 1
# we want F_MISS"""
try:
f = open(lmissfile,'r')
except:
f = None
if f:
for i,line in enumerate(f):
ll = line.strip().split()
if i == 0:
head = [x.upper() for x in ll] # expect above
fracpos = head.index('F_MISS')
npos = head.index('N_MISS')
snppos = head.index('SNP')
elif len(ll) >= fracpos: # full line
rs = ll[snppos]
fracval = ll[fracpos]
lmissdict[rs] = fracval # for now, just that?
else:
lmissdict[rs] = 'NA'
f.close()
# ------------------freq-------------------
# frq has CHR SNP A1 A2 MAF NM
# we want maf
try:
f = open(freqfile,'r')
except:
f = None
if f:
for i,line in enumerate(f):
ll = line.strip().split()
if i == 0:
head = [x.upper() for x in ll] # expect above
mafpos = head.index('MAF')
a1pos = head.index('A1')
a2pos = head.index('A2')
snppos = head.index('SNP')
elif len(ll) >= mafpos: # full line
rs = ll[snppos]
a1 = ll[a1pos]
a2 = ll[a2pos]
maf = ll[mafpos]
freqdict[rs] = (maf,a1,a2)
f.close()
# ------------------mend-------------------
# lmend has CHR SNP N
# we want N
gotmend = True
try:
f = open(lmendfile,'r')
except:
gotmend = False
for rs in markerlist:
lmenddict[rs] = '0'
if gotmend:
for i,line in enumerate(f):
ll = line.strip().split()
if i == 0:
head = [x.upper() for x in ll] # expect above
npos = head.index('N')
snppos = head.index('SNP')
elif len(ll) >= npos: # full line
rs = ll[snppos]
nmend = ll[npos]
lmenddict[rs] = nmend
f.close()
else:
logf.write('No %s file - assuming not family data\n' % lmendfile)
# now assemble result list
rhead = ['snp','chromosome','offset','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
res = []
fres = []
for rs in markerlist: # for each snp in found order
f_missing = lmissdict.get(rs,'NA')
maf,a1,a2 = freqdict.get(rs,('NA','NA','NA'))
hwe_all = hwedict[rs].get('ALL',('NA','NA')) # hope this doesn't change...
hwe_unaff = hwedict[rs].get('UNAFF',('NA','NA'))
nmend = lmenddict.get(rs,'NA')
(chrom,offset)=mapdict.get(rs,('?','0'))
res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend])
ntokeep = max(10,len(res)/keepfrac)
def msortk(item=None):
"""
deal with non numeric sorting
"""
try:
return float(item)
except:
return item
for i,col in enumerate(Tsorts):
res.sort(key=msortk(lambda x:x[col]))
if Treverse[i]:
res.reverse()
repname = Tnames[i]
Tops[repname] = res[0:ntokeep]
Tops[repname].insert(0,rhead)
res.sort(key=lambda x: '%s_%10d' % (x[1].ljust(4,'0'),int(x[2]))) # in chrom offset order
res.insert(0,rhead)
f = open(outfile,'w')
f.write('\n'.join(['\t'.join(x) for x in res]))
f.close()
return res,Tops
def getfSize(fpath,outpath):
"""
format a nice file size string
"""
size = ''
fp = os.path.join(outpath,fpath)
if os.path.isfile(fp):
n = float(os.path.getsize(fp))
if n > 2**20:
size = ' (%1.1f MB)' % (n/2**20)
elif n > 2**10:
size = ' (%1.1f KB)' % (n/2**10)
elif n > 0:
size = ' (%d B)' % (int(n))
return size
if __name__ == "__main__":
u = """ called in xml as
rgQC.py -i '$input_file.extra_files_path/$input_file.metadata.base_name' -o "$out_prefix"
-s '$html_file' -p '$html_file.files_path' -l '${GALAXY_DATA_INDEX_DIR}/rg/bin/plink'
-r '${GALAXY_DATA_INDEX_DIR}/rg/bin/R'
Prepare a qc report - eg:
print "%s %s -i birdlped -o birdlped -l qc.log -s htmlf -m marker.xls -s sub.xls -p ./" % (sys.executable,prog)
"""
progname = os.path.basename(sys.argv[0])
if len(sys.argv) < 9:
print '%s requires 6 parameters - got %d = %s' % (progname,len(sys.argv),sys.argv)
sys.exit(1)
parser = OptionParser(usage=u, version="%prog 0.01")
a = parser.add_option
a("-i","--infile",dest="infile")
a("-o","--oprefix",dest="opref")
a("-l","--plinkexe",dest="plinke", default=plinke)
a("-r","--rexe",dest="rexe", default=rexe)
a("-s","--snps",dest="htmlf")
#a("-m","--markerRaw",dest="markf")
#a("-r","--rawsubject",dest="subjf")
a("-p","--patho",dest="newfpath")
(options,args) = parser.parse_args()
basename = os.path.split(options.infile)[-1] # just want the file prefix to find the .xls files below
killme = string.punctuation + string.whitespace
trantab = string.maketrans(killme,'_'*len(killme))
opref = options.opref.translate(trantab)
alogh,alog = tempfile.mkstemp(suffix='.txt')
plogh,plog = tempfile.mkstemp(suffix='.txt')
alogf = open(alog,'w')
plogf = open(plog,'w')
ahtmlf = options.htmlf
amarkf = 'MarkerDetails_%s.xls' % opref
asubjf = 'SubjectDetails_%s.xls' % opref
newfpath = options.newfpath
newfpath = os.path.realpath(newfpath)
try:
os.makedirs(newfpath)
except:
pass
ofn = basename
bfn = options.infile
try:
mapf = '%s.bim' % bfn
maplist = file(mapf,'r').readlines()
maplist = [x.split() for x in maplist]
except:
maplist = None
alogf.write('## error - cannot open %s to read map - no offsets will be available for output files')
#rerla@beast galaxy]$ head test-data/tinywga.bim
#22 rs2283802 0 21784722 4 2
#22 rs2267000 0 21785366 4 2
rgbin = os.path.split(rexe)[0] # get our rg bin path
#plinktasks = [' --freq',' --missing',' --mendel',' --hardy',' --check-sex'] # plink v1 fixes that bug!
# if we could, do all at once? Nope. Probably never.
plinktasks = [['--freq',],['--hwe 0.0', '--missing','--hardy'],
['--mendel',],['--check-sex',]]
vclbase = [options.plinke,'--noweb','--out',basename,'--bfile',bfn,'--mind','1.0','--geno','1.0','--maf','0.0']
runPlink(logf=plogf,plinktasks=plinktasks,cd=newfpath, vclbase=vclbase)
plinktasks = [['--bfile',bfn,'--indep-pairwise 40 20 0.5','--out %s' % basename],
['--bfile',bfn,'--extract %s.prune.in --make-bed --out ldp_%s' % (basename, basename)],
['--bfile ldp_%s --het --out %s' % (basename,basename)]]
# subset of ld independent markers for eigenstrat and other requirements
vclbase = [options.plinke,'--noweb']
plogout = pruneLD(plinktasks=plinktasks,cd=newfpath,vclbase = vclbase)
plogf.write('\n'.join(plogout))
plogf.write('\n')
repout = os.path.join(newfpath,basename)
subjects,subjectTops = subjectRep(froot=repout,outfname=asubjf,newfpath=newfpath,
logf=alogf) # writes the subject_froot.xls file
markers,markerTops = markerRep(froot=repout,outfname=amarkf,newfpath=newfpath,
logf=alogf,maplist=maplist) # marker_froot.xls
nbreaks = 100
s = '## starting plotpage, newfpath=%s,m=%s,s=%s/n' % (newfpath,markers[:2],subjects[:2])
alogf.write(s)
print s
plotpage,cruft = makePlots(markers=markers,subjects=subjects,newfpath=newfpath,
basename=basename,nbreaks=nbreaks,height=10,width=8,rgbin=rgbin)
#plotpage = RmakePlots(markers=markers,subjects=subjects,newfpath=newfpath,basename=basename,nbreaks=nbreaks,rexe=rexe)
# [titles[n],plotnames[n],outfnames[n] ]
html = []
html.append('
')
size = getfSize(amarkf,newfpath)
html.append('%s%s tab delimited |
' % \
(amarkf,'Click here to download the Marker QC Detail report file',size))
size = getfSize(asubjf,newfpath)
html.append('%s%s tab delimited |
' % \
(asubjf,'Click here to download the Subject QC Detail report file',size))
for (title,url,ofname) in plotpage:
ttitle = 'Ranked %s' % title
dat = subjectTops.get(ttitle,None)
if not dat:
dat = markerTops.get(ttitle,None)
imghref = '%s.jpg' % os.path.splitext(url)[0] # removes .pdf
thumbnail = os.path.join(newfpath,imghref)
if not os.path.exists(thumbnail): # for multipage pdfs, mogrify makes multiple jpgs - fugly hack
imghref = '%s-0.jpg' % os.path.splitext(url)[0] # try the first jpg
thumbnail = os.path.join(newfpath,imghref)
if not os.path.exists(thumbnail):
html.append('%s |
' % (url,title))
else:
html.append('' \
% (url,imghref,title))
if dat: # one or the other - write as an extra file and make a link here
t = '%s.xls' % (ttitle.replace(' ','_'))
fname = os.path.join(newfpath,t)
f = file(fname,'w')
f.write('\n'.join(['\t'.join(x) for x in dat])) # the report
size = getfSize(t,newfpath)
html.append(' | %s | Worst data%s |
' % (title,t,size))
else:
html.append('%s | | ' % (title))
html.append('
All output files from the QC run are available below
')
html.append('\n')
flist = os.listdir(newfpath) # we want to catch 'em all
flist.sort()
for f in flist:
fname = os.path.split(f)[-1]
size = getfSize(fname,newfpath)
html.append('%s%s |
' % (fname,fname,size))
html.append('
')
alogf.close()
plogf.close()
llog = open(alog,'r').readlines()
plogfile = open(plog,'r').readlines()
os.unlink(alog)
os.unlink(plog)
llog += plogfile # add lines from pruneld log
lf = file(ahtmlf,'w') # galaxy will show this as the default view
lf.write(galhtmlprefix % progname)
s = '\nOutput from Rgenetics QC report tool run at %s
\n' % (timenow())
lf.write('
%s
\n' % s)
lf.write('(Click any preview image to download a full sized PDF version)
\n')
lf.write('\n'.join(html))
lf.write('QC run log contents
')
lf.write('%s
' % (''.join(llog))) # plink logs
if len(cruft) > 0:
lf.write('Blather from pdfnup follows:
%s
' % (''.join(cruft))) # pdfnup
lf.write('%s\n
\n' % galhtmlpostfix)
lf.close()