root/galaxy-central/tools/rgenetics/rgQC.py @ 2

リビジョン 2, 62.8 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1# oct 15 rpy replaced - temp fix until we get gnuplot working
2# rpy deprecated - replace with RRun
3# fixes to run functional test! oct1 2009
4# needed to expand our path with os.path.realpath to get newpath working with
5# all the fancy pdfnup stuff
6# and a fix to pruneld to write output to where it should be
7# smallish data in test-data/smallwga in various forms
8# python ../tools/rgenetics/rgQC.py -i smallwga -o smallwga -s smallwga/smallwga.html -p smallwga
9# child files are deprecated and broken as at july 19 2009
10# need to move them to the html file extrafiles path
11# found lots of corner cases with some illumina data where cnv markers were
12# included
13# and where affection status was all missing !
14# added links to tab files showing worst 1/keepfrac markers and subjects
15# ross lazarus january 2008
16#
17# added named parameters
18# to ensure no silly slippages if non required parameter in the most general case
19# some potentially useful things here reusable in complex scripts
20# with lots'o'html (TM)
21# aug 17 2007 rml
22#
23# added marker and subject and parenting april 14 rml
24# took a while to get the absolute paths right for all the file munging
25# as of april 16 seems to work..
26# getting galaxy to serve images in html reports is a little tricky
27# we don't want QC reports to be dozens of individual files, so need
28# to use the url /static/rg/... since galaxy's web server will happily serve images
29# from there
30# galaxy passes output files as relative paths
31# these have to be munged by rgQC.py before calling this
32# galaxy will pass in 2 file names - one for the log
33# and one for the final html report
34# of the form './database/files/dataset_66.dat'
35# we need to be working in that directory so our plink output files are there
36# so these have to be munged by rgQC.py before calling this
37# note no ped file passed so had to remove the -l option
38# for plinkParse.py that makes a heterozygosity report from the ped
39# file - needs fixing...
40# new: importing manhattan/qqplot plotter
41# def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir):
42#    """ draw a qq for pvals and a manhattan plot if chrom/offset <> 0
43#    contains some R scripts as text strings - we substitute defaults into the calls
44#    to make them do our bidding - and save the resulting code for posterity
45#    this can be called externally, I guess...for QC eg?
46#    """
47#
48#    rcmd = '%s%s' % (rcode,rcode2 % (input_fname,chrom_col,offset_col,pval_cols,title,grey))
49#    rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir)
50#    return rlog,flist
51 
52
53from optparse import OptionParser
54
55import sys,os,shutil, glob, math, subprocess, time, operator, random, tempfile, copy, string
56from os.path import abspath
57from rgutils import galhtmlprefix, galhtmlpostfix, RRun, timenow, plinke, rexe, runPlink, pruneLD
58import rgManQQ
59
60prog = os.path.split(sys.argv[0])[1]
61vers = '0.4 april 2009 rml'
62idjoiner = '_~_~_' # need something improbable..
63# many of these may need fixing for a new install
64
65myversion = vers
66keepfrac = 20 # fraction to keep after sorting by each interesting value
67
68missvals = {'0':'0','N':'N','-9':'-9','-':'-'} # fix me if these change!
69
70mogresize = "x300" # this controls the width for jpeg thumbnails
71
72
73
74           
75def makePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='20',nup=3,height=10,width=8,rgbin=''):
76    """
77    marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
78    'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
79    subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
80    """
81
82       
83    def rHist(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=50):
84        """   rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
85        # generic histogram and vertical boxplot in a 3:1 layout
86        # returns the graphic file name for inclusion in the web page
87        # broken out here for reuse
88        # ross lazarus march 2007
89        """
90        screenmat = (1,2,1,2) # create a 2x2 cabvas
91        widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
92        rpy.r.pdf( outfname, height , width  )
93        #rpy.r.layout(rpy.r.matrix(rpy.r.c(1,1,1,2), 1, 4, byrow = True)) # 3 to 1 vertical plot
94        m = rpy.r.matrix((1,1,1,2),nrow=1,ncol=4,byrow=True)
95        # in R, m = matrix(c(1,2),nrow=1,ncol=2,byrow=T)
96        rpy.r("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))") # 4 to 1 vertical plot
97        maint = 'QC for %s - %s' % (basename,title)
98        rpy.r.hist(plotme,main=maint, xlab=xlabname,breaks=nbreaks,col="maroon",cex=0.8)
99        rpy.r.boxplot(plotme,main='',col="maroon",outline=False)
100        rpy.r.dev_off()
101
102    def rCum(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=100):
103        """
104        Useful to see what various cutoffs yield - plot percentiles
105        """
106        n = len(plotme)
107        maxveclen = 1000.0 # for reasonable pdf sizes!
108        yvec = copy.copy(plotme)
109        # arrives already in decending order of importance missingness or mendel count by subj or marker
110        xvec = range(n)
111        xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
112        # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
113        if n > maxveclen: # oversample part of the distribution
114            always = min(1000,n/20) # oversample smaller of lowest few hundred items or 5%
115            skip = int(n/maxveclen) # take 1 in skip to get about maxveclen points
116            samplei = [i for i in range(n) if (i % skip == 0) or (i < always)] # always oversample first sorted values
117            yvec = [yvec[i] for i in samplei] # always get first and last
118            xvec = [xvec[i] for i in samplei] # always get first and last
119        # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
120        rpy.r.pdf( outfname, height , width  )
121        maint = 'QC for %s - %s' % (basename,title)
122        rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
123        rpy.r.plot(xvec,yvec,type='p',main=maint, ylab=xlabname, xlab='Sample Percentile',col="maroon",cex=0.8)
124        rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
125        rpy.r.dev_off()
126
127    def rQQ(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
128        """
129        y is data for a qq plot and ends up on the x axis go figure
130        if sampling, oversample low values - all the top 1% ?
131        this version called with -log10 transformed hwe
132        """
133        nrows = len(plotme)
134        fn = float(nrows)
135        xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
136        mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
137        maxveclen = 3000
138        yvec = copy.copy(plotme)
139        if nrows > maxveclen:
140            # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
141            # oversample part of the distribution
142            always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
143            skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
144            samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
145            # always oversample first sorted (here lowest) values
146            yvec = [yvec[i] for i in samplei] # always get first and last
147            xvec = [xvec[i] for i in samplei] # and sample xvec same way
148            maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
149        else:
150            maint='Log QQ Plot(n=%d)' % (nrows)
151        mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
152        ylab = '%s' % xlabname
153        xlab = '-log10(Uniform 0-1)'
154        # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
155        rpy.r.pdf( outfname, height , width  )
156        rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
157        rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
158        rpy.r.points(mx,mx,type='l')
159        rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
160        rpy.r.dev_off()
161
162    def rMultiQQ(plotme = [],nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
163        """
164        data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
165        grid of qq plots to show departure from null at extremes of data quality
166        Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
167        and ends up on the x axis go figure
168        if sampling, oversample low values - all the top 1% ?
169        """
170        data = copy.copy(plotme)
171        nvals = len(data)
172        stepsize = nvals/nsplits
173        logstep = math.log10(stepsize) # so is 3 for steps of 1000
174        quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
175        data.sort(key=itertools.itemgetter(1)) # into x order
176        rpy.r.pdf( outfname, height , width  )
177        rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
178        yvec = [-math.log10(random.random()) for x in range(stepsize)]
179        yvec.sort() # size of each step is expected range for xvec under null?!
180        for rowstart in quints:
181            rowend = rowstart + stepsize
182            if nvals - rowend < stepsize: # finish last split
183                rowend = nvals
184            row = data[rowstart:rowend]
185            row.sort(key=itertools.itemgetter(2)) # into y order
186            for colstart in quints:
187                colend = colstart + stepsize
188                if nvals - colend < stepsize: # finish last split
189                    colend = nvals
190                cell = row[colstart:colend]
191                xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
192                rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
193                rpy.r.points(c(0,logstep),c(0,logstep),type='l')
194        rpy.r.dev_off()
195
196
197    def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
198        """
199        y is data for a qqnorm plot
200        if sampling, oversample low values - all the top 1% ?
201        """
202        rangeunif = len(plotme)
203        nunif = 1000
204        maxveclen = 3000
205        nrows = len(plotme)
206        data = copy.copy(plotme)
207        if nrows > maxveclen:
208            # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
209            # oversample part of the distribution
210            always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
211            skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
212            samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
213            # always oversample first sorted (here lowest) values
214            yvec = [data[i] for i in samplei] # always get first and last
215            maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
216        else:
217            yvec = data
218            maint='Log QQ Plot(n=%d)' % (nrows)
219        n = 1000
220        ylab = '%s' % xlabname
221        xlab = 'Normal'
222        # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
223        rpy.r.pdf( outfname, height , width  )
224        rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
225        rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
226        rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
227        rpy.r.dev_off()
228
229    def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
230        """
231        layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
232        like the GAIN qc tools
233        y is data for a qq plot and ends up on the x axis go figure
234        if sampling, oversample low values - all the top 1% ?
235        """
236        rangeunif = len(plotme)
237        nunif = 1000
238        fn = float(rangeunif)
239        xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
240        skip = max(int(rangeunif/fn),1)
241        # force include last points
242        mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
243        maxveclen = 2000
244        nrows = len(plotme)
245        data = copy.copy(plotme)
246        data.sort() # low to high - oversample low values
247        if nrows > maxveclen:
248            # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
249            # oversample part of the distribution
250            always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
251            skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
252            samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
253            # always oversample first sorted (here lowest) values
254            yvec = [data[i] for i in samplei] # always get first and last
255            xvec = [xvec[i] for i in samplei] # and sample xvec same way
256            maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
257        else:
258            yvec = data
259            maint='Log QQ Plot(n=%d)' % (nrows)
260        n = 1000
261        mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
262        ylab = '%s' % xlabname
263        xlab = '-log10(Uniform 0-1)'
264        # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
265        rpy.r.pdf( outfname, height , width  )
266        rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
267        rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
268        rpy.r.points(mx,mx,type='l')
269        rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
270        rpy.r.dev_off()
271
272
273    fdsto,stofile = tempfile.mkstemp()
274    sto = open(stofile,'w')
275    import rpy # delay to avoid rpy stdout chatter replacing galaxy file blurb
276    mog = 'mogrify'
277    pdfnup = 'pdfnup'
278    pdfjoin = 'pdfjoin'
279    shead = subjects.pop(0) # get rid of head
280    mhead = markers.pop(0)
281    maf = mhead.index('maf')
282    missfrac = mhead.index('missfrac')
283    logphweall = mhead.index('logp_hwe_all')
284    logphweunaff = mhead.index('logp_hwe_unaff')
285    # check for at least some unaffected rml june 2009
286    m_mendel = mhead.index('N_Mendel')
287    fracmiss = shead.index('FracMiss')
288    s_mendel = shead.index('Mendel_errors')
289    s_het = shead.index('F_Stat')
290    params = {}
291    hweres = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
292         and x[logphweunaff].upper() <> 'NA']
293    if len(hweres) <> 0:
294        xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
295        # plot for each of these cols
296    else: # try hwe all instead - maybe no affection status available
297        xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
298    ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
299    oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
300    qqplotme = [1,0,0,0,0,0,0] #
301    qnplotme = [0,0,0,0,0,0,1] #
302    nplots = len(xs)
303    xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
304                 'Marker Mendel Error Count', 'Missing Rate: Subjects',
305                 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
306    plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
307    ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
308    ordplotnames = ['%s_cum' % x for x in plotnames]
309    ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
310    outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
311    ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
312    datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
313    titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
314        "Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
315    html = []
316    pdflist = []
317    for n,column in enumerate(xs):
318        dat = [float(x[column]) for x in datasources[n] if len(x) >= column
319               and x[column][:2].upper() <> 'NA'] # plink gives both!
320        if sum(dat) <> 0: # eg nada for mendel if case control?
321            rHist(plotme=dat,outfname=outfnames[n],xlabname=xlabnames[n],
322              title=titles[n],basename=basename,nbreaks=nbreaks)
323            row = [titles[n],ploturls[n],outfnames[n] ]
324            html.append(row)
325            pdflist.append(outfnames[n])
326            if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
327                otitle = 'Ranked %s' % titles[n]
328                dat.sort()
329                if oreverseme[n]:
330                    dat.reverse()
331                rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
332                  title=otitle,basename=basename,nbreaks=1000)
333                row = [otitle,ordploturls[n],ordoutfnames[n]]
334                html.append(row)
335                pdflist.append(ordoutfnames[n])
336            if qqplotme[n]: #
337                otitle = 'LogQQ plot %s' % titles[n]
338                dat.sort()
339                dat.reverse()
340                ofn = os.path.split(ordoutfnames[n])
341                ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
342                ofu = os.path.split(ordploturls[n])
343                ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
344                rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
345                  title=otitle,basename=basename)
346                row = [otitle,ofu,ofn]
347                html.append(row)
348                pdflist.append(ofn)
349            elif qnplotme[n]:
350                otitle = 'F Statistic %s' % titles[n]
351                dat.sort()
352                dat.reverse()
353                ofn = os.path.split(ordoutfnames[n])
354                ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
355                ofu = os.path.split(ordploturls[n])
356                ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
357                rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
358                  title=otitle,basename=basename)
359                row = [otitle,ofu,ofn]
360                html.append(row)
361                pdflist.append(ofn)
362        else:
363            print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
364    if nup>0:
365        # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
366        # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
367        filestojoin = ' '.join(pdflist) # all the file names so far
368        afname = '%s_All_Paged.pdf' % (basename)
369        fullafname = os.path.join(newfpath,afname)
370        expl = 'All %s QC Plots joined into a single pdf' % basename
371        vcl = '%s %s --outfile %s ' % (pdfjoin,filestojoin, fullafname)
372        # make single page pdf
373        x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
374        retval = x.wait()
375        row = [expl,afname,fullafname]
376        html.insert(0,row) # last rather than second
377        nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
378        fullnfname = os.path.join(newfpath,nfname)
379        expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
380        vcl = '%s %s --nup %dx%d --frame true --outfile %s' % (pdfnup,afname,nup,nup,fullnfname)
381        # make thumbnail images
382        x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
383        retval = x.wait()
384        row = [expl,nfname,fullnfname]
385        html.insert(1,row) # this goes second
386    vcl = '%s -format jpg -resize %s %s' % (mog, mogresize, os.path.join(newfpath,'*.pdf'))
387    # make thumbnail images
388    x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
389    retval = x.wait()
390    sto.close()
391    cruft = open(stofile,'r').readlines()
392    return html,cruft # elements for an ordered list of urls or whatever.. 
393
394
395def RmakePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='100',nup=3,height=8,width=10,rexe=''):
396    """
397    nice try but the R scripts are huge and take forever to run if there's a lot of data
398    marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
399    'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
400    subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
401    """
402    colour = "maroon"
403       
404    def rHist(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=nbreaks):
405        """   rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
406        # generic histogram and vertical boxplot in a 3:1 layout
407        # returns the graphic file name for inclusion in the web page
408        # broken out here for reuse
409        # ross lazarus march 2007
410        """
411        R = []
412        maint = 'QC for %s - %s' % (basename,title)
413        screenmat = (1,2,1,2) # create a 2x2 canvas
414        widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
415        R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
416        R.append("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))")
417        R.append("plotme = read.table(file='%s',head=F,sep='\t')" % plotme)
418        R.append('hist(plotme, main="%s",xlab="%s",breaks=%d,col="%s")' % (maint,xlabname,nbreaks,colour))
419        R.append('boxplot(plotme,main="",col="%s",outline=F)' % (colour) )
420        R.append('dev.off()')
421        return R
422       
423    def rCum(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=100):
424        """
425        Useful to see what various cutoffs yield - plot percentiles
426        """
427        R = []
428        n = len(plotme)
429        R.append("plotme = read.table(file='%s',head=T,sep='\t')" % plotme)
430        # arrives already in decending order of importance missingness or mendel count by subj or marker
431        maint = 'QC for %s - %s' % (basename,title)
432        R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
433        R.append("par(lab=c(10,10,10))")
434        R.append('plot(plotme$xvec,plotme$yvec,type="p",main="%s",ylab="%s",xlab="Sample Percentile",col="%s")' % (maint,xlabname,colour))
435        R.append('dev.off()')
436        return R
437
438    def rQQ(plotme='', outfname='fname',title='title',xlabname='Sample',basename=''):
439        """
440        y is data for a qq plot and ends up on the x axis go figure
441        if sampling, oversample low values - all the top 1% ?
442        this version called with -log10 transformed hwe
443        """
444        R = []
445        nrows = len(plotme)
446        fn = float(nrows)
447        xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
448        #mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
449        maxveclen = 3000
450        yvec = copy.copy(plotme)
451        if nrows > maxveclen:
452            # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
453            # oversample part of the distribution
454            always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
455            skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
456            if skip < 2:
457                skip = 2
458            samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
459            # always oversample first sorted (here lowest) values
460            yvec = [yvec[i] for i in samplei] # always get first and last
461            xvec = [xvec[i] for i in samplei] # and sample xvec same way
462            maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
463        else:
464            maint='Log QQ Plot(n=%d)' % (nrows)
465        mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
466        ylab = '%s' % xlabname
467        xlab = '-log10(Uniform 0-1)'
468        # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
469        x = ['%f' % x for x in xvec]
470        R.append('xvec = c(%s)' % ','.join(x))
471        y = ['%f' % x for x in yvec]
472        R.append('yvec = c(%s)' % ','.join(y))
473        R.append('mx = c(0,%f)' % (math.log10(fn)))
474        R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
475        R.append("par(lab=c(10,10,10))")
476        R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
477        R.append('points(mx,mx,type="l")')
478        R.append('grid(col="lightgray",lty="dotted")')
479        R.append('dev.off()')
480        return R
481
482    def rMultiQQ(plotme = '',nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
483        """
484        data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
485        grid of qq plots to show departure from null at extremes of data quality
486        Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
487        and ends up on the x axis go figure
488        if sampling, oversample low values - all the top 1% ?
489        """
490        data = copy.copy(plotme)
491        nvals = len(data)
492        stepsize = nvals/nsplits
493        logstep = math.log10(stepsize) # so is 3 for steps of 1000
494        R.append('mx = c(0,%f)' % logstep)
495        quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
496        data.sort(key=itertools.itemgetter(1)) # into x order
497        #rpy.r.pdf( outfname, h , w  )
498        #rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
499        R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
500        yvec = [-math.log10(random.random()) for x in range(stepsize)]
501        yvec.sort() # size of each step is expected range for xvec under null?!
502        y = ['%f' % x for x in yvec]
503        R.append('yvec = c(%s)' % ','.join(y))
504        for rowstart in quints:
505            rowend = rowstart + stepsize
506            if nvals - rowend < stepsize: # finish last split
507                rowend = nvals
508            row = data[rowstart:rowend]
509            row.sort(key=itertools.itemgetter(2)) # into y order
510            for colstart in quints:
511                colend = colstart + stepsize
512                if nvals - colend < stepsize: # finish last split
513                    colend = nvals
514                cell = row[colstart:colend]
515                xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
516                x = ['%f' % x for x in xvec]
517                R.append('xvec = c(%s)' % ','.join(x))
518                R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
519                R.append('points(mx,mx,type="l")')
520                R.append('grid(col="lightgray",lty="dotted")')
521                #rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
522                #rpy.r.points(c(0,logstep),c(0,logstep),type='l')
523        R.append('dev.off()')
524        #rpy.r.dev_off()
525        return R
526
527
528    def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
529        """
530        y is data for a qqnorm plot
531        if sampling, oversample low values - all the top 1% ?
532        """
533        rangeunif = len(plotme)
534        nunif = 1000
535        maxveclen = 3000
536        nrows = len(plotme)
537        data = copy.copy(plotme)
538        if nrows > maxveclen:
539            # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
540            # oversample part of the distribution
541            always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
542            skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
543            samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
544            # always oversample first sorted (here lowest) values
545            yvec = [data[i] for i in samplei] # always get first and last
546            maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
547        else:
548            yvec = data
549            maint='Log QQ Plot(n=%d)' % (nrows)
550        n = 1000
551        ylab = '%s' % xlabname
552        xlab = 'Normal'
553        # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
554        #rpy.r.pdf( outfname, h , w  )
555        #rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
556        #rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
557        #rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
558        #rpy.r.dev_off()
559        y = ['%f' % x for x in yvec]
560        R.append('yvec = c(%s)' % ','.join(y))
561        R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
562        R.append("par(lab=c(10,10,10))")
563        R.append('qqnorm(yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
564        R.append('grid(col="lightgray",lty="dotted")')
565        R.append('dev.off()')
566        return R
567
568    def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
569        """
570        layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
571        like the GAIN qc tools
572        y is data for a qq plot and ends up on the x axis go figure
573        if sampling, oversample low values - all the top 1% ?
574        """
575        rangeunif = len(plotme)
576        nunif = 1000
577        fn = float(rangeunif)
578        xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
579        skip = max(int(rangeunif/fn),1)
580        # force include last points
581        mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
582        maxveclen = 2000
583        nrows = len(plotme)
584        data = copy.copy(plotme)
585        data.sort() # low to high - oversample low values
586        if nrows > maxveclen:
587            # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
588            # oversample part of the distribution
589            always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
590            skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
591            samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
592            # always oversample first sorted (here lowest) values
593            yvec = [data[i] for i in samplei] # always get first and last
594            xvec = [xvec[i] for i in samplei] # and sample xvec same way
595            maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
596        else:
597            yvec = data
598            maint='Log QQ Plot(n=%d)' % (nrows)
599        n = 1000
600        mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
601        ylab = '%s' % xlabname
602        xlab = '-log10(Uniform 0-1)'
603        R.append('mx = c(0,%f)' % (math.log10(fn)))
604        x = ['%f' % x for x in xvec]
605        R.append('xvec = c(%s)' % ','.join(x))
606        y = ['%f' % x for x in yvec]
607        R.append('yvec = c(%s)' % ','.join(y))
608        R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
609        R.append("par(lab=c(10,10,10))")
610        R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
611        R.append('points(mx,mx,type="l")')
612        R.append('grid(col="lightgray",lty="dotted")')
613        R.append('dev.off()')
614
615
616    shead = subjects.pop(0) # get rid of head
617    mhead = markers.pop(0)
618    maf = mhead.index('maf')
619    missfrac = mhead.index('missfrac')
620    logphweall = mhead.index('logp_hwe_all')
621    logphweunaff = mhead.index('logp_hwe_unaff')
622    # check for at least some unaffected rml june 2009
623    m_mendel = mhead.index('N_Mendel')
624    fracmiss = shead.index('FracMiss')
625    s_mendel = shead.index('Mendel_errors')
626    s_het = shead.index('F_Stat')
627    params = {}
628    h = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
629         and x[logphweunaff].upper() <> 'NA']
630    if len(h) <> 0:
631        xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
632        # plot for each of these cols
633    else: # try hwe all instead - maybe no affection status available
634        xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
635    ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
636    oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
637    qqplotme = [1,0,0,0,0,0,0] #
638    qnplotme = [0,0,0,0,0,0,1] #
639    nplots = len(xs)
640    xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
641                 'Marker Mendel Error Count', 'Missing Rate: Subjects',
642                 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
643    plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
644    ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
645    ordplotnames = ['%s_cum' % x for x in plotnames]
646    ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
647    outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
648    ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
649    datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
650    titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
651        "Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
652    html = []
653    pdflist = []
654    R = []
655    for n,column in enumerate(xs):
656        dfn = '%d_%s.txt' % (n,titles[n])
657        dfilepath = os.path.join(newfpath,dfn)
658        dat = [float(x[column]) for x in datasources[n] if len(x) >= column
659               and x[column][:2].upper() <> 'NA'] # plink gives both!
660        if sum(dat) <> 0: # eg nada for mendel if case control?
661            plotme = file(dfilepath,'w')
662            plotme.write('\n'.join(['%f' % x for x in dat])) # pass as a file - copout!
663            tR = rHist(plotme=dfilepath,outfname=outfnames[n],xlabname=xlabnames[n],
664              title=titles[n],basename=basename,nbreaks=nbreaks)
665            R += tR
666            row = [titles[n],ploturls[n],outfnames[n] ]
667            html.append(row)
668            pdflist.append(outfnames[n])
669            if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
670                otitle = 'Ranked %s' % titles[n]
671                dat.sort()
672                if oreverseme[n]:
673                    dat.reverse()
674                    ndat = len(dat)
675                    xvec = range(ndat)
676                    xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
677                    # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
678                    maxveclen = 1000.0 # for reasonable pdf sizes!
679                    if ndat > maxveclen: # oversample part of the distribution
680                        always = min(1000,ndat/20) # oversample smaller of lowest few hundred items or 5%
681                        skip = int(ndat/maxveclen) # take 1 in skip to get about maxveclen points
682                        samplei = [i for i in range(ndat) if (i % skip == 0) or (i < always)] # always oversample first sorted values
683                        yvec = [yvec[i] for i in samplei] # always get first and last
684                        xvec = [xvec[i] for i in samplei] # always get first and last
685                        plotme = file(dfilepath,'w')
686                        plotme.write('xvec\tyvec\n')
687                        plotme.write('\n'.join(['%f\t%f' % (xvec[i],y) for y in yvec])) # pass as a file - copout!
688                tR = rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
689                  title=otitle,basename=basename,nbreaks=nbreaks)
690                R += tR
691                row = [otitle,ordploturls[n],ordoutfnames[n]]
692                html.append(row)
693                pdflist.append(ordoutfnames[n])
694            if qqplotme[n]: #
695                otitle = 'LogQQ plot %s' % titles[n]
696                dat.sort()
697                dat.reverse()
698                ofn = os.path.split(ordoutfnames[n])
699                ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
700                ofu = os.path.split(ordploturls[n])
701                ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
702                tR = rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
703                  title=otitle,basename=basename)
704                R += tR
705                row = [otitle,ofu,ofn]
706                html.append(row)
707                pdflist.append(ofn)
708            elif qnplotme[n]:
709                otitle = 'F Statistic %s' % titles[n]
710                dat.sort()
711                dat.reverse()
712                ofn = os.path.split(ordoutfnames[n])
713                ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
714                ofu = os.path.split(ordploturls[n])
715                ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
716                tR = rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
717                  title=otitle,basename=basename)
718                R += tR
719                row = [otitle,ofu,ofn]
720                html.append(row)
721                pdflist.append(ofn)
722        else:
723            print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
724    rlog,flist = RRun(rcmd=R,title='makeQCplots',outdir=newfpath)
725    if nup>0:
726        # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
727        # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
728        filestojoin = ' '.join(pdflist) # all the file names so far
729        afname = '%s_All_Paged.pdf' % (basename)
730        fullafname = os.path.join(newfpath,afname)
731        expl = 'All %s QC Plots joined into a single pdf' % basename
732        vcl = 'pdfjoin %s --outfile %s ' % (filestojoin, fullafname)
733        # make single page pdf
734        x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
735        retval = x.wait()
736        row = [expl,afname,fullafname]
737        html.insert(0,row) # last rather than second
738        nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
739        fullnfname = os.path.join(newfpath,nfname)
740        expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
741        vcl = 'pdfnup %s --nup %dx%d --frame true --outfile %s' % (afname,nup,nup,fullnfname)
742        # make thumbnail images
743        x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
744        retval = x.wait()
745        row = [expl,nfname,fullnfname]
746        html.insert(1,row) # this goes second
747    vcl = 'mogrify -format jpg -resize %s %s' % (mogresize, os.path.join(newfpath,'*.pdf'))
748    # make thumbnail images
749    x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
750    retval = x.wait()
751    return html # elements for an ordered list of urls or whatever..
752
753def countHet(bedf='fakeped_500000',linkageped=True,froot='fake500k',outfname="ahetf",logf='rgQC.log'):
754    """
755    NO LONGER USED - historical interest
756    count het loci for each subject to look for outliers = ? contamination
757    assume ped file is linkage format
758    need to make a ped file from the bed file we were passed
759    """
760    vcl = [plinke,'--bfile',bedf,'--recode','--out','%s_recode' % froot] # write a recoded ped file from the real .bed file
761    p=subprocess.Popen(' '.join(vcl),shell=True)
762    retval = p.wait()
763    f = open('%s_recode.recode.ped' % froot,'r')
764    if not linkageped:
765        head = f.next() # throw away header
766    hets = [] # simple count of het loci per subject. Expect poisson?
767    n = 1
768    for l in f:
769        n += 1
770        ll = l.strip().split()
771        if len(ll) > 6:
772            iid = idjoiner.join(ll[:2]) # fam_iid
773            gender = ll[4]
774            alleles = ll[6:]
775            nallele = len(alleles)
776            nhet = 0
777            for i in range(nallele/2):
778                a1=alleles[2*i]
779                a2=alleles[2*i+1]
780                if alleles[2*i] <> alleles[2*i+1]: # must be het
781                    if not missvals.get(a1,None) and not missvals.get(a2,None):
782                        nhet += 1
783            hets.append((nhet,iid,gender)) # for sorting later
784    f.close()
785    hets.sort()
786    hets.reverse() # biggest nhet now on top
787    f = open(outfname ,'w')
788    res = ['%d\t%s\t%s' % (x) for x in hets] # I love list comprehensions
789    res.insert(0,'nhetloci\tfamid_iid\tgender')
790    res.append('')
791    f.write('\n'.join(res))
792    f.close()
793
794
795
796def subjectRep(froot='cleantest',outfname="srep",newfpath='.',logf = None):
797    """by subject (missingness = .imiss, mendel = .imendel)
798    assume replicates have an underscore in family id for
799    hapmap testing
800    For sorting, we need floats and integers
801    """
802    isexfile = '%s.sexcheck' % froot
803    imissfile = '%s.imiss' % froot
804    imendfile = '%s.imendel' % froot
805    ihetfile = '%s.het' % froot
806    logf.write('## subject reports starting at %s\n' % timenow())
807    outfile = os.path.join(newfpath,outfname)
808    idlist = []
809    imissdict = {}
810    imenddict = {}
811    isexdict = {}
812    ihetdict = {}
813    Tops = {}
814    Tnames = ['Ranked Subject Missing Genotype', 'Ranked Subject Mendel',
815              'Ranked Sex check', 'Ranked Inbreeding (het) F statistic']
816    Tsorts = [2,3,6,8]
817    Treverse = [True,True,True,False] # so first values are worser
818    #rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
819    ##              FID            IID MISS_PHENO   N_MISS   N_GENO   F_MISS
820    ##  1552042370_A   1552042370_A          N     5480   549883 0.009966
821    ##  1552042410_A   1552042410_A          N     1638   549883 0.002979
822 
823    # ------------------missing--------------------
824    # imiss has FID  IID MISS_PHENO N_MISS  F_MISS
825    # we want F_MISS
826    try:
827        f = open(imissfile,'r')
828    except:
829        logf.write('# file %s is missing - talk about irony\n' % imissfile)
830        f = None
831    if f:
832        for n,line in enumerate(f):
833            ll = line.strip().split()
834            if n == 0:
835                head = [x.upper() for x in ll] # expect above               
836                fidpos = head.index('FID')
837                iidpos = head.index('IID')
838                fpos = head.index('F_MISS')
839            elif len(ll) >= fpos: # full line
840                fid = ll[fidpos]
841                #if fid.find('_') == -1: # not replicate! - icondb ids have these...
842                iid = ll[iidpos]
843                fmiss = ll[fpos]
844                id = '%s%s%s' % (fid,idjoiner,iid)
845                imissdict[id] = fmiss
846                idlist.append(id)
847        f.close()
848    logf.write('## imissfile %s contained %d ids\n' % (imissfile,len(idlist)))
849    # ------------------mend-------------------
850    # *.imendel has FID  IID   N
851    # we want N
852    gotmend = True
853    try:
854        f = open(imendfile,'r')
855    except:
856        gotmend = False
857        for id in idlist:
858            imenddict[id] = '0'
859    if gotmend:
860        for n,line in enumerate(f):
861            ll = line.strip().split()
862            if n == 0:
863                head = [x.upper() for x in ll] # expect above               
864                npos = head.index('N')
865                fidpos = head.index('FID')
866                iidpos = head.index('IID')
867            elif len(ll) >= npos: # full line
868                fid = ll[fidpos]
869                iid = ll[iidpos]
870                id = '%s%s%s' % (fid,idjoiner,iid)
871                nmend = ll[npos]
872                imenddict[id] = nmend
873        f.close()
874    else:
875        logf.write('## error No %s file - assuming not family data\n' % imendfile)
876    # ------------------sex check------------------
877    #[rerla@hg fresh]$ head /home/rerla/fresh/database/files/dataset_978_files/CAMP2007Dirty.sexcheck
878    # sexcheck has FID IID PEDSEX SNPSEX STATUS F
879    ##
880    ##     FID     Family ID
881    ##     IID     Individual ID
882    ##     PEDSEX  Sex as determined in pedigree file (1=male, 2=female)
883    ##     SNPSEX  Sex as determined by X chromosome
884    ##     STATUS  Displays "PROBLEM" or "OK" for each individual
885    ##     F       The actual X chromosome inbreeding (homozygosity) estimate
886    ##
887    ##    A PROBLEM arises if the two sexes do not match, or if the SNP data or pedigree data are
888    ##    ambiguous with regard to sex.
889    ##    A male call is made if F is more than 0.8; a femle call is made if F is less than 0.2.
890    try:
891        f = open(isexfile,'r')
892        got_sexcheck = 1
893    except:
894        got_sexcheck = 0
895    if got_sexcheck:
896        for n,line in enumerate(f):
897            ll = line.strip().split()
898            if n == 0:
899                head = [x.upper() for x in ll] # expect above               
900                fidpos = head.index('FID')
901                iidpos = head.index('IID')
902                pedsexpos = head.index('PEDSEX')
903                snpsexpos = head.index('SNPSEX')
904                statuspos = head.index('STATUS')
905                fpos = head.index('F')
906            elif len(ll) >= fpos: # full line
907                fid = ll[fidpos]
908                iid = ll[iidpos]
909                fest = ll[fpos]
910                pedsex = ll[pedsexpos]
911                snpsex = ll[snpsexpos]
912                stat = ll[statuspos]
913                id = '%s%s%s' % (fid,idjoiner,iid)
914                isexdict[id] = (pedsex,snpsex,stat,fest)
915        f.close()
916    else:
917        # this only happens if there are no subjects!
918        logf.write('No %s file - assuming no sex errors' % isexfile)
919    ##
920    ##    FID  IID       O(HOM)       E(HOM)        N(NM)            F
921    ##    457    2       490665    4.928e+05       722154    -0.009096
922    ##    457    3       464519     4.85e+05       710986      -0.0908
923    ##   1037    2       461632    4.856e+05       712025       -0.106
924    ##   1037    1       491845    4.906e+05       719353     0.005577
925    try:
926        f = open(ihetfile,'r')
927    except:
928        f = None
929        logf.write('## No %s file - did we run --het in plink?\n' % ihetfile)
930    if f:
931        for i,line in enumerate(f):
932            ll = line.strip().split()
933            if i == 0:
934                head = [x.upper() for x in ll] # expect above               
935                fidpos = head.index('FID')
936                iidpos = head.index('IID')
937                fpos = head.index('F')
938                n = 0
939            elif len(ll) >= fpos: # full line
940                fid = ll[fidpos]           
941                iid = ll[iidpos]
942                fhet = ll[fpos]
943                id = '%s%s%s' % (fid,idjoiner,iid)
944                ihetdict[id] = fhet
945        f.close()      # now assemble and output result list
946    rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','XHomEst','F_Stat']
947    res = []
948    fres = [] # floats for sorting
949    for id in idlist: # for each snp in found order
950        fid,iid = id.split(idjoiner) # recover keys
951        f_missing = imissdict.get(id,'0.0')
952        nmend = imenddict.get(id,'0')
953        (pedsex,snpsex,status,fest) = isexdict.get(id,('0','0','0','0.0'))
954        fhet = ihetdict.get(id,'0.0')
955        res.append([fid,iid,f_missing,nmend,pedsex,snpsex,status,fest,fhet])
956        try:
957            ff_missing = float(f_missing)
958        except:
959            ff_missing = 0.0
960        try:
961            inmend = int(nmend)
962        except:
963            inmend = 0
964        try:
965            ffest = float(fest)
966        except:
967            fest = 0.0
968        try:
969            ffhet = float(fhet)
970        except:
971            ffhet = 0.0
972        fres.append([fid,iid,ff_missing,inmend,pedsex,snpsex,status,ffest,ffhet])
973    ntokeep = max(20,len(res)/keepfrac)
974    for i,col in enumerate(Tsorts):
975        fres.sort(key=operator.itemgetter(col))
976        if Treverse[i]:
977            fres.reverse()
978        repname = Tnames[i]
979        Tops[repname] = fres[0:ntokeep]
980        Tops[repname] = [map(str,x) for x in Tops[repname]]
981        Tops[repname].insert(0,rhead)
982    res.sort()
983    res.insert(0,rhead)
984    logf.write('### writing %s report with %s' % ( outfile,res[0])) 
985    f = open(outfile,'w')
986    f.write('\n'.join(['\t'.join(x) for x in res]))
987    f.write('\n')
988    f.close()
989    return res,Tops
990
991def markerRep(froot='cleantest',outfname="mrep",newfpath='.',logf=None,maplist=None ):
992    """by marker (hwe = .hwe, missingness=.lmiss, freq = .frq)
993    keep a list of marker order but keep all stats in dicts
994    write out a fake xls file for R or SAS etc
995    kinda clunky, but..
996    TODO: ensure stable if any file not found?
997    """
998    mapdict = {}
999    if maplist <> None:
1000       rslist = [x[1] for x in maplist]
1001       offset = [(x[0],x[3]) for x in maplist]
1002       mapdict = dict(zip(rslist,offset))
1003    hwefile = '%s.hwe' % froot
1004    lmissfile = '%s.lmiss' % froot
1005    freqfile = '%s.frq' % froot
1006    lmendfile = '%s.lmendel' % froot
1007    outfile = os.path.join(newfpath,outfname)
1008    markerlist = []
1009    chromlist = []
1010    hwedict = {}
1011    lmissdict = {}
1012    freqdict = {}
1013    lmenddict = {}
1014    Tops = {}
1015    Tnames = ['Ranked Marker MAF', 'Ranked Marker Missing Genotype', 'Ranked Marker HWE', 'Ranked Marker Mendel']
1016    Tsorts = [3,6,10,11]
1017    Treverse = [False,True,True,True] # so first values are worse(r)
1018    #res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend])
1019    #rhead = ['snp','chrom','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
1020    # -------------------hwe--------------------------
1021    #    hwe has SNP TEST  GENO   O(HET)   E(HET) P_HWD
1022    # we want all hwe where P_HWD <> NA
1023    # ah changed in 1.04 to
1024    ##  CHR         SNP     TEST   A1   A2                 GENO   O(HET)   E(HET)            P
1025    ##   1   rs6671164      ALL    2    3           34/276/613    0.299   0.3032       0.6644
1026    ##   1   rs6671164      AFF    2    3                0/0/0      nan      nan           NA
1027    ##   1   rs6671164    UNAFF    2    3           34/276/613    0.299   0.3032       0.6644
1028    ##   1   rs4448553      ALL    2    3            8/176/748   0.1888   0.1848       0.5975
1029    ##   1   rs4448553      AFF    2    3                0/0/0      nan      nan           NA
1030    ##   1   rs4448553    UNAFF    2    3            8/176/748   0.1888   0.1848       0.5975
1031    ##   1   rs1990150      ALL    1    3           54/303/569   0.3272   0.3453       0.1067
1032    ##   1   rs1990150      AFF    1    3                0/0/0      nan      nan           NA
1033    ##   1   rs1990150    UNAFF    1    3           54/303/569   0.3272   0.3453       0.1067
1034    logf.write('## marker reports starting at %s\n' % timenow())
1035    try:
1036        f = open(hwefile,'r')
1037    except:
1038        f = None
1039        logf.write('## error - no hwefile %s found\n' % hwefile)
1040    if f:
1041        for i,line in enumerate(f):
1042            ll = line.strip().split()
1043            if i == 0: # head
1044                head = [x.upper() for x in ll] # expect above               
1045                try:
1046                    testpos = head.index('TEST')
1047                except:
1048                    testpos = 2 # patch for 1.04 which has b0rken headers - otherwise use head.index('TEST')
1049                try:
1050                    ppos = head.index('P')
1051                except:
1052                    ppos = 8 # patch - for head.index('P')
1053                snppos = head.index('SNP')
1054                chrpos = head.index('CHR')
1055                logf.write('hwe header testpos=%d,ppos=%d,snppos=%d\n' % (testpos,ppos,snppos))
1056            elif len(ll) >= ppos: # full line
1057                ps = ll[ppos].upper()
1058                rs = ll[snppos]
1059                chrom = ll[chrpos]
1060                test = ll[testpos]
1061                if not hwedict.get(rs,None):
1062                    hwedict[rs] = {}
1063                    markerlist.append(rs)
1064                chromlist.append(chrom) # one place to find it?
1065                lpvals = 0
1066                if ps.upper() <> 'NA' and ps.upper() <> 'NAN': # worth keeping
1067                    lpvals = '0'
1068                    if ps <> '1':
1069                        try:
1070                            pval = float(ps)
1071                            lpvals = '%f' % -math.log10(pval)
1072                        except:
1073                            pass
1074                    hwedict[rs][test] = (ps,lpvals)
1075            else:
1076                logf.write('short line #%d = %s\n' % (i,ll))
1077        f.close()
1078    # ------------------missing--------------------
1079    """lmiss has 
1080    CHR          SNP   N_MISS   N_GENO   F_MISS
1081   1   rs12354060        0       73        0
1082   1    rs4345758        1       73   0.0137
1083   1    rs2691310       73       73        1
1084   1    rs2531266       73       73        1
1085    # we want F_MISS"""
1086    try:
1087        f = open(lmissfile,'r')
1088    except:
1089        f = None
1090    if f:
1091        for i,line in enumerate(f):
1092            ll = line.strip().split()
1093            if i == 0:
1094                head = [x.upper() for x in ll] # expect above               
1095                fracpos = head.index('F_MISS')
1096                npos = head.index('N_MISS')
1097                snppos = head.index('SNP')
1098            elif len(ll) >= fracpos: # full line
1099                rs = ll[snppos]
1100                fracval = ll[fracpos]               
1101                lmissdict[rs] = fracval # for now, just that?
1102            else:
1103                lmissdict[rs] = 'NA'
1104        f.close()
1105    # ------------------freq-------------------
1106    # frq has CHR          SNP   A1   A2          MAF       NM
1107    # we want maf
1108    try:
1109        f = open(freqfile,'r')
1110    except:
1111        f = None
1112    if f:
1113        for i,line in enumerate(f):
1114            ll = line.strip().split()
1115            if i == 0:
1116                head = [x.upper() for x in ll] # expect above               
1117                mafpos = head.index('MAF')
1118                a1pos = head.index('A1')
1119                a2pos = head.index('A2')
1120                snppos = head.index('SNP')
1121            elif len(ll) >= mafpos: # full line
1122                rs = ll[snppos]
1123                a1 = ll[a1pos]
1124                a2 = ll[a2pos]
1125                maf = ll[mafpos]
1126                freqdict[rs] = (maf,a1,a2)
1127        f.close()
1128    # ------------------mend-------------------
1129    # lmend has CHR SNP   N
1130    # we want N
1131    gotmend = True
1132    try:
1133        f = open(lmendfile,'r')
1134    except:
1135        gotmend = False
1136        for rs in markerlist:
1137            lmenddict[rs] = '0'
1138    if gotmend:
1139        for i,line in enumerate(f):
1140            ll = line.strip().split()
1141            if i == 0:
1142                head = [x.upper() for x in ll] # expect above               
1143                npos = head.index('N')
1144                snppos = head.index('SNP')
1145            elif len(ll) >= npos: # full line
1146                rs = ll[snppos]
1147                nmend = ll[npos]
1148                lmenddict[rs] = nmend
1149        f.close()
1150    else:
1151        logf.write('No %s file - assuming not family data\n' % lmendfile)
1152    # now assemble result list
1153    rhead = ['snp','chromosome','offset','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
1154    res = []
1155    fres = []
1156    for rs in markerlist: # for each snp in found order
1157        f_missing = lmissdict.get(rs,'NA')
1158        maf,a1,a2 = freqdict.get(rs,('NA','NA','NA'))
1159        hwe_all = hwedict[rs].get('ALL',('NA','NA')) # hope this doesn't change...
1160        hwe_unaff = hwedict[rs].get('UNAFF',('NA','NA'))
1161        nmend = lmenddict.get(rs,'NA')
1162        (chrom,offset)=mapdict.get(rs,('?','0'))
1163        res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend])
1164    ntokeep = max(10,len(res)/keepfrac)
1165
1166    def msortk(item=None):
1167        """
1168        deal with non numeric sorting
1169        """
1170        try:
1171           return float(item)
1172        except:
1173           return item
1174
1175    for i,col in enumerate(Tsorts):
1176        res.sort(key=msortk(lambda x:x[col]))
1177        if Treverse[i]:
1178            res.reverse()
1179        repname = Tnames[i]
1180        Tops[repname] = res[0:ntokeep]
1181        Tops[repname].insert(0,rhead)
1182    res.sort(key=lambda x: '%s_%10d' % (x[1].ljust(4,'0'),int(x[2]))) # in chrom offset order
1183    res.insert(0,rhead)
1184    f = open(outfile,'w')
1185    f.write('\n'.join(['\t'.join(x) for x in res]))
1186    f.close()
1187    return res,Tops
1188
1189
1190
1191 
1192def getfSize(fpath,outpath):
1193    """
1194    format a nice file size string
1195    """
1196    size = ''
1197    fp = os.path.join(outpath,fpath)
1198    if os.path.isfile(fp):
1199        n = float(os.path.getsize(fp))
1200        if n > 2**20:
1201            size = ' (%1.1f MB)' % (n/2**20)
1202        elif n > 2**10:
1203            size = ' (%1.1f KB)' % (n/2**10)
1204        elif n > 0:
1205            size = ' (%d B)' % (int(n))
1206    return size
1207
1208
1209if __name__ == "__main__":
1210    u = """ called in xml as
1211     <command interpreter="python">
1212        rgQC.py -i '$input_file.extra_files_path/$input_file.metadata.base_name' -o "$out_prefix"
1213        -s '$html_file' -p '$html_file.files_path' -l '${GALAXY_DATA_INDEX_DIR}/rg/bin/plink'
1214        -r '${GALAXY_DATA_INDEX_DIR}/rg/bin/R'
1215    </command>
1216
1217        Prepare a qc report - eg:
1218    print "%s %s -i birdlped -o birdlped -l qc.log -s htmlf -m marker.xls -s sub.xls -p ./" % (sys.executable,prog)
1219
1220    """
1221    progname = os.path.basename(sys.argv[0])
1222    if len(sys.argv) < 9:
1223        print '%s requires 6 parameters - got %d = %s' % (progname,len(sys.argv),sys.argv)
1224        sys.exit(1)
1225    parser = OptionParser(usage=u, version="%prog 0.01")
1226    a = parser.add_option
1227    a("-i","--infile",dest="infile")
1228    a("-o","--oprefix",dest="opref")
1229    a("-l","--plinkexe",dest="plinke", default=plinke)
1230    a("-r","--rexe",dest="rexe", default=rexe)
1231    a("-s","--snps",dest="htmlf")
1232    #a("-m","--markerRaw",dest="markf")
1233    #a("-r","--rawsubject",dest="subjf")
1234    a("-p","--patho",dest="newfpath")
1235    (options,args) = parser.parse_args()
1236    basename = os.path.split(options.infile)[-1] # just want the file prefix to find the .xls files below
1237    killme = string.punctuation + string.whitespace
1238    trantab = string.maketrans(killme,'_'*len(killme))
1239    opref = options.opref.translate(trantab)
1240    alogh,alog = tempfile.mkstemp(suffix='.txt')
1241    plogh,plog = tempfile.mkstemp(suffix='.txt')
1242    alogf = open(alog,'w')
1243    plogf = open(plog,'w')
1244    ahtmlf = options.htmlf
1245    amarkf = 'MarkerDetails_%s.xls' % opref
1246    asubjf = 'SubjectDetails_%s.xls' % opref
1247    newfpath = options.newfpath
1248    newfpath = os.path.realpath(newfpath)
1249    try:
1250       os.makedirs(newfpath)
1251    except:
1252       pass
1253    ofn = basename
1254    bfn = options.infile
1255    try:
1256       mapf = '%s.bim' % bfn
1257       maplist = file(mapf,'r').readlines()
1258       maplist = [x.split() for x in maplist]
1259    except:
1260       maplist = None
1261       alogf.write('## error - cannot open %s to read map - no offsets will be available for output files')
1262    #rerla@beast galaxy]$ head test-data/tinywga.bim
1263    #22      rs2283802       0       21784722        4       2
1264    #22      rs2267000       0       21785366        4       2
1265    rgbin = os.path.split(rexe)[0] # get our rg bin path
1266    #plinktasks = [' --freq',' --missing',' --mendel',' --hardy',' --check-sex'] # plink v1 fixes that bug!
1267    # if we could, do all at once? Nope. Probably never.
1268    plinktasks = [['--freq',],['--hwe 0.0', '--missing','--hardy'],
1269    ['--mendel',],['--check-sex',]]
1270    vclbase = [options.plinke,'--noweb','--out',basename,'--bfile',bfn,'--mind','1.0','--geno','1.0','--maf','0.0']
1271    runPlink(logf=plogf,plinktasks=plinktasks,cd=newfpath, vclbase=vclbase)
1272    plinktasks = [['--bfile',bfn,'--indep-pairwise 40 20 0.5','--out %s' % basename],
1273    ['--bfile',bfn,'--extract %s.prune.in --make-bed --out ldp_%s' % (basename, basename)],
1274                  ['--bfile ldp_%s --het --out %s' % (basename,basename)]]
1275    # subset of ld independent markers for eigenstrat and other requirements
1276    vclbase = [options.plinke,'--noweb']
1277    plogout = pruneLD(plinktasks=plinktasks,cd=newfpath,vclbase = vclbase)
1278    plogf.write('\n'.join(plogout))
1279    plogf.write('\n')
1280    repout = os.path.join(newfpath,basename)
1281    subjects,subjectTops = subjectRep(froot=repout,outfname=asubjf,newfpath=newfpath,
1282                logf=alogf) # writes the subject_froot.xls file
1283    markers,markerTops = markerRep(froot=repout,outfname=amarkf,newfpath=newfpath,
1284                logf=alogf,maplist=maplist) # marker_froot.xls
1285    nbreaks = 100
1286    s = '## starting plotpage, newfpath=%s,m=%s,s=%s/n' % (newfpath,markers[:2],subjects[:2])
1287    alogf.write(s)
1288    print s
1289    plotpage,cruft = makePlots(markers=markers,subjects=subjects,newfpath=newfpath,
1290                         basename=basename,nbreaks=nbreaks,height=10,width=8,rgbin=rgbin)
1291    #plotpage = RmakePlots(markers=markers,subjects=subjects,newfpath=newfpath,basename=basename,nbreaks=nbreaks,rexe=rexe)
1292
1293    # [titles[n],plotnames[n],outfnames[n] ]
1294    html = []
1295    html.append('<table cellpadding="5" border="0">')
1296    size = getfSize(amarkf,newfpath)
1297    html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \
1298                (amarkf,'Click here to download the Marker QC Detail report file',size))
1299    size = getfSize(asubjf,newfpath)
1300    html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \
1301                (asubjf,'Click here to download the Subject QC Detail report file',size))
1302    for (title,url,ofname) in plotpage:
1303        ttitle = 'Ranked %s' % title
1304        dat = subjectTops.get(ttitle,None)
1305        if not dat:
1306            dat = markerTops.get(ttitle,None)
1307        imghref = '%s.jpg' % os.path.splitext(url)[0] # removes .pdf
1308        thumbnail = os.path.join(newfpath,imghref)
1309        if not os.path.exists(thumbnail): # for multipage pdfs, mogrify makes multiple jpgs - fugly hack
1310            imghref = '%s-0.jpg' % os.path.splitext(url)[0] # try the first jpg
1311            thumbnail = os.path.join(newfpath,imghref)
1312        if not os.path.exists(thumbnail):
1313            html.append('<tr><td colspan="3"><a href="%s">%s</a></td></tr>' % (url,title))
1314        else:
1315            html.append('<tr><td><a href="%s"><img src="%s" alt="%s" hspace="10" align="middle">' \
1316                    % (url,imghref,title))
1317            if dat: # one or the other - write as an extra file and make a link here
1318                t = '%s.xls' % (ttitle.replace(' ','_'))
1319                fname = os.path.join(newfpath,t)
1320                f = file(fname,'w')
1321                f.write('\n'.join(['\t'.join(x) for x in dat])) # the report
1322                size = getfSize(t,newfpath)
1323                html.append('</a></td><td>%s</td><td><a href="%s">Worst data</a>%s</td></tr>' % (title,t,size))
1324            else:
1325                html.append('</a></td><td>%s</td><td>&nbsp;</td></tr>' % (title))
1326    html.append('</table><hr><h3>All output files from the QC run are available below</h3>')
1327    html.append('<table cellpadding="5" border="0">\n')
1328    flist = os.listdir(newfpath) # we want to catch 'em all
1329    flist.sort()
1330    for f in flist:
1331        fname = os.path.split(f)[-1]
1332        size = getfSize(fname,newfpath)
1333        html.append('<tr><td><a href="%s">%s</a>%s</td></tr>' % (fname,fname,size))
1334    html.append('</table>')
1335    alogf.close()
1336    plogf.close()
1337    llog = open(alog,'r').readlines()
1338    plogfile = open(plog,'r').readlines()
1339    os.unlink(alog)
1340    os.unlink(plog)
1341    llog += plogfile # add lines from pruneld log
1342    lf = file(ahtmlf,'w') # galaxy will show this as the default view
1343    lf.write(galhtmlprefix % progname)
1344    s = '\n<div>Output from Rgenetics QC report tool run at %s<br>\n' % (timenow())
1345    lf.write('<h4>%s</h4>\n' % s)
1346    lf.write('</div><div><h4>(Click any preview image to download a full sized PDF version)</h4><br><ol>\n')
1347    lf.write('\n'.join(html))
1348    lf.write('<h4>QC run log contents</h4>')
1349    lf.write('<pre>%s</pre>' % (''.join(llog))) # plink logs
1350    if len(cruft) > 0:
1351        lf.write('<h2>Blather from pdfnup follows:</h2><pre>%s</pre>' % (''.join(cruft))) # pdfnup
1352    lf.write('%s\n<hr>\n' % galhtmlpostfix)
1353    lf.close()
1354
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。