[2] | 1 | # oct 15 rpy replaced - temp fix until we get gnuplot working |
---|
| 2 | # rpy deprecated - replace with RRun |
---|
| 3 | # fixes to run functional test! oct1 2009 |
---|
| 4 | # needed to expand our path with os.path.realpath to get newpath working with |
---|
| 5 | # all the fancy pdfnup stuff |
---|
| 6 | # and a fix to pruneld to write output to where it should be |
---|
| 7 | # smallish data in test-data/smallwga in various forms |
---|
| 8 | # python ../tools/rgenetics/rgQC.py -i smallwga -o smallwga -s smallwga/smallwga.html -p smallwga |
---|
| 9 | # child files are deprecated and broken as at july 19 2009 |
---|
| 10 | # need to move them to the html file extrafiles path |
---|
| 11 | # found lots of corner cases with some illumina data where cnv markers were |
---|
| 12 | # included |
---|
| 13 | # and where affection status was all missing ! |
---|
| 14 | # added links to tab files showing worst 1/keepfrac markers and subjects |
---|
| 15 | # ross lazarus january 2008 |
---|
| 16 | # |
---|
| 17 | # added named parameters |
---|
| 18 | # to ensure no silly slippages if non required parameter in the most general case |
---|
| 19 | # some potentially useful things here reusable in complex scripts |
---|
| 20 | # with lots'o'html (TM) |
---|
| 21 | # aug 17 2007 rml |
---|
| 22 | # |
---|
| 23 | # added marker and subject and parenting april 14 rml |
---|
| 24 | # took a while to get the absolute paths right for all the file munging |
---|
| 25 | # as of april 16 seems to work.. |
---|
| 26 | # getting galaxy to serve images in html reports is a little tricky |
---|
| 27 | # we don't want QC reports to be dozens of individual files, so need |
---|
| 28 | # to use the url /static/rg/... since galaxy's web server will happily serve images |
---|
| 29 | # from there |
---|
| 30 | # galaxy passes output files as relative paths |
---|
| 31 | # these have to be munged by rgQC.py before calling this |
---|
| 32 | # galaxy will pass in 2 file names - one for the log |
---|
| 33 | # and one for the final html report |
---|
| 34 | # of the form './database/files/dataset_66.dat' |
---|
| 35 | # we need to be working in that directory so our plink output files are there |
---|
| 36 | # so these have to be munged by rgQC.py before calling this |
---|
| 37 | # note no ped file passed so had to remove the -l option |
---|
| 38 | # for plinkParse.py that makes a heterozygosity report from the ped |
---|
| 39 | # file - needs fixing... |
---|
| 40 | # new: importing manhattan/qqplot plotter |
---|
| 41 | # def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir): |
---|
| 42 | # """ draw a qq for pvals and a manhattan plot if chrom/offset <> 0 |
---|
| 43 | # contains some R scripts as text strings - we substitute defaults into the calls |
---|
| 44 | # to make them do our bidding - and save the resulting code for posterity |
---|
| 45 | # this can be called externally, I guess...for QC eg? |
---|
| 46 | # """ |
---|
| 47 | # |
---|
| 48 | # rcmd = '%s%s' % (rcode,rcode2 % (input_fname,chrom_col,offset_col,pval_cols,title,grey)) |
---|
| 49 | # rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir) |
---|
| 50 | # return rlog,flist |
---|
| 51 | |
---|
| 52 | |
---|
| 53 | from optparse import OptionParser |
---|
| 54 | |
---|
| 55 | import sys,os,shutil, glob, math, subprocess, time, operator, random, tempfile, copy, string |
---|
| 56 | from os.path import abspath |
---|
| 57 | from rgutils import galhtmlprefix, galhtmlpostfix, RRun, timenow, plinke, rexe, runPlink, pruneLD |
---|
| 58 | import rgManQQ |
---|
| 59 | |
---|
| 60 | prog = os.path.split(sys.argv[0])[1] |
---|
| 61 | vers = '0.4 april 2009 rml' |
---|
| 62 | idjoiner = '_~_~_' # need something improbable.. |
---|
| 63 | # many of these may need fixing for a new install |
---|
| 64 | |
---|
| 65 | myversion = vers |
---|
| 66 | keepfrac = 20 # fraction to keep after sorting by each interesting value |
---|
| 67 | |
---|
| 68 | missvals = {'0':'0','N':'N','-9':'-9','-':'-'} # fix me if these change! |
---|
| 69 | |
---|
| 70 | mogresize = "x300" # this controls the width for jpeg thumbnails |
---|
| 71 | |
---|
| 72 | |
---|
| 73 | |
---|
| 74 | |
---|
| 75 | def makePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='20',nup=3,height=10,width=8,rgbin=''): |
---|
| 76 | """ |
---|
| 77 | marker rhead = ['snp','chrom','maf','a1','a2','missfrac', |
---|
| 78 | 'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel'] |
---|
| 79 | subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest'] |
---|
| 80 | """ |
---|
| 81 | |
---|
| 82 | |
---|
| 83 | def rHist(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=50): |
---|
| 84 | """ rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50) |
---|
| 85 | # generic histogram and vertical boxplot in a 3:1 layout |
---|
| 86 | # returns the graphic file name for inclusion in the web page |
---|
| 87 | # broken out here for reuse |
---|
| 88 | # ross lazarus march 2007 |
---|
| 89 | """ |
---|
| 90 | screenmat = (1,2,1,2) # create a 2x2 cabvas |
---|
| 91 | widthlist = (80,20) # change to 4:1 ratio for histo and boxplot |
---|
| 92 | rpy.r.pdf( outfname, height , width ) |
---|
| 93 | #rpy.r.layout(rpy.r.matrix(rpy.r.c(1,1,1,2), 1, 4, byrow = True)) # 3 to 1 vertical plot |
---|
| 94 | m = rpy.r.matrix((1,1,1,2),nrow=1,ncol=4,byrow=True) |
---|
| 95 | # in R, m = matrix(c(1,2),nrow=1,ncol=2,byrow=T) |
---|
| 96 | rpy.r("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))") # 4 to 1 vertical plot |
---|
| 97 | maint = 'QC for %s - %s' % (basename,title) |
---|
| 98 | rpy.r.hist(plotme,main=maint, xlab=xlabname,breaks=nbreaks,col="maroon",cex=0.8) |
---|
| 99 | rpy.r.boxplot(plotme,main='',col="maroon",outline=False) |
---|
| 100 | rpy.r.dev_off() |
---|
| 101 | |
---|
| 102 | def rCum(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=100): |
---|
| 103 | """ |
---|
| 104 | Useful to see what various cutoffs yield - plot percentiles |
---|
| 105 | """ |
---|
| 106 | n = len(plotme) |
---|
| 107 | maxveclen = 1000.0 # for reasonable pdf sizes! |
---|
| 108 | yvec = copy.copy(plotme) |
---|
| 109 | # arrives already in decending order of importance missingness or mendel count by subj or marker |
---|
| 110 | xvec = range(n) |
---|
| 111 | xvec = [100.0*(n-x)/n for x in xvec] # convert to centile |
---|
| 112 | # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points |
---|
| 113 | if n > maxveclen: # oversample part of the distribution |
---|
| 114 | always = min(1000,n/20) # oversample smaller of lowest few hundred items or 5% |
---|
| 115 | skip = int(n/maxveclen) # take 1 in skip to get about maxveclen points |
---|
| 116 | samplei = [i for i in range(n) if (i % skip == 0) or (i < always)] # always oversample first sorted values |
---|
| 117 | yvec = [yvec[i] for i in samplei] # always get first and last |
---|
| 118 | xvec = [xvec[i] for i in samplei] # always get first and last |
---|
| 119 | # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure |
---|
| 120 | rpy.r.pdf( outfname, height , width ) |
---|
| 121 | maint = 'QC for %s - %s' % (basename,title) |
---|
| 122 | rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5 |
---|
| 123 | rpy.r.plot(xvec,yvec,type='p',main=maint, ylab=xlabname, xlab='Sample Percentile',col="maroon",cex=0.8) |
---|
| 124 | rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted") |
---|
| 125 | rpy.r.dev_off() |
---|
| 126 | |
---|
| 127 | def rQQ(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''): |
---|
| 128 | """ |
---|
| 129 | y is data for a qq plot and ends up on the x axis go figure |
---|
| 130 | if sampling, oversample low values - all the top 1% ? |
---|
| 131 | this version called with -log10 transformed hwe |
---|
| 132 | """ |
---|
| 133 | nrows = len(plotme) |
---|
| 134 | fn = float(nrows) |
---|
| 135 | xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))] |
---|
| 136 | mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line |
---|
| 137 | maxveclen = 3000 |
---|
| 138 | yvec = copy.copy(plotme) |
---|
| 139 | if nrows > maxveclen: |
---|
| 140 | # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points |
---|
| 141 | # oversample part of the distribution |
---|
| 142 | always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% |
---|
| 143 | skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points |
---|
| 144 | samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)] |
---|
| 145 | # always oversample first sorted (here lowest) values |
---|
| 146 | yvec = [yvec[i] for i in samplei] # always get first and last |
---|
| 147 | xvec = [xvec[i] for i in samplei] # and sample xvec same way |
---|
| 148 | maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) |
---|
| 149 | else: |
---|
| 150 | maint='Log QQ Plot(n=%d)' % (nrows) |
---|
| 151 | mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line |
---|
| 152 | ylab = '%s' % xlabname |
---|
| 153 | xlab = '-log10(Uniform 0-1)' |
---|
| 154 | # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure |
---|
| 155 | rpy.r.pdf( outfname, height , width ) |
---|
| 156 | rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5 |
---|
| 157 | rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8) |
---|
| 158 | rpy.r.points(mx,mx,type='l') |
---|
| 159 | rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted") |
---|
| 160 | rpy.r.dev_off() |
---|
| 161 | |
---|
| 162 | def rMultiQQ(plotme = [],nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''): |
---|
| 163 | """ |
---|
| 164 | data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a |
---|
| 165 | grid of qq plots to show departure from null at extremes of data quality |
---|
| 166 | Need to plot qqplot(p,unif) where the p's come from one x and one y quantile |
---|
| 167 | and ends up on the x axis go figure |
---|
| 168 | if sampling, oversample low values - all the top 1% ? |
---|
| 169 | """ |
---|
| 170 | data = copy.copy(plotme) |
---|
| 171 | nvals = len(data) |
---|
| 172 | stepsize = nvals/nsplits |
---|
| 173 | logstep = math.log10(stepsize) # so is 3 for steps of 1000 |
---|
| 174 | quints = range(0,nvals,stepsize) # quintile cutpoints for each layer |
---|
| 175 | data.sort(key=itertools.itemgetter(1)) # into x order |
---|
| 176 | rpy.r.pdf( outfname, height , width ) |
---|
| 177 | rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits)) |
---|
| 178 | yvec = [-math.log10(random.random()) for x in range(stepsize)] |
---|
| 179 | yvec.sort() # size of each step is expected range for xvec under null?! |
---|
| 180 | for rowstart in quints: |
---|
| 181 | rowend = rowstart + stepsize |
---|
| 182 | if nvals - rowend < stepsize: # finish last split |
---|
| 183 | rowend = nvals |
---|
| 184 | row = data[rowstart:rowend] |
---|
| 185 | row.sort(key=itertools.itemgetter(2)) # into y order |
---|
| 186 | for colstart in quints: |
---|
| 187 | colend = colstart + stepsize |
---|
| 188 | if nvals - colend < stepsize: # finish last split |
---|
| 189 | colend = nvals |
---|
| 190 | cell = row[colstart:colend] |
---|
| 191 | xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell |
---|
| 192 | rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8) |
---|
| 193 | rpy.r.points(c(0,logstep),c(0,logstep),type='l') |
---|
| 194 | rpy.r.dev_off() |
---|
| 195 | |
---|
| 196 | |
---|
| 197 | def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''): |
---|
| 198 | """ |
---|
| 199 | y is data for a qqnorm plot |
---|
| 200 | if sampling, oversample low values - all the top 1% ? |
---|
| 201 | """ |
---|
| 202 | rangeunif = len(plotme) |
---|
| 203 | nunif = 1000 |
---|
| 204 | maxveclen = 3000 |
---|
| 205 | nrows = len(plotme) |
---|
| 206 | data = copy.copy(plotme) |
---|
| 207 | if nrows > maxveclen: |
---|
| 208 | # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points |
---|
| 209 | # oversample part of the distribution |
---|
| 210 | always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% |
---|
| 211 | skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points |
---|
| 212 | samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)] |
---|
| 213 | # always oversample first sorted (here lowest) values |
---|
| 214 | yvec = [data[i] for i in samplei] # always get first and last |
---|
| 215 | maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) |
---|
| 216 | else: |
---|
| 217 | yvec = data |
---|
| 218 | maint='Log QQ Plot(n=%d)' % (nrows) |
---|
| 219 | n = 1000 |
---|
| 220 | ylab = '%s' % xlabname |
---|
| 221 | xlab = 'Normal' |
---|
| 222 | # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure |
---|
| 223 | rpy.r.pdf( outfname, height , width ) |
---|
| 224 | rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5 |
---|
| 225 | rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8) |
---|
| 226 | rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted") |
---|
| 227 | rpy.r.dev_off() |
---|
| 228 | |
---|
| 229 | def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''): |
---|
| 230 | """ |
---|
| 231 | layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness |
---|
| 232 | like the GAIN qc tools |
---|
| 233 | y is data for a qq plot and ends up on the x axis go figure |
---|
| 234 | if sampling, oversample low values - all the top 1% ? |
---|
| 235 | """ |
---|
| 236 | rangeunif = len(plotme) |
---|
| 237 | nunif = 1000 |
---|
| 238 | fn = float(rangeunif) |
---|
| 239 | xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))] |
---|
| 240 | skip = max(int(rangeunif/fn),1) |
---|
| 241 | # force include last points |
---|
| 242 | mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line |
---|
| 243 | maxveclen = 2000 |
---|
| 244 | nrows = len(plotme) |
---|
| 245 | data = copy.copy(plotme) |
---|
| 246 | data.sort() # low to high - oversample low values |
---|
| 247 | if nrows > maxveclen: |
---|
| 248 | # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points |
---|
| 249 | # oversample part of the distribution |
---|
| 250 | always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% |
---|
| 251 | skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points |
---|
| 252 | samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)] |
---|
| 253 | # always oversample first sorted (here lowest) values |
---|
| 254 | yvec = [data[i] for i in samplei] # always get first and last |
---|
| 255 | xvec = [xvec[i] for i in samplei] # and sample xvec same way |
---|
| 256 | maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) |
---|
| 257 | else: |
---|
| 258 | yvec = data |
---|
| 259 | maint='Log QQ Plot(n=%d)' % (nrows) |
---|
| 260 | n = 1000 |
---|
| 261 | mx = [0,log10(fn)] # if 1000, becomes 3 for the null line |
---|
| 262 | ylab = '%s' % xlabname |
---|
| 263 | xlab = '-log10(Uniform 0-1)' |
---|
| 264 | # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure |
---|
| 265 | rpy.r.pdf( outfname, height , width ) |
---|
| 266 | rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5 |
---|
| 267 | rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8) |
---|
| 268 | rpy.r.points(mx,mx,type='l') |
---|
| 269 | rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted") |
---|
| 270 | rpy.r.dev_off() |
---|
| 271 | |
---|
| 272 | |
---|
| 273 | fdsto,stofile = tempfile.mkstemp() |
---|
| 274 | sto = open(stofile,'w') |
---|
| 275 | import rpy # delay to avoid rpy stdout chatter replacing galaxy file blurb |
---|
| 276 | mog = 'mogrify' |
---|
| 277 | pdfnup = 'pdfnup' |
---|
| 278 | pdfjoin = 'pdfjoin' |
---|
| 279 | shead = subjects.pop(0) # get rid of head |
---|
| 280 | mhead = markers.pop(0) |
---|
| 281 | maf = mhead.index('maf') |
---|
| 282 | missfrac = mhead.index('missfrac') |
---|
| 283 | logphweall = mhead.index('logp_hwe_all') |
---|
| 284 | logphweunaff = mhead.index('logp_hwe_unaff') |
---|
| 285 | # check for at least some unaffected rml june 2009 |
---|
| 286 | m_mendel = mhead.index('N_Mendel') |
---|
| 287 | fracmiss = shead.index('FracMiss') |
---|
| 288 | s_mendel = shead.index('Mendel_errors') |
---|
| 289 | s_het = shead.index('F_Stat') |
---|
| 290 | params = {} |
---|
| 291 | hweres = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff |
---|
| 292 | and x[logphweunaff].upper() <> 'NA'] |
---|
| 293 | if len(hweres) <> 0: |
---|
| 294 | xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het] |
---|
| 295 | # plot for each of these cols |
---|
| 296 | else: # try hwe all instead - maybe no affection status available |
---|
| 297 | xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het] |
---|
| 298 | ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything! |
---|
| 299 | oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled |
---|
| 300 | qqplotme = [1,0,0,0,0,0,0] # |
---|
| 301 | qnplotme = [0,0,0,0,0,0,1] # |
---|
| 302 | nplots = len(xs) |
---|
| 303 | xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency', |
---|
| 304 | 'Marker Mendel Error Count', 'Missing Rate: Subjects', |
---|
| 305 | 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic'] |
---|
| 306 | plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het'] |
---|
| 307 | ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames |
---|
| 308 | ordplotnames = ['%s_cum' % x for x in plotnames] |
---|
| 309 | ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames |
---|
| 310 | outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)] |
---|
| 311 | ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)] |
---|
| 312 | datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table |
---|
| 313 | titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel", |
---|
| 314 | "Subject Missing Genotype","Subject Mendel",'Subject F Statistic'] |
---|
| 315 | html = [] |
---|
| 316 | pdflist = [] |
---|
| 317 | for n,column in enumerate(xs): |
---|
| 318 | dat = [float(x[column]) for x in datasources[n] if len(x) >= column |
---|
| 319 | and x[column][:2].upper() <> 'NA'] # plink gives both! |
---|
| 320 | if sum(dat) <> 0: # eg nada for mendel if case control? |
---|
| 321 | rHist(plotme=dat,outfname=outfnames[n],xlabname=xlabnames[n], |
---|
| 322 | title=titles[n],basename=basename,nbreaks=nbreaks) |
---|
| 323 | row = [titles[n],ploturls[n],outfnames[n] ] |
---|
| 324 | html.append(row) |
---|
| 325 | pdflist.append(outfnames[n]) |
---|
| 326 | if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up |
---|
| 327 | otitle = 'Ranked %s' % titles[n] |
---|
| 328 | dat.sort() |
---|
| 329 | if oreverseme[n]: |
---|
| 330 | dat.reverse() |
---|
| 331 | rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n], |
---|
| 332 | title=otitle,basename=basename,nbreaks=1000) |
---|
| 333 | row = [otitle,ordploturls[n],ordoutfnames[n]] |
---|
| 334 | html.append(row) |
---|
| 335 | pdflist.append(ordoutfnames[n]) |
---|
| 336 | if qqplotme[n]: # |
---|
| 337 | otitle = 'LogQQ plot %s' % titles[n] |
---|
| 338 | dat.sort() |
---|
| 339 | dat.reverse() |
---|
| 340 | ofn = os.path.split(ordoutfnames[n]) |
---|
| 341 | ofn = os.path.join(ofn[0],'QQ%s' % ofn[1]) |
---|
| 342 | ofu = os.path.split(ordploturls[n]) |
---|
| 343 | ofu = os.path.join(ofu[0],'QQ%s' % ofu[1]) |
---|
| 344 | rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n], |
---|
| 345 | title=otitle,basename=basename) |
---|
| 346 | row = [otitle,ofu,ofn] |
---|
| 347 | html.append(row) |
---|
| 348 | pdflist.append(ofn) |
---|
| 349 | elif qnplotme[n]: |
---|
| 350 | otitle = 'F Statistic %s' % titles[n] |
---|
| 351 | dat.sort() |
---|
| 352 | dat.reverse() |
---|
| 353 | ofn = os.path.split(ordoutfnames[n]) |
---|
| 354 | ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1]) |
---|
| 355 | ofu = os.path.split(ordploturls[n]) |
---|
| 356 | ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1]) |
---|
| 357 | rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n], |
---|
| 358 | title=otitle,basename=basename) |
---|
| 359 | row = [otitle,ofu,ofn] |
---|
| 360 | html.append(row) |
---|
| 361 | pdflist.append(ofn) |
---|
| 362 | else: |
---|
| 363 | print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10]) |
---|
| 364 | if nup>0: |
---|
| 365 | # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf` |
---|
| 366 | # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf |
---|
| 367 | filestojoin = ' '.join(pdflist) # all the file names so far |
---|
| 368 | afname = '%s_All_Paged.pdf' % (basename) |
---|
| 369 | fullafname = os.path.join(newfpath,afname) |
---|
| 370 | expl = 'All %s QC Plots joined into a single pdf' % basename |
---|
| 371 | vcl = '%s %s --outfile %s ' % (pdfjoin,filestojoin, fullafname) |
---|
| 372 | # make single page pdf |
---|
| 373 | x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto) |
---|
| 374 | retval = x.wait() |
---|
| 375 | row = [expl,afname,fullafname] |
---|
| 376 | html.insert(0,row) # last rather than second |
---|
| 377 | nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup) |
---|
| 378 | fullnfname = os.path.join(newfpath,nfname) |
---|
| 379 | expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup) |
---|
| 380 | vcl = '%s %s --nup %dx%d --frame true --outfile %s' % (pdfnup,afname,nup,nup,fullnfname) |
---|
| 381 | # make thumbnail images |
---|
| 382 | x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto) |
---|
| 383 | retval = x.wait() |
---|
| 384 | row = [expl,nfname,fullnfname] |
---|
| 385 | html.insert(1,row) # this goes second |
---|
| 386 | vcl = '%s -format jpg -resize %s %s' % (mog, mogresize, os.path.join(newfpath,'*.pdf')) |
---|
| 387 | # make thumbnail images |
---|
| 388 | x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto) |
---|
| 389 | retval = x.wait() |
---|
| 390 | sto.close() |
---|
| 391 | cruft = open(stofile,'r').readlines() |
---|
| 392 | return html,cruft # elements for an ordered list of urls or whatever.. |
---|
| 393 | |
---|
| 394 | |
---|
| 395 | def RmakePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='100',nup=3,height=8,width=10,rexe=''): |
---|
| 396 | """ |
---|
| 397 | nice try but the R scripts are huge and take forever to run if there's a lot of data |
---|
| 398 | marker rhead = ['snp','chrom','maf','a1','a2','missfrac', |
---|
| 399 | 'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel'] |
---|
| 400 | subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest'] |
---|
| 401 | """ |
---|
| 402 | colour = "maroon" |
---|
| 403 | |
---|
| 404 | def rHist(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=nbreaks): |
---|
| 405 | """ rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50) |
---|
| 406 | # generic histogram and vertical boxplot in a 3:1 layout |
---|
| 407 | # returns the graphic file name for inclusion in the web page |
---|
| 408 | # broken out here for reuse |
---|
| 409 | # ross lazarus march 2007 |
---|
| 410 | """ |
---|
| 411 | R = [] |
---|
| 412 | maint = 'QC for %s - %s' % (basename,title) |
---|
| 413 | screenmat = (1,2,1,2) # create a 2x2 canvas |
---|
| 414 | widthlist = (80,20) # change to 4:1 ratio for histo and boxplot |
---|
| 415 | R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) |
---|
| 416 | R.append("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))") |
---|
| 417 | R.append("plotme = read.table(file='%s',head=F,sep='\t')" % plotme) |
---|
| 418 | R.append('hist(plotme, main="%s",xlab="%s",breaks=%d,col="%s")' % (maint,xlabname,nbreaks,colour)) |
---|
| 419 | R.append('boxplot(plotme,main="",col="%s",outline=F)' % (colour) ) |
---|
| 420 | R.append('dev.off()') |
---|
| 421 | return R |
---|
| 422 | |
---|
| 423 | def rCum(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=100): |
---|
| 424 | """ |
---|
| 425 | Useful to see what various cutoffs yield - plot percentiles |
---|
| 426 | """ |
---|
| 427 | R = [] |
---|
| 428 | n = len(plotme) |
---|
| 429 | R.append("plotme = read.table(file='%s',head=T,sep='\t')" % plotme) |
---|
| 430 | # arrives already in decending order of importance missingness or mendel count by subj or marker |
---|
| 431 | maint = 'QC for %s - %s' % (basename,title) |
---|
| 432 | R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) |
---|
| 433 | R.append("par(lab=c(10,10,10))") |
---|
| 434 | R.append('plot(plotme$xvec,plotme$yvec,type="p",main="%s",ylab="%s",xlab="Sample Percentile",col="%s")' % (maint,xlabname,colour)) |
---|
| 435 | R.append('dev.off()') |
---|
| 436 | return R |
---|
| 437 | |
---|
| 438 | def rQQ(plotme='', outfname='fname',title='title',xlabname='Sample',basename=''): |
---|
| 439 | """ |
---|
| 440 | y is data for a qq plot and ends up on the x axis go figure |
---|
| 441 | if sampling, oversample low values - all the top 1% ? |
---|
| 442 | this version called with -log10 transformed hwe |
---|
| 443 | """ |
---|
| 444 | R = [] |
---|
| 445 | nrows = len(plotme) |
---|
| 446 | fn = float(nrows) |
---|
| 447 | xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))] |
---|
| 448 | #mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line |
---|
| 449 | maxveclen = 3000 |
---|
| 450 | yvec = copy.copy(plotme) |
---|
| 451 | if nrows > maxveclen: |
---|
| 452 | # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points |
---|
| 453 | # oversample part of the distribution |
---|
| 454 | always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% |
---|
| 455 | skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points |
---|
| 456 | if skip < 2: |
---|
| 457 | skip = 2 |
---|
| 458 | samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)] |
---|
| 459 | # always oversample first sorted (here lowest) values |
---|
| 460 | yvec = [yvec[i] for i in samplei] # always get first and last |
---|
| 461 | xvec = [xvec[i] for i in samplei] # and sample xvec same way |
---|
| 462 | maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) |
---|
| 463 | else: |
---|
| 464 | maint='Log QQ Plot(n=%d)' % (nrows) |
---|
| 465 | mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line |
---|
| 466 | ylab = '%s' % xlabname |
---|
| 467 | xlab = '-log10(Uniform 0-1)' |
---|
| 468 | # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure |
---|
| 469 | x = ['%f' % x for x in xvec] |
---|
| 470 | R.append('xvec = c(%s)' % ','.join(x)) |
---|
| 471 | y = ['%f' % x for x in yvec] |
---|
| 472 | R.append('yvec = c(%s)' % ','.join(y)) |
---|
| 473 | R.append('mx = c(0,%f)' % (math.log10(fn))) |
---|
| 474 | R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) |
---|
| 475 | R.append("par(lab=c(10,10,10))") |
---|
| 476 | R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour)) |
---|
| 477 | R.append('points(mx,mx,type="l")') |
---|
| 478 | R.append('grid(col="lightgray",lty="dotted")') |
---|
| 479 | R.append('dev.off()') |
---|
| 480 | return R |
---|
| 481 | |
---|
| 482 | def rMultiQQ(plotme = '',nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''): |
---|
| 483 | """ |
---|
| 484 | data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a |
---|
| 485 | grid of qq plots to show departure from null at extremes of data quality |
---|
| 486 | Need to plot qqplot(p,unif) where the p's come from one x and one y quantile |
---|
| 487 | and ends up on the x axis go figure |
---|
| 488 | if sampling, oversample low values - all the top 1% ? |
---|
| 489 | """ |
---|
| 490 | data = copy.copy(plotme) |
---|
| 491 | nvals = len(data) |
---|
| 492 | stepsize = nvals/nsplits |
---|
| 493 | logstep = math.log10(stepsize) # so is 3 for steps of 1000 |
---|
| 494 | R.append('mx = c(0,%f)' % logstep) |
---|
| 495 | quints = range(0,nvals,stepsize) # quintile cutpoints for each layer |
---|
| 496 | data.sort(key=itertools.itemgetter(1)) # into x order |
---|
| 497 | #rpy.r.pdf( outfname, h , w ) |
---|
| 498 | #rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits)) |
---|
| 499 | R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) |
---|
| 500 | yvec = [-math.log10(random.random()) for x in range(stepsize)] |
---|
| 501 | yvec.sort() # size of each step is expected range for xvec under null?! |
---|
| 502 | y = ['%f' % x for x in yvec] |
---|
| 503 | R.append('yvec = c(%s)' % ','.join(y)) |
---|
| 504 | for rowstart in quints: |
---|
| 505 | rowend = rowstart + stepsize |
---|
| 506 | if nvals - rowend < stepsize: # finish last split |
---|
| 507 | rowend = nvals |
---|
| 508 | row = data[rowstart:rowend] |
---|
| 509 | row.sort(key=itertools.itemgetter(2)) # into y order |
---|
| 510 | for colstart in quints: |
---|
| 511 | colend = colstart + stepsize |
---|
| 512 | if nvals - colend < stepsize: # finish last split |
---|
| 513 | colend = nvals |
---|
| 514 | cell = row[colstart:colend] |
---|
| 515 | xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell |
---|
| 516 | x = ['%f' % x for x in xvec] |
---|
| 517 | R.append('xvec = c(%s)' % ','.join(x)) |
---|
| 518 | R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour)) |
---|
| 519 | R.append('points(mx,mx,type="l")') |
---|
| 520 | R.append('grid(col="lightgray",lty="dotted")') |
---|
| 521 | #rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8) |
---|
| 522 | #rpy.r.points(c(0,logstep),c(0,logstep),type='l') |
---|
| 523 | R.append('dev.off()') |
---|
| 524 | #rpy.r.dev_off() |
---|
| 525 | return R |
---|
| 526 | |
---|
| 527 | |
---|
| 528 | def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''): |
---|
| 529 | """ |
---|
| 530 | y is data for a qqnorm plot |
---|
| 531 | if sampling, oversample low values - all the top 1% ? |
---|
| 532 | """ |
---|
| 533 | rangeunif = len(plotme) |
---|
| 534 | nunif = 1000 |
---|
| 535 | maxveclen = 3000 |
---|
| 536 | nrows = len(plotme) |
---|
| 537 | data = copy.copy(plotme) |
---|
| 538 | if nrows > maxveclen: |
---|
| 539 | # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points |
---|
| 540 | # oversample part of the distribution |
---|
| 541 | always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% |
---|
| 542 | skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points |
---|
| 543 | samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)] |
---|
| 544 | # always oversample first sorted (here lowest) values |
---|
| 545 | yvec = [data[i] for i in samplei] # always get first and last |
---|
| 546 | maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) |
---|
| 547 | else: |
---|
| 548 | yvec = data |
---|
| 549 | maint='Log QQ Plot(n=%d)' % (nrows) |
---|
| 550 | n = 1000 |
---|
| 551 | ylab = '%s' % xlabname |
---|
| 552 | xlab = 'Normal' |
---|
| 553 | # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure |
---|
| 554 | #rpy.r.pdf( outfname, h , w ) |
---|
| 555 | #rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5 |
---|
| 556 | #rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8) |
---|
| 557 | #rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted") |
---|
| 558 | #rpy.r.dev_off() |
---|
| 559 | y = ['%f' % x for x in yvec] |
---|
| 560 | R.append('yvec = c(%s)' % ','.join(y)) |
---|
| 561 | R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) |
---|
| 562 | R.append("par(lab=c(10,10,10))") |
---|
| 563 | R.append('qqnorm(yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour)) |
---|
| 564 | R.append('grid(col="lightgray",lty="dotted")') |
---|
| 565 | R.append('dev.off()') |
---|
| 566 | return R |
---|
| 567 | |
---|
| 568 | def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''): |
---|
| 569 | """ |
---|
| 570 | layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness |
---|
| 571 | like the GAIN qc tools |
---|
| 572 | y is data for a qq plot and ends up on the x axis go figure |
---|
| 573 | if sampling, oversample low values - all the top 1% ? |
---|
| 574 | """ |
---|
| 575 | rangeunif = len(plotme) |
---|
| 576 | nunif = 1000 |
---|
| 577 | fn = float(rangeunif) |
---|
| 578 | xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))] |
---|
| 579 | skip = max(int(rangeunif/fn),1) |
---|
| 580 | # force include last points |
---|
| 581 | mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line |
---|
| 582 | maxveclen = 2000 |
---|
| 583 | nrows = len(plotme) |
---|
| 584 | data = copy.copy(plotme) |
---|
| 585 | data.sort() # low to high - oversample low values |
---|
| 586 | if nrows > maxveclen: |
---|
| 587 | # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points |
---|
| 588 | # oversample part of the distribution |
---|
| 589 | always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% |
---|
| 590 | skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points |
---|
| 591 | samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)] |
---|
| 592 | # always oversample first sorted (here lowest) values |
---|
| 593 | yvec = [data[i] for i in samplei] # always get first and last |
---|
| 594 | xvec = [xvec[i] for i in samplei] # and sample xvec same way |
---|
| 595 | maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) |
---|
| 596 | else: |
---|
| 597 | yvec = data |
---|
| 598 | maint='Log QQ Plot(n=%d)' % (nrows) |
---|
| 599 | n = 1000 |
---|
| 600 | mx = [0,log10(fn)] # if 1000, becomes 3 for the null line |
---|
| 601 | ylab = '%s' % xlabname |
---|
| 602 | xlab = '-log10(Uniform 0-1)' |
---|
| 603 | R.append('mx = c(0,%f)' % (math.log10(fn))) |
---|
| 604 | x = ['%f' % x for x in xvec] |
---|
| 605 | R.append('xvec = c(%s)' % ','.join(x)) |
---|
| 606 | y = ['%f' % x for x in yvec] |
---|
| 607 | R.append('yvec = c(%s)' % ','.join(y)) |
---|
| 608 | R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) |
---|
| 609 | R.append("par(lab=c(10,10,10))") |
---|
| 610 | R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour)) |
---|
| 611 | R.append('points(mx,mx,type="l")') |
---|
| 612 | R.append('grid(col="lightgray",lty="dotted")') |
---|
| 613 | R.append('dev.off()') |
---|
| 614 | |
---|
| 615 | |
---|
| 616 | shead = subjects.pop(0) # get rid of head |
---|
| 617 | mhead = markers.pop(0) |
---|
| 618 | maf = mhead.index('maf') |
---|
| 619 | missfrac = mhead.index('missfrac') |
---|
| 620 | logphweall = mhead.index('logp_hwe_all') |
---|
| 621 | logphweunaff = mhead.index('logp_hwe_unaff') |
---|
| 622 | # check for at least some unaffected rml june 2009 |
---|
| 623 | m_mendel = mhead.index('N_Mendel') |
---|
| 624 | fracmiss = shead.index('FracMiss') |
---|
| 625 | s_mendel = shead.index('Mendel_errors') |
---|
| 626 | s_het = shead.index('F_Stat') |
---|
| 627 | params = {} |
---|
| 628 | h = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff |
---|
| 629 | and x[logphweunaff].upper() <> 'NA'] |
---|
| 630 | if len(h) <> 0: |
---|
| 631 | xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het] |
---|
| 632 | # plot for each of these cols |
---|
| 633 | else: # try hwe all instead - maybe no affection status available |
---|
| 634 | xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het] |
---|
| 635 | ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything! |
---|
| 636 | oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled |
---|
| 637 | qqplotme = [1,0,0,0,0,0,0] # |
---|
| 638 | qnplotme = [0,0,0,0,0,0,1] # |
---|
| 639 | nplots = len(xs) |
---|
| 640 | xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency', |
---|
| 641 | 'Marker Mendel Error Count', 'Missing Rate: Subjects', |
---|
| 642 | 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic'] |
---|
| 643 | plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het'] |
---|
| 644 | ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames |
---|
| 645 | ordplotnames = ['%s_cum' % x for x in plotnames] |
---|
| 646 | ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames |
---|
| 647 | outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)] |
---|
| 648 | ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)] |
---|
| 649 | datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table |
---|
| 650 | titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel", |
---|
| 651 | "Subject Missing Genotype","Subject Mendel",'Subject F Statistic'] |
---|
| 652 | html = [] |
---|
| 653 | pdflist = [] |
---|
| 654 | R = [] |
---|
| 655 | for n,column in enumerate(xs): |
---|
| 656 | dfn = '%d_%s.txt' % (n,titles[n]) |
---|
| 657 | dfilepath = os.path.join(newfpath,dfn) |
---|
| 658 | dat = [float(x[column]) for x in datasources[n] if len(x) >= column |
---|
| 659 | and x[column][:2].upper() <> 'NA'] # plink gives both! |
---|
| 660 | if sum(dat) <> 0: # eg nada for mendel if case control? |
---|
| 661 | plotme = file(dfilepath,'w') |
---|
| 662 | plotme.write('\n'.join(['%f' % x for x in dat])) # pass as a file - copout! |
---|
| 663 | tR = rHist(plotme=dfilepath,outfname=outfnames[n],xlabname=xlabnames[n], |
---|
| 664 | title=titles[n],basename=basename,nbreaks=nbreaks) |
---|
| 665 | R += tR |
---|
| 666 | row = [titles[n],ploturls[n],outfnames[n] ] |
---|
| 667 | html.append(row) |
---|
| 668 | pdflist.append(outfnames[n]) |
---|
| 669 | if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up |
---|
| 670 | otitle = 'Ranked %s' % titles[n] |
---|
| 671 | dat.sort() |
---|
| 672 | if oreverseme[n]: |
---|
| 673 | dat.reverse() |
---|
| 674 | ndat = len(dat) |
---|
| 675 | xvec = range(ndat) |
---|
| 676 | xvec = [100.0*(n-x)/n for x in xvec] # convert to centile |
---|
| 677 | # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points |
---|
| 678 | maxveclen = 1000.0 # for reasonable pdf sizes! |
---|
| 679 | if ndat > maxveclen: # oversample part of the distribution |
---|
| 680 | always = min(1000,ndat/20) # oversample smaller of lowest few hundred items or 5% |
---|
| 681 | skip = int(ndat/maxveclen) # take 1 in skip to get about maxveclen points |
---|
| 682 | samplei = [i for i in range(ndat) if (i % skip == 0) or (i < always)] # always oversample first sorted values |
---|
| 683 | yvec = [yvec[i] for i in samplei] # always get first and last |
---|
| 684 | xvec = [xvec[i] for i in samplei] # always get first and last |
---|
| 685 | plotme = file(dfilepath,'w') |
---|
| 686 | plotme.write('xvec\tyvec\n') |
---|
| 687 | plotme.write('\n'.join(['%f\t%f' % (xvec[i],y) for y in yvec])) # pass as a file - copout! |
---|
| 688 | tR = rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n], |
---|
| 689 | title=otitle,basename=basename,nbreaks=nbreaks) |
---|
| 690 | R += tR |
---|
| 691 | row = [otitle,ordploturls[n],ordoutfnames[n]] |
---|
| 692 | html.append(row) |
---|
| 693 | pdflist.append(ordoutfnames[n]) |
---|
| 694 | if qqplotme[n]: # |
---|
| 695 | otitle = 'LogQQ plot %s' % titles[n] |
---|
| 696 | dat.sort() |
---|
| 697 | dat.reverse() |
---|
| 698 | ofn = os.path.split(ordoutfnames[n]) |
---|
| 699 | ofn = os.path.join(ofn[0],'QQ%s' % ofn[1]) |
---|
| 700 | ofu = os.path.split(ordploturls[n]) |
---|
| 701 | ofu = os.path.join(ofu[0],'QQ%s' % ofu[1]) |
---|
| 702 | tR = rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n], |
---|
| 703 | title=otitle,basename=basename) |
---|
| 704 | R += tR |
---|
| 705 | row = [otitle,ofu,ofn] |
---|
| 706 | html.append(row) |
---|
| 707 | pdflist.append(ofn) |
---|
| 708 | elif qnplotme[n]: |
---|
| 709 | otitle = 'F Statistic %s' % titles[n] |
---|
| 710 | dat.sort() |
---|
| 711 | dat.reverse() |
---|
| 712 | ofn = os.path.split(ordoutfnames[n]) |
---|
| 713 | ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1]) |
---|
| 714 | ofu = os.path.split(ordploturls[n]) |
---|
| 715 | ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1]) |
---|
| 716 | tR = rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n], |
---|
| 717 | title=otitle,basename=basename) |
---|
| 718 | R += tR |
---|
| 719 | row = [otitle,ofu,ofn] |
---|
| 720 | html.append(row) |
---|
| 721 | pdflist.append(ofn) |
---|
| 722 | else: |
---|
| 723 | print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10]) |
---|
| 724 | rlog,flist = RRun(rcmd=R,title='makeQCplots',outdir=newfpath) |
---|
| 725 | if nup>0: |
---|
| 726 | # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf` |
---|
| 727 | # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf |
---|
| 728 | filestojoin = ' '.join(pdflist) # all the file names so far |
---|
| 729 | afname = '%s_All_Paged.pdf' % (basename) |
---|
| 730 | fullafname = os.path.join(newfpath,afname) |
---|
| 731 | expl = 'All %s QC Plots joined into a single pdf' % basename |
---|
| 732 | vcl = 'pdfjoin %s --outfile %s ' % (filestojoin, fullafname) |
---|
| 733 | # make single page pdf |
---|
| 734 | x=subprocess.Popen(vcl,shell=True,cwd=newfpath) |
---|
| 735 | retval = x.wait() |
---|
| 736 | row = [expl,afname,fullafname] |
---|
| 737 | html.insert(0,row) # last rather than second |
---|
| 738 | nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup) |
---|
| 739 | fullnfname = os.path.join(newfpath,nfname) |
---|
| 740 | expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup) |
---|
| 741 | vcl = 'pdfnup %s --nup %dx%d --frame true --outfile %s' % (afname,nup,nup,fullnfname) |
---|
| 742 | # make thumbnail images |
---|
| 743 | x=subprocess.Popen(vcl,shell=True,cwd=newfpath) |
---|
| 744 | retval = x.wait() |
---|
| 745 | row = [expl,nfname,fullnfname] |
---|
| 746 | html.insert(1,row) # this goes second |
---|
| 747 | vcl = 'mogrify -format jpg -resize %s %s' % (mogresize, os.path.join(newfpath,'*.pdf')) |
---|
| 748 | # make thumbnail images |
---|
| 749 | x=subprocess.Popen(vcl,shell=True,cwd=newfpath) |
---|
| 750 | retval = x.wait() |
---|
| 751 | return html # elements for an ordered list of urls or whatever.. |
---|
| 752 | |
---|
| 753 | def countHet(bedf='fakeped_500000',linkageped=True,froot='fake500k',outfname="ahetf",logf='rgQC.log'): |
---|
| 754 | """ |
---|
| 755 | NO LONGER USED - historical interest |
---|
| 756 | count het loci for each subject to look for outliers = ? contamination |
---|
| 757 | assume ped file is linkage format |
---|
| 758 | need to make a ped file from the bed file we were passed |
---|
| 759 | """ |
---|
| 760 | vcl = [plinke,'--bfile',bedf,'--recode','--out','%s_recode' % froot] # write a recoded ped file from the real .bed file |
---|
| 761 | p=subprocess.Popen(' '.join(vcl),shell=True) |
---|
| 762 | retval = p.wait() |
---|
| 763 | f = open('%s_recode.recode.ped' % froot,'r') |
---|
| 764 | if not linkageped: |
---|
| 765 | head = f.next() # throw away header |
---|
| 766 | hets = [] # simple count of het loci per subject. Expect poisson? |
---|
| 767 | n = 1 |
---|
| 768 | for l in f: |
---|
| 769 | n += 1 |
---|
| 770 | ll = l.strip().split() |
---|
| 771 | if len(ll) > 6: |
---|
| 772 | iid = idjoiner.join(ll[:2]) # fam_iid |
---|
| 773 | gender = ll[4] |
---|
| 774 | alleles = ll[6:] |
---|
| 775 | nallele = len(alleles) |
---|
| 776 | nhet = 0 |
---|
| 777 | for i in range(nallele/2): |
---|
| 778 | a1=alleles[2*i] |
---|
| 779 | a2=alleles[2*i+1] |
---|
| 780 | if alleles[2*i] <> alleles[2*i+1]: # must be het |
---|
| 781 | if not missvals.get(a1,None) and not missvals.get(a2,None): |
---|
| 782 | nhet += 1 |
---|
| 783 | hets.append((nhet,iid,gender)) # for sorting later |
---|
| 784 | f.close() |
---|
| 785 | hets.sort() |
---|
| 786 | hets.reverse() # biggest nhet now on top |
---|
| 787 | f = open(outfname ,'w') |
---|
| 788 | res = ['%d\t%s\t%s' % (x) for x in hets] # I love list comprehensions |
---|
| 789 | res.insert(0,'nhetloci\tfamid_iid\tgender') |
---|
| 790 | res.append('') |
---|
| 791 | f.write('\n'.join(res)) |
---|
| 792 | f.close() |
---|
| 793 | |
---|
| 794 | |
---|
| 795 | |
---|
| 796 | def subjectRep(froot='cleantest',outfname="srep",newfpath='.',logf = None): |
---|
| 797 | """by subject (missingness = .imiss, mendel = .imendel) |
---|
| 798 | assume replicates have an underscore in family id for |
---|
| 799 | hapmap testing |
---|
| 800 | For sorting, we need floats and integers |
---|
| 801 | """ |
---|
| 802 | isexfile = '%s.sexcheck' % froot |
---|
| 803 | imissfile = '%s.imiss' % froot |
---|
| 804 | imendfile = '%s.imendel' % froot |
---|
| 805 | ihetfile = '%s.het' % froot |
---|
| 806 | logf.write('## subject reports starting at %s\n' % timenow()) |
---|
| 807 | outfile = os.path.join(newfpath,outfname) |
---|
| 808 | idlist = [] |
---|
| 809 | imissdict = {} |
---|
| 810 | imenddict = {} |
---|
| 811 | isexdict = {} |
---|
| 812 | ihetdict = {} |
---|
| 813 | Tops = {} |
---|
| 814 | Tnames = ['Ranked Subject Missing Genotype', 'Ranked Subject Mendel', |
---|
| 815 | 'Ranked Sex check', 'Ranked Inbreeding (het) F statistic'] |
---|
| 816 | Tsorts = [2,3,6,8] |
---|
| 817 | Treverse = [True,True,True,False] # so first values are worser |
---|
| 818 | #rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest'] |
---|
| 819 | ## FID IID MISS_PHENO N_MISS N_GENO F_MISS |
---|
| 820 | ## 1552042370_A 1552042370_A N 5480 549883 0.009966 |
---|
| 821 | ## 1552042410_A 1552042410_A N 1638 549883 0.002979 |
---|
| 822 | |
---|
| 823 | # ------------------missing-------------------- |
---|
| 824 | # imiss has FID IID MISS_PHENO N_MISS F_MISS |
---|
| 825 | # we want F_MISS |
---|
| 826 | try: |
---|
| 827 | f = open(imissfile,'r') |
---|
| 828 | except: |
---|
| 829 | logf.write('# file %s is missing - talk about irony\n' % imissfile) |
---|
| 830 | f = None |
---|
| 831 | if f: |
---|
| 832 | for n,line in enumerate(f): |
---|
| 833 | ll = line.strip().split() |
---|
| 834 | if n == 0: |
---|
| 835 | head = [x.upper() for x in ll] # expect above |
---|
| 836 | fidpos = head.index('FID') |
---|
| 837 | iidpos = head.index('IID') |
---|
| 838 | fpos = head.index('F_MISS') |
---|
| 839 | elif len(ll) >= fpos: # full line |
---|
| 840 | fid = ll[fidpos] |
---|
| 841 | #if fid.find('_') == -1: # not replicate! - icondb ids have these... |
---|
| 842 | iid = ll[iidpos] |
---|
| 843 | fmiss = ll[fpos] |
---|
| 844 | id = '%s%s%s' % (fid,idjoiner,iid) |
---|
| 845 | imissdict[id] = fmiss |
---|
| 846 | idlist.append(id) |
---|
| 847 | f.close() |
---|
| 848 | logf.write('## imissfile %s contained %d ids\n' % (imissfile,len(idlist))) |
---|
| 849 | # ------------------mend------------------- |
---|
| 850 | # *.imendel has FID IID N |
---|
| 851 | # we want N |
---|
| 852 | gotmend = True |
---|
| 853 | try: |
---|
| 854 | f = open(imendfile,'r') |
---|
| 855 | except: |
---|
| 856 | gotmend = False |
---|
| 857 | for id in idlist: |
---|
| 858 | imenddict[id] = '0' |
---|
| 859 | if gotmend: |
---|
| 860 | for n,line in enumerate(f): |
---|
| 861 | ll = line.strip().split() |
---|
| 862 | if n == 0: |
---|
| 863 | head = [x.upper() for x in ll] # expect above |
---|
| 864 | npos = head.index('N') |
---|
| 865 | fidpos = head.index('FID') |
---|
| 866 | iidpos = head.index('IID') |
---|
| 867 | elif len(ll) >= npos: # full line |
---|
| 868 | fid = ll[fidpos] |
---|
| 869 | iid = ll[iidpos] |
---|
| 870 | id = '%s%s%s' % (fid,idjoiner,iid) |
---|
| 871 | nmend = ll[npos] |
---|
| 872 | imenddict[id] = nmend |
---|
| 873 | f.close() |
---|
| 874 | else: |
---|
| 875 | logf.write('## error No %s file - assuming not family data\n' % imendfile) |
---|
| 876 | # ------------------sex check------------------ |
---|
| 877 | #[rerla@hg fresh]$ head /home/rerla/fresh/database/files/dataset_978_files/CAMP2007Dirty.sexcheck |
---|
| 878 | # sexcheck has FID IID PEDSEX SNPSEX STATUS F |
---|
| 879 | ## |
---|
| 880 | ## FID Family ID |
---|
| 881 | ## IID Individual ID |
---|
| 882 | ## PEDSEX Sex as determined in pedigree file (1=male, 2=female) |
---|
| 883 | ## SNPSEX Sex as determined by X chromosome |
---|
| 884 | ## STATUS Displays "PROBLEM" or "OK" for each individual |
---|
| 885 | ## F The actual X chromosome inbreeding (homozygosity) estimate |
---|
| 886 | ## |
---|
| 887 | ## A PROBLEM arises if the two sexes do not match, or if the SNP data or pedigree data are |
---|
| 888 | ## ambiguous with regard to sex. |
---|
| 889 | ## A male call is made if F is more than 0.8; a femle call is made if F is less than 0.2. |
---|
| 890 | try: |
---|
| 891 | f = open(isexfile,'r') |
---|
| 892 | got_sexcheck = 1 |
---|
| 893 | except: |
---|
| 894 | got_sexcheck = 0 |
---|
| 895 | if got_sexcheck: |
---|
| 896 | for n,line in enumerate(f): |
---|
| 897 | ll = line.strip().split() |
---|
| 898 | if n == 0: |
---|
| 899 | head = [x.upper() for x in ll] # expect above |
---|
| 900 | fidpos = head.index('FID') |
---|
| 901 | iidpos = head.index('IID') |
---|
| 902 | pedsexpos = head.index('PEDSEX') |
---|
| 903 | snpsexpos = head.index('SNPSEX') |
---|
| 904 | statuspos = head.index('STATUS') |
---|
| 905 | fpos = head.index('F') |
---|
| 906 | elif len(ll) >= fpos: # full line |
---|
| 907 | fid = ll[fidpos] |
---|
| 908 | iid = ll[iidpos] |
---|
| 909 | fest = ll[fpos] |
---|
| 910 | pedsex = ll[pedsexpos] |
---|
| 911 | snpsex = ll[snpsexpos] |
---|
| 912 | stat = ll[statuspos] |
---|
| 913 | id = '%s%s%s' % (fid,idjoiner,iid) |
---|
| 914 | isexdict[id] = (pedsex,snpsex,stat,fest) |
---|
| 915 | f.close() |
---|
| 916 | else: |
---|
| 917 | # this only happens if there are no subjects! |
---|
| 918 | logf.write('No %s file - assuming no sex errors' % isexfile) |
---|
| 919 | ## |
---|
| 920 | ## FID IID O(HOM) E(HOM) N(NM) F |
---|
| 921 | ## 457 2 490665 4.928e+05 722154 -0.009096 |
---|
| 922 | ## 457 3 464519 4.85e+05 710986 -0.0908 |
---|
| 923 | ## 1037 2 461632 4.856e+05 712025 -0.106 |
---|
| 924 | ## 1037 1 491845 4.906e+05 719353 0.005577 |
---|
| 925 | try: |
---|
| 926 | f = open(ihetfile,'r') |
---|
| 927 | except: |
---|
| 928 | f = None |
---|
| 929 | logf.write('## No %s file - did we run --het in plink?\n' % ihetfile) |
---|
| 930 | if f: |
---|
| 931 | for i,line in enumerate(f): |
---|
| 932 | ll = line.strip().split() |
---|
| 933 | if i == 0: |
---|
| 934 | head = [x.upper() for x in ll] # expect above |
---|
| 935 | fidpos = head.index('FID') |
---|
| 936 | iidpos = head.index('IID') |
---|
| 937 | fpos = head.index('F') |
---|
| 938 | n = 0 |
---|
| 939 | elif len(ll) >= fpos: # full line |
---|
| 940 | fid = ll[fidpos] |
---|
| 941 | iid = ll[iidpos] |
---|
| 942 | fhet = ll[fpos] |
---|
| 943 | id = '%s%s%s' % (fid,idjoiner,iid) |
---|
| 944 | ihetdict[id] = fhet |
---|
| 945 | f.close() # now assemble and output result list |
---|
| 946 | rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','XHomEst','F_Stat'] |
---|
| 947 | res = [] |
---|
| 948 | fres = [] # floats for sorting |
---|
| 949 | for id in idlist: # for each snp in found order |
---|
| 950 | fid,iid = id.split(idjoiner) # recover keys |
---|
| 951 | f_missing = imissdict.get(id,'0.0') |
---|
| 952 | nmend = imenddict.get(id,'0') |
---|
| 953 | (pedsex,snpsex,status,fest) = isexdict.get(id,('0','0','0','0.0')) |
---|
| 954 | fhet = ihetdict.get(id,'0.0') |
---|
| 955 | res.append([fid,iid,f_missing,nmend,pedsex,snpsex,status,fest,fhet]) |
---|
| 956 | try: |
---|
| 957 | ff_missing = float(f_missing) |
---|
| 958 | except: |
---|
| 959 | ff_missing = 0.0 |
---|
| 960 | try: |
---|
| 961 | inmend = int(nmend) |
---|
| 962 | except: |
---|
| 963 | inmend = 0 |
---|
| 964 | try: |
---|
| 965 | ffest = float(fest) |
---|
| 966 | except: |
---|
| 967 | fest = 0.0 |
---|
| 968 | try: |
---|
| 969 | ffhet = float(fhet) |
---|
| 970 | except: |
---|
| 971 | ffhet = 0.0 |
---|
| 972 | fres.append([fid,iid,ff_missing,inmend,pedsex,snpsex,status,ffest,ffhet]) |
---|
| 973 | ntokeep = max(20,len(res)/keepfrac) |
---|
| 974 | for i,col in enumerate(Tsorts): |
---|
| 975 | fres.sort(key=operator.itemgetter(col)) |
---|
| 976 | if Treverse[i]: |
---|
| 977 | fres.reverse() |
---|
| 978 | repname = Tnames[i] |
---|
| 979 | Tops[repname] = fres[0:ntokeep] |
---|
| 980 | Tops[repname] = [map(str,x) for x in Tops[repname]] |
---|
| 981 | Tops[repname].insert(0,rhead) |
---|
| 982 | res.sort() |
---|
| 983 | res.insert(0,rhead) |
---|
| 984 | logf.write('### writing %s report with %s' % ( outfile,res[0])) |
---|
| 985 | f = open(outfile,'w') |
---|
| 986 | f.write('\n'.join(['\t'.join(x) for x in res])) |
---|
| 987 | f.write('\n') |
---|
| 988 | f.close() |
---|
| 989 | return res,Tops |
---|
| 990 | |
---|
| 991 | def markerRep(froot='cleantest',outfname="mrep",newfpath='.',logf=None,maplist=None ): |
---|
| 992 | """by marker (hwe = .hwe, missingness=.lmiss, freq = .frq) |
---|
| 993 | keep a list of marker order but keep all stats in dicts |
---|
| 994 | write out a fake xls file for R or SAS etc |
---|
| 995 | kinda clunky, but.. |
---|
| 996 | TODO: ensure stable if any file not found? |
---|
| 997 | """ |
---|
| 998 | mapdict = {} |
---|
| 999 | if maplist <> None: |
---|
| 1000 | rslist = [x[1] for x in maplist] |
---|
| 1001 | offset = [(x[0],x[3]) for x in maplist] |
---|
| 1002 | mapdict = dict(zip(rslist,offset)) |
---|
| 1003 | hwefile = '%s.hwe' % froot |
---|
| 1004 | lmissfile = '%s.lmiss' % froot |
---|
| 1005 | freqfile = '%s.frq' % froot |
---|
| 1006 | lmendfile = '%s.lmendel' % froot |
---|
| 1007 | outfile = os.path.join(newfpath,outfname) |
---|
| 1008 | markerlist = [] |
---|
| 1009 | chromlist = [] |
---|
| 1010 | hwedict = {} |
---|
| 1011 | lmissdict = {} |
---|
| 1012 | freqdict = {} |
---|
| 1013 | lmenddict = {} |
---|
| 1014 | Tops = {} |
---|
| 1015 | Tnames = ['Ranked Marker MAF', 'Ranked Marker Missing Genotype', 'Ranked Marker HWE', 'Ranked Marker Mendel'] |
---|
| 1016 | Tsorts = [3,6,10,11] |
---|
| 1017 | Treverse = [False,True,True,True] # so first values are worse(r) |
---|
| 1018 | #res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend]) |
---|
| 1019 | #rhead = ['snp','chrom','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel'] |
---|
| 1020 | # -------------------hwe-------------------------- |
---|
| 1021 | # hwe has SNP TEST GENO O(HET) E(HET) P_HWD |
---|
| 1022 | # we want all hwe where P_HWD <> NA |
---|
| 1023 | # ah changed in 1.04 to |
---|
| 1024 | ## CHR SNP TEST A1 A2 GENO O(HET) E(HET) P |
---|
| 1025 | ## 1 rs6671164 ALL 2 3 34/276/613 0.299 0.3032 0.6644 |
---|
| 1026 | ## 1 rs6671164 AFF 2 3 0/0/0 nan nan NA |
---|
| 1027 | ## 1 rs6671164 UNAFF 2 3 34/276/613 0.299 0.3032 0.6644 |
---|
| 1028 | ## 1 rs4448553 ALL 2 3 8/176/748 0.1888 0.1848 0.5975 |
---|
| 1029 | ## 1 rs4448553 AFF 2 3 0/0/0 nan nan NA |
---|
| 1030 | ## 1 rs4448553 UNAFF 2 3 8/176/748 0.1888 0.1848 0.5975 |
---|
| 1031 | ## 1 rs1990150 ALL 1 3 54/303/569 0.3272 0.3453 0.1067 |
---|
| 1032 | ## 1 rs1990150 AFF 1 3 0/0/0 nan nan NA |
---|
| 1033 | ## 1 rs1990150 UNAFF 1 3 54/303/569 0.3272 0.3453 0.1067 |
---|
| 1034 | logf.write('## marker reports starting at %s\n' % timenow()) |
---|
| 1035 | try: |
---|
| 1036 | f = open(hwefile,'r') |
---|
| 1037 | except: |
---|
| 1038 | f = None |
---|
| 1039 | logf.write('## error - no hwefile %s found\n' % hwefile) |
---|
| 1040 | if f: |
---|
| 1041 | for i,line in enumerate(f): |
---|
| 1042 | ll = line.strip().split() |
---|
| 1043 | if i == 0: # head |
---|
| 1044 | head = [x.upper() for x in ll] # expect above |
---|
| 1045 | try: |
---|
| 1046 | testpos = head.index('TEST') |
---|
| 1047 | except: |
---|
| 1048 | testpos = 2 # patch for 1.04 which has b0rken headers - otherwise use head.index('TEST') |
---|
| 1049 | try: |
---|
| 1050 | ppos = head.index('P') |
---|
| 1051 | except: |
---|
| 1052 | ppos = 8 # patch - for head.index('P') |
---|
| 1053 | snppos = head.index('SNP') |
---|
| 1054 | chrpos = head.index('CHR') |
---|
| 1055 | logf.write('hwe header testpos=%d,ppos=%d,snppos=%d\n' % (testpos,ppos,snppos)) |
---|
| 1056 | elif len(ll) >= ppos: # full line |
---|
| 1057 | ps = ll[ppos].upper() |
---|
| 1058 | rs = ll[snppos] |
---|
| 1059 | chrom = ll[chrpos] |
---|
| 1060 | test = ll[testpos] |
---|
| 1061 | if not hwedict.get(rs,None): |
---|
| 1062 | hwedict[rs] = {} |
---|
| 1063 | markerlist.append(rs) |
---|
| 1064 | chromlist.append(chrom) # one place to find it? |
---|
| 1065 | lpvals = 0 |
---|
| 1066 | if ps.upper() <> 'NA' and ps.upper() <> 'NAN': # worth keeping |
---|
| 1067 | lpvals = '0' |
---|
| 1068 | if ps <> '1': |
---|
| 1069 | try: |
---|
| 1070 | pval = float(ps) |
---|
| 1071 | lpvals = '%f' % -math.log10(pval) |
---|
| 1072 | except: |
---|
| 1073 | pass |
---|
| 1074 | hwedict[rs][test] = (ps,lpvals) |
---|
| 1075 | else: |
---|
| 1076 | logf.write('short line #%d = %s\n' % (i,ll)) |
---|
| 1077 | f.close() |
---|
| 1078 | # ------------------missing-------------------- |
---|
| 1079 | """lmiss has |
---|
| 1080 | CHR SNP N_MISS N_GENO F_MISS |
---|
| 1081 | 1 rs12354060 0 73 0 |
---|
| 1082 | 1 rs4345758 1 73 0.0137 |
---|
| 1083 | 1 rs2691310 73 73 1 |
---|
| 1084 | 1 rs2531266 73 73 1 |
---|
| 1085 | # we want F_MISS""" |
---|
| 1086 | try: |
---|
| 1087 | f = open(lmissfile,'r') |
---|
| 1088 | except: |
---|
| 1089 | f = None |
---|
| 1090 | if f: |
---|
| 1091 | for i,line in enumerate(f): |
---|
| 1092 | ll = line.strip().split() |
---|
| 1093 | if i == 0: |
---|
| 1094 | head = [x.upper() for x in ll] # expect above |
---|
| 1095 | fracpos = head.index('F_MISS') |
---|
| 1096 | npos = head.index('N_MISS') |
---|
| 1097 | snppos = head.index('SNP') |
---|
| 1098 | elif len(ll) >= fracpos: # full line |
---|
| 1099 | rs = ll[snppos] |
---|
| 1100 | fracval = ll[fracpos] |
---|
| 1101 | lmissdict[rs] = fracval # for now, just that? |
---|
| 1102 | else: |
---|
| 1103 | lmissdict[rs] = 'NA' |
---|
| 1104 | f.close() |
---|
| 1105 | # ------------------freq------------------- |
---|
| 1106 | # frq has CHR SNP A1 A2 MAF NM |
---|
| 1107 | # we want maf |
---|
| 1108 | try: |
---|
| 1109 | f = open(freqfile,'r') |
---|
| 1110 | except: |
---|
| 1111 | f = None |
---|
| 1112 | if f: |
---|
| 1113 | for i,line in enumerate(f): |
---|
| 1114 | ll = line.strip().split() |
---|
| 1115 | if i == 0: |
---|
| 1116 | head = [x.upper() for x in ll] # expect above |
---|
| 1117 | mafpos = head.index('MAF') |
---|
| 1118 | a1pos = head.index('A1') |
---|
| 1119 | a2pos = head.index('A2') |
---|
| 1120 | snppos = head.index('SNP') |
---|
| 1121 | elif len(ll) >= mafpos: # full line |
---|
| 1122 | rs = ll[snppos] |
---|
| 1123 | a1 = ll[a1pos] |
---|
| 1124 | a2 = ll[a2pos] |
---|
| 1125 | maf = ll[mafpos] |
---|
| 1126 | freqdict[rs] = (maf,a1,a2) |
---|
| 1127 | f.close() |
---|
| 1128 | # ------------------mend------------------- |
---|
| 1129 | # lmend has CHR SNP N |
---|
| 1130 | # we want N |
---|
| 1131 | gotmend = True |
---|
| 1132 | try: |
---|
| 1133 | f = open(lmendfile,'r') |
---|
| 1134 | except: |
---|
| 1135 | gotmend = False |
---|
| 1136 | for rs in markerlist: |
---|
| 1137 | lmenddict[rs] = '0' |
---|
| 1138 | if gotmend: |
---|
| 1139 | for i,line in enumerate(f): |
---|
| 1140 | ll = line.strip().split() |
---|
| 1141 | if i == 0: |
---|
| 1142 | head = [x.upper() for x in ll] # expect above |
---|
| 1143 | npos = head.index('N') |
---|
| 1144 | snppos = head.index('SNP') |
---|
| 1145 | elif len(ll) >= npos: # full line |
---|
| 1146 | rs = ll[snppos] |
---|
| 1147 | nmend = ll[npos] |
---|
| 1148 | lmenddict[rs] = nmend |
---|
| 1149 | f.close() |
---|
| 1150 | else: |
---|
| 1151 | logf.write('No %s file - assuming not family data\n' % lmendfile) |
---|
| 1152 | # now assemble result list |
---|
| 1153 | rhead = ['snp','chromosome','offset','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel'] |
---|
| 1154 | res = [] |
---|
| 1155 | fres = [] |
---|
| 1156 | for rs in markerlist: # for each snp in found order |
---|
| 1157 | f_missing = lmissdict.get(rs,'NA') |
---|
| 1158 | maf,a1,a2 = freqdict.get(rs,('NA','NA','NA')) |
---|
| 1159 | hwe_all = hwedict[rs].get('ALL',('NA','NA')) # hope this doesn't change... |
---|
| 1160 | hwe_unaff = hwedict[rs].get('UNAFF',('NA','NA')) |
---|
| 1161 | nmend = lmenddict.get(rs,'NA') |
---|
| 1162 | (chrom,offset)=mapdict.get(rs,('?','0')) |
---|
| 1163 | res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend]) |
---|
| 1164 | ntokeep = max(10,len(res)/keepfrac) |
---|
| 1165 | |
---|
| 1166 | def msortk(item=None): |
---|
| 1167 | """ |
---|
| 1168 | deal with non numeric sorting |
---|
| 1169 | """ |
---|
| 1170 | try: |
---|
| 1171 | return float(item) |
---|
| 1172 | except: |
---|
| 1173 | return item |
---|
| 1174 | |
---|
| 1175 | for i,col in enumerate(Tsorts): |
---|
| 1176 | res.sort(key=msortk(lambda x:x[col])) |
---|
| 1177 | if Treverse[i]: |
---|
| 1178 | res.reverse() |
---|
| 1179 | repname = Tnames[i] |
---|
| 1180 | Tops[repname] = res[0:ntokeep] |
---|
| 1181 | Tops[repname].insert(0,rhead) |
---|
| 1182 | res.sort(key=lambda x: '%s_%10d' % (x[1].ljust(4,'0'),int(x[2]))) # in chrom offset order |
---|
| 1183 | res.insert(0,rhead) |
---|
| 1184 | f = open(outfile,'w') |
---|
| 1185 | f.write('\n'.join(['\t'.join(x) for x in res])) |
---|
| 1186 | f.close() |
---|
| 1187 | return res,Tops |
---|
| 1188 | |
---|
| 1189 | |
---|
| 1190 | |
---|
| 1191 | |
---|
| 1192 | def getfSize(fpath,outpath): |
---|
| 1193 | """ |
---|
| 1194 | format a nice file size string |
---|
| 1195 | """ |
---|
| 1196 | size = '' |
---|
| 1197 | fp = os.path.join(outpath,fpath) |
---|
| 1198 | if os.path.isfile(fp): |
---|
| 1199 | n = float(os.path.getsize(fp)) |
---|
| 1200 | if n > 2**20: |
---|
| 1201 | size = ' (%1.1f MB)' % (n/2**20) |
---|
| 1202 | elif n > 2**10: |
---|
| 1203 | size = ' (%1.1f KB)' % (n/2**10) |
---|
| 1204 | elif n > 0: |
---|
| 1205 | size = ' (%d B)' % (int(n)) |
---|
| 1206 | return size |
---|
| 1207 | |
---|
| 1208 | |
---|
| 1209 | if __name__ == "__main__": |
---|
| 1210 | u = """ called in xml as |
---|
| 1211 | <command interpreter="python"> |
---|
| 1212 | rgQC.py -i '$input_file.extra_files_path/$input_file.metadata.base_name' -o "$out_prefix" |
---|
| 1213 | -s '$html_file' -p '$html_file.files_path' -l '${GALAXY_DATA_INDEX_DIR}/rg/bin/plink' |
---|
| 1214 | -r '${GALAXY_DATA_INDEX_DIR}/rg/bin/R' |
---|
| 1215 | </command> |
---|
| 1216 | |
---|
| 1217 | Prepare a qc report - eg: |
---|
| 1218 | print "%s %s -i birdlped -o birdlped -l qc.log -s htmlf -m marker.xls -s sub.xls -p ./" % (sys.executable,prog) |
---|
| 1219 | |
---|
| 1220 | """ |
---|
| 1221 | progname = os.path.basename(sys.argv[0]) |
---|
| 1222 | if len(sys.argv) < 9: |
---|
| 1223 | print '%s requires 6 parameters - got %d = %s' % (progname,len(sys.argv),sys.argv) |
---|
| 1224 | sys.exit(1) |
---|
| 1225 | parser = OptionParser(usage=u, version="%prog 0.01") |
---|
| 1226 | a = parser.add_option |
---|
| 1227 | a("-i","--infile",dest="infile") |
---|
| 1228 | a("-o","--oprefix",dest="opref") |
---|
| 1229 | a("-l","--plinkexe",dest="plinke", default=plinke) |
---|
| 1230 | a("-r","--rexe",dest="rexe", default=rexe) |
---|
| 1231 | a("-s","--snps",dest="htmlf") |
---|
| 1232 | #a("-m","--markerRaw",dest="markf") |
---|
| 1233 | #a("-r","--rawsubject",dest="subjf") |
---|
| 1234 | a("-p","--patho",dest="newfpath") |
---|
| 1235 | (options,args) = parser.parse_args() |
---|
| 1236 | basename = os.path.split(options.infile)[-1] # just want the file prefix to find the .xls files below |
---|
| 1237 | killme = string.punctuation + string.whitespace |
---|
| 1238 | trantab = string.maketrans(killme,'_'*len(killme)) |
---|
| 1239 | opref = options.opref.translate(trantab) |
---|
| 1240 | alogh,alog = tempfile.mkstemp(suffix='.txt') |
---|
| 1241 | plogh,plog = tempfile.mkstemp(suffix='.txt') |
---|
| 1242 | alogf = open(alog,'w') |
---|
| 1243 | plogf = open(plog,'w') |
---|
| 1244 | ahtmlf = options.htmlf |
---|
| 1245 | amarkf = 'MarkerDetails_%s.xls' % opref |
---|
| 1246 | asubjf = 'SubjectDetails_%s.xls' % opref |
---|
| 1247 | newfpath = options.newfpath |
---|
| 1248 | newfpath = os.path.realpath(newfpath) |
---|
| 1249 | try: |
---|
| 1250 | os.makedirs(newfpath) |
---|
| 1251 | except: |
---|
| 1252 | pass |
---|
| 1253 | ofn = basename |
---|
| 1254 | bfn = options.infile |
---|
| 1255 | try: |
---|
| 1256 | mapf = '%s.bim' % bfn |
---|
| 1257 | maplist = file(mapf,'r').readlines() |
---|
| 1258 | maplist = [x.split() for x in maplist] |
---|
| 1259 | except: |
---|
| 1260 | maplist = None |
---|
| 1261 | alogf.write('## error - cannot open %s to read map - no offsets will be available for output files') |
---|
| 1262 | #rerla@beast galaxy]$ head test-data/tinywga.bim |
---|
| 1263 | #22 rs2283802 0 21784722 4 2 |
---|
| 1264 | #22 rs2267000 0 21785366 4 2 |
---|
| 1265 | rgbin = os.path.split(rexe)[0] # get our rg bin path |
---|
| 1266 | #plinktasks = [' --freq',' --missing',' --mendel',' --hardy',' --check-sex'] # plink v1 fixes that bug! |
---|
| 1267 | # if we could, do all at once? Nope. Probably never. |
---|
| 1268 | plinktasks = [['--freq',],['--hwe 0.0', '--missing','--hardy'], |
---|
| 1269 | ['--mendel',],['--check-sex',]] |
---|
| 1270 | vclbase = [options.plinke,'--noweb','--out',basename,'--bfile',bfn,'--mind','1.0','--geno','1.0','--maf','0.0'] |
---|
| 1271 | runPlink(logf=plogf,plinktasks=plinktasks,cd=newfpath, vclbase=vclbase) |
---|
| 1272 | plinktasks = [['--bfile',bfn,'--indep-pairwise 40 20 0.5','--out %s' % basename], |
---|
| 1273 | ['--bfile',bfn,'--extract %s.prune.in --make-bed --out ldp_%s' % (basename, basename)], |
---|
| 1274 | ['--bfile ldp_%s --het --out %s' % (basename,basename)]] |
---|
| 1275 | # subset of ld independent markers for eigenstrat and other requirements |
---|
| 1276 | vclbase = [options.plinke,'--noweb'] |
---|
| 1277 | plogout = pruneLD(plinktasks=plinktasks,cd=newfpath,vclbase = vclbase) |
---|
| 1278 | plogf.write('\n'.join(plogout)) |
---|
| 1279 | plogf.write('\n') |
---|
| 1280 | repout = os.path.join(newfpath,basename) |
---|
| 1281 | subjects,subjectTops = subjectRep(froot=repout,outfname=asubjf,newfpath=newfpath, |
---|
| 1282 | logf=alogf) # writes the subject_froot.xls file |
---|
| 1283 | markers,markerTops = markerRep(froot=repout,outfname=amarkf,newfpath=newfpath, |
---|
| 1284 | logf=alogf,maplist=maplist) # marker_froot.xls |
---|
| 1285 | nbreaks = 100 |
---|
| 1286 | s = '## starting plotpage, newfpath=%s,m=%s,s=%s/n' % (newfpath,markers[:2],subjects[:2]) |
---|
| 1287 | alogf.write(s) |
---|
| 1288 | print s |
---|
| 1289 | plotpage,cruft = makePlots(markers=markers,subjects=subjects,newfpath=newfpath, |
---|
| 1290 | basename=basename,nbreaks=nbreaks,height=10,width=8,rgbin=rgbin) |
---|
| 1291 | #plotpage = RmakePlots(markers=markers,subjects=subjects,newfpath=newfpath,basename=basename,nbreaks=nbreaks,rexe=rexe) |
---|
| 1292 | |
---|
| 1293 | # [titles[n],plotnames[n],outfnames[n] ] |
---|
| 1294 | html = [] |
---|
| 1295 | html.append('<table cellpadding="5" border="0">') |
---|
| 1296 | size = getfSize(amarkf,newfpath) |
---|
| 1297 | html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \ |
---|
| 1298 | (amarkf,'Click here to download the Marker QC Detail report file',size)) |
---|
| 1299 | size = getfSize(asubjf,newfpath) |
---|
| 1300 | html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \ |
---|
| 1301 | (asubjf,'Click here to download the Subject QC Detail report file',size)) |
---|
| 1302 | for (title,url,ofname) in plotpage: |
---|
| 1303 | ttitle = 'Ranked %s' % title |
---|
| 1304 | dat = subjectTops.get(ttitle,None) |
---|
| 1305 | if not dat: |
---|
| 1306 | dat = markerTops.get(ttitle,None) |
---|
| 1307 | imghref = '%s.jpg' % os.path.splitext(url)[0] # removes .pdf |
---|
| 1308 | thumbnail = os.path.join(newfpath,imghref) |
---|
| 1309 | if not os.path.exists(thumbnail): # for multipage pdfs, mogrify makes multiple jpgs - fugly hack |
---|
| 1310 | imghref = '%s-0.jpg' % os.path.splitext(url)[0] # try the first jpg |
---|
| 1311 | thumbnail = os.path.join(newfpath,imghref) |
---|
| 1312 | if not os.path.exists(thumbnail): |
---|
| 1313 | html.append('<tr><td colspan="3"><a href="%s">%s</a></td></tr>' % (url,title)) |
---|
| 1314 | else: |
---|
| 1315 | html.append('<tr><td><a href="%s"><img src="%s" alt="%s" hspace="10" align="middle">' \ |
---|
| 1316 | % (url,imghref,title)) |
---|
| 1317 | if dat: # one or the other - write as an extra file and make a link here |
---|
| 1318 | t = '%s.xls' % (ttitle.replace(' ','_')) |
---|
| 1319 | fname = os.path.join(newfpath,t) |
---|
| 1320 | f = file(fname,'w') |
---|
| 1321 | f.write('\n'.join(['\t'.join(x) for x in dat])) # the report |
---|
| 1322 | size = getfSize(t,newfpath) |
---|
| 1323 | html.append('</a></td><td>%s</td><td><a href="%s">Worst data</a>%s</td></tr>' % (title,t,size)) |
---|
| 1324 | else: |
---|
| 1325 | html.append('</a></td><td>%s</td><td> </td></tr>' % (title)) |
---|
| 1326 | html.append('</table><hr><h3>All output files from the QC run are available below</h3>') |
---|
| 1327 | html.append('<table cellpadding="5" border="0">\n') |
---|
| 1328 | flist = os.listdir(newfpath) # we want to catch 'em all |
---|
| 1329 | flist.sort() |
---|
| 1330 | for f in flist: |
---|
| 1331 | fname = os.path.split(f)[-1] |
---|
| 1332 | size = getfSize(fname,newfpath) |
---|
| 1333 | html.append('<tr><td><a href="%s">%s</a>%s</td></tr>' % (fname,fname,size)) |
---|
| 1334 | html.append('</table>') |
---|
| 1335 | alogf.close() |
---|
| 1336 | plogf.close() |
---|
| 1337 | llog = open(alog,'r').readlines() |
---|
| 1338 | plogfile = open(plog,'r').readlines() |
---|
| 1339 | os.unlink(alog) |
---|
| 1340 | os.unlink(plog) |
---|
| 1341 | llog += plogfile # add lines from pruneld log |
---|
| 1342 | lf = file(ahtmlf,'w') # galaxy will show this as the default view |
---|
| 1343 | lf.write(galhtmlprefix % progname) |
---|
| 1344 | s = '\n<div>Output from Rgenetics QC report tool run at %s<br>\n' % (timenow()) |
---|
| 1345 | lf.write('<h4>%s</h4>\n' % s) |
---|
| 1346 | lf.write('</div><div><h4>(Click any preview image to download a full sized PDF version)</h4><br><ol>\n') |
---|
| 1347 | lf.write('\n'.join(html)) |
---|
| 1348 | lf.write('<h4>QC run log contents</h4>') |
---|
| 1349 | lf.write('<pre>%s</pre>' % (''.join(llog))) # plink logs |
---|
| 1350 | if len(cruft) > 0: |
---|
| 1351 | lf.write('<h2>Blather from pdfnup follows:</h2><pre>%s</pre>' % (''.join(cruft))) # pdfnup |
---|
| 1352 | lf.write('%s\n<hr>\n' % galhtmlpostfix) |
---|
| 1353 | lf.close() |
---|
| 1354 | |
---|