""" # july 2009: Need to see outliers so need to draw them last? # could use clustering on the zscores to guess real relationships for unrelateds # but definitely need to draw last # added MAX_SHOW_ROWS to limit the length of the main report page # Changes for Galaxy integration # added more robust knuth method for one pass mean and sd # no difference really - let's use scipy.mean() and scipy.std() instead... # fixed labels and changed to .xls for outlier reports so can open in excel # interesting - with a few hundred subjects, 5k gives good resolution # and 100k gives better but not by much # TODO remove non autosomal markers # TODO it would be best if label had the zmean and zsd as these are what matter for # outliers rather than the group mean/sd # mods to rgGRR.py from channing CVS which John Ziniti has rewritten to produce SVG plots # to make a Galaxy tool - we need the table of mean and SD for interesting pairs, the SVG and the log # so the result should be an HTML file # rgIBS.py # use a random subset of markers for a quick ibs # to identify sample dups and closely related subjects # try snpMatrix and plink and see which one works best for us? # abecasis grr plots mean*sd for every subject to show clusters # mods june 23 rml to avoid non-autosomal markers # we seem to be distinguishing parent-child by gender - 2 clouds! snpMatrix from David Clayton has: ibs.stats function to calculate the identity-by-state stats of a group of samples Description Given a snp.matrix-class or a X.snp.matrix-class object with N samples, calculates some statistics about the relatedness of every pair of samples within. Usage ibs.stats(x) 8 ibs.stats Arguments x a snp.matrix-class or a X.snp.matrix-class object containing N samples Details No-calls are excluded from consideration here. Value A data.frame containing N(N - 1)/2 rows, where the row names are the sample name pairs separated by a comma, and the columns are: Count count of identical calls, exclusing no-calls Fraction fraction of identical calls comparied to actual calls being made in both samples Warning In some applications, it may be preferable to subset a (random) selection of SNPs first - the calculation time increases as N(N - 1)M/2 . Typically for N = 800 samples and M = 3000 SNPs, the calculation time is about 1 minute. A full GWA scan could take hours, and quite unnecessary for simple applications such as checking for duplicate or related samples. Note This is mostly written to find mislabelled and/or duplicate samples. Illumina indexes their SNPs in alphabetical order so the mitochondria SNPs comes first - for most purpose it is undesirable to use these SNPs for IBS purposes. TODO: Worst-case S4 subsetting seems to make 2 copies of a large object, so one might want to subset before rbind(), etc; a future version of this routine may contain a built-in subsetting facility """ import sys,os,time,random,string,copy,optparse try: set except NameError: from Sets import Set as set from rgutils import timenow,plinke import plinkbinJZ opts = None verbose = False showPolygons = False class NullDevice: def write(self, s): pass tempstderr = sys.stderr # save #sys.stderr = NullDevice() # need to avoid blather about deprecation and other strange stuff from scipy # the current galaxy job runner assumes that # the job is in error if anything appears on sys.stderr # grrrrr. James wants to keep it that way instead of using the # status flag for some strange reason. Presumably he doesn't use R or (in this case, scipy) import numpy import scipy from scipy import weave sys.stderr=tempstderr PROGNAME = os.path.split(sys.argv[0])[-1] X_AXIS_LABEL = 'Mean Alleles Shared' Y_AXIS_LABEL = 'SD Alleles Shared' LEGEND_ALIGN = 'topleft' LEGEND_TITLE = 'Relationship' DEFAULT_SYMBOL_SIZE = 1.0 # default symbol size DEFAULT_SYMBOL_SIZE = 0.5 # default symbol size ### Some colors for R/rpy R_BLACK = 1 R_RED = 2 R_GREEN = 3 R_BLUE = 4 R_CYAN = 5 R_PURPLE = 6 R_YELLOW = 7 R_GRAY = 8 ### ... and some point-styles ### PLOT_HEIGHT = 600 PLOT_WIDTH = 1150 #SVG_COLORS = ('black', 'darkblue', 'blue', 'deepskyblue', 'firebrick','maroon','crimson') #SVG_COLORS = ('cyan','dodgerblue','mediumpurple', 'fuchsia', 'red','gold','gray') SVG_COLORS = ('cyan','dodgerblue','mediumpurple','forestgreen', 'lightgreen','gold','gray') # dupe,parentchild,sibpair,halfsib,parents,unrel,unkn #('orange', 'red', 'green', 'chartreuse', 'blue', 'purple', 'gray') OUTLIERS_HEADER_list = ['Mean','Sdev','ZMean','ZSdev','FID1','IID1','FID2','IID2','RelMean_M','RelMean_SD','RelSD_M','RelSD_SD','PID1','MID1','PID2','MID2','Ped'] OUTLIERS_HEADER = '\t'.join(OUTLIERS_HEADER_list) TABLE_HEADER='fid1_iid1\tfid2_iid2\tmean\tsdev\tzmean\tzsdev\tgeno\trelcode\tpid1\tmid1\tpid2\tmid2\n' ### Relationship codes, text, and lookups/mappings N_RELATIONSHIP_TYPES = 7 REL_DUPE, REL_PARENTCHILD, REL_SIBS, REL_HALFSIBS, REL_RELATED, REL_UNRELATED, REL_UNKNOWN = range(N_RELATIONSHIP_TYPES) REL_LOOKUP = { REL_DUPE: ('dupe', R_BLUE, 1), REL_PARENTCHILD: ('parentchild', R_YELLOW, 1), REL_SIBS: ('sibpairs', R_RED, 1), REL_HALFSIBS: ('halfsibs', R_GREEN, 1), REL_RELATED: ('parents', R_PURPLE, 1), REL_UNRELATED: ('unrelated', R_CYAN, 1), REL_UNKNOWN: ('unknown', R_GRAY, 1), } OUTLIER_STDEVS = { REL_DUPE: 2, REL_PARENTCHILD: 2, REL_SIBS: 2, REL_HALFSIBS: 2, REL_RELATED: 2, REL_UNRELATED: 3, REL_UNKNOWN: 2, } # note now Z can be passed in REL_STATES = [REL_LOOKUP[r][0] for r in range(N_RELATIONSHIP_TYPES)] REL_COLORS = SVG_COLORS REL_POINTS = [REL_LOOKUP[r][2] for r in range(N_RELATIONSHIP_TYPES)] DEFAULT_MAX_SAMPLE_SIZE = 10000 REF_COUNT_HOM1 = 3 REF_COUNT_HET = 2 REF_COUNT_HOM2 = 1 MISSING = 0 MAX_SHOW_ROWS = 100 # framingham has millions - delays showing output page - so truncate and explain MARKER_PAIRS_PER_SECOND_SLOW = 15000000.0 MARKER_PAIRS_PER_SECOND_FAST = 70000000.0 galhtmlprefix = """
""" SVG_HEADER = ''' Developer Works Dynamic Scatter Graph Scaling Example Given Pair Relationship Zscore gt 15 Zscore 4 to 15 Zscore lt 4 Mean Alleles Shared 1.0 1.25 1.5 1.75 2.0 SD Alleles Shared 1.0 0.75 0.5 0.25 0.0 %s ''' SVG_FOOTER = ''' unrelated mean=1.5 +/- 0.04 sdev=0.7 +/- 0.03 npairs=1152 ngenos=4783 +/- 24 (min=1000, max=5000) sibpairs s1=fid1,iid1 s2=fid2,iid2 mean=1.82 sdev=0.7 ngeno=4487 relmean=1.85 relsdev=0.65 ''' DEFAULT_MAX_SAMPLE_SIZE = 5000 REF_COUNT_HOM1 = 3 REF_COUNT_HET = 2 REF_COUNT_HOM2 = 1 MISSING = 0 MARKER_PAIRS_PER_SECOND_SLOW = 15000000 MARKER_PAIRS_PER_SECOND_FAST = 70000000 POLYGONS = { REL_UNRELATED: ((1.360, 0.655), (1.385, 0.730), (1.620, 0.575), (1.610, 0.505)), REL_HALFSIBS: ((1.630, 0.500), (1.630, 0.550), (1.648, 0.540), (1.648, 0.490)), REL_SIBS: ((1.660, 0.510), (1.665, 0.560), (1.820, 0.410), (1.820, 0.390)), REL_PARENTCHILD: ((1.650, 0.470), (1.650, 0.490), (1.750, 0.440), (1.750, 0.420)), REL_DUPE: ((1.970, 0.000), (1.970, 0.150), (2.000, 0.150), (2.000, 0.000)), } def distance(point1, point2): """ Calculate the distance between two points """ (x1,y1) = [float(d) for d in point1] (x2,y2) = [float(d) for d in point2] dx = abs(x1 - x2) dy = abs(y1 - y2) return math.sqrt(dx**2 + dy**2) def point_inside_polygon(x, y, poly): """ Determine if a point (x,y) is inside a given polygon or not poly is a list of (x,y) pairs. Taken from: http://www.ariel.com.au/a/python-point-int-poly.html """ n = len(poly) inside = False p1x,p1y = poly[0] for i in range(n+1): p2x,p2y = poly[i % n] if y > min(p1y,p2y): if y <= max(p1y,p2y): if x <= max(p1x,p2x): if p1y != p2y: xinters = (y-p1y)*(p2x-p1x)/(p2y-p1y)+p1x if p1x == p2x or x <= xinters: inside = not inside p1x,p1y = p2x,p2y return inside def readMap(pedfile): """ """ mapfile = pedfile.replace('.ped', '.map') marker_list = [] if os.path.exists(mapfile): print 'readMap: %s' % (mapfile) fh = file(mapfile, 'r') for line in fh: marker_list.append(line.strip().split()) fh.close() print 'readMap: %s markers' % (len(marker_list)) return marker_list def calcMeanSD(useme): """ A numerically stable algorithm is given below. It also computes the mean. This algorithm is due to Knuth,[1] who cites Welford.[2] n = 0 mean = 0 M2 = 0 foreach x in data: n = n + 1 delta = x - mean mean = mean + delta/n M2 = M2 + delta*(x - mean) // This expression uses the new value of mean end for variance_n = M2/n variance = M2/(n - 1) """ mean = 0.0 M2 = 0.0 sd = 0.0 n = len(useme) if n > 1: for i,x in enumerate(useme): delta = x - mean mean = mean + delta/(i+1) # knuth uses n+=1 at start M2 = M2 + delta*(x - mean) # This expression uses the new value of mean variance = M2/(n-1) # assume is sample so lose 1 DOF sd = pow(variance,0.5) return mean,sd def doIBSpy(ped=None,basename='',outdir=None,logf=None, nrsSamples=10000,title='title',pdftoo=0,Zcutoff=2.0): #def doIBS(pedName, title, nrsSamples=None, pdftoo=False): """ started with snpmatrix but GRR uses actual IBS counts and sd's """ repOut = [] # text strings to add to the html display refallele = {} tblf = '%s_table.xls' % (title) tbl = file(os.path.join(outdir,tblf), 'w') tbl.write(TABLE_HEADER) svgf = '%s.svg' % (title) svg = file(os.path.join(outdir,svgf), 'w') nMarkers = len(ped._markers) if nMarkers < 5: print sys.stderr, '### ERROR - %d is too few markers for reliable estimation in %s - terminating' % (nMarkers,PROGNAME) sys.exit(1) nSubjects = len(ped._subjects) nrsSamples = min(nMarkers, nrsSamples) if opts and opts.use_mito: markers = range(nMarkers) nrsSamples = min(len(markers), nrsSamples) sampleIndexes = sorted(random.sample(markers, nrsSamples)) else: autosomals = ped.autosomal_indices() nrsSamples = min(len(autosomals), nrsSamples) sampleIndexes = sorted(random.sample(autosomals, nrsSamples)) print '' print 'Getting random.sample of %s from %s total' % (nrsSamples, nMarkers) npairs = (nSubjects*(nSubjects-1))/2 # total rows in table newfiles=[svgf,tblf] explanations = ['rgGRR Plot (requires SVG)','Mean by SD alleles shared - %d rows' % npairs] # these go with the output file links in the html file s = 'Reading genotypes for %s subjects and %s markers\n' % (nSubjects, nrsSamples) logf.write(s) minUsegenos = nrsSamples/2 # must have half? nGenotypes = nSubjects*nrsSamples stime = time.time() emptyRows = set() genos = numpy.zeros((nSubjects, nrsSamples), dtype=int) for s in xrange(nSubjects): nValid = 0 #getGenotypesByIndices(self, s, mlist, format) genos[s] = ped.getGenotypesByIndices(s, sampleIndexes, format='ref') nValid = sum([1 for g in genos[s] if g]) if not nValid: emptyRows.add(s) sub = ped.getSubject(s) print 'All missing for row %d (%s)' % (s, sub) logf.write('All missing for row %d (%s)\n' % (s, sub)) rtime = time.time() - stime if verbose: print '@@Read %s genotypes in %s seconds' % (nGenotypes, rtime) ### Now the expensive part. For each pair of subjects, we get the mean number ### and standard deviation of shared alleles over all of the markers where both ### subjects have a known genotype. Identical subjects should have mean shared ### alleles very close to 2.0 with a standard deviation very close to 0.0. tot = nSubjects*(nSubjects-1)/2 nprog = tot/10 nMarkerpairs = tot * nrsSamples estimatedTimeSlow = nMarkerpairs/MARKER_PAIRS_PER_SECOND_SLOW estimatedTimeFast = nMarkerpairs/MARKER_PAIRS_PER_SECOND_FAST pairs = [] pair_data = {} means = [] ## Mean IBS for each pair ngenoL = [] ## Count of comparable genotypes for each pair sdevs = [] ## Standard dev for each pair rels = [] ## A relationship code for each pair zmeans = [0.0 for x in xrange(tot)] ## zmean score for each pair for the relgroup zstds = [0.0 for x in xrange(tot)] ## zstd score for each pair for the relgrp skip = set() ndone = 0 ## How many have been done so far logf.write('Calculating %d pairs...\n' % (tot)) logf.write('Estimated time is %2.2f to %2.2f seconds ...\n' % (estimatedTimeFast, estimatedTimeSlow)) t1sum = 0 t2sum = 0 t3sum = 0 now = time.time() scache = {} _founder_cache = {} C_CODE = """ #include "math.h" int i; int sumibs = 0; int ssqibs = 0; int ngeno = 0; float mean = 0; float M2 = 0; float delta = 0; float sdev=0; float variance=0; for (i=0; i 1) { variance = M2/(ngeno-1); sdev = sqrt(variance); //printf("OK: %d %3.2f %3.2f\\n", ngeno, mean, sdev); } //printf("%d %d %d %1.2f %1.2f\\n", ngeno, sumibs, ssqibs, mean, sdev); result[0] = ngeno; result[1] = mean; result[2] = sdev; return_val = ngeno; """ started = time.time() for s1 in xrange(nSubjects): if s1 in emptyRows: continue (fid1,iid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache.setdefault(s1, ped.getSubject(s1)) isFounder1 = _founder_cache.setdefault(s1, (did1==mid1)) g1 = genos[s1] for s2 in xrange(s1+1, nSubjects): if s2 in emptyRows: continue t1s = time.time() (fid2,iid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache.setdefault(s2, ped.getSubject(s2)) g2 = genos[s2] isFounder2 = _founder_cache.setdefault(s2, (did2==mid2)) # Determine the relationship for this pair relcode = REL_UNKNOWN if (fid2 == fid1): if iid1 == iid2: relcode = REL_DUPE elif (did2 == did1) and (mid2 == mid1) and did1 != mid1: relcode = REL_SIBS elif (iid1 == mid2) or (iid1 == did2) or (iid2 == mid1) or (iid2 == did1): relcode = REL_PARENTCHILD elif (str(did1) != '0' and (did2 == did1)) or (str(mid1) != '0' and (mid2 == mid1)): relcode = REL_HALFSIBS else: # People in the same family should be marked as some other # form of related. In general, these people will have a # pretty random spread of similarity. This distinction is # probably not very useful most of the time relcode = REL_RELATED else: ### Different families relcode = REL_UNRELATED t1e = time.time() t1sum += t1e-t1s ### Calculate sum(2-abs(a1-a2)) and sum((2-abs(a1-a2))**2) and count ### the number of contributing genotypes. These values are not actually ### calculated here, but instead are looked up in a table for speed. ### FIXME: This is still too slow ... result = [0.0, 0.0, 0.0] ngeno = weave.inline(C_CODE, ['g1', 'g2', 'nrsSamples', 'result']) if ngeno >= minUsegenos: _, mean, sdev = result means.append(mean) sdevs.append(sdev) ngenoL.append(ngeno) pairs.append((s1, s2)) rels.append(relcode) else: skip.add(ndone) # signal no comparable genotypes for this pair ndone += 1 t2e = time.time() t2sum += t2e-t1e t3e = time.time() t3sum += t3e-t2e logme = [ 'T1: %s' % (t1sum), 'T2: %s' % (t2sum), 'T3: %s' % (t3sum),'TOT: %s' % (t3e-now), '%s pairs with no (or not enough) comparable genotypes (%3.1f%%)' % (len(skip), float(len(skip))/float(tot)*100)] logf.write('%s\n' % '\t'.join(logme)) ### Calculate mean and standard deviation of scores on a per relationship ### type basis, allowing us to flag outliers for each particular relationship ### type relstats = {} relCounts = {} outlierFiles = {} for relCode, relInfo in REL_LOOKUP.items(): relName, relColor, relStyle = relInfo useme = [means[x] for x in xrange(len(means)) if rels[x] == relCode] relCounts[relCode] = len(useme) mm = scipy.mean(useme) ms = scipy.std(useme) useme = [sdevs[x] for x in xrange(len(sdevs)) if rels[x] == relCode] sm = scipy.mean(useme) ss = scipy.std(useme) relstats[relCode] = {'sd':(sm,ss), 'mean':(mm,ms)} s = 'Relstate %s (n=%d): mean(mean)=%3.2f sdev(mean)=%3.2f, mean(sdev)=%3.2f sdev(sdev)=%3.2f\n' % \ (relName,relCounts[relCode], mm, ms, sm, ss) logf.write(s) ### now fake z scores for each subject like abecasis recommends max(|zmu|,|zsd|) ### within each group, for each pair, z=(groupmean-pairmean)/groupsd available = len(means) logf.write('%d pairs are available of %d\n' % (available, tot)) ### s = '\nOutliers:\nrelationship\tzmean\tzsd\tped1\tped2\tmean\tsd\trmeanmean\trmeansd\trsdmean\trsdsd\n' ### logf.write(s) pairnum = 0 offset = 0 nOutliers = 0 cexs = [] outlierRecords = dict([(r, []) for r in range(N_RELATIONSHIP_TYPES)]) zsdmax = 0 for s1 in range(nSubjects): if s1 in emptyRows: continue (fid1,iid1,did1,mid1,sex1,aff1,ok1,d_sid1,m_sid1) = scache[s1] for s2 in range(s1+1, nSubjects): if s2 in emptyRows: continue if pairnum not in skip: ### Get group stats for this relationship (fid2,iid2,did2,mid2,sex2,aff2,ok2,d_sid2,m_sid2) = scache[s2] try: r = rels[offset] except IndexError: logf.write('###OOPS offset %d available %d pairnum %d len(rels) %d', offset, available, pairnum, len(rels)) notfound = ('?',('?','0','0')) relInfo = REL_LOOKUP.get(r,notfound) relName, relColor, relStyle = relInfo rmm,rmd = relstats[r]['mean'] # group mean, group meansd alleles shared rdm,rdd = relstats[r]['sd'] # group sdmean, group sdsd alleles shared try: zsd = (sdevs[offset] - rdm)/rdd # distance from group mean in group sd units except: zsd = 1 if abs(zsd) > zsdmax: zsdmax = zsd # keep for sort scaling try: zmean = (means[offset] - rmm)/rmd # distance from group mean except: zmean = 1 zmeans[offset] = zmean zstds[offset] = zsd pid=(s1,s2) zrad = max(zsd,zmean) if zrad < 4: zrad = 2 elif 4 < zrad < 15: zrad = 3 # to 9 else: # > 15 6=24+ zrad=zrad/4 zrad = min(zrad,6) # scale limit zrad = max(2,max(zsd,zmean)) # as > 2, z grows pair_data[pid] = (zmean,zsd,r,zrad) if max(zsd,zmean) > Zcutoff: # is potentially interesting mean = means[offset] sdev = sdevs[offset] outlierRecords[r].append((mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd,did1,mid1,did2,mid2)) nOutliers += 1 tbl.write('%s_%s\t%s_%s\t%f\t%f\t%f\t%f\t%d\t%s\t%s\t%s\t%s\t%s\n' % \ (fid1, iid1, fid2, iid2, mean, sdev, zmean,zsd, ngeno, relName, did1,mid1,did2,mid2)) offset += 1 pairnum += 1 logf.write( 'Outliers: %s\n' % (nOutliers)) ### Write outlier files for each relationship type repOut.append('

Outliers in tab delimited files linked above are also listed below

') lzsd = round(numpy.log10(zsdmax)) + 1 scalefactor = 10**lzsd for relCode, relInfo in REL_LOOKUP.items(): relName, _, _ = relInfo outliers = outlierRecords[relCode] if not outliers: continue outliers = [(scalefactor*int(abs(x[3]))+ int(abs(x[2])),x) for x in outliers] # decorate outliers.sort() outliers.reverse() # largest deviation first outliers = [x[1] for x in outliers] # undecorate nrows = len(outliers) truncated = 0 if nrows > MAX_SHOW_ROWS: s = '

%s outlying pairs (top %d of %d) from %s

' % \ (relName,MAX_SHOW_ROWS,nrows,title) truncated = nrows - MAX_SHOW_ROWS else: s = '

%s outlying pairs (n=%d) from %s

' % (relName,nrows,title) repOut.append(s) fhname = '%s_rgGRR_%s_outliers.xls' % (title, relName) fhpath = os.path.join(outdir,fhname) fh = open(fhpath, 'w') newfiles.append(fhname) explanations.append('%s Outlier Pairs %s, N=%d, Cutoff SD=%f' % (relName,title,len(outliers),Zcutoff)) fh.write(OUTLIERS_HEADER) s = ''.join(['' % x for x in OUTLIERS_HEADER_list]) repOut.append('%s' % s) for n,rec in enumerate(outliers): #(mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd) = rec s = '%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t' % tuple(rec) fh.write('%s%s\n' % (s,relName)) # (mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd, did1,mid1,did2,mid2)) s = '''''' % tuple(rec) s = '%s' % (s,relName) if n < MAX_SHOW_ROWS: repOut.append('%s' % s) if truncated > 0: repOut.append('

WARNING: %d rows truncated - see outlier file for all %d rows

' % (truncated, nrows)) fh.close() repOut.append('
%s
%f%f%f%f%s%s %s%s%f%f%f%f%s%s%s%s%s

') ### Now, draw the plot in jpeg and svg formats, and optionally in the PDF format ### if requested logf.write('Plotting ...') pointColors = [REL_COLORS[rel] for rel in rels] pointStyles = [REL_POINTS[rel] for rel in rels] mainTitle = '%s (%s subjects, %d snp)' % (title, nSubjects, nrsSamples) svg.write(SVG_HEADER % (SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[4], SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[0],SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[1], SVG_COLORS[2],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[3],SVG_COLORS[4],SVG_COLORS[4], SVG_COLORS[5],SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[6],mainTitle)) #rpy.r.jpeg(filename='%s.jpg' % (title), width=1600, height=1200, pointsize=12, quality=100, bg='white') #rpy.r.par(mai=(1,1,1,0.5)) #rpy.r('par(xaxs="i",yaxs="i")') #rpy.r.plot(means, sdevs, main=mainTitle, ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2)) #rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE) #rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted') #rpy.r.dev_off() ### We will now go through each relationship type to partition plot points ### into "bulk" and "outlier" groups. Bulk points will represent common ### mean/sdev pairs and will cover the majority of the points in the plot -- ### they will use generic tooltip informtion about all of the pairs ### represented by that point. "Outlier" points will be uncommon pairs, ### with very specific information in their tooltips. It would be nice to ### keep hte total number of plotted points in the SVG representation to ### ~10000 (certainly less than 100000?) pointMap = {} orderedRels = [y[1] for y in reversed(sorted([(relCounts.get(x, 0),x) for x in REL_LOOKUP.keys()]))] # do we really want this? I want out of zone points last and big for relCode in orderedRels: svgColor = SVG_COLORS[relCode] relName, relColor, relStyle = REL_LOOKUP[relCode] svg.write('\n' % (relName, svgColor, svgColor)) pMap = pointMap.setdefault(relCode, {}) nPoints = 0 rpairs=[] rgenos=[] rmeans=[] rsdevs=[] rz = [] for x,rel in enumerate(rels): # all pairs if rel == relCode: s1,s2 = pairs[x] pid=(s1,s2) zmean,zsd,r,zrad = pair_data[pid][:4] rpairs.append(pairs[x]) rgenos.append(ngenoL[x]) rmeans.append(means[x]) rsdevs.append(sdevs[x]) rz.append(zrad) ### Now add the svg point group for this relationship to the svg file for x in range(len(rmeans)): svgX = '%d' % ((rmeans[x] - 1.0) * PLOT_WIDTH) # changed so mean scale is 1-2 svgY = '%d' % (PLOT_HEIGHT - (rsdevs[x] * PLOT_HEIGHT)) # changed so sd scale is 0-1 s1, s2 = rpairs[x] (fid1,uid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache[s1] (fid2,uid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache[s2] ngenos = rgenos[x] nPoints += 1 point = pMap.setdefault((svgX, svgY), []) point.append((rmeans[x], rsdevs[x], fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos,rz[x])) for (svgX, svgY) in pMap: points = pMap[(svgX, svgY)] svgX = int(svgX) svgY = int(svgY) if len(points) > 1: mmean,dmean = calcMeanSD([p[0] for p in points]) msdev,dsdev = calcMeanSD([p[1] for p in points]) mgeno,dgeno = calcMeanSD([p[-1] for p in points]) mingeno = min([p[-1] for p in points]) maxgeno = max([p[-1] for p in points]) svg.write("""\n""" % (svgX, svgY, relCode, mmean, dmean, msdev, dsdev, len(points), mgeno, dgeno, mingeno, maxgeno)) else: mean, sdev, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos, zrad = points[0][:12] rmean = float(relstats[relCode]['mean'][0]) rsdev = float(relstats[relCode]['sd'][0]) if zrad < 4: zrad = 2 elif 4 < zrad < 9: zrad = 3 # to 9 else: # > 9 5=15+ zrad=zrad/3 zrad = min(zrad,5) # scale limit if zrad <= 3: svg.write('\n' % (svgX, svgY, zrad, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev)) else: # highlight pairs a long way from expectation by outlining circle in red svg.write("""\n""" % \ (svgX, svgY, zrad, svgColor, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev)) svg.write('\n') ### Create a pdf as well if indicated on the command line ### WARNING! for framingham share, with about 50M pairs, this is a 5.5GB pdf! ## if pdftoo: ## pdfname = '%s.pdf' % (title) ## rpy.r.pdf(pdfname, 6, 6) ## rpy.r.par(mai=(1,1,1,0.5)) ## rpy.r('par(xaxs="i",yaxs="i")') ## rpy.r.plot(means, sdevs, main='%s, %d snp' % (title, nSamples), ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2)) ## rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE) ## rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted') ## rpy.r.dev_off() ### Draw polygons if showPolygons: svg.write('\n') for rel, poly in POLYGONS.items(): points = ' '.join(['%s,%s' % ((p[0]-1.0)*float(PLOT_WIDTH), (PLOT_HEIGHT - p[1]*PLOT_HEIGHT)) for p in poly]) svg.write('\n' % (points, SVG_COLORS[rel])) svg.write('\n') svg.write(SVG_FOOTER) svg.close() return newfiles,explanations,repOut def doIBS(n=100): """parse parameters from galaxy expect 'input pbed path' 'basename' 'outpath' 'title' 'logpath' 'n' rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name" '$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z' """ u=""" rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name" '$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z' """ if len(sys.argv) < 7: print >> sys.stdout, 'Need pbed inpath, basename, out_htmlname, outpath, title, logpath, nSNP, Zcutoff on command line please' print >> sys.stdout, u sys.exit(1) ts = '%s%s' % (string.punctuation,string.whitespace) ptran = string.maketrans(ts,'_'*len(ts)) inpath = sys.argv[1] ldinpath = os.path.split(inpath)[0] basename = sys.argv[2] outhtml = sys.argv[3] newfilepath = sys.argv[4] title = sys.argv[5].translate(ptran) logfname = 'Log_%s.txt' % title logpath = os.path.join(newfilepath,logfname) # log was a child - make part of html extra_files_path zoo n = int(sys.argv[6]) try: Zcutoff = float(sys.argv[7]) except: Zcutoff = 2.0 try: os.makedirs(newfilepath) except: pass logf = file(logpath,'w') efp,ibase_name = os.path.split(inpath) # need to use these for outputs in files_path ped = plinkbinJZ.BPed(inpath) ped.parse(quick=True) if ped == None: print >> sys.stderr, '## doIBSpy problem - cannot open %s or %s - cannot run' % (ldreduced,basename) sys.exit(1) newfiles,explanations,repOut = doIBSpy(ped=ped,basename=basename,outdir=newfilepath, logf=logf,nrsSamples=n,title=title,pdftoo=0,Zcutoff=Zcutoff) logf.close() logfs = file(logpath,'r').readlines() lf = file(outhtml,'w') lf.write(galhtmlprefix % PROGNAME) # this is a mess. todo clean up - should each datatype have it's own directory? Yes # probably. Then titles are universal - but userId libraries are separate. s = '

Output from %s run at %s
\n' % (PROGNAME,timenow()) lf.write('

%s

\n' % s) fixed = ["'%s'" % x for x in sys.argv] # add quotes just in case s = 'If you need to rerun this analysis, the command line was\n
%s
\n
' % (' '.join(fixed)) lf.write(s) # various ways of displaying svg - experiments related to missing svg mimetype on test (!) #s = """ # # """ % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT,newfiles[0],PLOT_WIDTH,PLOT_HEIGHT) s = """ """ % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT) #s = """