""" # july 2009: Need to see outliers so need to draw them last? # could use clustering on the zscores to guess real relationships for unrelateds # but definitely need to draw last # added MAX_SHOW_ROWS to limit the length of the main report page # Changes for Galaxy integration # added more robust knuth method for one pass mean and sd # no difference really - let's use scipy.mean() and scipy.std() instead... # fixed labels and changed to .xls for outlier reports so can open in excel # interesting - with a few hundred subjects, 5k gives good resolution # and 100k gives better but not by much # TODO remove non autosomal markers # TODO it would be best if label had the zmean and zsd as these are what matter for # outliers rather than the group mean/sd # mods to rgGRR.py from channing CVS which John Ziniti has rewritten to produce SVG plots # to make a Galaxy tool - we need the table of mean and SD for interesting pairs, the SVG and the log # so the result should be an HTML file # rgIBS.py # use a random subset of markers for a quick ibs # to identify sample dups and closely related subjects # try snpMatrix and plink and see which one works best for us? # abecasis grr plots mean*sd for every subject to show clusters # mods june 23 rml to avoid non-autosomal markers # we seem to be distinguishing parent-child by gender - 2 clouds! snpMatrix from David Clayton has: ibs.stats function to calculate the identity-by-state stats of a group of samples Description Given a snp.matrix-class or a X.snp.matrix-class object with N samples, calculates some statistics about the relatedness of every pair of samples within. Usage ibs.stats(x) 8 ibs.stats Arguments x a snp.matrix-class or a X.snp.matrix-class object containing N samples Details No-calls are excluded from consideration here. Value A data.frame containing N(N - 1)/2 rows, where the row names are the sample name pairs separated by a comma, and the columns are: Count count of identical calls, exclusing no-calls Fraction fraction of identical calls comparied to actual calls being made in both samples Warning In some applications, it may be preferable to subset a (random) selection of SNPs first - the calculation time increases as N(N - 1)M/2 . Typically for N = 800 samples and M = 3000 SNPs, the calculation time is about 1 minute. A full GWA scan could take hours, and quite unnecessary for simple applications such as checking for duplicate or related samples. Note This is mostly written to find mislabelled and/or duplicate samples. Illumina indexes their SNPs in alphabetical order so the mitochondria SNPs comes first - for most purpose it is undesirable to use these SNPs for IBS purposes. TODO: Worst-case S4 subsetting seems to make 2 copies of a large object, so one might want to subset before rbind(), etc; a future version of this routine may contain a built-in subsetting facility """ import sys,os,time,random,string,copy,optparse try: set except NameError: from Sets import Set as set from rgutils import timenow,plinke import plinkbinJZ opts = None verbose = False showPolygons = False class NullDevice: def write(self, s): pass tempstderr = sys.stderr # save #sys.stderr = NullDevice() # need to avoid blather about deprecation and other strange stuff from scipy # the current galaxy job runner assumes that # the job is in error if anything appears on sys.stderr # grrrrr. James wants to keep it that way instead of using the # status flag for some strange reason. Presumably he doesn't use R or (in this case, scipy) import numpy import scipy from scipy import weave sys.stderr=tempstderr PROGNAME = os.path.split(sys.argv[0])[-1] X_AXIS_LABEL = 'Mean Alleles Shared' Y_AXIS_LABEL = 'SD Alleles Shared' LEGEND_ALIGN = 'topleft' LEGEND_TITLE = 'Relationship' DEFAULT_SYMBOL_SIZE = 1.0 # default symbol size DEFAULT_SYMBOL_SIZE = 0.5 # default symbol size ### Some colors for R/rpy R_BLACK = 1 R_RED = 2 R_GREEN = 3 R_BLUE = 4 R_CYAN = 5 R_PURPLE = 6 R_YELLOW = 7 R_GRAY = 8 ### ... and some point-styles ### PLOT_HEIGHT = 600 PLOT_WIDTH = 1150 #SVG_COLORS = ('black', 'darkblue', 'blue', 'deepskyblue', 'firebrick','maroon','crimson') #SVG_COLORS = ('cyan','dodgerblue','mediumpurple', 'fuchsia', 'red','gold','gray') SVG_COLORS = ('cyan','dodgerblue','mediumpurple','forestgreen', 'lightgreen','gold','gray') # dupe,parentchild,sibpair,halfsib,parents,unrel,unkn #('orange', 'red', 'green', 'chartreuse', 'blue', 'purple', 'gray') OUTLIERS_HEADER_list = ['Mean','Sdev','ZMean','ZSdev','FID1','IID1','FID2','IID2','RelMean_M','RelMean_SD','RelSD_M','RelSD_SD','PID1','MID1','PID2','MID2','Ped'] OUTLIERS_HEADER = '\t'.join(OUTLIERS_HEADER_list) TABLE_HEADER='fid1_iid1\tfid2_iid2\tmean\tsdev\tzmean\tzsdev\tgeno\trelcode\tpid1\tmid1\tpid2\tmid2\n' ### Relationship codes, text, and lookups/mappings N_RELATIONSHIP_TYPES = 7 REL_DUPE, REL_PARENTCHILD, REL_SIBS, REL_HALFSIBS, REL_RELATED, REL_UNRELATED, REL_UNKNOWN = range(N_RELATIONSHIP_TYPES) REL_LOOKUP = { REL_DUPE: ('dupe', R_BLUE, 1), REL_PARENTCHILD: ('parentchild', R_YELLOW, 1), REL_SIBS: ('sibpairs', R_RED, 1), REL_HALFSIBS: ('halfsibs', R_GREEN, 1), REL_RELATED: ('parents', R_PURPLE, 1), REL_UNRELATED: ('unrelated', R_CYAN, 1), REL_UNKNOWN: ('unknown', R_GRAY, 1), } OUTLIER_STDEVS = { REL_DUPE: 2, REL_PARENTCHILD: 2, REL_SIBS: 2, REL_HALFSIBS: 2, REL_RELATED: 2, REL_UNRELATED: 3, REL_UNKNOWN: 2, } # note now Z can be passed in REL_STATES = [REL_LOOKUP[r][0] for r in range(N_RELATIONSHIP_TYPES)] REL_COLORS = SVG_COLORS REL_POINTS = [REL_LOOKUP[r][2] for r in range(N_RELATIONSHIP_TYPES)] DEFAULT_MAX_SAMPLE_SIZE = 10000 REF_COUNT_HOM1 = 3 REF_COUNT_HET = 2 REF_COUNT_HOM2 = 1 MISSING = 0 MAX_SHOW_ROWS = 100 # framingham has millions - delays showing output page - so truncate and explain MARKER_PAIRS_PER_SECOND_SLOW = 15000000.0 MARKER_PAIRS_PER_SECOND_FAST = 70000000.0 galhtmlprefix = """
%s | ' % x for x in OUTLIERS_HEADER_list]) repOut.append('
---|
%f | %f | %f | %f | %s | %s | %s | %s | %f | %f | %f | %f | %s | %s | %s | %s | ''' % tuple(rec) s = '%s%s | ' % (s,relName) if n < MAX_SHOW_ROWS: repOut.append('
')
### Now, draw the plot in jpeg and svg formats, and optionally in the PDF format
### if requested
logf.write('Plotting ...')
pointColors = [REL_COLORS[rel] for rel in rels]
pointStyles = [REL_POINTS[rel] for rel in rels]
mainTitle = '%s (%s subjects, %d snp)' % (title, nSubjects, nrsSamples)
svg.write(SVG_HEADER % (SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[4],
SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[0],SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[1],
SVG_COLORS[2],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[3],SVG_COLORS[4],SVG_COLORS[4],
SVG_COLORS[5],SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[6],mainTitle))
#rpy.r.jpeg(filename='%s.jpg' % (title), width=1600, height=1200, pointsize=12, quality=100, bg='white')
#rpy.r.par(mai=(1,1,1,0.5))
#rpy.r('par(xaxs="i",yaxs="i")')
#rpy.r.plot(means, sdevs, main=mainTitle, ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2))
#rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE)
#rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted')
#rpy.r.dev_off()
### We will now go through each relationship type to partition plot points
### into "bulk" and "outlier" groups. Bulk points will represent common
### mean/sdev pairs and will cover the majority of the points in the plot --
### they will use generic tooltip informtion about all of the pairs
### represented by that point. "Outlier" points will be uncommon pairs,
### with very specific information in their tooltips. It would be nice to
### keep hte total number of plotted points in the SVG representation to
### ~10000 (certainly less than 100000?)
pointMap = {}
orderedRels = [y[1] for y in reversed(sorted([(relCounts.get(x, 0),x) for x in REL_LOOKUP.keys()]))]
# do we really want this? I want out of zone points last and big
for relCode in orderedRels:
svgColor = SVG_COLORS[relCode]
relName, relColor, relStyle = REL_LOOKUP[relCode]
svg.write('
%s\n
%s