Context Navigation

rgGRR.py @ 2

リビジョン 2, 45.2 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

Rev	行番号
[2]	1	"""
	2	# july 2009: Need to see outliers so need to draw them last?
	3	# could use clustering on the zscores to guess real relationships for unrelateds
	4	# but definitely need to draw last
	5	# added MAX_SHOW_ROWS to limit the length of the main report page
	6	# Changes for Galaxy integration
	7	# added more robust knuth method for one pass mean and sd
	8	# no difference really - let's use scipy.mean() and scipy.std() instead...
	9	# fixed labels and changed to .xls for outlier reports so can open in excel
	10	# interesting - with a few hundred subjects, 5k gives good resolution
	11	# and 100k gives better but not by much
	12	# TODO remove non autosomal markers
	13	# TODO it would be best if label had the zmean and zsd as these are what matter for
	14	# outliers rather than the group mean/sd
	15	# mods to rgGRR.py from channing CVS which John Ziniti has rewritten to produce SVG plots
	16	# to make a Galaxy tool - we need the table of mean and SD for interesting pairs, the SVG and the log
	17	# so the result should be an HTML file
	18
	19	# rgIBS.py
	20	# use a random subset of markers for a quick ibs
	21	# to identify sample dups and closely related subjects
	22	# try snpMatrix and plink and see which one works best for us?
	23	# abecasis grr plots mean*sd for every subject to show clusters
	24	# mods june 23 rml to avoid non-autosomal markers
	25	# we seem to be distinguishing parent-child by gender - 2 clouds!
	26
	27
	28	snpMatrix from David Clayton has:
	29	ibs.stats function to calculate the identity-by-state stats of a group of samples
	30	Description
	31	Given a snp.matrix-class or a X.snp.matrix-class object with N samples, calculates some statistics
	32	about the relatedness of every pair of samples within.
	33
	34	Usage
	35	ibs.stats(x)
	36	8 ibs.stats
	37	Arguments
	38	x a snp.matrix-class or a X.snp.matrix-class object containing N samples
	39	Details
	40	No-calls are excluded from consideration here.
	41	Value
	42	A data.frame containing N(N - 1)/2 rows, where the row names are the sample name pairs separated
	43	by a comma, and the columns are:
	44	Count count of identical calls, exclusing no-calls
	45	Fraction fraction of identical calls comparied to actual calls being made in both samples
	46	Warning
	47	In some applications, it may be preferable to subset a (random) selection of SNPs first - the
	48	calculation
	49	time increases as N(N - 1)M/2 . Typically for N = 800 samples and M = 3000 SNPs, the
	50	calculation time is about 1 minute. A full GWA scan could take hours, and quite unnecessary for
	51	simple applications such as checking for duplicate or related samples.
	52	Note
	53	This is mostly written to find mislabelled and/or duplicate samples.
	54	Illumina indexes their SNPs in alphabetical order so the mitochondria SNPs comes first - for most
	55	purpose it is undesirable to use these SNPs for IBS purposes.
	56	TODO: Worst-case S4 subsetting seems to make 2 copies of a large object, so one might want to
	57	subset before rbind(), etc; a future version of this routine may contain a built-in subsetting facility
	58	"""
	59	import sys,os,time,random,string,copy,optparse
	60
	61	try:
	62	set
	63	except NameError:
	64	from Sets import Set as set
	65
	66	from rgutils import timenow,plinke
	67
	68	import plinkbinJZ
	69
	70
	71	opts = None
	72	verbose = False
	73
	74	showPolygons = False
	75
	76	class NullDevice:
	77	def write(self, s):
	78	pass
	79
	80	tempstderr = sys.stderr # save
	81	#sys.stderr = NullDevice()
	82	# need to avoid blather about deprecation and other strange stuff from scipy
	83	# the current galaxy job runner assumes that
	84	# the job is in error if anything appears on sys.stderr
	85	# grrrrr. James wants to keep it that way instead of using the
	86	# status flag for some strange reason. Presumably he doesn't use R or (in this case, scipy)
	87	import numpy
	88	import scipy
	89	from scipy import weave
	90
	91
	92	sys.stderr=tempstderr
	93
	94
	95	PROGNAME = os.path.split(sys.argv[0])[-1]
	96	X_AXIS_LABEL = 'Mean Alleles Shared'
	97	Y_AXIS_LABEL = 'SD Alleles Shared'
	98	LEGEND_ALIGN = 'topleft'
	99	LEGEND_TITLE = 'Relationship'
	100	DEFAULT_SYMBOL_SIZE = 1.0 # default symbol size
	101	DEFAULT_SYMBOL_SIZE = 0.5 # default symbol size
	102
	103	### Some colors for R/rpy
	104	R_BLACK = 1
	105	R_RED = 2
	106	R_GREEN = 3
	107	R_BLUE = 4
	108	R_CYAN = 5
	109	R_PURPLE = 6
	110	R_YELLOW = 7
	111	R_GRAY = 8
	112
	113	### ... and some point-styles
	114
	115	###
	116	PLOT_HEIGHT = 600
	117	PLOT_WIDTH = 1150
	118
	119
	120	#SVG_COLORS = ('black', 'darkblue', 'blue', 'deepskyblue', 'firebrick','maroon','crimson')
	121	#SVG_COLORS = ('cyan','dodgerblue','mediumpurple', 'fuchsia', 'red','gold','gray')
	122	SVG_COLORS = ('cyan','dodgerblue','mediumpurple','forestgreen', 'lightgreen','gold','gray')
	123	# dupe,parentchild,sibpair,halfsib,parents,unrel,unkn
	124	#('orange', 'red', 'green', 'chartreuse', 'blue', 'purple', 'gray')
	125
	126	OUTLIERS_HEADER_list = ['Mean','Sdev','ZMean','ZSdev','FID1','IID1','FID2','IID2','RelMean_M','RelMean_SD','RelSD_M','RelSD_SD','PID1','MID1','PID2','MID2','Ped']
	127	OUTLIERS_HEADER = '\t'.join(OUTLIERS_HEADER_list)
	128	TABLE_HEADER='fid1_iid1\tfid2_iid2\tmean\tsdev\tzmean\tzsdev\tgeno\trelcode\tpid1\tmid1\tpid2\tmid2\n'
	129
	130
	131	### Relationship codes, text, and lookups/mappings
	132	N_RELATIONSHIP_TYPES = 7
	133	REL_DUPE, REL_PARENTCHILD, REL_SIBS, REL_HALFSIBS, REL_RELATED, REL_UNRELATED, REL_UNKNOWN = range(N_RELATIONSHIP_TYPES)
	134	REL_LOOKUP = {
	135	REL_DUPE: ('dupe', R_BLUE, 1),
	136	REL_PARENTCHILD: ('parentchild', R_YELLOW, 1),
	137	REL_SIBS: ('sibpairs', R_RED, 1),
	138	REL_HALFSIBS: ('halfsibs', R_GREEN, 1),
	139	REL_RELATED: ('parents', R_PURPLE, 1),
	140	REL_UNRELATED: ('unrelated', R_CYAN, 1),
	141	REL_UNKNOWN: ('unknown', R_GRAY, 1),
	142	}
	143	OUTLIER_STDEVS = {
	144	REL_DUPE: 2,
	145	REL_PARENTCHILD: 2,
	146	REL_SIBS: 2,
	147	REL_HALFSIBS: 2,
	148	REL_RELATED: 2,
	149	REL_UNRELATED: 3,
	150	REL_UNKNOWN: 2,
	151	}
	152	# note now Z can be passed in
	153
	154	REL_STATES = [REL_LOOKUP[r][0] for r in range(N_RELATIONSHIP_TYPES)]
	155	REL_COLORS = SVG_COLORS
	156	REL_POINTS = [REL_LOOKUP[r][2] for r in range(N_RELATIONSHIP_TYPES)]
	157
	158	DEFAULT_MAX_SAMPLE_SIZE = 10000
	159
	160	REF_COUNT_HOM1 = 3
	161	REF_COUNT_HET = 2
	162	REF_COUNT_HOM2 = 1
	163	MISSING = 0
	164	MAX_SHOW_ROWS = 100 # framingham has millions - delays showing output page - so truncate and explain
	165	MARKER_PAIRS_PER_SECOND_SLOW = 15000000.0
	166	MARKER_PAIRS_PER_SECOND_FAST = 70000000.0
	167
	168
	169	galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
	170	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
	171	<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
	172	<head>
	173	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
	174	<meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" />
	175	<title></title>
	176	<link rel="stylesheet" href="/static/style/base.css" type="text/css" />
	177	</head>
	178	<body>
	179	<div class="document">
	180	"""
	181
	182
	183	SVG_HEADER = '''<?xml version="1.0" standalone="no"?>
	184	<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.2//EN" "http://www.w3.org/Graphics/SVG/1.2/DTD/svg12.dtd">
	185
	186	<svg width="1280" height="800"
	187	xmlns="http://www.w3.org/2000/svg" version="1.2"
	188	xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 1280 800" onload="init()">
	189
	190	<script type="text/ecmascript" xlink:href="/static/scripts/checkbox_and_radiobutton.js"/>
	191	<script type="text/ecmascript" xlink:href="/static/scripts/helper_functions.js"/>
	192	<script type="text/ecmascript" xlink:href="/static/scripts/timer.js"/>
	193	<script type="text/ecmascript">
	194	<![CDATA[
	195	var checkBoxes = new Array();
	196	var radioGroupBandwidth;
	197	var colours = ['%s','%s','%s','%s','%s','%s','%s'];
	198	function init() {
	199	var style = {"font-family":"Arial,Helvetica", "fill":"black", "font-size":12};
	200	var dist = 12;
	201	var yOffset = 4;
	202
	203	//A checkBox for each relationship type dupe,parentchild,sibpair,halfsib,parents,unrel,unkn
	204	checkBoxes["dupe"] = new checkBox("dupe","checkboxes",20,40,"cbRect","cbCross",true,"Duplicate",style,dist,yOffset,undefined,hideShowLayer);
	205	checkBoxes["parentchild"] = new checkBox("parentchild","checkboxes",20,60,"cbRect","cbCross",true,"Parent-Child",style,dist,yOffset,undefined,hideShowLayer);
	206	checkBoxes["sibpairs"] = new checkBox("sibpairs","checkboxes",20,80,"cbRect","cbCross",true,"Sib-pairs",style,dist,yOffset,undefined,hideShowLayer);
	207	checkBoxes["halfsibs"] = new checkBox("halfsibs","checkboxes",20,100,"cbRect","cbCross",true,"Half-sibs",style,dist,yOffset,undefined,hideShowLayer);
	208	checkBoxes["parents"] = new checkBox("parents","checkboxes",20,120,"cbRect","cbCross",true,"Parents",style,dist,yOffset,undefined,hideShowLayer);
	209	checkBoxes["unrelated"] = new checkBox("unrelated","checkboxes",20,140,"cbRect","cbCross",true,"Unrelated",style,dist,yOffset,undefined,hideShowLayer);
	210	checkBoxes["unknown"] = new checkBox("unknown","checkboxes",20,160,"cbRect","cbCross",true,"Unknown",style,dist,yOffset,undefined,hideShowLayer);
	211
	212	}
	213
	214	function hideShowLayer(id, status, label) {
	215	var vis = "hidden";
	216	if (status) {
	217	vis = "visible";
	218	}
	219	document.getElementById(id).setAttributeNS(null, 'visibility', vis);
	220	}
	221
	222	function showBTT(evt, rel, mm, dm, md, dd, n, mg, dg, lg, hg) {
	223	var x = parseInt(evt.pageX)-250;
	224	var y = parseInt(evt.pageY)-110;
	225	switch(rel) {
	226	case 0:
	227	fill = colours[rel];
	228	relt = "dupe";
	229	break;
	230	case 1:
	231	fill = colours[rel];
	232	relt = "parentchild";
	233	break;
	234	case 2:
	235	fill = colours[rel];
	236	relt = "sibpairs";
	237	break;
	238	case 3:
	239	fill = colours[rel];
	240	relt = "halfsibs";
	241	break;
	242	case 4:
	243	fill = colours[rel];
	244	relt = "parents";
	245	break;
	246	case 5:
	247	fill = colours[rel];
	248	relt = "unrelated";
	249	break;
	250	case 6:
	251	fill = colours[rel];
	252	relt = "unknown";
	253	break;
	254	default:
	255	fill = "cyan";
	256	relt = "ERROR_CODE: "+rel;
	257	}
	258
	259	document.getElementById("btRel").textContent = "GROUP: "+relt;
	260	document.getElementById("btMean").textContent = "mean="+mm+" +/- "+dm;
	261	document.getElementById("btSdev").textContent = "sdev="+dm+" +/- "+dd;
	262	document.getElementById("btPair").textContent = "npairs="+n;
	263	document.getElementById("btGeno").textContent = "ngenos="+mg+" +/- "+dg+" (min="+lg+", max="+hg+")";
	264	document.getElementById("btHead").setAttribute('fill', fill);
	265
	266	var tt = document.getElementById("btTip");
	267	tt.setAttribute("transform", "translate("+x+","+y+")");
	268	tt.setAttribute('visibility', 'visible');
	269	}
	270
	271	function showOTT(evt, rel, s1, s2, mean, sdev, ngeno, rmean, rsdev) {
	272	var x = parseInt(evt.pageX)-150;
	273	var y = parseInt(evt.pageY)-180;
	274
	275	switch(rel) {
	276	case 0:
	277	fill = colours[rel];
	278	relt = "dupe";
	279	break;
	280	case 1:
	281	fill = colours[rel];
	282	relt = "parentchild";
	283	break;
	284	case 2:
	285	fill = colours[rel];
	286	relt = "sibpairs";
	287	break;
	288	case 3:
	289	fill = colours[rel];
	290	relt = "halfsibs";
	291	break;
	292	case 4:
	293	fill = colours[rel];
	294	relt = "parents";
	295	break;
	296	case 5:
	297	fill = colours[rel];
	298	relt = "unrelated";
	299	break;
	300	case 6:
	301	fill = colours[rel];
	302	relt = "unknown";
	303	break;
	304	default:
	305	fill = "cyan";
	306	relt = "ERROR_CODE: "+rel;
	307	}
	308
	309	document.getElementById("otRel").textContent = "PAIR: "+relt;
	310	document.getElementById("otS1").textContent = "s1="+s1;
	311	document.getElementById("otS2").textContent = "s2="+s2;
	312	document.getElementById("otMean").textContent = "mean="+mean;
	313	document.getElementById("otSdev").textContent = "sdev="+sdev;
	314	document.getElementById("otGeno").textContent = "ngenos="+ngeno;
	315	document.getElementById("otRmean").textContent = "relmean="+rmean;
	316	document.getElementById("otRsdev").textContent = "relsdev="+rsdev;
	317	document.getElementById("otHead").setAttribute('fill', fill);
	318
	319	var tt = document.getElementById("otTip");
	320	tt.setAttribute("transform", "translate("+x+","+y+")");
	321	tt.setAttribute('visibility', 'visible');
	322	}
	323
	324	function hideBTT(evt) {
	325	document.getElementById("btTip").setAttributeNS(null, 'visibility', 'hidden');
	326	}
	327
	328	function hideOTT(evt) {
	329	document.getElementById("otTip").setAttributeNS(null, 'visibility', 'hidden');
	330	}
	331
	332	]]>
	333	</script>
	334	<defs>
	335	<!-- symbols for check boxes -->
	336	<symbol id="cbRect" overflow="visible">
	337	<rect x="-5" y="-5" width="10" height="10" fill="white" stroke="dimgray" stroke-width="1" cursor="pointer"/>
	338	</symbol>
	339	<symbol id="cbCross" overflow="visible">
	340	<g pointer-events="none" stroke="black" stroke-width="1">
	341	<line x1="-3" y1="-3" x2="3" y2="3"/>
	342	<line x1="3" y1="-3" x2="-3" y2="3"/>
	343	</g>
	344	</symbol>
	345	</defs>
	346
	347	<desc>Developer Works Dynamic Scatter Graph Scaling Example</desc>
	348
	349	<!-- Now Draw the main X and Y axis -->
	350	<g style="stroke-width:1.0; stroke:black; shape-rendering:crispEdges">
	351	<!-- X Axis top and bottom -->
	352	<path d="M 100 100 L 1250 100 Z"/>
	353	<path d="M 100 700 L 1250 700 Z"/>
	354
	355	<!-- Y Axis left and right -->
	356	<path d="M 100 100 L 100 700 Z"/>
	357	<path d="M 1250 100 L 1250 700 Z"/>
	358	</g>
	359
	360	<g transform="translate(100,100)">
	361
	362	<!-- Grid Lines -->
	363	<g style="fill:none; stroke:#dddddd; stroke-width:1; stroke-dasharray:2,2; text-anchor:end; shape-rendering:crispEdges">
	364
	365	<!-- Vertical grid lines -->
	366	<line x1="125" y1="0" x2="115" y2="600" />
	367	<line x1="230" y1="0" x2="230" y2="600" />
	368	<line x1="345" y1="0" x2="345" y2="600" />
	369	<line x1="460" y1="0" x2="460" y2="600" />
	370	<line x1="575" y1="0" x2="575" y2="600" style="stroke-dasharray:none;" />
	371	<line x1="690" y1="0" x2="690" y2="600" />
	372	<line x1="805" y1="0" x2="805" y2="600" />
	373	<line x1="920" y1="0" x2="920" y2="600" />
	374	<line x1="1035" y1="0" x2="1035" y2="600" />
	375
	376	<!-- Horizontal grid lines -->
	377	<line x1="0" y1="60" x2="1150" y2="60" />
	378	<line x1="0" y1="120" x2="1150" y2="120" />
	379	<line x1="0" y1="180" x2="1150" y2="180" />
	380	<line x1="0" y1="240" x2="1150" y2="240" />
	381	<line x1="0" y1="300" x2="1150" y2="300" style="stroke-dasharray:none;" />
	382	<line x1="0" y1="360" x2="1150" y2="360" />
	383	<line x1="0" y1="420" x2="1150" y2="420" />
	384	<line x1="0" y1="480" x2="1150" y2="480" />
	385	<line x1="0" y1="540" x2="1150" y2="540" />
	386	</g>
	387
	388	<!-- Legend -->
	389	<g style="fill:black; stroke:none" font-size="12" font-family="Arial" transform="translate(25,25)">
	390	<rect width="160" height="270" style="fill:none; stroke:black; shape-rendering:crispEdges" />
	391	<text x="5" y="20" style="fill:black; stroke:none;" font-size="13" font-weight="bold">Given Pair Relationship</text>
	392	<rect x="120" y="35" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
	393	<rect x="120" y="55" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
	394	<rect x="120" y="75" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
	395	<rect x="120" y="95" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
	396	<rect x="120" y="115" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
	397	<rect x="120" y="135" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
	398	<rect x="120" y="155" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
	399	<text x="15" y="195" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore gt 15</text>
	400	<circle cx="125" cy="192" r="6" style="stroke:red; fill:gold; fill-opacity:1.0; stroke-width:1;"/>
	401	<text x="15" y="215" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore 4 to 15</text>
	402	<circle cx="125" cy="212" r="3" style="stroke:gold; fill:gold; fill-opacity:1.0; stroke-width:1;"/>
	403	<text x="15" y="235" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore lt 4</text>
	404	<circle cx="125" cy="232" r="2" style="stroke:gold; fill:gold; fill-opacity:1.0; stroke-width:1;"/>
	405	<g id="checkboxes">
	406	</g>
	407	</g>
	408
	409
	410	<g style='fill:black; stroke:none' font-size="17" font-family="Arial">
	411	<!-- X Axis Labels -->
	412	<text x="480" y="660">Mean Alleles Shared</text>
	413	<text x="0" y="630" >1.0</text>
	414	<text x="277" y="630" >1.25</text>
	415	<text x="564" y="630" >1.5</text>
	416	<text x="842" y="630" >1.75</text>
	417	<text x="1140" y="630" >2.0</text>
	418	</g>
	419
	420	<g transform="rotate(270)" style="fill:black; stroke:none" font-size="17" font-family="Arial">
	421	<!-- Y Axis Labels -->
	422	<text x="-350" y="-40">SD Alleles Shared</text>
	423	<text x="-20" y="-10" >1.0</text>
	424	<text x="-165" y="-10" >0.75</text>
	425	<text x="-310" y="-10" >0.5</text>
	426	<text x="-455" y="-10" >0.25</text>
	427	<text x="-600" y="-10" >0.0</text>
	428	</g>
	429
	430	<!-- Plot Title -->
	431	<g style="fill:black; stroke:none" font-size="18" font-family="Arial">
	432	<text x="425" y="-30">%s</text>
	433	</g>
	434
	435	<!-- One group/layer of points for each relationship type -->
	436	'''
	437
	438	SVG_FOOTER = '''
	439	<!-- End of Data -->
	440	</g>
	441	<g id="btTip" visibility="hidden" style="stroke-width:1.0; fill:black; stroke:none;" font-size="10" font-family="Arial">
	442	<rect width="250" height="110" style="fill:silver" rx="2" ry="2"/>
	443	<rect id="btHead" width="250" height="20" rx="2" ry="2" />
	444	<text id="btRel" y="14" x="85">unrelated</text>
	445	<text id="btMean" y="40" x="4">mean=1.5 +/- 0.04</text>
	446	<text id="btSdev" y="60" x="4">sdev=0.7 +/- 0.03</text>
	447	<text id="btPair" y="80" x="4">npairs=1152</text>
	448	<text id="btGeno" y="100" x="4">ngenos=4783 +/- 24 (min=1000, max=5000)</text>
	449	</g>
	450
	451	<g id="otTip" visibility="hidden" style="stroke-width:1.0; fill:black; stroke:none;" font-size="10" font-family="Arial">
	452	<rect width="150" height="180" style="fill:silver" rx="2" ry="2"/>
	453	<rect id="otHead" width="150" height="20" rx="2" ry="2" />
	454	<text id="otRel" y="14" x="40">sibpairs</text>
	455	<text id="otS1" y="40" x="4">s1=fid1,iid1</text>
	456	<text id="otS2" y="60" x="4">s2=fid2,iid2</text>
	457	<text id="otMean" y="80" x="4">mean=1.82</text>
	458	<text id="otSdev" y="100" x="4">sdev=0.7</text>
	459	<text id="otGeno" y="120" x="4">ngeno=4487</text>
	460	<text id="otRmean" y="140" x="4">relmean=1.85</text>
	461	<text id="otRsdev" y="160" x="4">relsdev=0.65</text>
	462	</g>
	463	</svg>
	464	'''
	465
	466
	467	DEFAULT_MAX_SAMPLE_SIZE = 5000
	468
	469	REF_COUNT_HOM1 = 3
	470	REF_COUNT_HET = 2
	471	REF_COUNT_HOM2 = 1
	472	MISSING = 0
	473
	474	MARKER_PAIRS_PER_SECOND_SLOW = 15000000
	475	MARKER_PAIRS_PER_SECOND_FAST = 70000000
	476
	477	POLYGONS = {
	478	REL_UNRELATED: ((1.360, 0.655), (1.385, 0.730), (1.620, 0.575), (1.610, 0.505)),
	479	REL_HALFSIBS: ((1.630, 0.500), (1.630, 0.550), (1.648, 0.540), (1.648, 0.490)),
	480	REL_SIBS: ((1.660, 0.510), (1.665, 0.560), (1.820, 0.410), (1.820, 0.390)),
	481	REL_PARENTCHILD: ((1.650, 0.470), (1.650, 0.490), (1.750, 0.440), (1.750, 0.420)),
	482	REL_DUPE: ((1.970, 0.000), (1.970, 0.150), (2.000, 0.150), (2.000, 0.000)),
	483	}
	484
	485	def distance(point1, point2):
	486	""" Calculate the distance between two points
	487	"""
	488	(x1,y1) = [float(d) for d in point1]
	489	(x2,y2) = [float(d) for d in point2]
	490	dx = abs(x1 - x2)
	491	dy = abs(y1 - y2)
	492	return math.sqrt(dx2 + dy2)
	493
	494	def point_inside_polygon(x, y, poly):
	495	""" Determine if a point (x,y) is inside a given polygon or not
	496	poly is a list of (x,y) pairs.
	497
	498	Taken from: http://www.ariel.com.au/a/python-point-int-poly.html
	499	"""
	500
	501	n = len(poly)
	502	inside = False
	503
	504	p1x,p1y = poly[0]
	505	for i in range(n+1):
	506	p2x,p2y = poly[i % n]
	507	if y > min(p1y,p2y):
	508	if y <= max(p1y,p2y):
	509	if x <= max(p1x,p2x):
	510	if p1y != p2y:
	511	xinters = (y-p1y)*(p2x-p1x)/(p2y-p1y)+p1x
	512	if p1x == p2x or x <= xinters:
	513	inside = not inside
	514	p1x,p1y = p2x,p2y
	515	return inside
	516
	517	def readMap(pedfile):
	518	"""
	519	"""
	520	mapfile = pedfile.replace('.ped', '.map')
	521	marker_list = []
	522	if os.path.exists(mapfile):
	523	print 'readMap: %s' % (mapfile)
	524	fh = file(mapfile, 'r')
	525	for line in fh:
	526	marker_list.append(line.strip().split())
	527	fh.close()
	528	print 'readMap: %s markers' % (len(marker_list))
	529	return marker_list
	530
	531	def calcMeanSD(useme):
	532	"""
	533	A numerically stable algorithm is given below. It also computes the mean.
	534	This algorithm is due to Knuth,[1] who cites Welford.[2]
	535	n = 0
	536	mean = 0
	537	M2 = 0
	538
	539	foreach x in data:
	540	n = n + 1
	541	delta = x - mean
	542	mean = mean + delta/n
	543	M2 = M2 + delta*(x - mean) // This expression uses the new value of mean
	544	end for
	545
	546	variance_n = M2/n
	547	variance = M2/(n - 1)
	548	"""
	549	mean = 0.0
	550	M2 = 0.0
	551	sd = 0.0
	552	n = len(useme)
	553	if n > 1:
	554	for i,x in enumerate(useme):
	555	delta = x - mean
	556	mean = mean + delta/(i+1) # knuth uses n+=1 at start
	557	M2 = M2 + delta*(x - mean) # This expression uses the new value of mean
	558	variance = M2/(n-1) # assume is sample so lose 1 DOF
	559	sd = pow(variance,0.5)
	560	return mean,sd
	561
	562
	563	def doIBSpy(ped=None,basename='',outdir=None,logf=None,
	564	nrsSamples=10000,title='title',pdftoo=0,Zcutoff=2.0):
	565	#def doIBS(pedName, title, nrsSamples=None, pdftoo=False):
	566	""" started with snpmatrix but GRR uses actual IBS counts and sd's
	567	"""
	568	repOut = [] # text strings to add to the html display
	569	refallele = {}
	570	tblf = '%s_table.xls' % (title)
	571	tbl = file(os.path.join(outdir,tblf), 'w')
	572	tbl.write(TABLE_HEADER)
	573	svgf = '%s.svg' % (title)
	574	svg = file(os.path.join(outdir,svgf), 'w')
	575
	576	nMarkers = len(ped._markers)
	577	if nMarkers < 5:
	578	print sys.stderr, '### ERROR - %d is too few markers for reliable estimation in %s - terminating' % (nMarkers,PROGNAME)
	579	sys.exit(1)
	580	nSubjects = len(ped._subjects)
	581	nrsSamples = min(nMarkers, nrsSamples)
	582	if opts and opts.use_mito:
	583	markers = range(nMarkers)
	584	nrsSamples = min(len(markers), nrsSamples)
	585	sampleIndexes = sorted(random.sample(markers, nrsSamples))
	586	else:
	587	autosomals = ped.autosomal_indices()
	588	nrsSamples = min(len(autosomals), nrsSamples)
	589	sampleIndexes = sorted(random.sample(autosomals, nrsSamples))
	590
	591	print ''
	592	print 'Getting random.sample of %s from %s total' % (nrsSamples, nMarkers)
	593	npairs = (nSubjects*(nSubjects-1))/2 # total rows in table
	594	newfiles=[svgf,tblf]
	595	explanations = ['rgGRR Plot (requires SVG)','Mean by SD alleles shared - %d rows' % npairs]
	596	# these go with the output file links in the html file
	597	s = 'Reading genotypes for %s subjects and %s markers\n' % (nSubjects, nrsSamples)
	598	logf.write(s)
	599	minUsegenos = nrsSamples/2 # must have half?
	600	nGenotypes = nSubjects*nrsSamples
	601	stime = time.time()
	602	emptyRows = set()
	603	genos = numpy.zeros((nSubjects, nrsSamples), dtype=int)
	604	for s in xrange(nSubjects):
	605	nValid = 0
	606	#getGenotypesByIndices(self, s, mlist, format)
	607	genos[s] = ped.getGenotypesByIndices(s, sampleIndexes, format='ref')
	608	nValid = sum([1 for g in genos[s] if g])
	609	if not nValid:
	610	emptyRows.add(s)
	611	sub = ped.getSubject(s)
	612	print 'All missing for row %d (%s)' % (s, sub)
	613	logf.write('All missing for row %d (%s)\n' % (s, sub))
	614	rtime = time.time() - stime
	615	if verbose:
	616	print '@@Read %s genotypes in %s seconds' % (nGenotypes, rtime)
	617
	618
	619	### Now the expensive part. For each pair of subjects, we get the mean number
	620	### and standard deviation of shared alleles over all of the markers where both
	621	### subjects have a known genotype. Identical subjects should have mean shared
	622	### alleles very close to 2.0 with a standard deviation very close to 0.0.
	623	tot = nSubjects*(nSubjects-1)/2
	624	nprog = tot/10
	625	nMarkerpairs = tot * nrsSamples
	626	estimatedTimeSlow = nMarkerpairs/MARKER_PAIRS_PER_SECOND_SLOW
	627	estimatedTimeFast = nMarkerpairs/MARKER_PAIRS_PER_SECOND_FAST
	628
	629	pairs = []
	630	pair_data = {}
	631	means = [] ## Mean IBS for each pair
	632	ngenoL = [] ## Count of comparable genotypes for each pair
	633	sdevs = [] ## Standard dev for each pair
	634	rels = [] ## A relationship code for each pair
	635	zmeans = [0.0 for x in xrange(tot)] ## zmean score for each pair for the relgroup
	636	zstds = [0.0 for x in xrange(tot)] ## zstd score for each pair for the relgrp
	637	skip = set()
	638	ndone = 0 ## How many have been done so far
	639
	640	logf.write('Calculating %d pairs...\n' % (tot))
	641	logf.write('Estimated time is %2.2f to %2.2f seconds ...\n' % (estimatedTimeFast, estimatedTimeSlow))
	642
	643	t1sum = 0
	644	t2sum = 0
	645	t3sum = 0
	646	now = time.time()
	647	scache = {}
	648	_founder_cache = {}
	649	C_CODE = """
	650	#include "math.h"
	651	int i;
	652	int sumibs = 0;
	653	int ssqibs = 0;
	654	int ngeno = 0;
	655	float mean = 0;
	656	float M2 = 0;
	657	float delta = 0;
	658	float sdev=0;
	659	float variance=0;
	660	for (i=0; i<nrsSamples; i++) {
	661	int a1 = g1[i];
	662	int a2 = g2[i];
	663	if (a1 != 0 && a2 != 0) {
	664	ngeno += 1;
	665	int shared = 2-abs(a1-a2);
	666	delta = shared - mean;
	667	mean = mean + delta/ngeno;
	668	M2 += delta*(shared-mean);
	669	// yes that second time, the updated mean is used see calcmeansd above;
	670	//printf("%d %d %d %d %d %d\\n", i, a1, a2, ngeno, shared, squared);
	671	}
	672	}
	673	if (ngeno > 1) {
	674	variance = M2/(ngeno-1);
	675	sdev = sqrt(variance);
	676	//printf("OK: %d %3.2f %3.2f\\n", ngeno, mean, sdev);
	677	}
	678	//printf("%d %d %d %1.2f %1.2f\\n", ngeno, sumibs, ssqibs, mean, sdev);
	679	result[0] = ngeno;
	680	result[1] = mean;
	681	result[2] = sdev;
	682	return_val = ngeno;
	683	"""
	684	started = time.time()
	685	for s1 in xrange(nSubjects):
	686	if s1 in emptyRows:
	687	continue
	688	(fid1,iid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache.setdefault(s1, ped.getSubject(s1))
	689
	690	isFounder1 = _founder_cache.setdefault(s1, (did1==mid1))
	691	g1 = genos[s1]
	692
	693	for s2 in xrange(s1+1, nSubjects):
	694	if s2 in emptyRows:
	695	continue
	696	t1s = time.time()
	697
	698	(fid2,iid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache.setdefault(s2, ped.getSubject(s2))
	699
	700	g2 = genos[s2]
	701	isFounder2 = _founder_cache.setdefault(s2, (did2==mid2))
	702
	703	# Determine the relationship for this pair
	704	relcode = REL_UNKNOWN
	705	if (fid2 == fid1):
	706	if iid1 == iid2:
	707	relcode = REL_DUPE
	708	elif (did2 == did1) and (mid2 == mid1) and did1 != mid1:
	709	relcode = REL_SIBS
	710	elif (iid1 == mid2) or (iid1 == did2) or (iid2 == mid1) or (iid2 == did1):
	711	relcode = REL_PARENTCHILD
	712	elif (str(did1) != '0' and (did2 == did1)) or (str(mid1) != '0' and (mid2 == mid1)):
	713	relcode = REL_HALFSIBS
	714	else:
	715	# People in the same family should be marked as some other
	716	# form of related. In general, these people will have a
	717	# pretty random spread of similarity. This distinction is
	718	# probably not very useful most of the time
	719	relcode = REL_RELATED
	720	else:
	721	### Different families
	722	relcode = REL_UNRELATED
	723
	724	t1e = time.time()
	725	t1sum += t1e-t1s
	726
	727
	728	### Calculate sum(2-abs(a1-a2)) and sum((2-abs(a1-a2))**2) and count
	729	### the number of contributing genotypes. These values are not actually
	730	### calculated here, but instead are looked up in a table for speed.
	731	### FIXME: This is still too slow ...
	732	result = [0.0, 0.0, 0.0]
	733	ngeno = weave.inline(C_CODE, ['g1', 'g2', 'nrsSamples', 'result'])
	734	if ngeno >= minUsegenos:
	735	_, mean, sdev = result
	736	means.append(mean)
	737	sdevs.append(sdev)
	738	ngenoL.append(ngeno)
	739	pairs.append((s1, s2))
	740	rels.append(relcode)
	741	else:
	742	skip.add(ndone) # signal no comparable genotypes for this pair
	743	ndone += 1
	744	t2e = time.time()
	745	t2sum += t2e-t1e
	746	t3e = time.time()
	747	t3sum += t3e-t2e
	748
	749	logme = [ 'T1: %s' % (t1sum), 'T2: %s' % (t2sum), 'T3: %s' % (t3sum),'TOT: %s' % (t3e-now),
	750	'%s pairs with no (or not enough) comparable genotypes (%3.1f%%)' % (len(skip),
	751	float(len(skip))/float(tot)*100)]
	752	logf.write('%s\n' % '\t'.join(logme))
	753	### Calculate mean and standard deviation of scores on a per relationship
	754	### type basis, allowing us to flag outliers for each particular relationship
	755	### type
	756	relstats = {}
	757	relCounts = {}
	758	outlierFiles = {}
	759	for relCode, relInfo in REL_LOOKUP.items():
	760	relName, relColor, relStyle = relInfo
	761	useme = [means[x] for x in xrange(len(means)) if rels[x] == relCode]
	762	relCounts[relCode] = len(useme)
	763	mm = scipy.mean(useme)
	764	ms = scipy.std(useme)
	765	useme = [sdevs[x] for x in xrange(len(sdevs)) if rels[x] == relCode]
	766	sm = scipy.mean(useme)
	767	ss = scipy.std(useme)
	768	relstats[relCode] = {'sd':(sm,ss), 'mean':(mm,ms)}
	769	s = 'Relstate %s (n=%d): mean(mean)=%3.2f sdev(mean)=%3.2f, mean(sdev)=%3.2f sdev(sdev)=%3.2f\n' % \
	770	(relName,relCounts[relCode], mm, ms, sm, ss)
	771	logf.write(s)
	772
	773	### now fake z scores for each subject like abecasis recommends max(\|zmu\|,\|zsd\|)
	774	### within each group, for each pair, z=(groupmean-pairmean)/groupsd
	775	available = len(means)
	776	logf.write('%d pairs are available of %d\n' % (available, tot))
	777	### s = '\nOutliers:\nrelationship\tzmean\tzsd\tped1\tped2\tmean\tsd\trmeanmean\trmeansd\trsdmean\trsdsd\n'
	778	### logf.write(s)
	779	pairnum = 0
	780	offset = 0
	781	nOutliers = 0
	782	cexs = []
	783	outlierRecords = dict([(r, []) for r in range(N_RELATIONSHIP_TYPES)])
	784	zsdmax = 0
	785	for s1 in range(nSubjects):
	786	if s1 in emptyRows:
	787	continue
	788	(fid1,iid1,did1,mid1,sex1,aff1,ok1,d_sid1,m_sid1) = scache[s1]
	789	for s2 in range(s1+1, nSubjects):
	790	if s2 in emptyRows:
	791	continue
	792	if pairnum not in skip:
	793	### Get group stats for this relationship
	794	(fid2,iid2,did2,mid2,sex2,aff2,ok2,d_sid2,m_sid2) = scache[s2]
	795	try:
	796	r = rels[offset]
	797	except IndexError:
	798	logf.write('###OOPS offset %d available %d pairnum %d len(rels) %d', offset, available, pairnum, len(rels))
	799	notfound = ('?',('?','0','0'))
	800	relInfo = REL_LOOKUP.get(r,notfound)
	801	relName, relColor, relStyle = relInfo
	802	rmm,rmd = relstats[r]['mean'] # group mean, group meansd alleles shared
	803	rdm,rdd = relstats[r]['sd'] # group sdmean, group sdsd alleles shared
	804
	805	try:
	806	zsd = (sdevs[offset] - rdm)/rdd # distance from group mean in group sd units
	807	except:
	808	zsd = 1
	809	if abs(zsd) > zsdmax:
	810	zsdmax = zsd # keep for sort scaling
	811	try:
	812	zmean = (means[offset] - rmm)/rmd # distance from group mean
	813	except:
	814	zmean = 1
	815	zmeans[offset] = zmean
	816	zstds[offset] = zsd
	817	pid=(s1,s2)
	818	zrad = max(zsd,zmean)
	819	if zrad < 4:
	820	zrad = 2
	821	elif 4 < zrad < 15:
	822	zrad = 3 # to 9
	823	else: # > 15 6=24+
	824	zrad=zrad/4
	825	zrad = min(zrad,6) # scale limit
	826	zrad = max(2,max(zsd,zmean)) # as > 2, z grows
	827	pair_data[pid] = (zmean,zsd,r,zrad)
	828	if max(zsd,zmean) > Zcutoff: # is potentially interesting
	829	mean = means[offset]
	830	sdev = sdevs[offset]
	831	outlierRecords[r].append((mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd,did1,mid1,did2,mid2))
	832	nOutliers += 1
	833	tbl.write('%s_%s\t%s_%s\t%f\t%f\t%f\t%f\t%d\t%s\t%s\t%s\t%s\t%s\n' % \
	834	(fid1, iid1, fid2, iid2, mean, sdev, zmean,zsd, ngeno, relName, did1,mid1,did2,mid2))
	835	offset += 1
	836	pairnum += 1
	837	logf.write( 'Outliers: %s\n' % (nOutliers))
	838
	839	### Write outlier files for each relationship type
	840	repOut.append('<h2>Outliers in tab delimited files linked above are also listed below</h2>')
	841	lzsd = round(numpy.log10(zsdmax)) + 1
	842	scalefactor = 10**lzsd
	843	for relCode, relInfo in REL_LOOKUP.items():
	844	relName, _, _ = relInfo
	845	outliers = outlierRecords[relCode]
	846	if not outliers:
	847	continue
	848	outliers = [(scalefactor*int(abs(x[3]))+ int(abs(x[2])),x) for x in outliers] # decorate
	849	outliers.sort()
	850	outliers.reverse() # largest deviation first
	851	outliers = [x[1] for x in outliers] # undecorate
	852	nrows = len(outliers)
	853	truncated = 0
	854	if nrows > MAX_SHOW_ROWS:
	855	s = '<h3>%s outlying pairs (top %d of %d) from %s</h3><table border="0" cellpadding="3">' % \
	856	(relName,MAX_SHOW_ROWS,nrows,title)
	857	truncated = nrows - MAX_SHOW_ROWS
	858	else:
	859	s = '<h3>%s outlying pairs (n=%d) from %s</h3><table border="0" cellpadding="3">' % (relName,nrows,title)
	860	repOut.append(s)
	861	fhname = '%s_rgGRR_%s_outliers.xls' % (title, relName)
	862	fhpath = os.path.join(outdir,fhname)
	863	fh = open(fhpath, 'w')
	864	newfiles.append(fhname)
	865	explanations.append('%s Outlier Pairs %s, N=%d, Cutoff SD=%f' % (relName,title,len(outliers),Zcutoff))
	866	fh.write(OUTLIERS_HEADER)
	867	s = ''.join(['<th>%s</th>' % x for x in OUTLIERS_HEADER_list])
	868	repOut.append('<tr align="center">%s</tr>' % s)
	869	for n,rec in enumerate(outliers):
	870	#(mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd) = rec
	871	s = '%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t' % tuple(rec)
	872	fh.write('%s%s\n' % (s,relName))
	873	# (mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd, did1,mid1,did2,mid2))
	874	s = '''<td>%f</td><td>%f</td><td>%f</td><td>%f</td><td>%s</td><td>%s</td>
	875	<td>%s</td><td>%s</td><td>%f</td><td>%f</td><td>%f</td><td>%f</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td>''' % tuple(rec)
	876	s = '%s<td>%s</td>' % (s,relName)
	877	if n < MAX_SHOW_ROWS:
	878	repOut.append('<tr align="center">%s</tr>' % s)
	879	if truncated > 0:
	880	repOut.append('<H2>WARNING: %d rows truncated - see outlier file for all %d rows</H2>' % (truncated,
	881	nrows))
	882	fh.close()
	883	repOut.append('</table><p>')
	884
	885	### Now, draw the plot in jpeg and svg formats, and optionally in the PDF format
	886	### if requested
	887	logf.write('Plotting ...')
	888	pointColors = [REL_COLORS[rel] for rel in rels]
	889	pointStyles = [REL_POINTS[rel] for rel in rels]
	890
	891	mainTitle = '%s (%s subjects, %d snp)' % (title, nSubjects, nrsSamples)
	892	svg.write(SVG_HEADER % (SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[4],
	893	SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[0],SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[1],
	894	SVG_COLORS[2],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[3],SVG_COLORS[4],SVG_COLORS[4],
	895	SVG_COLORS[5],SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[6],mainTitle))
	896	#rpy.r.jpeg(filename='%s.jpg' % (title), width=1600, height=1200, pointsize=12, quality=100, bg='white')
	897	#rpy.r.par(mai=(1,1,1,0.5))
	898	#rpy.r('par(xaxs="i",yaxs="i")')
	899	#rpy.r.plot(means, sdevs, main=mainTitle, ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2))
	900	#rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE)
	901	#rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted')
	902	#rpy.r.dev_off()
	903
	904	### We will now go through each relationship type to partition plot points
	905	### into "bulk" and "outlier" groups. Bulk points will represent common
	906	### mean/sdev pairs and will cover the majority of the points in the plot --
	907	### they will use generic tooltip informtion about all of the pairs
	908	### represented by that point. "Outlier" points will be uncommon pairs,
	909	### with very specific information in their tooltips. It would be nice to
	910	### keep hte total number of plotted points in the SVG representation to
	911	### ~10000 (certainly less than 100000?)
	912	pointMap = {}
	913	orderedRels = [y[1] for y in reversed(sorted([(relCounts.get(x, 0),x) for x in REL_LOOKUP.keys()]))]
	914	# do we really want this? I want out of zone points last and big
	915	for relCode in orderedRels:
	916	svgColor = SVG_COLORS[relCode]
	917	relName, relColor, relStyle = REL_LOOKUP[relCode]
	918	svg.write('<g id="%s" style="stroke:%s; fill:%s; fill-opacity:1.0; stroke-width:1;" cursor="pointer">\n' % (relName, svgColor, svgColor))
	919	pMap = pointMap.setdefault(relCode, {})
	920	nPoints = 0
	921	rpairs=[]
	922	rgenos=[]
	923	rmeans=[]
	924	rsdevs=[]
	925	rz = []
	926	for x,rel in enumerate(rels): # all pairs
	927	if rel == relCode:
	928	s1,s2 = pairs[x]
	929	pid=(s1,s2)
	930	zmean,zsd,r,zrad = pair_data[pid][:4]
	931	rpairs.append(pairs[x])
	932	rgenos.append(ngenoL[x])
	933	rmeans.append(means[x])
	934	rsdevs.append(sdevs[x])
	935	rz.append(zrad)
	936	### Now add the svg point group for this relationship to the svg file
	937	for x in range(len(rmeans)):
	938	svgX = '%d' % ((rmeans[x] - 1.0) * PLOT_WIDTH) # changed so mean scale is 1-2
	939	svgY = '%d' % (PLOT_HEIGHT - (rsdevs[x] * PLOT_HEIGHT)) # changed so sd scale is 0-1
	940	s1, s2 = rpairs[x]
	941	(fid1,uid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache[s1]
	942	(fid2,uid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache[s2]
	943	ngenos = rgenos[x]
	944	nPoints += 1
	945	point = pMap.setdefault((svgX, svgY), [])
	946	point.append((rmeans[x], rsdevs[x], fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos,rz[x]))
	947	for (svgX, svgY) in pMap:
	948	points = pMap[(svgX, svgY)]
	949	svgX = int(svgX)
	950	svgY = int(svgY)
	951	if len(points) > 1:
	952	mmean,dmean = calcMeanSD([p[0] for p in points])
	953	msdev,dsdev = calcMeanSD([p[1] for p in points])
	954	mgeno,dgeno = calcMeanSD([p[-1] for p in points])
	955	mingeno = min([p[-1] for p in points])
	956	maxgeno = max([p[-1] for p in points])
	957	svg.write("""<circle cx="%d" cy="%d" r="2"
	958	onmouseover="showBTT(evt, %d, %1.2f, %1.2f, %1.2f, %1.2f, %d, %d, %d, %d, %d)"
	959	onmouseout="hideBTT(evt)" />\n""" % (svgX, svgY, relCode, mmean, dmean, msdev, dsdev, len(points), mgeno, dgeno, mingeno, maxgeno))
	960	else:
	961	mean, sdev, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos, zrad = points[0][:12]
	962	rmean = float(relstats[relCode]['mean'][0])
	963	rsdev = float(relstats[relCode]['sd'][0])
	964	if zrad < 4:
	965	zrad = 2
	966	elif 4 < zrad < 9:
	967	zrad = 3 # to 9
	968	else: # > 9 5=15+
	969	zrad=zrad/3
	970	zrad = min(zrad,5) # scale limit
	971	if zrad <= 3:
	972	svg.write('<circle cx="%d" cy="%d" r="%s" onmouseover="showOTT(evt, %d, \'%s,%s,%s,%s\', \'%s,%s,%s,%s\', %1.2f, %1.2f, %s, %1.2f, %1.2f)" onmouseout="hideOTT(evt)" />\n' % (svgX, svgY, zrad, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev))
	973	else: # highlight pairs a long way from expectation by outlining circle in red
	974	svg.write("""<circle cx="%d" cy="%d" r="%s" style="stroke:red; fill:%s; fill-opacity:1.0; stroke-width:1;"
	975	onmouseover="showOTT(evt, %d, \'%s,%s,%s,%s\', \'%s,%s,%s,%s\', %1.2f, %1.2f, %s, %1.2f, %1.2f)"
	976	onmouseout="hideOTT(evt)" />\n""" % \
	977	(svgX, svgY, zrad, svgColor, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev))
	978	svg.write('</g>\n')
	979
	980	### Create a pdf as well if indicated on the command line
	981	### WARNING! for framingham share, with about 50M pairs, this is a 5.5GB pdf!
	982	## if pdftoo:
	983	## pdfname = '%s.pdf' % (title)
	984	## rpy.r.pdf(pdfname, 6, 6)
	985	## rpy.r.par(mai=(1,1,1,0.5))
	986	## rpy.r('par(xaxs="i",yaxs="i")')
	987	## rpy.r.plot(means, sdevs, main='%s, %d snp' % (title, nSamples), ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2))
	988	## rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE)
	989	## rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted')
	990	## rpy.r.dev_off()
	991
	992	### Draw polygons
	993	if showPolygons:
	994	svg.write('<g id="polygons" cursor="pointer">\n')
	995	for rel, poly in POLYGONS.items():
	996	points = ' '.join(['%s,%s' % ((p[0]-1.0)float(PLOT_WIDTH), (PLOT_HEIGHT - p[1]PLOT_HEIGHT)) for p in poly])
	997	svg.write('<polygon points="%s" fill="transparent" style="stroke:%s; stroke-width:1"/>\n' % (points, SVG_COLORS[rel]))
	998	svg.write('</g>\n')
	999
	1000
	1001	svg.write(SVG_FOOTER)
	1002	svg.close()
	1003	return newfiles,explanations,repOut
	1004
	1005	def doIBS(n=100):
	1006	"""parse parameters from galaxy
	1007	expect 'input pbed path' 'basename' 'outpath' 'title' 'logpath' 'n'
	1008	<command interpreter="python">
	1009	rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name"
	1010	'$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z'
	1011	</command>
	1012
	1013	"""
	1014	u="""<command interpreter="python">
	1015	rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name"
	1016	'$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z'
	1017	</command>
	1018	"""
	1019
	1020
	1021	if len(sys.argv) < 7:
	1022	print >> sys.stdout, 'Need pbed inpath, basename, out_htmlname, outpath, title, logpath, nSNP, Zcutoff on command line please'
	1023	print >> sys.stdout, u
	1024	sys.exit(1)
	1025	ts = '%s%s' % (string.punctuation,string.whitespace)
	1026	ptran = string.maketrans(ts,'_'*len(ts))
	1027	inpath = sys.argv[1]
	1028	ldinpath = os.path.split(inpath)[0]
	1029	basename = sys.argv[2]
	1030	outhtml = sys.argv[3]
	1031	newfilepath = sys.argv[4]
	1032	title = sys.argv[5].translate(ptran)
	1033	logfname = 'Log_%s.txt' % title
	1034	logpath = os.path.join(newfilepath,logfname) # log was a child - make part of html extra_files_path zoo
	1035	n = int(sys.argv[6])
	1036	try:
	1037	Zcutoff = float(sys.argv[7])
	1038	except:
	1039	Zcutoff = 2.0
	1040	try:
	1041	os.makedirs(newfilepath)
	1042	except:
	1043	pass
	1044	logf = file(logpath,'w')
	1045	efp,ibase_name = os.path.split(inpath) # need to use these for outputs in files_path
	1046	ped = plinkbinJZ.BPed(inpath)
	1047	ped.parse(quick=True)
	1048	if ped == None:
	1049	print >> sys.stderr, '## doIBSpy problem - cannot open %s or %s - cannot run' % (ldreduced,basename)
	1050	sys.exit(1)
	1051	newfiles,explanations,repOut = doIBSpy(ped=ped,basename=basename,outdir=newfilepath,
	1052	logf=logf,nrsSamples=n,title=title,pdftoo=0,Zcutoff=Zcutoff)
	1053	logf.close()
	1054	logfs = file(logpath,'r').readlines()
	1055	lf = file(outhtml,'w')
	1056	lf.write(galhtmlprefix % PROGNAME)
	1057	# this is a mess. todo clean up - should each datatype have it's own directory? Yes
	1058	# probably. Then titles are universal - but userId libraries are separate.
	1059	s = '<div>Output from %s run at %s<br>\n' % (PROGNAME,timenow())
	1060	lf.write('<h4>%s</h4>\n' % s)
	1061	fixed = ["'%s'" % x for x in sys.argv] # add quotes just in case
	1062	s = 'If you need to rerun this analysis, the command line was\n<pre>%s</pre>\n</div>' % (' '.join(fixed))
	1063	lf.write(s)
	1064	# various ways of displaying svg - experiments related to missing svg mimetype on test (!)
	1065	#s = """<object data="%s" type="image/svg+xml" width="%d" height="%d">
	1066	# <embed src="%s" type="image/svg+xml" width="%d" height="%d" />
	1067	# </object>""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT,newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)
	1068	s = """ <embed src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)
	1069	#s = """ <iframe src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)
	1070	lf.write(s)
	1071	lf.write('<div><h4>Click the links below to save output files and plots</h4><br><ol>\n')
	1072	for i in range(len(newfiles)):
	1073	if i == 0:
	1074	lf.write('<li><a href="%s" type="image/svg+xml" >%s</a></li>\n' % (newfiles[i],explanations[i]))
	1075	else:
	1076	lf.write('<li><a href="%s">%s</a></li>\n' % (newfiles[i],explanations[i]))
	1077	flist = os.listdir(newfilepath)
	1078	for fname in flist:
	1079	if not fname in newfiles:
	1080	lf.write('<li><a href="%s">%s</a></li>\n' % (fname,fname))
	1081	lf.write('</ol></div>')
	1082	lf.write('<div>%s</div>' % ('\n'.join(repOut))) # repOut is a list of tables
	1083	lf.write('<div><hr><h3>Log from this job (also stored in %s)</h3><pre>%s</pre><hr></div>' % (logfname,''.join(logfs)))
	1084	lf.write('</body></html>\n')
	1085	lf.close()
	1086	logf.close()
	1087
	1088	if __name__ == '__main__':
	1089	doIBS()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/rgenetics/rgGRR.py @ 2

異なるフォーマットでダウンロード: