Context Navigation

rgGRR.py

リビジョン 2, 45.2 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

行番号
1	"""
2	# july 2009: Need to see outliers so need to draw them last?
3	# could use clustering on the zscores to guess real relationships for unrelateds
4	# but definitely need to draw last
5	# added MAX_SHOW_ROWS to limit the length of the main report page
6	# Changes for Galaxy integration
7	# added more robust knuth method for one pass mean and sd
8	# no difference really - let's use scipy.mean() and scipy.std() instead...
9	# fixed labels and changed to .xls for outlier reports so can open in excel
10	# interesting - with a few hundred subjects, 5k gives good resolution
11	# and 100k gives better but not by much
12	# TODO remove non autosomal markers
13	# TODO it would be best if label had the zmean and zsd as these are what matter for
14	# outliers rather than the group mean/sd
15	# mods to rgGRR.py from channing CVS which John Ziniti has rewritten to produce SVG plots
16	# to make a Galaxy tool - we need the table of mean and SD for interesting pairs, the SVG and the log
17	# so the result should be an HTML file
18
19	# rgIBS.py
20	# use a random subset of markers for a quick ibs
21	# to identify sample dups and closely related subjects
22	# try snpMatrix and plink and see which one works best for us?
23	# abecasis grr plots mean*sd for every subject to show clusters
24	# mods june 23 rml to avoid non-autosomal markers
25	# we seem to be distinguishing parent-child by gender - 2 clouds!
26
27
28	snpMatrix from David Clayton has:
29	ibs.stats function to calculate the identity-by-state stats of a group of samples
30	Description
31	Given a snp.matrix-class or a X.snp.matrix-class object with N samples, calculates some statistics
32	about the relatedness of every pair of samples within.
33
34	Usage
35	ibs.stats(x)
36	8 ibs.stats
37	Arguments
38	x a snp.matrix-class or a X.snp.matrix-class object containing N samples
39	Details
40	No-calls are excluded from consideration here.
41	Value
42	A data.frame containing N(N - 1)/2 rows, where the row names are the sample name pairs separated
43	by a comma, and the columns are:
44	Count count of identical calls, exclusing no-calls
45	Fraction fraction of identical calls comparied to actual calls being made in both samples
46	Warning
47	In some applications, it may be preferable to subset a (random) selection of SNPs first - the
48	calculation
49	time increases as N(N - 1)M/2 . Typically for N = 800 samples and M = 3000 SNPs, the
50	calculation time is about 1 minute. A full GWA scan could take hours, and quite unnecessary for
51	simple applications such as checking for duplicate or related samples.
52	Note
53	This is mostly written to find mislabelled and/or duplicate samples.
54	Illumina indexes their SNPs in alphabetical order so the mitochondria SNPs comes first - for most
55	purpose it is undesirable to use these SNPs for IBS purposes.
56	TODO: Worst-case S4 subsetting seems to make 2 copies of a large object, so one might want to
57	subset before rbind(), etc; a future version of this routine may contain a built-in subsetting facility
58	"""
59	import sys,os,time,random,string,copy,optparse
60
61	try:
62	set
63	except NameError:
64	from Sets import Set as set
65
66	from rgutils import timenow,plinke
67
68	import plinkbinJZ
69
70
71	opts = None
72	verbose = False
73
74	showPolygons = False
75
76	class NullDevice:
77	def write(self, s):
78	pass
79
80	tempstderr = sys.stderr # save
81	#sys.stderr = NullDevice()
82	# need to avoid blather about deprecation and other strange stuff from scipy
83	# the current galaxy job runner assumes that
84	# the job is in error if anything appears on sys.stderr
85	# grrrrr. James wants to keep it that way instead of using the
86	# status flag for some strange reason. Presumably he doesn't use R or (in this case, scipy)
87	import numpy
88	import scipy
89	from scipy import weave
90
91
92	sys.stderr=tempstderr
93
94
95	PROGNAME = os.path.split(sys.argv[0])[-1]
96	X_AXIS_LABEL = 'Mean Alleles Shared'
97	Y_AXIS_LABEL = 'SD Alleles Shared'
98	LEGEND_ALIGN = 'topleft'
99	LEGEND_TITLE = 'Relationship'
100	DEFAULT_SYMBOL_SIZE = 1.0 # default symbol size
101	DEFAULT_SYMBOL_SIZE = 0.5 # default symbol size
102
103	### Some colors for R/rpy
104	R_BLACK = 1
105	R_RED = 2
106	R_GREEN = 3
107	R_BLUE = 4
108	R_CYAN = 5
109	R_PURPLE = 6
110	R_YELLOW = 7
111	R_GRAY = 8
112
113	### ... and some point-styles
114
115	###
116	PLOT_HEIGHT = 600
117	PLOT_WIDTH = 1150
118
119
120	#SVG_COLORS = ('black', 'darkblue', 'blue', 'deepskyblue', 'firebrick','maroon','crimson')
121	#SVG_COLORS = ('cyan','dodgerblue','mediumpurple', 'fuchsia', 'red','gold','gray')
122	SVG_COLORS = ('cyan','dodgerblue','mediumpurple','forestgreen', 'lightgreen','gold','gray')
123	# dupe,parentchild,sibpair,halfsib,parents,unrel,unkn
124	#('orange', 'red', 'green', 'chartreuse', 'blue', 'purple', 'gray')
125
126	OUTLIERS_HEADER_list = ['Mean','Sdev','ZMean','ZSdev','FID1','IID1','FID2','IID2','RelMean_M','RelMean_SD','RelSD_M','RelSD_SD','PID1','MID1','PID2','MID2','Ped']
127	OUTLIERS_HEADER = '\t'.join(OUTLIERS_HEADER_list)
128	TABLE_HEADER='fid1_iid1\tfid2_iid2\tmean\tsdev\tzmean\tzsdev\tgeno\trelcode\tpid1\tmid1\tpid2\tmid2\n'
129
130
131	### Relationship codes, text, and lookups/mappings
132	N_RELATIONSHIP_TYPES = 7
133	REL_DUPE, REL_PARENTCHILD, REL_SIBS, REL_HALFSIBS, REL_RELATED, REL_UNRELATED, REL_UNKNOWN = range(N_RELATIONSHIP_TYPES)
134	REL_LOOKUP = {
135	REL_DUPE: ('dupe', R_BLUE, 1),
136	REL_PARENTCHILD: ('parentchild', R_YELLOW, 1),
137	REL_SIBS: ('sibpairs', R_RED, 1),
138	REL_HALFSIBS: ('halfsibs', R_GREEN, 1),
139	REL_RELATED: ('parents', R_PURPLE, 1),
140	REL_UNRELATED: ('unrelated', R_CYAN, 1),
141	REL_UNKNOWN: ('unknown', R_GRAY, 1),
142	}
143	OUTLIER_STDEVS = {
144	REL_DUPE: 2,
145	REL_PARENTCHILD: 2,
146	REL_SIBS: 2,
147	REL_HALFSIBS: 2,
148	REL_RELATED: 2,
149	REL_UNRELATED: 3,
150	REL_UNKNOWN: 2,
151	}
152	# note now Z can be passed in
153
154	REL_STATES = [REL_LOOKUP[r][0] for r in range(N_RELATIONSHIP_TYPES)]
155	REL_COLORS = SVG_COLORS
156	REL_POINTS = [REL_LOOKUP[r][2] for r in range(N_RELATIONSHIP_TYPES)]
157
158	DEFAULT_MAX_SAMPLE_SIZE = 10000
159
160	REF_COUNT_HOM1 = 3
161	REF_COUNT_HET = 2
162	REF_COUNT_HOM2 = 1
163	MISSING = 0
164	MAX_SHOW_ROWS = 100 # framingham has millions - delays showing output page - so truncate and explain
165	MARKER_PAIRS_PER_SECOND_SLOW = 15000000.0
166	MARKER_PAIRS_PER_SECOND_FAST = 70000000.0
167
168
169	galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
170	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
171	<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
172	<head>
173	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
174	<meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" />
175	<title></title>
176	<link rel="stylesheet" href="/static/style/base.css" type="text/css" />
177	</head>
178	<body>
179	<div class="document">
180	"""
181
182
183	SVG_HEADER = '''<?xml version="1.0" standalone="no"?>
184	<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.2//EN" "http://www.w3.org/Graphics/SVG/1.2/DTD/svg12.dtd">
185
186	<svg width="1280" height="800"
187	xmlns="http://www.w3.org/2000/svg" version="1.2"
188	xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 1280 800" onload="init()">
189
190	<script type="text/ecmascript" xlink:href="/static/scripts/checkbox_and_radiobutton.js"/>
191	<script type="text/ecmascript" xlink:href="/static/scripts/helper_functions.js"/>
192	<script type="text/ecmascript" xlink:href="/static/scripts/timer.js"/>
193	<script type="text/ecmascript">
194	<![CDATA[
195	var checkBoxes = new Array();
196	var radioGroupBandwidth;
197	var colours = ['%s','%s','%s','%s','%s','%s','%s'];
198	function init() {
199	var style = {"font-family":"Arial,Helvetica", "fill":"black", "font-size":12};
200	var dist = 12;
201	var yOffset = 4;
202
203	//A checkBox for each relationship type dupe,parentchild,sibpair,halfsib,parents,unrel,unkn
204	checkBoxes["dupe"] = new checkBox("dupe","checkboxes",20,40,"cbRect","cbCross",true,"Duplicate",style,dist,yOffset,undefined,hideShowLayer);
205	checkBoxes["parentchild"] = new checkBox("parentchild","checkboxes",20,60,"cbRect","cbCross",true,"Parent-Child",style,dist,yOffset,undefined,hideShowLayer);
206	checkBoxes["sibpairs"] = new checkBox("sibpairs","checkboxes",20,80,"cbRect","cbCross",true,"Sib-pairs",style,dist,yOffset,undefined,hideShowLayer);
207	checkBoxes["halfsibs"] = new checkBox("halfsibs","checkboxes",20,100,"cbRect","cbCross",true,"Half-sibs",style,dist,yOffset,undefined,hideShowLayer);
208	checkBoxes["parents"] = new checkBox("parents","checkboxes",20,120,"cbRect","cbCross",true,"Parents",style,dist,yOffset,undefined,hideShowLayer);
209	checkBoxes["unrelated"] = new checkBox("unrelated","checkboxes",20,140,"cbRect","cbCross",true,"Unrelated",style,dist,yOffset,undefined,hideShowLayer);
210	checkBoxes["unknown"] = new checkBox("unknown","checkboxes",20,160,"cbRect","cbCross",true,"Unknown",style,dist,yOffset,undefined,hideShowLayer);
211
212	}
213
214	function hideShowLayer(id, status, label) {
215	var vis = "hidden";
216	if (status) {
217	vis = "visible";
218	}
219	document.getElementById(id).setAttributeNS(null, 'visibility', vis);
220	}
221
222	function showBTT(evt, rel, mm, dm, md, dd, n, mg, dg, lg, hg) {
223	var x = parseInt(evt.pageX)-250;
224	var y = parseInt(evt.pageY)-110;
225	switch(rel) {
226	case 0:
227	fill = colours[rel];
228	relt = "dupe";
229	break;
230	case 1:
231	fill = colours[rel];
232	relt = "parentchild";
233	break;
234	case 2:
235	fill = colours[rel];
236	relt = "sibpairs";
237	break;
238	case 3:
239	fill = colours[rel];
240	relt = "halfsibs";
241	break;
242	case 4:
243	fill = colours[rel];
244	relt = "parents";
245	break;
246	case 5:
247	fill = colours[rel];
248	relt = "unrelated";
249	break;
250	case 6:
251	fill = colours[rel];
252	relt = "unknown";
253	break;
254	default:
255	fill = "cyan";
256	relt = "ERROR_CODE: "+rel;
257	}
258
259	document.getElementById("btRel").textContent = "GROUP: "+relt;
260	document.getElementById("btMean").textContent = "mean="+mm+" +/- "+dm;
261	document.getElementById("btSdev").textContent = "sdev="+dm+" +/- "+dd;
262	document.getElementById("btPair").textContent = "npairs="+n;
263	document.getElementById("btGeno").textContent = "ngenos="+mg+" +/- "+dg+" (min="+lg+", max="+hg+")";
264	document.getElementById("btHead").setAttribute('fill', fill);
265
266	var tt = document.getElementById("btTip");
267	tt.setAttribute("transform", "translate("+x+","+y+")");
268	tt.setAttribute('visibility', 'visible');
269	}
270
271	function showOTT(evt, rel, s1, s2, mean, sdev, ngeno, rmean, rsdev) {
272	var x = parseInt(evt.pageX)-150;
273	var y = parseInt(evt.pageY)-180;
274
275	switch(rel) {
276	case 0:
277	fill = colours[rel];
278	relt = "dupe";
279	break;
280	case 1:
281	fill = colours[rel];
282	relt = "parentchild";
283	break;
284	case 2:
285	fill = colours[rel];
286	relt = "sibpairs";
287	break;
288	case 3:
289	fill = colours[rel];
290	relt = "halfsibs";
291	break;
292	case 4:
293	fill = colours[rel];
294	relt = "parents";
295	break;
296	case 5:
297	fill = colours[rel];
298	relt = "unrelated";
299	break;
300	case 6:
301	fill = colours[rel];
302	relt = "unknown";
303	break;
304	default:
305	fill = "cyan";
306	relt = "ERROR_CODE: "+rel;
307	}
308
309	document.getElementById("otRel").textContent = "PAIR: "+relt;
310	document.getElementById("otS1").textContent = "s1="+s1;
311	document.getElementById("otS2").textContent = "s2="+s2;
312	document.getElementById("otMean").textContent = "mean="+mean;
313	document.getElementById("otSdev").textContent = "sdev="+sdev;
314	document.getElementById("otGeno").textContent = "ngenos="+ngeno;
315	document.getElementById("otRmean").textContent = "relmean="+rmean;
316	document.getElementById("otRsdev").textContent = "relsdev="+rsdev;
317	document.getElementById("otHead").setAttribute('fill', fill);
318
319	var tt = document.getElementById("otTip");
320	tt.setAttribute("transform", "translate("+x+","+y+")");
321	tt.setAttribute('visibility', 'visible');
322	}
323
324	function hideBTT(evt) {
325	document.getElementById("btTip").setAttributeNS(null, 'visibility', 'hidden');
326	}
327
328	function hideOTT(evt) {
329	document.getElementById("otTip").setAttributeNS(null, 'visibility', 'hidden');
330	}
331
332	]]>
333	</script>
334	<defs>
335	<!-- symbols for check boxes -->
336	<symbol id="cbRect" overflow="visible">
337	<rect x="-5" y="-5" width="10" height="10" fill="white" stroke="dimgray" stroke-width="1" cursor="pointer"/>
338	</symbol>
339	<symbol id="cbCross" overflow="visible">
340	<g pointer-events="none" stroke="black" stroke-width="1">
341	<line x1="-3" y1="-3" x2="3" y2="3"/>
342	<line x1="3" y1="-3" x2="-3" y2="3"/>
343	</g>
344	</symbol>
345	</defs>
346
347	<desc>Developer Works Dynamic Scatter Graph Scaling Example</desc>
348
349	<!-- Now Draw the main X and Y axis -->
350	<g style="stroke-width:1.0; stroke:black; shape-rendering:crispEdges">
351	<!-- X Axis top and bottom -->
352	<path d="M 100 100 L 1250 100 Z"/>
353	<path d="M 100 700 L 1250 700 Z"/>
354
355	<!-- Y Axis left and right -->
356	<path d="M 100 100 L 100 700 Z"/>
357	<path d="M 1250 100 L 1250 700 Z"/>
358	</g>
359
360	<g transform="translate(100,100)">
361
362	<!-- Grid Lines -->
363	<g style="fill:none; stroke:#dddddd; stroke-width:1; stroke-dasharray:2,2; text-anchor:end; shape-rendering:crispEdges">
364
365	<!-- Vertical grid lines -->
366	<line x1="125" y1="0" x2="115" y2="600" />
367	<line x1="230" y1="0" x2="230" y2="600" />
368	<line x1="345" y1="0" x2="345" y2="600" />
369	<line x1="460" y1="0" x2="460" y2="600" />
370	<line x1="575" y1="0" x2="575" y2="600" style="stroke-dasharray:none;" />
371	<line x1="690" y1="0" x2="690" y2="600" />
372	<line x1="805" y1="0" x2="805" y2="600" />
373	<line x1="920" y1="0" x2="920" y2="600" />
374	<line x1="1035" y1="0" x2="1035" y2="600" />
375
376	<!-- Horizontal grid lines -->
377	<line x1="0" y1="60" x2="1150" y2="60" />
378	<line x1="0" y1="120" x2="1150" y2="120" />
379	<line x1="0" y1="180" x2="1150" y2="180" />
380	<line x1="0" y1="240" x2="1150" y2="240" />
381	<line x1="0" y1="300" x2="1150" y2="300" style="stroke-dasharray:none;" />
382	<line x1="0" y1="360" x2="1150" y2="360" />
383	<line x1="0" y1="420" x2="1150" y2="420" />
384	<line x1="0" y1="480" x2="1150" y2="480" />
385	<line x1="0" y1="540" x2="1150" y2="540" />
386	</g>
387
388	<!-- Legend -->
389	<g style="fill:black; stroke:none" font-size="12" font-family="Arial" transform="translate(25,25)">
390	<rect width="160" height="270" style="fill:none; stroke:black; shape-rendering:crispEdges" />
391	<text x="5" y="20" style="fill:black; stroke:none;" font-size="13" font-weight="bold">Given Pair Relationship</text>
392	<rect x="120" y="35" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
393	<rect x="120" y="55" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
394	<rect x="120" y="75" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
395	<rect x="120" y="95" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
396	<rect x="120" y="115" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
397	<rect x="120" y="135" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
398	<rect x="120" y="155" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
399	<text x="15" y="195" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore gt 15</text>
400	<circle cx="125" cy="192" r="6" style="stroke:red; fill:gold; fill-opacity:1.0; stroke-width:1;"/>
401	<text x="15" y="215" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore 4 to 15</text>
402	<circle cx="125" cy="212" r="3" style="stroke:gold; fill:gold; fill-opacity:1.0; stroke-width:1;"/>
403	<text x="15" y="235" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore lt 4</text>
404	<circle cx="125" cy="232" r="2" style="stroke:gold; fill:gold; fill-opacity:1.0; stroke-width:1;"/>
405	<g id="checkboxes">
406	</g>
407	</g>
408
409
410	<g style='fill:black; stroke:none' font-size="17" font-family="Arial">
411	<!-- X Axis Labels -->
412	<text x="480" y="660">Mean Alleles Shared</text>
413	<text x="0" y="630" >1.0</text>
414	<text x="277" y="630" >1.25</text>
415	<text x="564" y="630" >1.5</text>
416	<text x="842" y="630" >1.75</text>
417	<text x="1140" y="630" >2.0</text>
418	</g>
419
420	<g transform="rotate(270)" style="fill:black; stroke:none" font-size="17" font-family="Arial">
421	<!-- Y Axis Labels -->
422	<text x="-350" y="-40">SD Alleles Shared</text>
423	<text x="-20" y="-10" >1.0</text>
424	<text x="-165" y="-10" >0.75</text>
425	<text x="-310" y="-10" >0.5</text>
426	<text x="-455" y="-10" >0.25</text>
427	<text x="-600" y="-10" >0.0</text>
428	</g>
429
430	<!-- Plot Title -->
431	<g style="fill:black; stroke:none" font-size="18" font-family="Arial">
432	<text x="425" y="-30">%s</text>
433	</g>
434
435	<!-- One group/layer of points for each relationship type -->
436	'''
437
438	SVG_FOOTER = '''
439	<!-- End of Data -->
440	</g>
441	<g id="btTip" visibility="hidden" style="stroke-width:1.0; fill:black; stroke:none;" font-size="10" font-family="Arial">
442	<rect width="250" height="110" style="fill:silver" rx="2" ry="2"/>
443	<rect id="btHead" width="250" height="20" rx="2" ry="2" />
444	<text id="btRel" y="14" x="85">unrelated</text>
445	<text id="btMean" y="40" x="4">mean=1.5 +/- 0.04</text>
446	<text id="btSdev" y="60" x="4">sdev=0.7 +/- 0.03</text>
447	<text id="btPair" y="80" x="4">npairs=1152</text>
448	<text id="btGeno" y="100" x="4">ngenos=4783 +/- 24 (min=1000, max=5000)</text>
449	</g>
450
451	<g id="otTip" visibility="hidden" style="stroke-width:1.0; fill:black; stroke:none;" font-size="10" font-family="Arial">
452	<rect width="150" height="180" style="fill:silver" rx="2" ry="2"/>
453	<rect id="otHead" width="150" height="20" rx="2" ry="2" />
454	<text id="otRel" y="14" x="40">sibpairs</text>
455	<text id="otS1" y="40" x="4">s1=fid1,iid1</text>
456	<text id="otS2" y="60" x="4">s2=fid2,iid2</text>
457	<text id="otMean" y="80" x="4">mean=1.82</text>
458	<text id="otSdev" y="100" x="4">sdev=0.7</text>
459	<text id="otGeno" y="120" x="4">ngeno=4487</text>
460	<text id="otRmean" y="140" x="4">relmean=1.85</text>
461	<text id="otRsdev" y="160" x="4">relsdev=0.65</text>
462	</g>
463	</svg>
464	'''
465
466
467	DEFAULT_MAX_SAMPLE_SIZE = 5000
468
469	REF_COUNT_HOM1 = 3
470	REF_COUNT_HET = 2
471	REF_COUNT_HOM2 = 1
472	MISSING = 0
473
474	MARKER_PAIRS_PER_SECOND_SLOW = 15000000
475	MARKER_PAIRS_PER_SECOND_FAST = 70000000
476
477	POLYGONS = {
478	REL_UNRELATED: ((1.360, 0.655), (1.385, 0.730), (1.620, 0.575), (1.610, 0.505)),
479	REL_HALFSIBS: ((1.630, 0.500), (1.630, 0.550), (1.648, 0.540), (1.648, 0.490)),
480	REL_SIBS: ((1.660, 0.510), (1.665, 0.560), (1.820, 0.410), (1.820, 0.390)),
481	REL_PARENTCHILD: ((1.650, 0.470), (1.650, 0.490), (1.750, 0.440), (1.750, 0.420)),
482	REL_DUPE: ((1.970, 0.000), (1.970, 0.150), (2.000, 0.150), (2.000, 0.000)),
483	}
484
485	def distance(point1, point2):
486	""" Calculate the distance between two points
487	"""
488	(x1,y1) = [float(d) for d in point1]
489	(x2,y2) = [float(d) for d in point2]
490	dx = abs(x1 - x2)
491	dy = abs(y1 - y2)
492	return math.sqrt(dx2 + dy2)
493
494	def point_inside_polygon(x, y, poly):
495	""" Determine if a point (x,y) is inside a given polygon or not
496	poly is a list of (x,y) pairs.
497
498	Taken from: http://www.ariel.com.au/a/python-point-int-poly.html
499	"""
500
501	n = len(poly)
502	inside = False
503
504	p1x,p1y = poly[0]
505	for i in range(n+1):
506	p2x,p2y = poly[i % n]
507	if y > min(p1y,p2y):
508	if y <= max(p1y,p2y):
509	if x <= max(p1x,p2x):
510	if p1y != p2y:
511	xinters = (y-p1y)*(p2x-p1x)/(p2y-p1y)+p1x
512	if p1x == p2x or x <= xinters:
513	inside = not inside
514	p1x,p1y = p2x,p2y
515	return inside
516
517	def readMap(pedfile):
518	"""
519	"""
520	mapfile = pedfile.replace('.ped', '.map')
521	marker_list = []
522	if os.path.exists(mapfile):
523	print 'readMap: %s' % (mapfile)
524	fh = file(mapfile, 'r')
525	for line in fh:
526	marker_list.append(line.strip().split())
527	fh.close()
528	print 'readMap: %s markers' % (len(marker_list))
529	return marker_list
530
531	def calcMeanSD(useme):
532	"""
533	A numerically stable algorithm is given below. It also computes the mean.
534	This algorithm is due to Knuth,[1] who cites Welford.[2]
535	n = 0
536	mean = 0
537	M2 = 0
538
539	foreach x in data:
540	n = n + 1
541	delta = x - mean
542	mean = mean + delta/n
543	M2 = M2 + delta*(x - mean) // This expression uses the new value of mean
544	end for
545
546	variance_n = M2/n
547	variance = M2/(n - 1)
548	"""
549	mean = 0.0
550	M2 = 0.0
551	sd = 0.0
552	n = len(useme)
553	if n > 1:
554	for i,x in enumerate(useme):
555	delta = x - mean
556	mean = mean + delta/(i+1) # knuth uses n+=1 at start
557	M2 = M2 + delta*(x - mean) # This expression uses the new value of mean
558	variance = M2/(n-1) # assume is sample so lose 1 DOF
559	sd = pow(variance,0.5)
560	return mean,sd
561
562
563	def doIBSpy(ped=None,basename='',outdir=None,logf=None,
564	nrsSamples=10000,title='title',pdftoo=0,Zcutoff=2.0):
565	#def doIBS(pedName, title, nrsSamples=None, pdftoo=False):
566	""" started with snpmatrix but GRR uses actual IBS counts and sd's
567	"""
568	repOut = [] # text strings to add to the html display
569	refallele = {}
570	tblf = '%s_table.xls' % (title)
571	tbl = file(os.path.join(outdir,tblf), 'w')
572	tbl.write(TABLE_HEADER)
573	svgf = '%s.svg' % (title)
574	svg = file(os.path.join(outdir,svgf), 'w')
575
576	nMarkers = len(ped._markers)
577	if nMarkers < 5:
578	print sys.stderr, '### ERROR - %d is too few markers for reliable estimation in %s - terminating' % (nMarkers,PROGNAME)
579	sys.exit(1)
580	nSubjects = len(ped._subjects)
581	nrsSamples = min(nMarkers, nrsSamples)
582	if opts and opts.use_mito:
583	markers = range(nMarkers)
584	nrsSamples = min(len(markers), nrsSamples)
585	sampleIndexes = sorted(random.sample(markers, nrsSamples))
586	else:
587	autosomals = ped.autosomal_indices()
588	nrsSamples = min(len(autosomals), nrsSamples)
589	sampleIndexes = sorted(random.sample(autosomals, nrsSamples))
590
591	print ''
592	print 'Getting random.sample of %s from %s total' % (nrsSamples, nMarkers)
593	npairs = (nSubjects*(nSubjects-1))/2 # total rows in table
594	newfiles=[svgf,tblf]
595	explanations = ['rgGRR Plot (requires SVG)','Mean by SD alleles shared - %d rows' % npairs]
596	# these go with the output file links in the html file
597	s = 'Reading genotypes for %s subjects and %s markers\n' % (nSubjects, nrsSamples)
598	logf.write(s)
599	minUsegenos = nrsSamples/2 # must have half?
600	nGenotypes = nSubjects*nrsSamples
601	stime = time.time()
602	emptyRows = set()
603	genos = numpy.zeros((nSubjects, nrsSamples), dtype=int)
604	for s in xrange(nSubjects):
605	nValid = 0
606	#getGenotypesByIndices(self, s, mlist, format)
607	genos[s] = ped.getGenotypesByIndices(s, sampleIndexes, format='ref')
608	nValid = sum([1 for g in genos[s] if g])
609	if not nValid:
610	emptyRows.add(s)
611	sub = ped.getSubject(s)
612	print 'All missing for row %d (%s)' % (s, sub)
613	logf.write('All missing for row %d (%s)\n' % (s, sub))
614	rtime = time.time() - stime
615	if verbose:
616	print '@@Read %s genotypes in %s seconds' % (nGenotypes, rtime)
617
618
619	### Now the expensive part. For each pair of subjects, we get the mean number
620	### and standard deviation of shared alleles over all of the markers where both
621	### subjects have a known genotype. Identical subjects should have mean shared
622	### alleles very close to 2.0 with a standard deviation very close to 0.0.
623	tot = nSubjects*(nSubjects-1)/2
624	nprog = tot/10
625	nMarkerpairs = tot * nrsSamples
626	estimatedTimeSlow = nMarkerpairs/MARKER_PAIRS_PER_SECOND_SLOW
627	estimatedTimeFast = nMarkerpairs/MARKER_PAIRS_PER_SECOND_FAST
628
629	pairs = []
630	pair_data = {}
631	means = [] ## Mean IBS for each pair
632	ngenoL = [] ## Count of comparable genotypes for each pair
633	sdevs = [] ## Standard dev for each pair
634	rels = [] ## A relationship code for each pair
635	zmeans = [0.0 for x in xrange(tot)] ## zmean score for each pair for the relgroup
636	zstds = [0.0 for x in xrange(tot)] ## zstd score for each pair for the relgrp
637	skip = set()
638	ndone = 0 ## How many have been done so far
639
640	logf.write('Calculating %d pairs...\n' % (tot))
641	logf.write('Estimated time is %2.2f to %2.2f seconds ...\n' % (estimatedTimeFast, estimatedTimeSlow))
642
643	t1sum = 0
644	t2sum = 0
645	t3sum = 0
646	now = time.time()
647	scache = {}
648	_founder_cache = {}
649	C_CODE = """
650	#include "math.h"
651	int i;
652	int sumibs = 0;
653	int ssqibs = 0;
654	int ngeno = 0;
655	float mean = 0;
656	float M2 = 0;
657	float delta = 0;
658	float sdev=0;
659	float variance=0;
660	for (i=0; i<nrsSamples; i++) {
661	int a1 = g1[i];
662	int a2 = g2[i];
663	if (a1 != 0 && a2 != 0) {
664	ngeno += 1;
665	int shared = 2-abs(a1-a2);
666	delta = shared - mean;
667	mean = mean + delta/ngeno;
668	M2 += delta*(shared-mean);
669	// yes that second time, the updated mean is used see calcmeansd above;
670	//printf("%d %d %d %d %d %d\\n", i, a1, a2, ngeno, shared, squared);
671	}
672	}
673	if (ngeno > 1) {
674	variance = M2/(ngeno-1);
675	sdev = sqrt(variance);
676	//printf("OK: %d %3.2f %3.2f\\n", ngeno, mean, sdev);
677	}
678	//printf("%d %d %d %1.2f %1.2f\\n", ngeno, sumibs, ssqibs, mean, sdev);
679	result[0] = ngeno;
680	result[1] = mean;
681	result[2] = sdev;
682	return_val = ngeno;
683	"""
684	started = time.time()
685	for s1 in xrange(nSubjects):
686	if s1 in emptyRows:
687	continue
688	(fid1,iid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache.setdefault(s1, ped.getSubject(s1))
689
690	isFounder1 = _founder_cache.setdefault(s1, (did1==mid1))
691	g1 = genos[s1]
692
693	for s2 in xrange(s1+1, nSubjects):
694	if s2 in emptyRows:
695	continue
696	t1s = time.time()
697
698	(fid2,iid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache.setdefault(s2, ped.getSubject(s2))
699
700	g2 = genos[s2]
701	isFounder2 = _founder_cache.setdefault(s2, (did2==mid2))
702
703	# Determine the relationship for this pair
704	relcode = REL_UNKNOWN
705	if (fid2 == fid1):
706	if iid1 == iid2:
707	relcode = REL_DUPE
708	elif (did2 == did1) and (mid2 == mid1) and did1 != mid1:
709	relcode = REL_SIBS
710	elif (iid1 == mid2) or (iid1 == did2) or (iid2 == mid1) or (iid2 == did1):
711	relcode = REL_PARENTCHILD
712	elif (str(did1) != '0' and (did2 == did1)) or (str(mid1) != '0' and (mid2 == mid1)):
713	relcode = REL_HALFSIBS
714	else:
715	# People in the same family should be marked as some other
716	# form of related. In general, these people will have a
717	# pretty random spread of similarity. This distinction is
718	# probably not very useful most of the time
719	relcode = REL_RELATED
720	else:
721	### Different families
722	relcode = REL_UNRELATED
723
724	t1e = time.time()
725	t1sum += t1e-t1s
726
727
728	### Calculate sum(2-abs(a1-a2)) and sum((2-abs(a1-a2))**2) and count
729	### the number of contributing genotypes. These values are not actually
730	### calculated here, but instead are looked up in a table for speed.
731	### FIXME: This is still too slow ...
732	result = [0.0, 0.0, 0.0]
733	ngeno = weave.inline(C_CODE, ['g1', 'g2', 'nrsSamples', 'result'])
734	if ngeno >= minUsegenos:
735	_, mean, sdev = result
736	means.append(mean)
737	sdevs.append(sdev)
738	ngenoL.append(ngeno)
739	pairs.append((s1, s2))
740	rels.append(relcode)
741	else:
742	skip.add(ndone) # signal no comparable genotypes for this pair
743	ndone += 1
744	t2e = time.time()
745	t2sum += t2e-t1e
746	t3e = time.time()
747	t3sum += t3e-t2e
748
749	logme = [ 'T1: %s' % (t1sum), 'T2: %s' % (t2sum), 'T3: %s' % (t3sum),'TOT: %s' % (t3e-now),
750	'%s pairs with no (or not enough) comparable genotypes (%3.1f%%)' % (len(skip),
751	float(len(skip))/float(tot)*100)]
752	logf.write('%s\n' % '\t'.join(logme))
753	### Calculate mean and standard deviation of scores on a per relationship
754	### type basis, allowing us to flag outliers for each particular relationship
755	### type
756	relstats = {}
757	relCounts = {}
758	outlierFiles = {}
759	for relCode, relInfo in REL_LOOKUP.items():
760	relName, relColor, relStyle = relInfo
761	useme = [means[x] for x in xrange(len(means)) if rels[x] == relCode]
762	relCounts[relCode] = len(useme)
763	mm = scipy.mean(useme)
764	ms = scipy.std(useme)
765	useme = [sdevs[x] for x in xrange(len(sdevs)) if rels[x] == relCode]
766	sm = scipy.mean(useme)
767	ss = scipy.std(useme)
768	relstats[relCode] = {'sd':(sm,ss), 'mean':(mm,ms)}
769	s = 'Relstate %s (n=%d): mean(mean)=%3.2f sdev(mean)=%3.2f, mean(sdev)=%3.2f sdev(sdev)=%3.2f\n' % \
770	(relName,relCounts[relCode], mm, ms, sm, ss)
771	logf.write(s)
772
773	### now fake z scores for each subject like abecasis recommends max(\|zmu\|,\|zsd\|)
774	### within each group, for each pair, z=(groupmean-pairmean)/groupsd
775	available = len(means)
776	logf.write('%d pairs are available of %d\n' % (available, tot))
777	### s = '\nOutliers:\nrelationship\tzmean\tzsd\tped1\tped2\tmean\tsd\trmeanmean\trmeansd\trsdmean\trsdsd\n'
778	### logf.write(s)
779	pairnum = 0
780	offset = 0
781	nOutliers = 0
782	cexs = []
783	outlierRecords = dict([(r, []) for r in range(N_RELATIONSHIP_TYPES)])
784	zsdmax = 0
785	for s1 in range(nSubjects):
786	if s1 in emptyRows:
787	continue
788	(fid1,iid1,did1,mid1,sex1,aff1,ok1,d_sid1,m_sid1) = scache[s1]
789	for s2 in range(s1+1, nSubjects):
790	if s2 in emptyRows:
791	continue
792	if pairnum not in skip:
793	### Get group stats for this relationship
794	(fid2,iid2,did2,mid2,sex2,aff2,ok2,d_sid2,m_sid2) = scache[s2]
795	try:
796	r = rels[offset]
797	except IndexError:
798	logf.write('###OOPS offset %d available %d pairnum %d len(rels) %d', offset, available, pairnum, len(rels))
799	notfound = ('?',('?','0','0'))
800	relInfo = REL_LOOKUP.get(r,notfound)
801	relName, relColor, relStyle = relInfo
802	rmm,rmd = relstats[r]['mean'] # group mean, group meansd alleles shared
803	rdm,rdd = relstats[r]['sd'] # group sdmean, group sdsd alleles shared
804
805	try:
806	zsd = (sdevs[offset] - rdm)/rdd # distance from group mean in group sd units
807	except:
808	zsd = 1
809	if abs(zsd) > zsdmax:
810	zsdmax = zsd # keep for sort scaling
811	try:
812	zmean = (means[offset] - rmm)/rmd # distance from group mean
813	except:
814	zmean = 1
815	zmeans[offset] = zmean
816	zstds[offset] = zsd
817	pid=(s1,s2)
818	zrad = max(zsd,zmean)
819	if zrad < 4:
820	zrad = 2
821	elif 4 < zrad < 15:
822	zrad = 3 # to 9
823	else: # > 15 6=24+
824	zrad=zrad/4
825	zrad = min(zrad,6) # scale limit
826	zrad = max(2,max(zsd,zmean)) # as > 2, z grows
827	pair_data[pid] = (zmean,zsd,r,zrad)
828	if max(zsd,zmean) > Zcutoff: # is potentially interesting
829	mean = means[offset]
830	sdev = sdevs[offset]
831	outlierRecords[r].append((mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd,did1,mid1,did2,mid2))
832	nOutliers += 1
833	tbl.write('%s_%s\t%s_%s\t%f\t%f\t%f\t%f\t%d\t%s\t%s\t%s\t%s\t%s\n' % \
834	(fid1, iid1, fid2, iid2, mean, sdev, zmean,zsd, ngeno, relName, did1,mid1,did2,mid2))
835	offset += 1
836	pairnum += 1
837	logf.write( 'Outliers: %s\n' % (nOutliers))
838
839	### Write outlier files for each relationship type
840	repOut.append('<h2>Outliers in tab delimited files linked above are also listed below</h2>')
841	lzsd = round(numpy.log10(zsdmax)) + 1
842	scalefactor = 10**lzsd
843	for relCode, relInfo in REL_LOOKUP.items():
844	relName, _, _ = relInfo
845	outliers = outlierRecords[relCode]
846	if not outliers:
847	continue
848	outliers = [(scalefactor*int(abs(x[3]))+ int(abs(x[2])),x) for x in outliers] # decorate
849	outliers.sort()
850	outliers.reverse() # largest deviation first
851	outliers = [x[1] for x in outliers] # undecorate
852	nrows = len(outliers)
853	truncated = 0
854	if nrows > MAX_SHOW_ROWS:
855	s = '<h3>%s outlying pairs (top %d of %d) from %s</h3><table border="0" cellpadding="3">' % \
856	(relName,MAX_SHOW_ROWS,nrows,title)
857	truncated = nrows - MAX_SHOW_ROWS
858	else:
859	s = '<h3>%s outlying pairs (n=%d) from %s</h3><table border="0" cellpadding="3">' % (relName,nrows,title)
860	repOut.append(s)
861	fhname = '%s_rgGRR_%s_outliers.xls' % (title, relName)
862	fhpath = os.path.join(outdir,fhname)
863	fh = open(fhpath, 'w')
864	newfiles.append(fhname)
865	explanations.append('%s Outlier Pairs %s, N=%d, Cutoff SD=%f' % (relName,title,len(outliers),Zcutoff))
866	fh.write(OUTLIERS_HEADER)
867	s = ''.join(['<th>%s</th>' % x for x in OUTLIERS_HEADER_list])
868	repOut.append('<tr align="center">%s</tr>' % s)
869	for n,rec in enumerate(outliers):
870	#(mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd) = rec
871	s = '%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t' % tuple(rec)
872	fh.write('%s%s\n' % (s,relName))
873	# (mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd, did1,mid1,did2,mid2))
874	s = '''<td>%f</td><td>%f</td><td>%f</td><td>%f</td><td>%s</td><td>%s</td>
875	<td>%s</td><td>%s</td><td>%f</td><td>%f</td><td>%f</td><td>%f</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td>''' % tuple(rec)
876	s = '%s<td>%s</td>' % (s,relName)
877	if n < MAX_SHOW_ROWS:
878	repOut.append('<tr align="center">%s</tr>' % s)
879	if truncated > 0:
880	repOut.append('<H2>WARNING: %d rows truncated - see outlier file for all %d rows</H2>' % (truncated,
881	nrows))
882	fh.close()
883	repOut.append('</table><p>')
884
885	### Now, draw the plot in jpeg and svg formats, and optionally in the PDF format
886	### if requested
887	logf.write('Plotting ...')
888	pointColors = [REL_COLORS[rel] for rel in rels]
889	pointStyles = [REL_POINTS[rel] for rel in rels]
890
891	mainTitle = '%s (%s subjects, %d snp)' % (title, nSubjects, nrsSamples)
892	svg.write(SVG_HEADER % (SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[4],
893	SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[0],SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[1],
894	SVG_COLORS[2],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[3],SVG_COLORS[4],SVG_COLORS[4],
895	SVG_COLORS[5],SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[6],mainTitle))
896	#rpy.r.jpeg(filename='%s.jpg' % (title), width=1600, height=1200, pointsize=12, quality=100, bg='white')
897	#rpy.r.par(mai=(1,1,1,0.5))
898	#rpy.r('par(xaxs="i",yaxs="i")')
899	#rpy.r.plot(means, sdevs, main=mainTitle, ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2))
900	#rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE)
901	#rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted')
902	#rpy.r.dev_off()
903
904	### We will now go through each relationship type to partition plot points
905	### into "bulk" and "outlier" groups. Bulk points will represent common
906	### mean/sdev pairs and will cover the majority of the points in the plot --
907	### they will use generic tooltip informtion about all of the pairs
908	### represented by that point. "Outlier" points will be uncommon pairs,
909	### with very specific information in their tooltips. It would be nice to
910	### keep hte total number of plotted points in the SVG representation to
911	### ~10000 (certainly less than 100000?)
912	pointMap = {}
913	orderedRels = [y[1] for y in reversed(sorted([(relCounts.get(x, 0),x) for x in REL_LOOKUP.keys()]))]
914	# do we really want this? I want out of zone points last and big
915	for relCode in orderedRels:
916	svgColor = SVG_COLORS[relCode]
917	relName, relColor, relStyle = REL_LOOKUP[relCode]
918	svg.write('<g id="%s" style="stroke:%s; fill:%s; fill-opacity:1.0; stroke-width:1;" cursor="pointer">\n' % (relName, svgColor, svgColor))
919	pMap = pointMap.setdefault(relCode, {})
920	nPoints = 0
921	rpairs=[]
922	rgenos=[]
923	rmeans=[]
924	rsdevs=[]
925	rz = []
926	for x,rel in enumerate(rels): # all pairs
927	if rel == relCode:
928	s1,s2 = pairs[x]
929	pid=(s1,s2)
930	zmean,zsd,r,zrad = pair_data[pid][:4]
931	rpairs.append(pairs[x])
932	rgenos.append(ngenoL[x])
933	rmeans.append(means[x])
934	rsdevs.append(sdevs[x])
935	rz.append(zrad)
936	### Now add the svg point group for this relationship to the svg file
937	for x in range(len(rmeans)):
938	svgX = '%d' % ((rmeans[x] - 1.0) * PLOT_WIDTH) # changed so mean scale is 1-2
939	svgY = '%d' % (PLOT_HEIGHT - (rsdevs[x] * PLOT_HEIGHT)) # changed so sd scale is 0-1
940	s1, s2 = rpairs[x]
941	(fid1,uid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache[s1]
942	(fid2,uid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache[s2]
943	ngenos = rgenos[x]
944	nPoints += 1
945	point = pMap.setdefault((svgX, svgY), [])
946	point.append((rmeans[x], rsdevs[x], fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos,rz[x]))
947	for (svgX, svgY) in pMap:
948	points = pMap[(svgX, svgY)]
949	svgX = int(svgX)
950	svgY = int(svgY)
951	if len(points) > 1:
952	mmean,dmean = calcMeanSD([p[0] for p in points])
953	msdev,dsdev = calcMeanSD([p[1] for p in points])
954	mgeno,dgeno = calcMeanSD([p[-1] for p in points])
955	mingeno = min([p[-1] for p in points])
956	maxgeno = max([p[-1] for p in points])
957	svg.write("""<circle cx="%d" cy="%d" r="2"
958	onmouseover="showBTT(evt, %d, %1.2f, %1.2f, %1.2f, %1.2f, %d, %d, %d, %d, %d)"
959	onmouseout="hideBTT(evt)" />\n""" % (svgX, svgY, relCode, mmean, dmean, msdev, dsdev, len(points), mgeno, dgeno, mingeno, maxgeno))
960	else:
961	mean, sdev, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos, zrad = points[0][:12]
962	rmean = float(relstats[relCode]['mean'][0])
963	rsdev = float(relstats[relCode]['sd'][0])
964	if zrad < 4:
965	zrad = 2
966	elif 4 < zrad < 9:
967	zrad = 3 # to 9
968	else: # > 9 5=15+
969	zrad=zrad/3
970	zrad = min(zrad,5) # scale limit
971	if zrad <= 3:
972	svg.write('<circle cx="%d" cy="%d" r="%s" onmouseover="showOTT(evt, %d, \'%s,%s,%s,%s\', \'%s,%s,%s,%s\', %1.2f, %1.2f, %s, %1.2f, %1.2f)" onmouseout="hideOTT(evt)" />\n' % (svgX, svgY, zrad, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev))
973	else: # highlight pairs a long way from expectation by outlining circle in red
974	svg.write("""<circle cx="%d" cy="%d" r="%s" style="stroke:red; fill:%s; fill-opacity:1.0; stroke-width:1;"
975	onmouseover="showOTT(evt, %d, \'%s,%s,%s,%s\', \'%s,%s,%s,%s\', %1.2f, %1.2f, %s, %1.2f, %1.2f)"
976	onmouseout="hideOTT(evt)" />\n""" % \
977	(svgX, svgY, zrad, svgColor, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev))
978	svg.write('</g>\n')
979
980	### Create a pdf as well if indicated on the command line
981	### WARNING! for framingham share, with about 50M pairs, this is a 5.5GB pdf!
982	## if pdftoo:
983	## pdfname = '%s.pdf' % (title)
984	## rpy.r.pdf(pdfname, 6, 6)
985	## rpy.r.par(mai=(1,1,1,0.5))
986	## rpy.r('par(xaxs="i",yaxs="i")')
987	## rpy.r.plot(means, sdevs, main='%s, %d snp' % (title, nSamples), ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2))
988	## rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE)
989	## rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted')
990	## rpy.r.dev_off()
991
992	### Draw polygons
993	if showPolygons:
994	svg.write('<g id="polygons" cursor="pointer">\n')
995	for rel, poly in POLYGONS.items():
996	points = ' '.join(['%s,%s' % ((p[0]-1.0)float(PLOT_WIDTH), (PLOT_HEIGHT - p[1]PLOT_HEIGHT)) for p in poly])
997	svg.write('<polygon points="%s" fill="transparent" style="stroke:%s; stroke-width:1"/>\n' % (points, SVG_COLORS[rel]))
998	svg.write('</g>\n')
999
1000
1001	svg.write(SVG_FOOTER)
1002	svg.close()
1003	return newfiles,explanations,repOut
1004
1005	def doIBS(n=100):
1006	"""parse parameters from galaxy
1007	expect 'input pbed path' 'basename' 'outpath' 'title' 'logpath' 'n'
1008	<command interpreter="python">
1009	rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name"
1010	'$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z'
1011	</command>
1012
1013	"""
1014	u="""<command interpreter="python">
1015	rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name"
1016	'$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z'
1017	</command>
1018	"""
1019
1020
1021	if len(sys.argv) < 7:
1022	print >> sys.stdout, 'Need pbed inpath, basename, out_htmlname, outpath, title, logpath, nSNP, Zcutoff on command line please'
1023	print >> sys.stdout, u
1024	sys.exit(1)
1025	ts = '%s%s' % (string.punctuation,string.whitespace)
1026	ptran = string.maketrans(ts,'_'*len(ts))
1027	inpath = sys.argv[1]
1028	ldinpath = os.path.split(inpath)[0]
1029	basename = sys.argv[2]
1030	outhtml = sys.argv[3]
1031	newfilepath = sys.argv[4]
1032	title = sys.argv[5].translate(ptran)
1033	logfname = 'Log_%s.txt' % title
1034	logpath = os.path.join(newfilepath,logfname) # log was a child - make part of html extra_files_path zoo
1035	n = int(sys.argv[6])
1036	try:
1037	Zcutoff = float(sys.argv[7])
1038	except:
1039	Zcutoff = 2.0
1040	try:
1041	os.makedirs(newfilepath)
1042	except:
1043	pass
1044	logf = file(logpath,'w')
1045	efp,ibase_name = os.path.split(inpath) # need to use these for outputs in files_path
1046	ped = plinkbinJZ.BPed(inpath)
1047	ped.parse(quick=True)
1048	if ped == None:
1049	print >> sys.stderr, '## doIBSpy problem - cannot open %s or %s - cannot run' % (ldreduced,basename)
1050	sys.exit(1)
1051	newfiles,explanations,repOut = doIBSpy(ped=ped,basename=basename,outdir=newfilepath,
1052	logf=logf,nrsSamples=n,title=title,pdftoo=0,Zcutoff=Zcutoff)
1053	logf.close()
1054	logfs = file(logpath,'r').readlines()
1055	lf = file(outhtml,'w')
1056	lf.write(galhtmlprefix % PROGNAME)
1057	# this is a mess. todo clean up - should each datatype have it's own directory? Yes
1058	# probably. Then titles are universal - but userId libraries are separate.
1059	s = '<div>Output from %s run at %s<br>\n' % (PROGNAME,timenow())
1060	lf.write('<h4>%s</h4>\n' % s)
1061	fixed = ["'%s'" % x for x in sys.argv] # add quotes just in case
1062	s = 'If you need to rerun this analysis, the command line was\n<pre>%s</pre>\n</div>' % (' '.join(fixed))
1063	lf.write(s)
1064	# various ways of displaying svg - experiments related to missing svg mimetype on test (!)
1065	#s = """<object data="%s" type="image/svg+xml" width="%d" height="%d">
1066	# <embed src="%s" type="image/svg+xml" width="%d" height="%d" />
1067	# </object>""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT,newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)
1068	s = """ <embed src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)
1069	#s = """ <iframe src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)
1070	lf.write(s)
1071	lf.write('<div><h4>Click the links below to save output files and plots</h4><br><ol>\n')
1072	for i in range(len(newfiles)):
1073	if i == 0:
1074	lf.write('<li><a href="%s" type="image/svg+xml" >%s</a></li>\n' % (newfiles[i],explanations[i]))
1075	else:
1076	lf.write('<li><a href="%s">%s</a></li>\n' % (newfiles[i],explanations[i]))
1077	flist = os.listdir(newfilepath)
1078	for fname in flist:
1079	if not fname in newfiles:
1080	lf.write('<li><a href="%s">%s</a></li>\n' % (fname,fname))
1081	lf.write('</ol></div>')
1082	lf.write('<div>%s</div>' % ('\n'.join(repOut))) # repOut is a list of tables
1083	lf.write('<div><hr><h3>Log from this job (also stored in %s)</h3><pre>%s</pre><hr></div>' % (logfname,''.join(logfs)))
1084	lf.write('</body></html>\n')
1085	lf.close()
1086	logf.close()
1087
1088	if __name__ == '__main__':
1089	doIBS()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/rgenetics/rgGRR.py

異なるフォーマットでダウンロード: