1 | """ |
---|
2 | # july 2009: Need to see outliers so need to draw them last? |
---|
3 | # could use clustering on the zscores to guess real relationships for unrelateds |
---|
4 | # but definitely need to draw last |
---|
5 | # added MAX_SHOW_ROWS to limit the length of the main report page |
---|
6 | # Changes for Galaxy integration |
---|
7 | # added more robust knuth method for one pass mean and sd |
---|
8 | # no difference really - let's use scipy.mean() and scipy.std() instead... |
---|
9 | # fixed labels and changed to .xls for outlier reports so can open in excel |
---|
10 | # interesting - with a few hundred subjects, 5k gives good resolution |
---|
11 | # and 100k gives better but not by much |
---|
12 | # TODO remove non autosomal markers |
---|
13 | # TODO it would be best if label had the zmean and zsd as these are what matter for |
---|
14 | # outliers rather than the group mean/sd |
---|
15 | # mods to rgGRR.py from channing CVS which John Ziniti has rewritten to produce SVG plots |
---|
16 | # to make a Galaxy tool - we need the table of mean and SD for interesting pairs, the SVG and the log |
---|
17 | # so the result should be an HTML file |
---|
18 | |
---|
19 | # rgIBS.py |
---|
20 | # use a random subset of markers for a quick ibs |
---|
21 | # to identify sample dups and closely related subjects |
---|
22 | # try snpMatrix and plink and see which one works best for us? |
---|
23 | # abecasis grr plots mean*sd for every subject to show clusters |
---|
24 | # mods june 23 rml to avoid non-autosomal markers |
---|
25 | # we seem to be distinguishing parent-child by gender - 2 clouds! |
---|
26 | |
---|
27 | |
---|
28 | snpMatrix from David Clayton has: |
---|
29 | ibs.stats function to calculate the identity-by-state stats of a group of samples |
---|
30 | Description |
---|
31 | Given a snp.matrix-class or a X.snp.matrix-class object with N samples, calculates some statistics |
---|
32 | about the relatedness of every pair of samples within. |
---|
33 | |
---|
34 | Usage |
---|
35 | ibs.stats(x) |
---|
36 | 8 ibs.stats |
---|
37 | Arguments |
---|
38 | x a snp.matrix-class or a X.snp.matrix-class object containing N samples |
---|
39 | Details |
---|
40 | No-calls are excluded from consideration here. |
---|
41 | Value |
---|
42 | A data.frame containing N(N - 1)/2 rows, where the row names are the sample name pairs separated |
---|
43 | by a comma, and the columns are: |
---|
44 | Count count of identical calls, exclusing no-calls |
---|
45 | Fraction fraction of identical calls comparied to actual calls being made in both samples |
---|
46 | Warning |
---|
47 | In some applications, it may be preferable to subset a (random) selection of SNPs first - the |
---|
48 | calculation |
---|
49 | time increases as N(N - 1)M/2 . Typically for N = 800 samples and M = 3000 SNPs, the |
---|
50 | calculation time is about 1 minute. A full GWA scan could take hours, and quite unnecessary for |
---|
51 | simple applications such as checking for duplicate or related samples. |
---|
52 | Note |
---|
53 | This is mostly written to find mislabelled and/or duplicate samples. |
---|
54 | Illumina indexes their SNPs in alphabetical order so the mitochondria SNPs comes first - for most |
---|
55 | purpose it is undesirable to use these SNPs for IBS purposes. |
---|
56 | TODO: Worst-case S4 subsetting seems to make 2 copies of a large object, so one might want to |
---|
57 | subset before rbind(), etc; a future version of this routine may contain a built-in subsetting facility |
---|
58 | """ |
---|
59 | import sys,os,time,random,string,copy,optparse |
---|
60 | |
---|
61 | try: |
---|
62 | set |
---|
63 | except NameError: |
---|
64 | from Sets import Set as set |
---|
65 | |
---|
66 | from rgutils import timenow,plinke |
---|
67 | |
---|
68 | import plinkbinJZ |
---|
69 | |
---|
70 | |
---|
71 | opts = None |
---|
72 | verbose = False |
---|
73 | |
---|
74 | showPolygons = False |
---|
75 | |
---|
76 | class NullDevice: |
---|
77 | def write(self, s): |
---|
78 | pass |
---|
79 | |
---|
80 | tempstderr = sys.stderr # save |
---|
81 | #sys.stderr = NullDevice() |
---|
82 | # need to avoid blather about deprecation and other strange stuff from scipy |
---|
83 | # the current galaxy job runner assumes that |
---|
84 | # the job is in error if anything appears on sys.stderr |
---|
85 | # grrrrr. James wants to keep it that way instead of using the |
---|
86 | # status flag for some strange reason. Presumably he doesn't use R or (in this case, scipy) |
---|
87 | import numpy |
---|
88 | import scipy |
---|
89 | from scipy import weave |
---|
90 | |
---|
91 | |
---|
92 | sys.stderr=tempstderr |
---|
93 | |
---|
94 | |
---|
95 | PROGNAME = os.path.split(sys.argv[0])[-1] |
---|
96 | X_AXIS_LABEL = 'Mean Alleles Shared' |
---|
97 | Y_AXIS_LABEL = 'SD Alleles Shared' |
---|
98 | LEGEND_ALIGN = 'topleft' |
---|
99 | LEGEND_TITLE = 'Relationship' |
---|
100 | DEFAULT_SYMBOL_SIZE = 1.0 # default symbol size |
---|
101 | DEFAULT_SYMBOL_SIZE = 0.5 # default symbol size |
---|
102 | |
---|
103 | ### Some colors for R/rpy |
---|
104 | R_BLACK = 1 |
---|
105 | R_RED = 2 |
---|
106 | R_GREEN = 3 |
---|
107 | R_BLUE = 4 |
---|
108 | R_CYAN = 5 |
---|
109 | R_PURPLE = 6 |
---|
110 | R_YELLOW = 7 |
---|
111 | R_GRAY = 8 |
---|
112 | |
---|
113 | ### ... and some point-styles |
---|
114 | |
---|
115 | ### |
---|
116 | PLOT_HEIGHT = 600 |
---|
117 | PLOT_WIDTH = 1150 |
---|
118 | |
---|
119 | |
---|
120 | #SVG_COLORS = ('black', 'darkblue', 'blue', 'deepskyblue', 'firebrick','maroon','crimson') |
---|
121 | #SVG_COLORS = ('cyan','dodgerblue','mediumpurple', 'fuchsia', 'red','gold','gray') |
---|
122 | SVG_COLORS = ('cyan','dodgerblue','mediumpurple','forestgreen', 'lightgreen','gold','gray') |
---|
123 | # dupe,parentchild,sibpair,halfsib,parents,unrel,unkn |
---|
124 | #('orange', 'red', 'green', 'chartreuse', 'blue', 'purple', 'gray') |
---|
125 | |
---|
126 | OUTLIERS_HEADER_list = ['Mean','Sdev','ZMean','ZSdev','FID1','IID1','FID2','IID2','RelMean_M','RelMean_SD','RelSD_M','RelSD_SD','PID1','MID1','PID2','MID2','Ped'] |
---|
127 | OUTLIERS_HEADER = '\t'.join(OUTLIERS_HEADER_list) |
---|
128 | TABLE_HEADER='fid1_iid1\tfid2_iid2\tmean\tsdev\tzmean\tzsdev\tgeno\trelcode\tpid1\tmid1\tpid2\tmid2\n' |
---|
129 | |
---|
130 | |
---|
131 | ### Relationship codes, text, and lookups/mappings |
---|
132 | N_RELATIONSHIP_TYPES = 7 |
---|
133 | REL_DUPE, REL_PARENTCHILD, REL_SIBS, REL_HALFSIBS, REL_RELATED, REL_UNRELATED, REL_UNKNOWN = range(N_RELATIONSHIP_TYPES) |
---|
134 | REL_LOOKUP = { |
---|
135 | REL_DUPE: ('dupe', R_BLUE, 1), |
---|
136 | REL_PARENTCHILD: ('parentchild', R_YELLOW, 1), |
---|
137 | REL_SIBS: ('sibpairs', R_RED, 1), |
---|
138 | REL_HALFSIBS: ('halfsibs', R_GREEN, 1), |
---|
139 | REL_RELATED: ('parents', R_PURPLE, 1), |
---|
140 | REL_UNRELATED: ('unrelated', R_CYAN, 1), |
---|
141 | REL_UNKNOWN: ('unknown', R_GRAY, 1), |
---|
142 | } |
---|
143 | OUTLIER_STDEVS = { |
---|
144 | REL_DUPE: 2, |
---|
145 | REL_PARENTCHILD: 2, |
---|
146 | REL_SIBS: 2, |
---|
147 | REL_HALFSIBS: 2, |
---|
148 | REL_RELATED: 2, |
---|
149 | REL_UNRELATED: 3, |
---|
150 | REL_UNKNOWN: 2, |
---|
151 | } |
---|
152 | # note now Z can be passed in |
---|
153 | |
---|
154 | REL_STATES = [REL_LOOKUP[r][0] for r in range(N_RELATIONSHIP_TYPES)] |
---|
155 | REL_COLORS = SVG_COLORS |
---|
156 | REL_POINTS = [REL_LOOKUP[r][2] for r in range(N_RELATIONSHIP_TYPES)] |
---|
157 | |
---|
158 | DEFAULT_MAX_SAMPLE_SIZE = 10000 |
---|
159 | |
---|
160 | REF_COUNT_HOM1 = 3 |
---|
161 | REF_COUNT_HET = 2 |
---|
162 | REF_COUNT_HOM2 = 1 |
---|
163 | MISSING = 0 |
---|
164 | MAX_SHOW_ROWS = 100 # framingham has millions - delays showing output page - so truncate and explain |
---|
165 | MARKER_PAIRS_PER_SECOND_SLOW = 15000000.0 |
---|
166 | MARKER_PAIRS_PER_SECOND_FAST = 70000000.0 |
---|
167 | |
---|
168 | |
---|
169 | galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> |
---|
170 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
---|
171 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> |
---|
172 | <head> |
---|
173 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> |
---|
174 | <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" /> |
---|
175 | <title></title> |
---|
176 | <link rel="stylesheet" href="/static/style/base.css" type="text/css" /> |
---|
177 | </head> |
---|
178 | <body> |
---|
179 | <div class="document"> |
---|
180 | """ |
---|
181 | |
---|
182 | |
---|
183 | SVG_HEADER = '''<?xml version="1.0" standalone="no"?> |
---|
184 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.2//EN" "http://www.w3.org/Graphics/SVG/1.2/DTD/svg12.dtd"> |
---|
185 | |
---|
186 | <svg width="1280" height="800" |
---|
187 | xmlns="http://www.w3.org/2000/svg" version="1.2" |
---|
188 | xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 1280 800" onload="init()"> |
---|
189 | |
---|
190 | <script type="text/ecmascript" xlink:href="/static/scripts/checkbox_and_radiobutton.js"/> |
---|
191 | <script type="text/ecmascript" xlink:href="/static/scripts/helper_functions.js"/> |
---|
192 | <script type="text/ecmascript" xlink:href="/static/scripts/timer.js"/> |
---|
193 | <script type="text/ecmascript"> |
---|
194 | <![CDATA[ |
---|
195 | var checkBoxes = new Array(); |
---|
196 | var radioGroupBandwidth; |
---|
197 | var colours = ['%s','%s','%s','%s','%s','%s','%s']; |
---|
198 | function init() { |
---|
199 | var style = {"font-family":"Arial,Helvetica", "fill":"black", "font-size":12}; |
---|
200 | var dist = 12; |
---|
201 | var yOffset = 4; |
---|
202 | |
---|
203 | //A checkBox for each relationship type dupe,parentchild,sibpair,halfsib,parents,unrel,unkn |
---|
204 | checkBoxes["dupe"] = new checkBox("dupe","checkboxes",20,40,"cbRect","cbCross",true,"Duplicate",style,dist,yOffset,undefined,hideShowLayer); |
---|
205 | checkBoxes["parentchild"] = new checkBox("parentchild","checkboxes",20,60,"cbRect","cbCross",true,"Parent-Child",style,dist,yOffset,undefined,hideShowLayer); |
---|
206 | checkBoxes["sibpairs"] = new checkBox("sibpairs","checkboxes",20,80,"cbRect","cbCross",true,"Sib-pairs",style,dist,yOffset,undefined,hideShowLayer); |
---|
207 | checkBoxes["halfsibs"] = new checkBox("halfsibs","checkboxes",20,100,"cbRect","cbCross",true,"Half-sibs",style,dist,yOffset,undefined,hideShowLayer); |
---|
208 | checkBoxes["parents"] = new checkBox("parents","checkboxes",20,120,"cbRect","cbCross",true,"Parents",style,dist,yOffset,undefined,hideShowLayer); |
---|
209 | checkBoxes["unrelated"] = new checkBox("unrelated","checkboxes",20,140,"cbRect","cbCross",true,"Unrelated",style,dist,yOffset,undefined,hideShowLayer); |
---|
210 | checkBoxes["unknown"] = new checkBox("unknown","checkboxes",20,160,"cbRect","cbCross",true,"Unknown",style,dist,yOffset,undefined,hideShowLayer); |
---|
211 | |
---|
212 | } |
---|
213 | |
---|
214 | function hideShowLayer(id, status, label) { |
---|
215 | var vis = "hidden"; |
---|
216 | if (status) { |
---|
217 | vis = "visible"; |
---|
218 | } |
---|
219 | document.getElementById(id).setAttributeNS(null, 'visibility', vis); |
---|
220 | } |
---|
221 | |
---|
222 | function showBTT(evt, rel, mm, dm, md, dd, n, mg, dg, lg, hg) { |
---|
223 | var x = parseInt(evt.pageX)-250; |
---|
224 | var y = parseInt(evt.pageY)-110; |
---|
225 | switch(rel) { |
---|
226 | case 0: |
---|
227 | fill = colours[rel]; |
---|
228 | relt = "dupe"; |
---|
229 | break; |
---|
230 | case 1: |
---|
231 | fill = colours[rel]; |
---|
232 | relt = "parentchild"; |
---|
233 | break; |
---|
234 | case 2: |
---|
235 | fill = colours[rel]; |
---|
236 | relt = "sibpairs"; |
---|
237 | break; |
---|
238 | case 3: |
---|
239 | fill = colours[rel]; |
---|
240 | relt = "halfsibs"; |
---|
241 | break; |
---|
242 | case 4: |
---|
243 | fill = colours[rel]; |
---|
244 | relt = "parents"; |
---|
245 | break; |
---|
246 | case 5: |
---|
247 | fill = colours[rel]; |
---|
248 | relt = "unrelated"; |
---|
249 | break; |
---|
250 | case 6: |
---|
251 | fill = colours[rel]; |
---|
252 | relt = "unknown"; |
---|
253 | break; |
---|
254 | default: |
---|
255 | fill = "cyan"; |
---|
256 | relt = "ERROR_CODE: "+rel; |
---|
257 | } |
---|
258 | |
---|
259 | document.getElementById("btRel").textContent = "GROUP: "+relt; |
---|
260 | document.getElementById("btMean").textContent = "mean="+mm+" +/- "+dm; |
---|
261 | document.getElementById("btSdev").textContent = "sdev="+dm+" +/- "+dd; |
---|
262 | document.getElementById("btPair").textContent = "npairs="+n; |
---|
263 | document.getElementById("btGeno").textContent = "ngenos="+mg+" +/- "+dg+" (min="+lg+", max="+hg+")"; |
---|
264 | document.getElementById("btHead").setAttribute('fill', fill); |
---|
265 | |
---|
266 | var tt = document.getElementById("btTip"); |
---|
267 | tt.setAttribute("transform", "translate("+x+","+y+")"); |
---|
268 | tt.setAttribute('visibility', 'visible'); |
---|
269 | } |
---|
270 | |
---|
271 | function showOTT(evt, rel, s1, s2, mean, sdev, ngeno, rmean, rsdev) { |
---|
272 | var x = parseInt(evt.pageX)-150; |
---|
273 | var y = parseInt(evt.pageY)-180; |
---|
274 | |
---|
275 | switch(rel) { |
---|
276 | case 0: |
---|
277 | fill = colours[rel]; |
---|
278 | relt = "dupe"; |
---|
279 | break; |
---|
280 | case 1: |
---|
281 | fill = colours[rel]; |
---|
282 | relt = "parentchild"; |
---|
283 | break; |
---|
284 | case 2: |
---|
285 | fill = colours[rel]; |
---|
286 | relt = "sibpairs"; |
---|
287 | break; |
---|
288 | case 3: |
---|
289 | fill = colours[rel]; |
---|
290 | relt = "halfsibs"; |
---|
291 | break; |
---|
292 | case 4: |
---|
293 | fill = colours[rel]; |
---|
294 | relt = "parents"; |
---|
295 | break; |
---|
296 | case 5: |
---|
297 | fill = colours[rel]; |
---|
298 | relt = "unrelated"; |
---|
299 | break; |
---|
300 | case 6: |
---|
301 | fill = colours[rel]; |
---|
302 | relt = "unknown"; |
---|
303 | break; |
---|
304 | default: |
---|
305 | fill = "cyan"; |
---|
306 | relt = "ERROR_CODE: "+rel; |
---|
307 | } |
---|
308 | |
---|
309 | document.getElementById("otRel").textContent = "PAIR: "+relt; |
---|
310 | document.getElementById("otS1").textContent = "s1="+s1; |
---|
311 | document.getElementById("otS2").textContent = "s2="+s2; |
---|
312 | document.getElementById("otMean").textContent = "mean="+mean; |
---|
313 | document.getElementById("otSdev").textContent = "sdev="+sdev; |
---|
314 | document.getElementById("otGeno").textContent = "ngenos="+ngeno; |
---|
315 | document.getElementById("otRmean").textContent = "relmean="+rmean; |
---|
316 | document.getElementById("otRsdev").textContent = "relsdev="+rsdev; |
---|
317 | document.getElementById("otHead").setAttribute('fill', fill); |
---|
318 | |
---|
319 | var tt = document.getElementById("otTip"); |
---|
320 | tt.setAttribute("transform", "translate("+x+","+y+")"); |
---|
321 | tt.setAttribute('visibility', 'visible'); |
---|
322 | } |
---|
323 | |
---|
324 | function hideBTT(evt) { |
---|
325 | document.getElementById("btTip").setAttributeNS(null, 'visibility', 'hidden'); |
---|
326 | } |
---|
327 | |
---|
328 | function hideOTT(evt) { |
---|
329 | document.getElementById("otTip").setAttributeNS(null, 'visibility', 'hidden'); |
---|
330 | } |
---|
331 | |
---|
332 | ]]> |
---|
333 | </script> |
---|
334 | <defs> |
---|
335 | <!-- symbols for check boxes --> |
---|
336 | <symbol id="cbRect" overflow="visible"> |
---|
337 | <rect x="-5" y="-5" width="10" height="10" fill="white" stroke="dimgray" stroke-width="1" cursor="pointer"/> |
---|
338 | </symbol> |
---|
339 | <symbol id="cbCross" overflow="visible"> |
---|
340 | <g pointer-events="none" stroke="black" stroke-width="1"> |
---|
341 | <line x1="-3" y1="-3" x2="3" y2="3"/> |
---|
342 | <line x1="3" y1="-3" x2="-3" y2="3"/> |
---|
343 | </g> |
---|
344 | </symbol> |
---|
345 | </defs> |
---|
346 | |
---|
347 | <desc>Developer Works Dynamic Scatter Graph Scaling Example</desc> |
---|
348 | |
---|
349 | <!-- Now Draw the main X and Y axis --> |
---|
350 | <g style="stroke-width:1.0; stroke:black; shape-rendering:crispEdges"> |
---|
351 | <!-- X Axis top and bottom --> |
---|
352 | <path d="M 100 100 L 1250 100 Z"/> |
---|
353 | <path d="M 100 700 L 1250 700 Z"/> |
---|
354 | |
---|
355 | <!-- Y Axis left and right --> |
---|
356 | <path d="M 100 100 L 100 700 Z"/> |
---|
357 | <path d="M 1250 100 L 1250 700 Z"/> |
---|
358 | </g> |
---|
359 | |
---|
360 | <g transform="translate(100,100)"> |
---|
361 | |
---|
362 | <!-- Grid Lines --> |
---|
363 | <g style="fill:none; stroke:#dddddd; stroke-width:1; stroke-dasharray:2,2; text-anchor:end; shape-rendering:crispEdges"> |
---|
364 | |
---|
365 | <!-- Vertical grid lines --> |
---|
366 | <line x1="125" y1="0" x2="115" y2="600" /> |
---|
367 | <line x1="230" y1="0" x2="230" y2="600" /> |
---|
368 | <line x1="345" y1="0" x2="345" y2="600" /> |
---|
369 | <line x1="460" y1="0" x2="460" y2="600" /> |
---|
370 | <line x1="575" y1="0" x2="575" y2="600" style="stroke-dasharray:none;" /> |
---|
371 | <line x1="690" y1="0" x2="690" y2="600" /> |
---|
372 | <line x1="805" y1="0" x2="805" y2="600" /> |
---|
373 | <line x1="920" y1="0" x2="920" y2="600" /> |
---|
374 | <line x1="1035" y1="0" x2="1035" y2="600" /> |
---|
375 | |
---|
376 | <!-- Horizontal grid lines --> |
---|
377 | <line x1="0" y1="60" x2="1150" y2="60" /> |
---|
378 | <line x1="0" y1="120" x2="1150" y2="120" /> |
---|
379 | <line x1="0" y1="180" x2="1150" y2="180" /> |
---|
380 | <line x1="0" y1="240" x2="1150" y2="240" /> |
---|
381 | <line x1="0" y1="300" x2="1150" y2="300" style="stroke-dasharray:none;" /> |
---|
382 | <line x1="0" y1="360" x2="1150" y2="360" /> |
---|
383 | <line x1="0" y1="420" x2="1150" y2="420" /> |
---|
384 | <line x1="0" y1="480" x2="1150" y2="480" /> |
---|
385 | <line x1="0" y1="540" x2="1150" y2="540" /> |
---|
386 | </g> |
---|
387 | |
---|
388 | <!-- Legend --> |
---|
389 | <g style="fill:black; stroke:none" font-size="12" font-family="Arial" transform="translate(25,25)"> |
---|
390 | <rect width="160" height="270" style="fill:none; stroke:black; shape-rendering:crispEdges" /> |
---|
391 | <text x="5" y="20" style="fill:black; stroke:none;" font-size="13" font-weight="bold">Given Pair Relationship</text> |
---|
392 | <rect x="120" y="35" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> |
---|
393 | <rect x="120" y="55" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> |
---|
394 | <rect x="120" y="75" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> |
---|
395 | <rect x="120" y="95" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> |
---|
396 | <rect x="120" y="115" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> |
---|
397 | <rect x="120" y="135" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> |
---|
398 | <rect x="120" y="155" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> |
---|
399 | <text x="15" y="195" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore gt 15</text> |
---|
400 | <circle cx="125" cy="192" r="6" style="stroke:red; fill:gold; fill-opacity:1.0; stroke-width:1;"/> |
---|
401 | <text x="15" y="215" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore 4 to 15</text> |
---|
402 | <circle cx="125" cy="212" r="3" style="stroke:gold; fill:gold; fill-opacity:1.0; stroke-width:1;"/> |
---|
403 | <text x="15" y="235" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore lt 4</text> |
---|
404 | <circle cx="125" cy="232" r="2" style="stroke:gold; fill:gold; fill-opacity:1.0; stroke-width:1;"/> |
---|
405 | <g id="checkboxes"> |
---|
406 | </g> |
---|
407 | </g> |
---|
408 | |
---|
409 | |
---|
410 | <g style='fill:black; stroke:none' font-size="17" font-family="Arial"> |
---|
411 | <!-- X Axis Labels --> |
---|
412 | <text x="480" y="660">Mean Alleles Shared</text> |
---|
413 | <text x="0" y="630" >1.0</text> |
---|
414 | <text x="277" y="630" >1.25</text> |
---|
415 | <text x="564" y="630" >1.5</text> |
---|
416 | <text x="842" y="630" >1.75</text> |
---|
417 | <text x="1140" y="630" >2.0</text> |
---|
418 | </g> |
---|
419 | |
---|
420 | <g transform="rotate(270)" style="fill:black; stroke:none" font-size="17" font-family="Arial"> |
---|
421 | <!-- Y Axis Labels --> |
---|
422 | <text x="-350" y="-40">SD Alleles Shared</text> |
---|
423 | <text x="-20" y="-10" >1.0</text> |
---|
424 | <text x="-165" y="-10" >0.75</text> |
---|
425 | <text x="-310" y="-10" >0.5</text> |
---|
426 | <text x="-455" y="-10" >0.25</text> |
---|
427 | <text x="-600" y="-10" >0.0</text> |
---|
428 | </g> |
---|
429 | |
---|
430 | <!-- Plot Title --> |
---|
431 | <g style="fill:black; stroke:none" font-size="18" font-family="Arial"> |
---|
432 | <text x="425" y="-30">%s</text> |
---|
433 | </g> |
---|
434 | |
---|
435 | <!-- One group/layer of points for each relationship type --> |
---|
436 | ''' |
---|
437 | |
---|
438 | SVG_FOOTER = ''' |
---|
439 | <!-- End of Data --> |
---|
440 | </g> |
---|
441 | <g id="btTip" visibility="hidden" style="stroke-width:1.0; fill:black; stroke:none;" font-size="10" font-family="Arial"> |
---|
442 | <rect width="250" height="110" style="fill:silver" rx="2" ry="2"/> |
---|
443 | <rect id="btHead" width="250" height="20" rx="2" ry="2" /> |
---|
444 | <text id="btRel" y="14" x="85">unrelated</text> |
---|
445 | <text id="btMean" y="40" x="4">mean=1.5 +/- 0.04</text> |
---|
446 | <text id="btSdev" y="60" x="4">sdev=0.7 +/- 0.03</text> |
---|
447 | <text id="btPair" y="80" x="4">npairs=1152</text> |
---|
448 | <text id="btGeno" y="100" x="4">ngenos=4783 +/- 24 (min=1000, max=5000)</text> |
---|
449 | </g> |
---|
450 | |
---|
451 | <g id="otTip" visibility="hidden" style="stroke-width:1.0; fill:black; stroke:none;" font-size="10" font-family="Arial"> |
---|
452 | <rect width="150" height="180" style="fill:silver" rx="2" ry="2"/> |
---|
453 | <rect id="otHead" width="150" height="20" rx="2" ry="2" /> |
---|
454 | <text id="otRel" y="14" x="40">sibpairs</text> |
---|
455 | <text id="otS1" y="40" x="4">s1=fid1,iid1</text> |
---|
456 | <text id="otS2" y="60" x="4">s2=fid2,iid2</text> |
---|
457 | <text id="otMean" y="80" x="4">mean=1.82</text> |
---|
458 | <text id="otSdev" y="100" x="4">sdev=0.7</text> |
---|
459 | <text id="otGeno" y="120" x="4">ngeno=4487</text> |
---|
460 | <text id="otRmean" y="140" x="4">relmean=1.85</text> |
---|
461 | <text id="otRsdev" y="160" x="4">relsdev=0.65</text> |
---|
462 | </g> |
---|
463 | </svg> |
---|
464 | ''' |
---|
465 | |
---|
466 | |
---|
467 | DEFAULT_MAX_SAMPLE_SIZE = 5000 |
---|
468 | |
---|
469 | REF_COUNT_HOM1 = 3 |
---|
470 | REF_COUNT_HET = 2 |
---|
471 | REF_COUNT_HOM2 = 1 |
---|
472 | MISSING = 0 |
---|
473 | |
---|
474 | MARKER_PAIRS_PER_SECOND_SLOW = 15000000 |
---|
475 | MARKER_PAIRS_PER_SECOND_FAST = 70000000 |
---|
476 | |
---|
477 | POLYGONS = { |
---|
478 | REL_UNRELATED: ((1.360, 0.655), (1.385, 0.730), (1.620, 0.575), (1.610, 0.505)), |
---|
479 | REL_HALFSIBS: ((1.630, 0.500), (1.630, 0.550), (1.648, 0.540), (1.648, 0.490)), |
---|
480 | REL_SIBS: ((1.660, 0.510), (1.665, 0.560), (1.820, 0.410), (1.820, 0.390)), |
---|
481 | REL_PARENTCHILD: ((1.650, 0.470), (1.650, 0.490), (1.750, 0.440), (1.750, 0.420)), |
---|
482 | REL_DUPE: ((1.970, 0.000), (1.970, 0.150), (2.000, 0.150), (2.000, 0.000)), |
---|
483 | } |
---|
484 | |
---|
485 | def distance(point1, point2): |
---|
486 | """ Calculate the distance between two points |
---|
487 | """ |
---|
488 | (x1,y1) = [float(d) for d in point1] |
---|
489 | (x2,y2) = [float(d) for d in point2] |
---|
490 | dx = abs(x1 - x2) |
---|
491 | dy = abs(y1 - y2) |
---|
492 | return math.sqrt(dx**2 + dy**2) |
---|
493 | |
---|
494 | def point_inside_polygon(x, y, poly): |
---|
495 | """ Determine if a point (x,y) is inside a given polygon or not |
---|
496 | poly is a list of (x,y) pairs. |
---|
497 | |
---|
498 | Taken from: http://www.ariel.com.au/a/python-point-int-poly.html |
---|
499 | """ |
---|
500 | |
---|
501 | n = len(poly) |
---|
502 | inside = False |
---|
503 | |
---|
504 | p1x,p1y = poly[0] |
---|
505 | for i in range(n+1): |
---|
506 | p2x,p2y = poly[i % n] |
---|
507 | if y > min(p1y,p2y): |
---|
508 | if y <= max(p1y,p2y): |
---|
509 | if x <= max(p1x,p2x): |
---|
510 | if p1y != p2y: |
---|
511 | xinters = (y-p1y)*(p2x-p1x)/(p2y-p1y)+p1x |
---|
512 | if p1x == p2x or x <= xinters: |
---|
513 | inside = not inside |
---|
514 | p1x,p1y = p2x,p2y |
---|
515 | return inside |
---|
516 | |
---|
517 | def readMap(pedfile): |
---|
518 | """ |
---|
519 | """ |
---|
520 | mapfile = pedfile.replace('.ped', '.map') |
---|
521 | marker_list = [] |
---|
522 | if os.path.exists(mapfile): |
---|
523 | print 'readMap: %s' % (mapfile) |
---|
524 | fh = file(mapfile, 'r') |
---|
525 | for line in fh: |
---|
526 | marker_list.append(line.strip().split()) |
---|
527 | fh.close() |
---|
528 | print 'readMap: %s markers' % (len(marker_list)) |
---|
529 | return marker_list |
---|
530 | |
---|
531 | def calcMeanSD(useme): |
---|
532 | """ |
---|
533 | A numerically stable algorithm is given below. It also computes the mean. |
---|
534 | This algorithm is due to Knuth,[1] who cites Welford.[2] |
---|
535 | n = 0 |
---|
536 | mean = 0 |
---|
537 | M2 = 0 |
---|
538 | |
---|
539 | foreach x in data: |
---|
540 | n = n + 1 |
---|
541 | delta = x - mean |
---|
542 | mean = mean + delta/n |
---|
543 | M2 = M2 + delta*(x - mean) // This expression uses the new value of mean |
---|
544 | end for |
---|
545 | |
---|
546 | variance_n = M2/n |
---|
547 | variance = M2/(n - 1) |
---|
548 | """ |
---|
549 | mean = 0.0 |
---|
550 | M2 = 0.0 |
---|
551 | sd = 0.0 |
---|
552 | n = len(useme) |
---|
553 | if n > 1: |
---|
554 | for i,x in enumerate(useme): |
---|
555 | delta = x - mean |
---|
556 | mean = mean + delta/(i+1) # knuth uses n+=1 at start |
---|
557 | M2 = M2 + delta*(x - mean) # This expression uses the new value of mean |
---|
558 | variance = M2/(n-1) # assume is sample so lose 1 DOF |
---|
559 | sd = pow(variance,0.5) |
---|
560 | return mean,sd |
---|
561 | |
---|
562 | |
---|
563 | def doIBSpy(ped=None,basename='',outdir=None,logf=None, |
---|
564 | nrsSamples=10000,title='title',pdftoo=0,Zcutoff=2.0): |
---|
565 | #def doIBS(pedName, title, nrsSamples=None, pdftoo=False): |
---|
566 | """ started with snpmatrix but GRR uses actual IBS counts and sd's |
---|
567 | """ |
---|
568 | repOut = [] # text strings to add to the html display |
---|
569 | refallele = {} |
---|
570 | tblf = '%s_table.xls' % (title) |
---|
571 | tbl = file(os.path.join(outdir,tblf), 'w') |
---|
572 | tbl.write(TABLE_HEADER) |
---|
573 | svgf = '%s.svg' % (title) |
---|
574 | svg = file(os.path.join(outdir,svgf), 'w') |
---|
575 | |
---|
576 | nMarkers = len(ped._markers) |
---|
577 | if nMarkers < 5: |
---|
578 | print sys.stderr, '### ERROR - %d is too few markers for reliable estimation in %s - terminating' % (nMarkers,PROGNAME) |
---|
579 | sys.exit(1) |
---|
580 | nSubjects = len(ped._subjects) |
---|
581 | nrsSamples = min(nMarkers, nrsSamples) |
---|
582 | if opts and opts.use_mito: |
---|
583 | markers = range(nMarkers) |
---|
584 | nrsSamples = min(len(markers), nrsSamples) |
---|
585 | sampleIndexes = sorted(random.sample(markers, nrsSamples)) |
---|
586 | else: |
---|
587 | autosomals = ped.autosomal_indices() |
---|
588 | nrsSamples = min(len(autosomals), nrsSamples) |
---|
589 | sampleIndexes = sorted(random.sample(autosomals, nrsSamples)) |
---|
590 | |
---|
591 | print '' |
---|
592 | print 'Getting random.sample of %s from %s total' % (nrsSamples, nMarkers) |
---|
593 | npairs = (nSubjects*(nSubjects-1))/2 # total rows in table |
---|
594 | newfiles=[svgf,tblf] |
---|
595 | explanations = ['rgGRR Plot (requires SVG)','Mean by SD alleles shared - %d rows' % npairs] |
---|
596 | # these go with the output file links in the html file |
---|
597 | s = 'Reading genotypes for %s subjects and %s markers\n' % (nSubjects, nrsSamples) |
---|
598 | logf.write(s) |
---|
599 | minUsegenos = nrsSamples/2 # must have half? |
---|
600 | nGenotypes = nSubjects*nrsSamples |
---|
601 | stime = time.time() |
---|
602 | emptyRows = set() |
---|
603 | genos = numpy.zeros((nSubjects, nrsSamples), dtype=int) |
---|
604 | for s in xrange(nSubjects): |
---|
605 | nValid = 0 |
---|
606 | #getGenotypesByIndices(self, s, mlist, format) |
---|
607 | genos[s] = ped.getGenotypesByIndices(s, sampleIndexes, format='ref') |
---|
608 | nValid = sum([1 for g in genos[s] if g]) |
---|
609 | if not nValid: |
---|
610 | emptyRows.add(s) |
---|
611 | sub = ped.getSubject(s) |
---|
612 | print 'All missing for row %d (%s)' % (s, sub) |
---|
613 | logf.write('All missing for row %d (%s)\n' % (s, sub)) |
---|
614 | rtime = time.time() - stime |
---|
615 | if verbose: |
---|
616 | print '@@Read %s genotypes in %s seconds' % (nGenotypes, rtime) |
---|
617 | |
---|
618 | |
---|
619 | ### Now the expensive part. For each pair of subjects, we get the mean number |
---|
620 | ### and standard deviation of shared alleles over all of the markers where both |
---|
621 | ### subjects have a known genotype. Identical subjects should have mean shared |
---|
622 | ### alleles very close to 2.0 with a standard deviation very close to 0.0. |
---|
623 | tot = nSubjects*(nSubjects-1)/2 |
---|
624 | nprog = tot/10 |
---|
625 | nMarkerpairs = tot * nrsSamples |
---|
626 | estimatedTimeSlow = nMarkerpairs/MARKER_PAIRS_PER_SECOND_SLOW |
---|
627 | estimatedTimeFast = nMarkerpairs/MARKER_PAIRS_PER_SECOND_FAST |
---|
628 | |
---|
629 | pairs = [] |
---|
630 | pair_data = {} |
---|
631 | means = [] ## Mean IBS for each pair |
---|
632 | ngenoL = [] ## Count of comparable genotypes for each pair |
---|
633 | sdevs = [] ## Standard dev for each pair |
---|
634 | rels = [] ## A relationship code for each pair |
---|
635 | zmeans = [0.0 for x in xrange(tot)] ## zmean score for each pair for the relgroup |
---|
636 | zstds = [0.0 for x in xrange(tot)] ## zstd score for each pair for the relgrp |
---|
637 | skip = set() |
---|
638 | ndone = 0 ## How many have been done so far |
---|
639 | |
---|
640 | logf.write('Calculating %d pairs...\n' % (tot)) |
---|
641 | logf.write('Estimated time is %2.2f to %2.2f seconds ...\n' % (estimatedTimeFast, estimatedTimeSlow)) |
---|
642 | |
---|
643 | t1sum = 0 |
---|
644 | t2sum = 0 |
---|
645 | t3sum = 0 |
---|
646 | now = time.time() |
---|
647 | scache = {} |
---|
648 | _founder_cache = {} |
---|
649 | C_CODE = """ |
---|
650 | #include "math.h" |
---|
651 | int i; |
---|
652 | int sumibs = 0; |
---|
653 | int ssqibs = 0; |
---|
654 | int ngeno = 0; |
---|
655 | float mean = 0; |
---|
656 | float M2 = 0; |
---|
657 | float delta = 0; |
---|
658 | float sdev=0; |
---|
659 | float variance=0; |
---|
660 | for (i=0; i<nrsSamples; i++) { |
---|
661 | int a1 = g1[i]; |
---|
662 | int a2 = g2[i]; |
---|
663 | if (a1 != 0 && a2 != 0) { |
---|
664 | ngeno += 1; |
---|
665 | int shared = 2-abs(a1-a2); |
---|
666 | delta = shared - mean; |
---|
667 | mean = mean + delta/ngeno; |
---|
668 | M2 += delta*(shared-mean); |
---|
669 | // yes that second time, the updated mean is used see calcmeansd above; |
---|
670 | //printf("%d %d %d %d %d %d\\n", i, a1, a2, ngeno, shared, squared); |
---|
671 | } |
---|
672 | } |
---|
673 | if (ngeno > 1) { |
---|
674 | variance = M2/(ngeno-1); |
---|
675 | sdev = sqrt(variance); |
---|
676 | //printf("OK: %d %3.2f %3.2f\\n", ngeno, mean, sdev); |
---|
677 | } |
---|
678 | //printf("%d %d %d %1.2f %1.2f\\n", ngeno, sumibs, ssqibs, mean, sdev); |
---|
679 | result[0] = ngeno; |
---|
680 | result[1] = mean; |
---|
681 | result[2] = sdev; |
---|
682 | return_val = ngeno; |
---|
683 | """ |
---|
684 | started = time.time() |
---|
685 | for s1 in xrange(nSubjects): |
---|
686 | if s1 in emptyRows: |
---|
687 | continue |
---|
688 | (fid1,iid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache.setdefault(s1, ped.getSubject(s1)) |
---|
689 | |
---|
690 | isFounder1 = _founder_cache.setdefault(s1, (did1==mid1)) |
---|
691 | g1 = genos[s1] |
---|
692 | |
---|
693 | for s2 in xrange(s1+1, nSubjects): |
---|
694 | if s2 in emptyRows: |
---|
695 | continue |
---|
696 | t1s = time.time() |
---|
697 | |
---|
698 | (fid2,iid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache.setdefault(s2, ped.getSubject(s2)) |
---|
699 | |
---|
700 | g2 = genos[s2] |
---|
701 | isFounder2 = _founder_cache.setdefault(s2, (did2==mid2)) |
---|
702 | |
---|
703 | # Determine the relationship for this pair |
---|
704 | relcode = REL_UNKNOWN |
---|
705 | if (fid2 == fid1): |
---|
706 | if iid1 == iid2: |
---|
707 | relcode = REL_DUPE |
---|
708 | elif (did2 == did1) and (mid2 == mid1) and did1 != mid1: |
---|
709 | relcode = REL_SIBS |
---|
710 | elif (iid1 == mid2) or (iid1 == did2) or (iid2 == mid1) or (iid2 == did1): |
---|
711 | relcode = REL_PARENTCHILD |
---|
712 | elif (str(did1) != '0' and (did2 == did1)) or (str(mid1) != '0' and (mid2 == mid1)): |
---|
713 | relcode = REL_HALFSIBS |
---|
714 | else: |
---|
715 | # People in the same family should be marked as some other |
---|
716 | # form of related. In general, these people will have a |
---|
717 | # pretty random spread of similarity. This distinction is |
---|
718 | # probably not very useful most of the time |
---|
719 | relcode = REL_RELATED |
---|
720 | else: |
---|
721 | ### Different families |
---|
722 | relcode = REL_UNRELATED |
---|
723 | |
---|
724 | t1e = time.time() |
---|
725 | t1sum += t1e-t1s |
---|
726 | |
---|
727 | |
---|
728 | ### Calculate sum(2-abs(a1-a2)) and sum((2-abs(a1-a2))**2) and count |
---|
729 | ### the number of contributing genotypes. These values are not actually |
---|
730 | ### calculated here, but instead are looked up in a table for speed. |
---|
731 | ### FIXME: This is still too slow ... |
---|
732 | result = [0.0, 0.0, 0.0] |
---|
733 | ngeno = weave.inline(C_CODE, ['g1', 'g2', 'nrsSamples', 'result']) |
---|
734 | if ngeno >= minUsegenos: |
---|
735 | _, mean, sdev = result |
---|
736 | means.append(mean) |
---|
737 | sdevs.append(sdev) |
---|
738 | ngenoL.append(ngeno) |
---|
739 | pairs.append((s1, s2)) |
---|
740 | rels.append(relcode) |
---|
741 | else: |
---|
742 | skip.add(ndone) # signal no comparable genotypes for this pair |
---|
743 | ndone += 1 |
---|
744 | t2e = time.time() |
---|
745 | t2sum += t2e-t1e |
---|
746 | t3e = time.time() |
---|
747 | t3sum += t3e-t2e |
---|
748 | |
---|
749 | logme = [ 'T1: %s' % (t1sum), 'T2: %s' % (t2sum), 'T3: %s' % (t3sum),'TOT: %s' % (t3e-now), |
---|
750 | '%s pairs with no (or not enough) comparable genotypes (%3.1f%%)' % (len(skip), |
---|
751 | float(len(skip))/float(tot)*100)] |
---|
752 | logf.write('%s\n' % '\t'.join(logme)) |
---|
753 | ### Calculate mean and standard deviation of scores on a per relationship |
---|
754 | ### type basis, allowing us to flag outliers for each particular relationship |
---|
755 | ### type |
---|
756 | relstats = {} |
---|
757 | relCounts = {} |
---|
758 | outlierFiles = {} |
---|
759 | for relCode, relInfo in REL_LOOKUP.items(): |
---|
760 | relName, relColor, relStyle = relInfo |
---|
761 | useme = [means[x] for x in xrange(len(means)) if rels[x] == relCode] |
---|
762 | relCounts[relCode] = len(useme) |
---|
763 | mm = scipy.mean(useme) |
---|
764 | ms = scipy.std(useme) |
---|
765 | useme = [sdevs[x] for x in xrange(len(sdevs)) if rels[x] == relCode] |
---|
766 | sm = scipy.mean(useme) |
---|
767 | ss = scipy.std(useme) |
---|
768 | relstats[relCode] = {'sd':(sm,ss), 'mean':(mm,ms)} |
---|
769 | s = 'Relstate %s (n=%d): mean(mean)=%3.2f sdev(mean)=%3.2f, mean(sdev)=%3.2f sdev(sdev)=%3.2f\n' % \ |
---|
770 | (relName,relCounts[relCode], mm, ms, sm, ss) |
---|
771 | logf.write(s) |
---|
772 | |
---|
773 | ### now fake z scores for each subject like abecasis recommends max(|zmu|,|zsd|) |
---|
774 | ### within each group, for each pair, z=(groupmean-pairmean)/groupsd |
---|
775 | available = len(means) |
---|
776 | logf.write('%d pairs are available of %d\n' % (available, tot)) |
---|
777 | ### s = '\nOutliers:\nrelationship\tzmean\tzsd\tped1\tped2\tmean\tsd\trmeanmean\trmeansd\trsdmean\trsdsd\n' |
---|
778 | ### logf.write(s) |
---|
779 | pairnum = 0 |
---|
780 | offset = 0 |
---|
781 | nOutliers = 0 |
---|
782 | cexs = [] |
---|
783 | outlierRecords = dict([(r, []) for r in range(N_RELATIONSHIP_TYPES)]) |
---|
784 | zsdmax = 0 |
---|
785 | for s1 in range(nSubjects): |
---|
786 | if s1 in emptyRows: |
---|
787 | continue |
---|
788 | (fid1,iid1,did1,mid1,sex1,aff1,ok1,d_sid1,m_sid1) = scache[s1] |
---|
789 | for s2 in range(s1+1, nSubjects): |
---|
790 | if s2 in emptyRows: |
---|
791 | continue |
---|
792 | if pairnum not in skip: |
---|
793 | ### Get group stats for this relationship |
---|
794 | (fid2,iid2,did2,mid2,sex2,aff2,ok2,d_sid2,m_sid2) = scache[s2] |
---|
795 | try: |
---|
796 | r = rels[offset] |
---|
797 | except IndexError: |
---|
798 | logf.write('###OOPS offset %d available %d pairnum %d len(rels) %d', offset, available, pairnum, len(rels)) |
---|
799 | notfound = ('?',('?','0','0')) |
---|
800 | relInfo = REL_LOOKUP.get(r,notfound) |
---|
801 | relName, relColor, relStyle = relInfo |
---|
802 | rmm,rmd = relstats[r]['mean'] # group mean, group meansd alleles shared |
---|
803 | rdm,rdd = relstats[r]['sd'] # group sdmean, group sdsd alleles shared |
---|
804 | |
---|
805 | try: |
---|
806 | zsd = (sdevs[offset] - rdm)/rdd # distance from group mean in group sd units |
---|
807 | except: |
---|
808 | zsd = 1 |
---|
809 | if abs(zsd) > zsdmax: |
---|
810 | zsdmax = zsd # keep for sort scaling |
---|
811 | try: |
---|
812 | zmean = (means[offset] - rmm)/rmd # distance from group mean |
---|
813 | except: |
---|
814 | zmean = 1 |
---|
815 | zmeans[offset] = zmean |
---|
816 | zstds[offset] = zsd |
---|
817 | pid=(s1,s2) |
---|
818 | zrad = max(zsd,zmean) |
---|
819 | if zrad < 4: |
---|
820 | zrad = 2 |
---|
821 | elif 4 < zrad < 15: |
---|
822 | zrad = 3 # to 9 |
---|
823 | else: # > 15 6=24+ |
---|
824 | zrad=zrad/4 |
---|
825 | zrad = min(zrad,6) # scale limit |
---|
826 | zrad = max(2,max(zsd,zmean)) # as > 2, z grows |
---|
827 | pair_data[pid] = (zmean,zsd,r,zrad) |
---|
828 | if max(zsd,zmean) > Zcutoff: # is potentially interesting |
---|
829 | mean = means[offset] |
---|
830 | sdev = sdevs[offset] |
---|
831 | outlierRecords[r].append((mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd,did1,mid1,did2,mid2)) |
---|
832 | nOutliers += 1 |
---|
833 | tbl.write('%s_%s\t%s_%s\t%f\t%f\t%f\t%f\t%d\t%s\t%s\t%s\t%s\t%s\n' % \ |
---|
834 | (fid1, iid1, fid2, iid2, mean, sdev, zmean,zsd, ngeno, relName, did1,mid1,did2,mid2)) |
---|
835 | offset += 1 |
---|
836 | pairnum += 1 |
---|
837 | logf.write( 'Outliers: %s\n' % (nOutliers)) |
---|
838 | |
---|
839 | ### Write outlier files for each relationship type |
---|
840 | repOut.append('<h2>Outliers in tab delimited files linked above are also listed below</h2>') |
---|
841 | lzsd = round(numpy.log10(zsdmax)) + 1 |
---|
842 | scalefactor = 10**lzsd |
---|
843 | for relCode, relInfo in REL_LOOKUP.items(): |
---|
844 | relName, _, _ = relInfo |
---|
845 | outliers = outlierRecords[relCode] |
---|
846 | if not outliers: |
---|
847 | continue |
---|
848 | outliers = [(scalefactor*int(abs(x[3]))+ int(abs(x[2])),x) for x in outliers] # decorate |
---|
849 | outliers.sort() |
---|
850 | outliers.reverse() # largest deviation first |
---|
851 | outliers = [x[1] for x in outliers] # undecorate |
---|
852 | nrows = len(outliers) |
---|
853 | truncated = 0 |
---|
854 | if nrows > MAX_SHOW_ROWS: |
---|
855 | s = '<h3>%s outlying pairs (top %d of %d) from %s</h3><table border="0" cellpadding="3">' % \ |
---|
856 | (relName,MAX_SHOW_ROWS,nrows,title) |
---|
857 | truncated = nrows - MAX_SHOW_ROWS |
---|
858 | else: |
---|
859 | s = '<h3>%s outlying pairs (n=%d) from %s</h3><table border="0" cellpadding="3">' % (relName,nrows,title) |
---|
860 | repOut.append(s) |
---|
861 | fhname = '%s_rgGRR_%s_outliers.xls' % (title, relName) |
---|
862 | fhpath = os.path.join(outdir,fhname) |
---|
863 | fh = open(fhpath, 'w') |
---|
864 | newfiles.append(fhname) |
---|
865 | explanations.append('%s Outlier Pairs %s, N=%d, Cutoff SD=%f' % (relName,title,len(outliers),Zcutoff)) |
---|
866 | fh.write(OUTLIERS_HEADER) |
---|
867 | s = ''.join(['<th>%s</th>' % x for x in OUTLIERS_HEADER_list]) |
---|
868 | repOut.append('<tr align="center">%s</tr>' % s) |
---|
869 | for n,rec in enumerate(outliers): |
---|
870 | #(mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd) = rec |
---|
871 | s = '%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t' % tuple(rec) |
---|
872 | fh.write('%s%s\n' % (s,relName)) |
---|
873 | # (mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd, did1,mid1,did2,mid2)) |
---|
874 | s = '''<td>%f</td><td>%f</td><td>%f</td><td>%f</td><td>%s</td><td>%s</td> |
---|
875 | <td>%s</td><td>%s</td><td>%f</td><td>%f</td><td>%f</td><td>%f</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td>''' % tuple(rec) |
---|
876 | s = '%s<td>%s</td>' % (s,relName) |
---|
877 | if n < MAX_SHOW_ROWS: |
---|
878 | repOut.append('<tr align="center">%s</tr>' % s) |
---|
879 | if truncated > 0: |
---|
880 | repOut.append('<H2>WARNING: %d rows truncated - see outlier file for all %d rows</H2>' % (truncated, |
---|
881 | nrows)) |
---|
882 | fh.close() |
---|
883 | repOut.append('</table><p>') |
---|
884 | |
---|
885 | ### Now, draw the plot in jpeg and svg formats, and optionally in the PDF format |
---|
886 | ### if requested |
---|
887 | logf.write('Plotting ...') |
---|
888 | pointColors = [REL_COLORS[rel] for rel in rels] |
---|
889 | pointStyles = [REL_POINTS[rel] for rel in rels] |
---|
890 | |
---|
891 | mainTitle = '%s (%s subjects, %d snp)' % (title, nSubjects, nrsSamples) |
---|
892 | svg.write(SVG_HEADER % (SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[4], |
---|
893 | SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[0],SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[1], |
---|
894 | SVG_COLORS[2],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[3],SVG_COLORS[4],SVG_COLORS[4], |
---|
895 | SVG_COLORS[5],SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[6],mainTitle)) |
---|
896 | #rpy.r.jpeg(filename='%s.jpg' % (title), width=1600, height=1200, pointsize=12, quality=100, bg='white') |
---|
897 | #rpy.r.par(mai=(1,1,1,0.5)) |
---|
898 | #rpy.r('par(xaxs="i",yaxs="i")') |
---|
899 | #rpy.r.plot(means, sdevs, main=mainTitle, ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2)) |
---|
900 | #rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE) |
---|
901 | #rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted') |
---|
902 | #rpy.r.dev_off() |
---|
903 | |
---|
904 | ### We will now go through each relationship type to partition plot points |
---|
905 | ### into "bulk" and "outlier" groups. Bulk points will represent common |
---|
906 | ### mean/sdev pairs and will cover the majority of the points in the plot -- |
---|
907 | ### they will use generic tooltip informtion about all of the pairs |
---|
908 | ### represented by that point. "Outlier" points will be uncommon pairs, |
---|
909 | ### with very specific information in their tooltips. It would be nice to |
---|
910 | ### keep hte total number of plotted points in the SVG representation to |
---|
911 | ### ~10000 (certainly less than 100000?) |
---|
912 | pointMap = {} |
---|
913 | orderedRels = [y[1] for y in reversed(sorted([(relCounts.get(x, 0),x) for x in REL_LOOKUP.keys()]))] |
---|
914 | # do we really want this? I want out of zone points last and big |
---|
915 | for relCode in orderedRels: |
---|
916 | svgColor = SVG_COLORS[relCode] |
---|
917 | relName, relColor, relStyle = REL_LOOKUP[relCode] |
---|
918 | svg.write('<g id="%s" style="stroke:%s; fill:%s; fill-opacity:1.0; stroke-width:1;" cursor="pointer">\n' % (relName, svgColor, svgColor)) |
---|
919 | pMap = pointMap.setdefault(relCode, {}) |
---|
920 | nPoints = 0 |
---|
921 | rpairs=[] |
---|
922 | rgenos=[] |
---|
923 | rmeans=[] |
---|
924 | rsdevs=[] |
---|
925 | rz = [] |
---|
926 | for x,rel in enumerate(rels): # all pairs |
---|
927 | if rel == relCode: |
---|
928 | s1,s2 = pairs[x] |
---|
929 | pid=(s1,s2) |
---|
930 | zmean,zsd,r,zrad = pair_data[pid][:4] |
---|
931 | rpairs.append(pairs[x]) |
---|
932 | rgenos.append(ngenoL[x]) |
---|
933 | rmeans.append(means[x]) |
---|
934 | rsdevs.append(sdevs[x]) |
---|
935 | rz.append(zrad) |
---|
936 | ### Now add the svg point group for this relationship to the svg file |
---|
937 | for x in range(len(rmeans)): |
---|
938 | svgX = '%d' % ((rmeans[x] - 1.0) * PLOT_WIDTH) # changed so mean scale is 1-2 |
---|
939 | svgY = '%d' % (PLOT_HEIGHT - (rsdevs[x] * PLOT_HEIGHT)) # changed so sd scale is 0-1 |
---|
940 | s1, s2 = rpairs[x] |
---|
941 | (fid1,uid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache[s1] |
---|
942 | (fid2,uid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache[s2] |
---|
943 | ngenos = rgenos[x] |
---|
944 | nPoints += 1 |
---|
945 | point = pMap.setdefault((svgX, svgY), []) |
---|
946 | point.append((rmeans[x], rsdevs[x], fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos,rz[x])) |
---|
947 | for (svgX, svgY) in pMap: |
---|
948 | points = pMap[(svgX, svgY)] |
---|
949 | svgX = int(svgX) |
---|
950 | svgY = int(svgY) |
---|
951 | if len(points) > 1: |
---|
952 | mmean,dmean = calcMeanSD([p[0] for p in points]) |
---|
953 | msdev,dsdev = calcMeanSD([p[1] for p in points]) |
---|
954 | mgeno,dgeno = calcMeanSD([p[-1] for p in points]) |
---|
955 | mingeno = min([p[-1] for p in points]) |
---|
956 | maxgeno = max([p[-1] for p in points]) |
---|
957 | svg.write("""<circle cx="%d" cy="%d" r="2" |
---|
958 | onmouseover="showBTT(evt, %d, %1.2f, %1.2f, %1.2f, %1.2f, %d, %d, %d, %d, %d)" |
---|
959 | onmouseout="hideBTT(evt)" />\n""" % (svgX, svgY, relCode, mmean, dmean, msdev, dsdev, len(points), mgeno, dgeno, mingeno, maxgeno)) |
---|
960 | else: |
---|
961 | mean, sdev, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos, zrad = points[0][:12] |
---|
962 | rmean = float(relstats[relCode]['mean'][0]) |
---|
963 | rsdev = float(relstats[relCode]['sd'][0]) |
---|
964 | if zrad < 4: |
---|
965 | zrad = 2 |
---|
966 | elif 4 < zrad < 9: |
---|
967 | zrad = 3 # to 9 |
---|
968 | else: # > 9 5=15+ |
---|
969 | zrad=zrad/3 |
---|
970 | zrad = min(zrad,5) # scale limit |
---|
971 | if zrad <= 3: |
---|
972 | svg.write('<circle cx="%d" cy="%d" r="%s" onmouseover="showOTT(evt, %d, \'%s,%s,%s,%s\', \'%s,%s,%s,%s\', %1.2f, %1.2f, %s, %1.2f, %1.2f)" onmouseout="hideOTT(evt)" />\n' % (svgX, svgY, zrad, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev)) |
---|
973 | else: # highlight pairs a long way from expectation by outlining circle in red |
---|
974 | svg.write("""<circle cx="%d" cy="%d" r="%s" style="stroke:red; fill:%s; fill-opacity:1.0; stroke-width:1;" |
---|
975 | onmouseover="showOTT(evt, %d, \'%s,%s,%s,%s\', \'%s,%s,%s,%s\', %1.2f, %1.2f, %s, %1.2f, %1.2f)" |
---|
976 | onmouseout="hideOTT(evt)" />\n""" % \ |
---|
977 | (svgX, svgY, zrad, svgColor, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev)) |
---|
978 | svg.write('</g>\n') |
---|
979 | |
---|
980 | ### Create a pdf as well if indicated on the command line |
---|
981 | ### WARNING! for framingham share, with about 50M pairs, this is a 5.5GB pdf! |
---|
982 | ## if pdftoo: |
---|
983 | ## pdfname = '%s.pdf' % (title) |
---|
984 | ## rpy.r.pdf(pdfname, 6, 6) |
---|
985 | ## rpy.r.par(mai=(1,1,1,0.5)) |
---|
986 | ## rpy.r('par(xaxs="i",yaxs="i")') |
---|
987 | ## rpy.r.plot(means, sdevs, main='%s, %d snp' % (title, nSamples), ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2)) |
---|
988 | ## rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE) |
---|
989 | ## rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted') |
---|
990 | ## rpy.r.dev_off() |
---|
991 | |
---|
992 | ### Draw polygons |
---|
993 | if showPolygons: |
---|
994 | svg.write('<g id="polygons" cursor="pointer">\n') |
---|
995 | for rel, poly in POLYGONS.items(): |
---|
996 | points = ' '.join(['%s,%s' % ((p[0]-1.0)*float(PLOT_WIDTH), (PLOT_HEIGHT - p[1]*PLOT_HEIGHT)) for p in poly]) |
---|
997 | svg.write('<polygon points="%s" fill="transparent" style="stroke:%s; stroke-width:1"/>\n' % (points, SVG_COLORS[rel])) |
---|
998 | svg.write('</g>\n') |
---|
999 | |
---|
1000 | |
---|
1001 | svg.write(SVG_FOOTER) |
---|
1002 | svg.close() |
---|
1003 | return newfiles,explanations,repOut |
---|
1004 | |
---|
1005 | def doIBS(n=100): |
---|
1006 | """parse parameters from galaxy |
---|
1007 | expect 'input pbed path' 'basename' 'outpath' 'title' 'logpath' 'n' |
---|
1008 | <command interpreter="python"> |
---|
1009 | rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name" |
---|
1010 | '$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z' |
---|
1011 | </command> |
---|
1012 | |
---|
1013 | """ |
---|
1014 | u="""<command interpreter="python"> |
---|
1015 | rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name" |
---|
1016 | '$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z' |
---|
1017 | </command> |
---|
1018 | """ |
---|
1019 | |
---|
1020 | |
---|
1021 | if len(sys.argv) < 7: |
---|
1022 | print >> sys.stdout, 'Need pbed inpath, basename, out_htmlname, outpath, title, logpath, nSNP, Zcutoff on command line please' |
---|
1023 | print >> sys.stdout, u |
---|
1024 | sys.exit(1) |
---|
1025 | ts = '%s%s' % (string.punctuation,string.whitespace) |
---|
1026 | ptran = string.maketrans(ts,'_'*len(ts)) |
---|
1027 | inpath = sys.argv[1] |
---|
1028 | ldinpath = os.path.split(inpath)[0] |
---|
1029 | basename = sys.argv[2] |
---|
1030 | outhtml = sys.argv[3] |
---|
1031 | newfilepath = sys.argv[4] |
---|
1032 | title = sys.argv[5].translate(ptran) |
---|
1033 | logfname = 'Log_%s.txt' % title |
---|
1034 | logpath = os.path.join(newfilepath,logfname) # log was a child - make part of html extra_files_path zoo |
---|
1035 | n = int(sys.argv[6]) |
---|
1036 | try: |
---|
1037 | Zcutoff = float(sys.argv[7]) |
---|
1038 | except: |
---|
1039 | Zcutoff = 2.0 |
---|
1040 | try: |
---|
1041 | os.makedirs(newfilepath) |
---|
1042 | except: |
---|
1043 | pass |
---|
1044 | logf = file(logpath,'w') |
---|
1045 | efp,ibase_name = os.path.split(inpath) # need to use these for outputs in files_path |
---|
1046 | ped = plinkbinJZ.BPed(inpath) |
---|
1047 | ped.parse(quick=True) |
---|
1048 | if ped == None: |
---|
1049 | print >> sys.stderr, '## doIBSpy problem - cannot open %s or %s - cannot run' % (ldreduced,basename) |
---|
1050 | sys.exit(1) |
---|
1051 | newfiles,explanations,repOut = doIBSpy(ped=ped,basename=basename,outdir=newfilepath, |
---|
1052 | logf=logf,nrsSamples=n,title=title,pdftoo=0,Zcutoff=Zcutoff) |
---|
1053 | logf.close() |
---|
1054 | logfs = file(logpath,'r').readlines() |
---|
1055 | lf = file(outhtml,'w') |
---|
1056 | lf.write(galhtmlprefix % PROGNAME) |
---|
1057 | # this is a mess. todo clean up - should each datatype have it's own directory? Yes |
---|
1058 | # probably. Then titles are universal - but userId libraries are separate. |
---|
1059 | s = '<div>Output from %s run at %s<br>\n' % (PROGNAME,timenow()) |
---|
1060 | lf.write('<h4>%s</h4>\n' % s) |
---|
1061 | fixed = ["'%s'" % x for x in sys.argv] # add quotes just in case |
---|
1062 | s = 'If you need to rerun this analysis, the command line was\n<pre>%s</pre>\n</div>' % (' '.join(fixed)) |
---|
1063 | lf.write(s) |
---|
1064 | # various ways of displaying svg - experiments related to missing svg mimetype on test (!) |
---|
1065 | #s = """<object data="%s" type="image/svg+xml" width="%d" height="%d"> |
---|
1066 | # <embed src="%s" type="image/svg+xml" width="%d" height="%d" /> |
---|
1067 | # </object>""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT,newfiles[0],PLOT_WIDTH,PLOT_HEIGHT) |
---|
1068 | s = """ <embed src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT) |
---|
1069 | #s = """ <iframe src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT) |
---|
1070 | lf.write(s) |
---|
1071 | lf.write('<div><h4>Click the links below to save output files and plots</h4><br><ol>\n') |
---|
1072 | for i in range(len(newfiles)): |
---|
1073 | if i == 0: |
---|
1074 | lf.write('<li><a href="%s" type="image/svg+xml" >%s</a></li>\n' % (newfiles[i],explanations[i])) |
---|
1075 | else: |
---|
1076 | lf.write('<li><a href="%s">%s</a></li>\n' % (newfiles[i],explanations[i])) |
---|
1077 | flist = os.listdir(newfilepath) |
---|
1078 | for fname in flist: |
---|
1079 | if not fname in newfiles: |
---|
1080 | lf.write('<li><a href="%s">%s</a></li>\n' % (fname,fname)) |
---|
1081 | lf.write('</ol></div>') |
---|
1082 | lf.write('<div>%s</div>' % ('\n'.join(repOut))) # repOut is a list of tables |
---|
1083 | lf.write('<div><hr><h3>Log from this job (also stored in %s)</h3><pre>%s</pre><hr></div>' % (logfname,''.join(logfs))) |
---|
1084 | lf.write('</body></html>\n') |
---|
1085 | lf.close() |
---|
1086 | logf.close() |
---|
1087 | |
---|
1088 | if __name__ == '__main__': |
---|
1089 | doIBS() |
---|