| 1 | """ | 
|---|
| 2 | # july 2009: Need to see outliers so need to draw them last? | 
|---|
| 3 | # could use clustering on the zscores to guess real relationships for unrelateds | 
|---|
| 4 | # but definitely need to draw last | 
|---|
| 5 | # added MAX_SHOW_ROWS to limit the length of the main report page | 
|---|
| 6 | # Changes for Galaxy integration | 
|---|
| 7 | # added more robust knuth method for one pass mean and sd | 
|---|
| 8 | # no difference really - let's use scipy.mean() and scipy.std() instead... | 
|---|
| 9 | # fixed labels and changed to .xls for outlier reports so can open in excel | 
|---|
| 10 | # interesting - with a few hundred subjects, 5k gives good resolution | 
|---|
| 11 | # and 100k gives better but not by much | 
|---|
| 12 | # TODO remove non autosomal markers | 
|---|
| 13 | # TODO it would be best if label had the zmean and zsd as these are what matter for | 
|---|
| 14 | # outliers rather than the group mean/sd | 
|---|
| 15 | # mods to rgGRR.py from channing CVS which John Ziniti has rewritten to produce SVG plots | 
|---|
| 16 | # to make a Galaxy tool - we need the table of mean and SD for interesting pairs, the SVG and the log | 
|---|
| 17 | # so the result should be an HTML file | 
|---|
| 18 |  | 
|---|
| 19 | # rgIBS.py | 
|---|
| 20 | # use a random subset of markers for a quick ibs | 
|---|
| 21 | # to identify sample dups and closely related subjects | 
|---|
| 22 | # try snpMatrix and plink and see which one works best for us? | 
|---|
| 23 | # abecasis grr plots mean*sd for every subject to show clusters | 
|---|
| 24 | # mods june 23 rml to avoid non-autosomal markers | 
|---|
| 25 | # we seem to be distinguishing parent-child by gender - 2 clouds! | 
|---|
| 26 |  | 
|---|
| 27 |  | 
|---|
| 28 | snpMatrix from David Clayton has: | 
|---|
| 29 | ibs.stats function to calculate the identity-by-state stats of a group of samples | 
|---|
| 30 | Description | 
|---|
| 31 | Given a snp.matrix-class or a X.snp.matrix-class object with N samples, calculates some statistics | 
|---|
| 32 | about the relatedness of every pair of samples within. | 
|---|
| 33 |  | 
|---|
| 34 | Usage | 
|---|
| 35 | ibs.stats(x) | 
|---|
| 36 | 8 ibs.stats | 
|---|
| 37 | Arguments | 
|---|
| 38 | x a snp.matrix-class or a X.snp.matrix-class object containing N samples | 
|---|
| 39 | Details | 
|---|
| 40 | No-calls are excluded from consideration here. | 
|---|
| 41 | Value | 
|---|
| 42 | A data.frame containing N(N - 1)/2 rows, where the row names are the sample name pairs separated | 
|---|
| 43 | by a comma, and the columns are: | 
|---|
| 44 | Count count of identical calls, exclusing no-calls | 
|---|
| 45 | Fraction fraction of identical calls comparied to actual calls being made in both samples | 
|---|
| 46 | Warning | 
|---|
| 47 | In some applications, it may be preferable to subset a (random) selection of SNPs first - the | 
|---|
| 48 | calculation | 
|---|
| 49 | time increases as N(N - 1)M/2 . Typically for N = 800 samples and M = 3000 SNPs, the | 
|---|
| 50 | calculation time is about 1 minute. A full GWA scan could take hours, and quite unnecessary for | 
|---|
| 51 | simple applications such as checking for duplicate or related samples. | 
|---|
| 52 | Note | 
|---|
| 53 | This is mostly written to find mislabelled and/or duplicate samples. | 
|---|
| 54 | Illumina indexes their SNPs in alphabetical order so the mitochondria SNPs comes first - for most | 
|---|
| 55 | purpose it is undesirable to use these SNPs for IBS purposes. | 
|---|
| 56 | TODO: Worst-case S4 subsetting seems to make 2 copies of a large object, so one might want to | 
|---|
| 57 | subset before rbind(), etc; a future version of this routine may contain a built-in subsetting facility | 
|---|
| 58 | """ | 
|---|
| 59 | import sys,os,time,random,string,copy,optparse | 
|---|
| 60 |  | 
|---|
| 61 | try: | 
|---|
| 62 | set | 
|---|
| 63 | except NameError: | 
|---|
| 64 | from Sets import Set as set | 
|---|
| 65 |  | 
|---|
| 66 | from rgutils import timenow,plinke | 
|---|
| 67 |  | 
|---|
| 68 | import plinkbinJZ | 
|---|
| 69 |  | 
|---|
| 70 |  | 
|---|
| 71 | opts = None | 
|---|
| 72 | verbose = False | 
|---|
| 73 |  | 
|---|
| 74 | showPolygons = False | 
|---|
| 75 |  | 
|---|
| 76 | class NullDevice: | 
|---|
| 77 | def write(self, s): | 
|---|
| 78 | pass | 
|---|
| 79 |  | 
|---|
| 80 | tempstderr = sys.stderr # save | 
|---|
| 81 | #sys.stderr = NullDevice() | 
|---|
| 82 | # need to avoid blather about deprecation and other strange stuff from scipy | 
|---|
| 83 | # the current galaxy job runner assumes that | 
|---|
| 84 | # the job is in error if anything appears on sys.stderr | 
|---|
| 85 | # grrrrr. James wants to keep it that way instead of using the | 
|---|
| 86 | # status flag for some strange reason. Presumably he doesn't use R or (in this case, scipy) | 
|---|
| 87 | import numpy | 
|---|
| 88 | import scipy | 
|---|
| 89 | from scipy import weave | 
|---|
| 90 |  | 
|---|
| 91 |  | 
|---|
| 92 | sys.stderr=tempstderr | 
|---|
| 93 |  | 
|---|
| 94 |  | 
|---|
| 95 | PROGNAME = os.path.split(sys.argv[0])[-1] | 
|---|
| 96 | X_AXIS_LABEL = 'Mean Alleles Shared' | 
|---|
| 97 | Y_AXIS_LABEL = 'SD Alleles Shared' | 
|---|
| 98 | LEGEND_ALIGN = 'topleft' | 
|---|
| 99 | LEGEND_TITLE = 'Relationship' | 
|---|
| 100 | DEFAULT_SYMBOL_SIZE = 1.0 # default symbol size | 
|---|
| 101 | DEFAULT_SYMBOL_SIZE = 0.5 # default symbol size | 
|---|
| 102 |  | 
|---|
| 103 | ### Some colors for R/rpy | 
|---|
| 104 | R_BLACK  = 1 | 
|---|
| 105 | R_RED    = 2 | 
|---|
| 106 | R_GREEN  = 3 | 
|---|
| 107 | R_BLUE   = 4 | 
|---|
| 108 | R_CYAN   = 5 | 
|---|
| 109 | R_PURPLE = 6 | 
|---|
| 110 | R_YELLOW = 7 | 
|---|
| 111 | R_GRAY   = 8 | 
|---|
| 112 |  | 
|---|
| 113 | ### ... and some point-styles | 
|---|
| 114 |  | 
|---|
| 115 | ### | 
|---|
| 116 | PLOT_HEIGHT = 600 | 
|---|
| 117 | PLOT_WIDTH = 1150 | 
|---|
| 118 |  | 
|---|
| 119 |  | 
|---|
| 120 | #SVG_COLORS = ('black', 'darkblue', 'blue', 'deepskyblue', 'firebrick','maroon','crimson') | 
|---|
| 121 | #SVG_COLORS = ('cyan','dodgerblue','mediumpurple', 'fuchsia', 'red','gold','gray') | 
|---|
| 122 | SVG_COLORS = ('cyan','dodgerblue','mediumpurple','forestgreen', 'lightgreen','gold','gray') | 
|---|
| 123 | # dupe,parentchild,sibpair,halfsib,parents,unrel,unkn | 
|---|
| 124 | #('orange', 'red', 'green', 'chartreuse', 'blue', 'purple', 'gray') | 
|---|
| 125 |  | 
|---|
| 126 | OUTLIERS_HEADER_list = ['Mean','Sdev','ZMean','ZSdev','FID1','IID1','FID2','IID2','RelMean_M','RelMean_SD','RelSD_M','RelSD_SD','PID1','MID1','PID2','MID2','Ped'] | 
|---|
| 127 | OUTLIERS_HEADER = '\t'.join(OUTLIERS_HEADER_list) | 
|---|
| 128 | TABLE_HEADER='fid1_iid1\tfid2_iid2\tmean\tsdev\tzmean\tzsdev\tgeno\trelcode\tpid1\tmid1\tpid2\tmid2\n' | 
|---|
| 129 |  | 
|---|
| 130 |  | 
|---|
| 131 | ### Relationship codes, text, and lookups/mappings | 
|---|
| 132 | N_RELATIONSHIP_TYPES = 7 | 
|---|
| 133 | REL_DUPE, REL_PARENTCHILD, REL_SIBS, REL_HALFSIBS, REL_RELATED, REL_UNRELATED, REL_UNKNOWN = range(N_RELATIONSHIP_TYPES) | 
|---|
| 134 | REL_LOOKUP = { | 
|---|
| 135 | REL_DUPE:        ('dupe',        R_BLUE,   1), | 
|---|
| 136 | REL_PARENTCHILD: ('parentchild', R_YELLOW, 1), | 
|---|
| 137 | REL_SIBS:        ('sibpairs',    R_RED,    1), | 
|---|
| 138 | REL_HALFSIBS:    ('halfsibs',    R_GREEN,  1), | 
|---|
| 139 | REL_RELATED:     ('parents',     R_PURPLE, 1), | 
|---|
| 140 | REL_UNRELATED:   ('unrelated',   R_CYAN,   1), | 
|---|
| 141 | REL_UNKNOWN:     ('unknown',     R_GRAY,   1), | 
|---|
| 142 | } | 
|---|
| 143 | OUTLIER_STDEVS = { | 
|---|
| 144 | REL_DUPE:        2, | 
|---|
| 145 | REL_PARENTCHILD: 2, | 
|---|
| 146 | REL_SIBS:        2, | 
|---|
| 147 | REL_HALFSIBS:    2, | 
|---|
| 148 | REL_RELATED:     2, | 
|---|
| 149 | REL_UNRELATED:   3, | 
|---|
| 150 | REL_UNKNOWN:     2, | 
|---|
| 151 | } | 
|---|
| 152 | # note now Z can be passed in | 
|---|
| 153 |  | 
|---|
| 154 | REL_STATES = [REL_LOOKUP[r][0] for r in range(N_RELATIONSHIP_TYPES)] | 
|---|
| 155 | REL_COLORS = SVG_COLORS | 
|---|
| 156 | REL_POINTS = [REL_LOOKUP[r][2] for r in range(N_RELATIONSHIP_TYPES)] | 
|---|
| 157 |  | 
|---|
| 158 | DEFAULT_MAX_SAMPLE_SIZE = 10000 | 
|---|
| 159 |  | 
|---|
| 160 | REF_COUNT_HOM1 = 3 | 
|---|
| 161 | REF_COUNT_HET  = 2 | 
|---|
| 162 | REF_COUNT_HOM2 = 1 | 
|---|
| 163 | MISSING        = 0 | 
|---|
| 164 | MAX_SHOW_ROWS = 100 # framingham has millions - delays showing output page - so truncate and explain | 
|---|
| 165 | MARKER_PAIRS_PER_SECOND_SLOW = 15000000.0 | 
|---|
| 166 | MARKER_PAIRS_PER_SECOND_FAST = 70000000.0 | 
|---|
| 167 |  | 
|---|
| 168 |  | 
|---|
| 169 | galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> | 
|---|
| 170 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | 
|---|
| 171 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | 
|---|
| 172 | <head> | 
|---|
| 173 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | 
|---|
| 174 | <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" /> | 
|---|
| 175 | <title></title> | 
|---|
| 176 | <link rel="stylesheet" href="/static/style/base.css" type="text/css" /> | 
|---|
| 177 | </head> | 
|---|
| 178 | <body> | 
|---|
| 179 | <div class="document"> | 
|---|
| 180 | """ | 
|---|
| 181 |  | 
|---|
| 182 |  | 
|---|
| 183 | SVG_HEADER = '''<?xml version="1.0" standalone="no"?> | 
|---|
| 184 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.2//EN" "http://www.w3.org/Graphics/SVG/1.2/DTD/svg12.dtd"> | 
|---|
| 185 |  | 
|---|
| 186 | <svg width="1280" height="800" | 
|---|
| 187 | xmlns="http://www.w3.org/2000/svg" version="1.2" | 
|---|
| 188 | xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 1280 800" onload="init()"> | 
|---|
| 189 |  | 
|---|
| 190 | <script type="text/ecmascript" xlink:href="/static/scripts/checkbox_and_radiobutton.js"/> | 
|---|
| 191 | <script type="text/ecmascript" xlink:href="/static/scripts/helper_functions.js"/> | 
|---|
| 192 | <script type="text/ecmascript" xlink:href="/static/scripts/timer.js"/> | 
|---|
| 193 | <script type="text/ecmascript"> | 
|---|
| 194 | <![CDATA[ | 
|---|
| 195 | var checkBoxes = new Array(); | 
|---|
| 196 | var radioGroupBandwidth; | 
|---|
| 197 | var colours = ['%s','%s','%s','%s','%s','%s','%s']; | 
|---|
| 198 | function init() { | 
|---|
| 199 | var style = {"font-family":"Arial,Helvetica", "fill":"black", "font-size":12}; | 
|---|
| 200 | var dist = 12; | 
|---|
| 201 | var yOffset = 4; | 
|---|
| 202 |  | 
|---|
| 203 | //A checkBox for each relationship type dupe,parentchild,sibpair,halfsib,parents,unrel,unkn | 
|---|
| 204 | checkBoxes["dupe"] = new checkBox("dupe","checkboxes",20,40,"cbRect","cbCross",true,"Duplicate",style,dist,yOffset,undefined,hideShowLayer); | 
|---|
| 205 | checkBoxes["parentchild"] = new checkBox("parentchild","checkboxes",20,60,"cbRect","cbCross",true,"Parent-Child",style,dist,yOffset,undefined,hideShowLayer); | 
|---|
| 206 | checkBoxes["sibpairs"] = new checkBox("sibpairs","checkboxes",20,80,"cbRect","cbCross",true,"Sib-pairs",style,dist,yOffset,undefined,hideShowLayer); | 
|---|
| 207 | checkBoxes["halfsibs"] = new checkBox("halfsibs","checkboxes",20,100,"cbRect","cbCross",true,"Half-sibs",style,dist,yOffset,undefined,hideShowLayer); | 
|---|
| 208 | checkBoxes["parents"] = new checkBox("parents","checkboxes",20,120,"cbRect","cbCross",true,"Parents",style,dist,yOffset,undefined,hideShowLayer); | 
|---|
| 209 | checkBoxes["unrelated"] = new checkBox("unrelated","checkboxes",20,140,"cbRect","cbCross",true,"Unrelated",style,dist,yOffset,undefined,hideShowLayer); | 
|---|
| 210 | checkBoxes["unknown"] = new checkBox("unknown","checkboxes",20,160,"cbRect","cbCross",true,"Unknown",style,dist,yOffset,undefined,hideShowLayer); | 
|---|
| 211 |  | 
|---|
| 212 | } | 
|---|
| 213 |  | 
|---|
| 214 | function hideShowLayer(id, status, label) { | 
|---|
| 215 | var vis = "hidden"; | 
|---|
| 216 | if (status) { | 
|---|
| 217 | vis = "visible"; | 
|---|
| 218 | } | 
|---|
| 219 | document.getElementById(id).setAttributeNS(null, 'visibility', vis); | 
|---|
| 220 | } | 
|---|
| 221 |  | 
|---|
| 222 | function showBTT(evt, rel, mm, dm, md, dd, n, mg, dg, lg, hg) { | 
|---|
| 223 | var x = parseInt(evt.pageX)-250; | 
|---|
| 224 | var y = parseInt(evt.pageY)-110; | 
|---|
| 225 | switch(rel) { | 
|---|
| 226 | case 0: | 
|---|
| 227 | fill = colours[rel]; | 
|---|
| 228 | relt = "dupe"; | 
|---|
| 229 | break; | 
|---|
| 230 | case 1: | 
|---|
| 231 | fill = colours[rel]; | 
|---|
| 232 | relt = "parentchild"; | 
|---|
| 233 | break; | 
|---|
| 234 | case 2: | 
|---|
| 235 | fill = colours[rel]; | 
|---|
| 236 | relt = "sibpairs"; | 
|---|
| 237 | break; | 
|---|
| 238 | case 3: | 
|---|
| 239 | fill = colours[rel]; | 
|---|
| 240 | relt = "halfsibs"; | 
|---|
| 241 | break; | 
|---|
| 242 | case 4: | 
|---|
| 243 | fill = colours[rel]; | 
|---|
| 244 | relt = "parents"; | 
|---|
| 245 | break; | 
|---|
| 246 | case 5: | 
|---|
| 247 | fill = colours[rel]; | 
|---|
| 248 | relt = "unrelated"; | 
|---|
| 249 | break; | 
|---|
| 250 | case 6: | 
|---|
| 251 | fill = colours[rel]; | 
|---|
| 252 | relt = "unknown"; | 
|---|
| 253 | break; | 
|---|
| 254 | default: | 
|---|
| 255 | fill = "cyan"; | 
|---|
| 256 | relt = "ERROR_CODE: "+rel; | 
|---|
| 257 | } | 
|---|
| 258 |  | 
|---|
| 259 | document.getElementById("btRel").textContent = "GROUP: "+relt; | 
|---|
| 260 | document.getElementById("btMean").textContent = "mean="+mm+" +/- "+dm; | 
|---|
| 261 | document.getElementById("btSdev").textContent = "sdev="+dm+" +/- "+dd; | 
|---|
| 262 | document.getElementById("btPair").textContent = "npairs="+n; | 
|---|
| 263 | document.getElementById("btGeno").textContent = "ngenos="+mg+" +/- "+dg+" (min="+lg+", max="+hg+")"; | 
|---|
| 264 | document.getElementById("btHead").setAttribute('fill', fill); | 
|---|
| 265 |  | 
|---|
| 266 | var tt = document.getElementById("btTip"); | 
|---|
| 267 | tt.setAttribute("transform", "translate("+x+","+y+")"); | 
|---|
| 268 | tt.setAttribute('visibility', 'visible'); | 
|---|
| 269 | } | 
|---|
| 270 |  | 
|---|
| 271 | function showOTT(evt, rel, s1, s2, mean, sdev, ngeno, rmean, rsdev) { | 
|---|
| 272 | var x = parseInt(evt.pageX)-150; | 
|---|
| 273 | var y = parseInt(evt.pageY)-180; | 
|---|
| 274 |  | 
|---|
| 275 | switch(rel) { | 
|---|
| 276 | case 0: | 
|---|
| 277 | fill = colours[rel]; | 
|---|
| 278 | relt = "dupe"; | 
|---|
| 279 | break; | 
|---|
| 280 | case 1: | 
|---|
| 281 | fill = colours[rel]; | 
|---|
| 282 | relt = "parentchild"; | 
|---|
| 283 | break; | 
|---|
| 284 | case 2: | 
|---|
| 285 | fill = colours[rel]; | 
|---|
| 286 | relt = "sibpairs"; | 
|---|
| 287 | break; | 
|---|
| 288 | case 3: | 
|---|
| 289 | fill = colours[rel]; | 
|---|
| 290 | relt = "halfsibs"; | 
|---|
| 291 | break; | 
|---|
| 292 | case 4: | 
|---|
| 293 | fill = colours[rel]; | 
|---|
| 294 | relt = "parents"; | 
|---|
| 295 | break; | 
|---|
| 296 | case 5: | 
|---|
| 297 | fill = colours[rel]; | 
|---|
| 298 | relt = "unrelated"; | 
|---|
| 299 | break; | 
|---|
| 300 | case 6: | 
|---|
| 301 | fill = colours[rel]; | 
|---|
| 302 | relt = "unknown"; | 
|---|
| 303 | break; | 
|---|
| 304 | default: | 
|---|
| 305 | fill = "cyan"; | 
|---|
| 306 | relt = "ERROR_CODE: "+rel; | 
|---|
| 307 | } | 
|---|
| 308 |  | 
|---|
| 309 | document.getElementById("otRel").textContent = "PAIR: "+relt; | 
|---|
| 310 | document.getElementById("otS1").textContent = "s1="+s1; | 
|---|
| 311 | document.getElementById("otS2").textContent = "s2="+s2; | 
|---|
| 312 | document.getElementById("otMean").textContent = "mean="+mean; | 
|---|
| 313 | document.getElementById("otSdev").textContent = "sdev="+sdev; | 
|---|
| 314 | document.getElementById("otGeno").textContent = "ngenos="+ngeno; | 
|---|
| 315 | document.getElementById("otRmean").textContent = "relmean="+rmean; | 
|---|
| 316 | document.getElementById("otRsdev").textContent = "relsdev="+rsdev; | 
|---|
| 317 | document.getElementById("otHead").setAttribute('fill', fill); | 
|---|
| 318 |  | 
|---|
| 319 | var tt = document.getElementById("otTip"); | 
|---|
| 320 | tt.setAttribute("transform", "translate("+x+","+y+")"); | 
|---|
| 321 | tt.setAttribute('visibility', 'visible'); | 
|---|
| 322 | } | 
|---|
| 323 |  | 
|---|
| 324 | function hideBTT(evt) { | 
|---|
| 325 | document.getElementById("btTip").setAttributeNS(null, 'visibility', 'hidden'); | 
|---|
| 326 | } | 
|---|
| 327 |  | 
|---|
| 328 | function hideOTT(evt) { | 
|---|
| 329 | document.getElementById("otTip").setAttributeNS(null, 'visibility', 'hidden'); | 
|---|
| 330 | } | 
|---|
| 331 |  | 
|---|
| 332 | ]]> | 
|---|
| 333 | </script> | 
|---|
| 334 | <defs> | 
|---|
| 335 | <!-- symbols for check boxes --> | 
|---|
| 336 | <symbol id="cbRect" overflow="visible"> | 
|---|
| 337 | <rect x="-5" y="-5" width="10" height="10" fill="white" stroke="dimgray" stroke-width="1" cursor="pointer"/> | 
|---|
| 338 | </symbol> | 
|---|
| 339 | <symbol id="cbCross" overflow="visible"> | 
|---|
| 340 | <g pointer-events="none" stroke="black" stroke-width="1"> | 
|---|
| 341 | <line x1="-3" y1="-3" x2="3" y2="3"/> | 
|---|
| 342 | <line x1="3" y1="-3" x2="-3" y2="3"/> | 
|---|
| 343 | </g> | 
|---|
| 344 | </symbol> | 
|---|
| 345 | </defs> | 
|---|
| 346 |  | 
|---|
| 347 | <desc>Developer Works Dynamic Scatter Graph Scaling Example</desc> | 
|---|
| 348 |  | 
|---|
| 349 | <!-- Now Draw the main X and Y axis --> | 
|---|
| 350 | <g style="stroke-width:1.0; stroke:black; shape-rendering:crispEdges"> | 
|---|
| 351 | <!-- X Axis top and bottom --> | 
|---|
| 352 | <path d="M 100 100 L 1250 100 Z"/> | 
|---|
| 353 | <path d="M 100 700 L 1250 700 Z"/> | 
|---|
| 354 |  | 
|---|
| 355 | <!-- Y Axis left and right --> | 
|---|
| 356 | <path d="M 100  100 L 100  700 Z"/> | 
|---|
| 357 | <path d="M 1250 100 L 1250 700 Z"/> | 
|---|
| 358 | </g> | 
|---|
| 359 |  | 
|---|
| 360 | <g transform="translate(100,100)"> | 
|---|
| 361 |  | 
|---|
| 362 | <!-- Grid Lines --> | 
|---|
| 363 | <g style="fill:none; stroke:#dddddd; stroke-width:1; stroke-dasharray:2,2; text-anchor:end; shape-rendering:crispEdges"> | 
|---|
| 364 |  | 
|---|
| 365 | <!-- Vertical grid lines --> | 
|---|
| 366 | <line x1="125" y1="0" x2="115" y2="600" /> | 
|---|
| 367 | <line x1="230" y1="0" x2="230" y2="600" /> | 
|---|
| 368 | <line x1="345" y1="0" x2="345" y2="600" /> | 
|---|
| 369 | <line x1="460" y1="0" x2="460" y2="600" /> | 
|---|
| 370 | <line x1="575" y1="0" x2="575" y2="600" style="stroke-dasharray:none;" /> | 
|---|
| 371 | <line x1="690" y1="0" x2="690" y2="600"   /> | 
|---|
| 372 | <line x1="805" y1="0" x2="805" y2="600"   /> | 
|---|
| 373 | <line x1="920" y1="0" x2="920" y2="600"   /> | 
|---|
| 374 | <line x1="1035" y1="0" x2="1035" y2="600" /> | 
|---|
| 375 |  | 
|---|
| 376 | <!-- Horizontal grid lines --> | 
|---|
| 377 | <line x1="0" y1="60" x2="1150" y2="60"   /> | 
|---|
| 378 | <line x1="0" y1="120" x2="1150" y2="120" /> | 
|---|
| 379 | <line x1="0" y1="180" x2="1150" y2="180" /> | 
|---|
| 380 | <line x1="0" y1="240" x2="1150" y2="240" /> | 
|---|
| 381 | <line x1="0" y1="300" x2="1150" y2="300" style="stroke-dasharray:none;" /> | 
|---|
| 382 | <line x1="0" y1="360" x2="1150" y2="360" /> | 
|---|
| 383 | <line x1="0" y1="420" x2="1150" y2="420" /> | 
|---|
| 384 | <line x1="0" y1="480" x2="1150" y2="480" /> | 
|---|
| 385 | <line x1="0" y1="540" x2="1150" y2="540" /> | 
|---|
| 386 | </g> | 
|---|
| 387 |  | 
|---|
| 388 | <!-- Legend --> | 
|---|
| 389 | <g style="fill:black; stroke:none" font-size="12" font-family="Arial" transform="translate(25,25)"> | 
|---|
| 390 | <rect width="160" height="270" style="fill:none; stroke:black; shape-rendering:crispEdges" /> | 
|---|
| 391 | <text x="5" y="20" style="fill:black; stroke:none;" font-size="13" font-weight="bold">Given Pair Relationship</text> | 
|---|
| 392 | <rect x="120" y="35" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> | 
|---|
| 393 | <rect x="120" y="55" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> | 
|---|
| 394 | <rect x="120" y="75" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> | 
|---|
| 395 | <rect x="120" y="95" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> | 
|---|
| 396 | <rect x="120" y="115" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> | 
|---|
| 397 | <rect x="120" y="135" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> | 
|---|
| 398 | <rect x="120" y="155" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> | 
|---|
| 399 | <text x="15"  y="195" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore gt 15</text> | 
|---|
| 400 | <circle cx="125" cy="192" r="6" style="stroke:red; fill:gold; fill-opacity:1.0; stroke-width:1;"/> | 
|---|
| 401 | <text x="15" y="215" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore 4 to 15</text> | 
|---|
| 402 | <circle cx="125" cy="212" r="3" style="stroke:gold; fill:gold; fill-opacity:1.0; stroke-width:1;"/> | 
|---|
| 403 | <text x="15" y="235" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore lt 4</text> | 
|---|
| 404 | <circle cx="125" cy="232" r="2" style="stroke:gold; fill:gold; fill-opacity:1.0; stroke-width:1;"/> | 
|---|
| 405 | <g id="checkboxes"> | 
|---|
| 406 | </g> | 
|---|
| 407 | </g> | 
|---|
| 408 |  | 
|---|
| 409 |  | 
|---|
| 410 | <g style='fill:black; stroke:none' font-size="17" font-family="Arial"> | 
|---|
| 411 | <!-- X Axis Labels --> | 
|---|
| 412 | <text x="480" y="660">Mean Alleles Shared</text> | 
|---|
| 413 | <text x="0"    y="630" >1.0</text> | 
|---|
| 414 | <text x="277"  y="630" >1.25</text> | 
|---|
| 415 | <text x="564"  y="630" >1.5</text> | 
|---|
| 416 | <text x="842" y="630" >1.75</text> | 
|---|
| 417 | <text x="1140" y="630" >2.0</text> | 
|---|
| 418 | </g> | 
|---|
| 419 |  | 
|---|
| 420 | <g transform="rotate(270)" style="fill:black; stroke:none" font-size="17" font-family="Arial"> | 
|---|
| 421 | <!-- Y Axis Labels --> | 
|---|
| 422 | <text x="-350" y="-40">SD Alleles Shared</text> | 
|---|
| 423 | <text x="-20" y="-10" >1.0</text> | 
|---|
| 424 | <text x="-165" y="-10" >0.75</text> | 
|---|
| 425 | <text x="-310" y="-10" >0.5</text> | 
|---|
| 426 | <text x="-455" y="-10" >0.25</text> | 
|---|
| 427 | <text x="-600" y="-10" >0.0</text> | 
|---|
| 428 | </g> | 
|---|
| 429 |  | 
|---|
| 430 | <!-- Plot Title --> | 
|---|
| 431 | <g style="fill:black; stroke:none" font-size="18" font-family="Arial"> | 
|---|
| 432 | <text x="425" y="-30">%s</text> | 
|---|
| 433 | </g> | 
|---|
| 434 |  | 
|---|
| 435 | <!-- One group/layer of points for each relationship type --> | 
|---|
| 436 | ''' | 
|---|
| 437 |  | 
|---|
| 438 | SVG_FOOTER = ''' | 
|---|
| 439 | <!-- End of Data --> | 
|---|
| 440 | </g> | 
|---|
| 441 | <g id="btTip" visibility="hidden" style="stroke-width:1.0; fill:black; stroke:none;" font-size="10" font-family="Arial"> | 
|---|
| 442 | <rect width="250" height="110" style="fill:silver" rx="2" ry="2"/> | 
|---|
| 443 | <rect id="btHead" width="250" height="20" rx="2" ry="2" /> | 
|---|
| 444 | <text id="btRel" y="14" x="85">unrelated</text> | 
|---|
| 445 | <text id="btMean" y="40" x="4">mean=1.5 +/- 0.04</text> | 
|---|
| 446 | <text id="btSdev" y="60" x="4">sdev=0.7 +/- 0.03</text> | 
|---|
| 447 | <text id="btPair" y="80" x="4">npairs=1152</text> | 
|---|
| 448 | <text id="btGeno" y="100" x="4">ngenos=4783 +/- 24 (min=1000, max=5000)</text> | 
|---|
| 449 | </g> | 
|---|
| 450 |  | 
|---|
| 451 | <g id="otTip" visibility="hidden" style="stroke-width:1.0; fill:black; stroke:none;" font-size="10" font-family="Arial"> | 
|---|
| 452 | <rect width="150" height="180" style="fill:silver" rx="2" ry="2"/> | 
|---|
| 453 | <rect id="otHead" width="150" height="20" rx="2" ry="2" /> | 
|---|
| 454 | <text id="otRel" y="14" x="40">sibpairs</text> | 
|---|
| 455 | <text id="otS1" y="40" x="4">s1=fid1,iid1</text> | 
|---|
| 456 | <text id="otS2" y="60" x="4">s2=fid2,iid2</text> | 
|---|
| 457 | <text id="otMean" y="80" x="4">mean=1.82</text> | 
|---|
| 458 | <text id="otSdev" y="100" x="4">sdev=0.7</text> | 
|---|
| 459 | <text id="otGeno" y="120" x="4">ngeno=4487</text> | 
|---|
| 460 | <text id="otRmean" y="140" x="4">relmean=1.85</text> | 
|---|
| 461 | <text id="otRsdev" y="160" x="4">relsdev=0.65</text> | 
|---|
| 462 | </g> | 
|---|
| 463 | </svg> | 
|---|
| 464 | ''' | 
|---|
| 465 |  | 
|---|
| 466 |  | 
|---|
| 467 | DEFAULT_MAX_SAMPLE_SIZE = 5000 | 
|---|
| 468 |  | 
|---|
| 469 | REF_COUNT_HOM1 = 3 | 
|---|
| 470 | REF_COUNT_HET  = 2 | 
|---|
| 471 | REF_COUNT_HOM2 = 1 | 
|---|
| 472 | MISSING        = 0 | 
|---|
| 473 |  | 
|---|
| 474 | MARKER_PAIRS_PER_SECOND_SLOW = 15000000 | 
|---|
| 475 | MARKER_PAIRS_PER_SECOND_FAST = 70000000 | 
|---|
| 476 |  | 
|---|
| 477 | POLYGONS = { | 
|---|
| 478 | REL_UNRELATED:   ((1.360, 0.655), (1.385, 0.730), (1.620, 0.575), (1.610, 0.505)), | 
|---|
| 479 | REL_HALFSIBS:    ((1.630, 0.500), (1.630, 0.550), (1.648, 0.540), (1.648, 0.490)), | 
|---|
| 480 | REL_SIBS:        ((1.660, 0.510), (1.665, 0.560), (1.820, 0.410), (1.820, 0.390)), | 
|---|
| 481 | REL_PARENTCHILD: ((1.650, 0.470), (1.650, 0.490), (1.750, 0.440), (1.750, 0.420)), | 
|---|
| 482 | REL_DUPE:        ((1.970, 0.000), (1.970, 0.150), (2.000, 0.150), (2.000, 0.000)), | 
|---|
| 483 | } | 
|---|
| 484 |  | 
|---|
| 485 | def distance(point1, point2): | 
|---|
| 486 | """ Calculate the distance between two points | 
|---|
| 487 | """ | 
|---|
| 488 | (x1,y1) = [float(d) for d in point1] | 
|---|
| 489 | (x2,y2) = [float(d) for d in point2] | 
|---|
| 490 | dx = abs(x1 - x2) | 
|---|
| 491 | dy = abs(y1 - y2) | 
|---|
| 492 | return math.sqrt(dx**2 + dy**2) | 
|---|
| 493 |  | 
|---|
| 494 | def point_inside_polygon(x, y, poly): | 
|---|
| 495 | """ Determine if a point (x,y) is inside a given polygon or not | 
|---|
| 496 | poly is a list of (x,y) pairs. | 
|---|
| 497 |  | 
|---|
| 498 | Taken from: http://www.ariel.com.au/a/python-point-int-poly.html | 
|---|
| 499 | """ | 
|---|
| 500 |  | 
|---|
| 501 | n = len(poly) | 
|---|
| 502 | inside = False | 
|---|
| 503 |  | 
|---|
| 504 | p1x,p1y = poly[0] | 
|---|
| 505 | for i in range(n+1): | 
|---|
| 506 | p2x,p2y = poly[i % n] | 
|---|
| 507 | if y > min(p1y,p2y): | 
|---|
| 508 | if y <= max(p1y,p2y): | 
|---|
| 509 | if x <= max(p1x,p2x): | 
|---|
| 510 | if p1y != p2y: | 
|---|
| 511 | xinters = (y-p1y)*(p2x-p1x)/(p2y-p1y)+p1x | 
|---|
| 512 | if p1x == p2x or x <= xinters: | 
|---|
| 513 | inside = not inside | 
|---|
| 514 | p1x,p1y = p2x,p2y | 
|---|
| 515 | return inside | 
|---|
| 516 |  | 
|---|
| 517 | def readMap(pedfile): | 
|---|
| 518 | """ | 
|---|
| 519 | """ | 
|---|
| 520 | mapfile = pedfile.replace('.ped', '.map') | 
|---|
| 521 | marker_list = [] | 
|---|
| 522 | if os.path.exists(mapfile): | 
|---|
| 523 | print 'readMap: %s' % (mapfile) | 
|---|
| 524 | fh = file(mapfile, 'r') | 
|---|
| 525 | for line in fh: | 
|---|
| 526 | marker_list.append(line.strip().split()) | 
|---|
| 527 | fh.close() | 
|---|
| 528 | print 'readMap: %s markers' % (len(marker_list)) | 
|---|
| 529 | return marker_list | 
|---|
| 530 |  | 
|---|
| 531 | def calcMeanSD(useme): | 
|---|
| 532 | """ | 
|---|
| 533 | A numerically stable algorithm is given below. It also computes the mean. | 
|---|
| 534 | This algorithm is due to Knuth,[1] who cites Welford.[2] | 
|---|
| 535 | n = 0 | 
|---|
| 536 | mean = 0 | 
|---|
| 537 | M2 = 0 | 
|---|
| 538 |  | 
|---|
| 539 | foreach x in data: | 
|---|
| 540 | n = n + 1 | 
|---|
| 541 | delta = x - mean | 
|---|
| 542 | mean = mean + delta/n | 
|---|
| 543 | M2 = M2 + delta*(x - mean)      // This expression uses the new value of mean | 
|---|
| 544 | end for | 
|---|
| 545 |  | 
|---|
| 546 | variance_n = M2/n | 
|---|
| 547 | variance = M2/(n - 1) | 
|---|
| 548 | """ | 
|---|
| 549 | mean = 0.0 | 
|---|
| 550 | M2 = 0.0 | 
|---|
| 551 | sd = 0.0 | 
|---|
| 552 | n = len(useme) | 
|---|
| 553 | if n > 1: | 
|---|
| 554 | for i,x in enumerate(useme): | 
|---|
| 555 | delta = x - mean | 
|---|
| 556 | mean = mean + delta/(i+1) # knuth uses n+=1 at start | 
|---|
| 557 | M2 = M2 + delta*(x - mean)      # This expression uses the new value of mean | 
|---|
| 558 | variance = M2/(n-1) # assume is sample so lose 1 DOF | 
|---|
| 559 | sd = pow(variance,0.5) | 
|---|
| 560 | return mean,sd | 
|---|
| 561 |  | 
|---|
| 562 |  | 
|---|
| 563 | def doIBSpy(ped=None,basename='',outdir=None,logf=None, | 
|---|
| 564 | nrsSamples=10000,title='title',pdftoo=0,Zcutoff=2.0): | 
|---|
| 565 | #def doIBS(pedName, title, nrsSamples=None, pdftoo=False): | 
|---|
| 566 | """ started with snpmatrix but GRR uses actual IBS counts and sd's | 
|---|
| 567 | """ | 
|---|
| 568 | repOut = [] # text strings to add to the html display | 
|---|
| 569 | refallele = {} | 
|---|
| 570 | tblf = '%s_table.xls' % (title) | 
|---|
| 571 | tbl = file(os.path.join(outdir,tblf), 'w') | 
|---|
| 572 | tbl.write(TABLE_HEADER) | 
|---|
| 573 | svgf = '%s.svg' % (title) | 
|---|
| 574 | svg = file(os.path.join(outdir,svgf), 'w') | 
|---|
| 575 |  | 
|---|
| 576 | nMarkers = len(ped._markers) | 
|---|
| 577 | if nMarkers < 5: | 
|---|
| 578 | print sys.stderr, '### ERROR - %d is too few markers for reliable estimation in %s - terminating' % (nMarkers,PROGNAME) | 
|---|
| 579 | sys.exit(1) | 
|---|
| 580 | nSubjects = len(ped._subjects) | 
|---|
| 581 | nrsSamples = min(nMarkers, nrsSamples) | 
|---|
| 582 | if opts and opts.use_mito: | 
|---|
| 583 | markers = range(nMarkers) | 
|---|
| 584 | nrsSamples = min(len(markers), nrsSamples) | 
|---|
| 585 | sampleIndexes = sorted(random.sample(markers, nrsSamples)) | 
|---|
| 586 | else: | 
|---|
| 587 | autosomals = ped.autosomal_indices() | 
|---|
| 588 | nrsSamples = min(len(autosomals), nrsSamples) | 
|---|
| 589 | sampleIndexes = sorted(random.sample(autosomals, nrsSamples)) | 
|---|
| 590 |  | 
|---|
| 591 | print '' | 
|---|
| 592 | print 'Getting random.sample of %s from %s total' % (nrsSamples, nMarkers) | 
|---|
| 593 | npairs = (nSubjects*(nSubjects-1))/2 # total rows in table | 
|---|
| 594 | newfiles=[svgf,tblf] | 
|---|
| 595 | explanations = ['rgGRR Plot (requires SVG)','Mean by SD alleles shared - %d rows' % npairs] | 
|---|
| 596 | # these go with the output file links in the html file | 
|---|
| 597 | s = 'Reading genotypes for %s subjects and %s markers\n' % (nSubjects, nrsSamples) | 
|---|
| 598 | logf.write(s) | 
|---|
| 599 | minUsegenos = nrsSamples/2 # must have half? | 
|---|
| 600 | nGenotypes = nSubjects*nrsSamples | 
|---|
| 601 | stime = time.time() | 
|---|
| 602 | emptyRows = set() | 
|---|
| 603 | genos = numpy.zeros((nSubjects, nrsSamples), dtype=int) | 
|---|
| 604 | for s in xrange(nSubjects): | 
|---|
| 605 | nValid = 0 | 
|---|
| 606 | #getGenotypesByIndices(self, s, mlist, format) | 
|---|
| 607 | genos[s] = ped.getGenotypesByIndices(s, sampleIndexes, format='ref') | 
|---|
| 608 | nValid = sum([1 for g in genos[s] if g]) | 
|---|
| 609 | if not nValid: | 
|---|
| 610 | emptyRows.add(s) | 
|---|
| 611 | sub = ped.getSubject(s) | 
|---|
| 612 | print 'All missing for row %d (%s)' % (s, sub) | 
|---|
| 613 | logf.write('All missing for row %d (%s)\n' % (s, sub)) | 
|---|
| 614 | rtime = time.time() - stime | 
|---|
| 615 | if verbose: | 
|---|
| 616 | print '@@Read %s genotypes in %s seconds' % (nGenotypes, rtime) | 
|---|
| 617 |  | 
|---|
| 618 |  | 
|---|
| 619 | ### Now the expensive part.  For each pair of subjects, we get the mean number | 
|---|
| 620 | ### and standard deviation of shared alleles over all of the markers where both | 
|---|
| 621 | ### subjects have a known genotype.  Identical subjects should have mean shared | 
|---|
| 622 | ### alleles very close to 2.0 with a standard deviation very close to 0.0. | 
|---|
| 623 | tot = nSubjects*(nSubjects-1)/2 | 
|---|
| 624 | nprog = tot/10 | 
|---|
| 625 | nMarkerpairs = tot * nrsSamples | 
|---|
| 626 | estimatedTimeSlow = nMarkerpairs/MARKER_PAIRS_PER_SECOND_SLOW | 
|---|
| 627 | estimatedTimeFast = nMarkerpairs/MARKER_PAIRS_PER_SECOND_FAST | 
|---|
| 628 |  | 
|---|
| 629 | pairs = [] | 
|---|
| 630 | pair_data = {} | 
|---|
| 631 | means = []    ## Mean IBS for each pair | 
|---|
| 632 | ngenoL = []   ## Count of comparable genotypes for each pair | 
|---|
| 633 | sdevs = []    ## Standard dev for each pair | 
|---|
| 634 | rels  = []    ## A relationship code for each pair | 
|---|
| 635 | zmeans  = [0.0 for x in xrange(tot)]    ## zmean score for each pair for the relgroup | 
|---|
| 636 | zstds  = [0.0 for x in xrange(tot)]   ## zstd score for each pair for the relgrp | 
|---|
| 637 | skip = set() | 
|---|
| 638 | ndone = 0     ## How many have been done so far | 
|---|
| 639 |  | 
|---|
| 640 | logf.write('Calculating %d pairs...\n' % (tot)) | 
|---|
| 641 | logf.write('Estimated time is %2.2f to %2.2f seconds ...\n' % (estimatedTimeFast, estimatedTimeSlow)) | 
|---|
| 642 |  | 
|---|
| 643 | t1sum = 0 | 
|---|
| 644 | t2sum = 0 | 
|---|
| 645 | t3sum = 0 | 
|---|
| 646 | now = time.time() | 
|---|
| 647 | scache = {} | 
|---|
| 648 | _founder_cache = {} | 
|---|
| 649 | C_CODE = """ | 
|---|
| 650 | #include "math.h" | 
|---|
| 651 | int i; | 
|---|
| 652 | int sumibs = 0; | 
|---|
| 653 | int ssqibs = 0; | 
|---|
| 654 | int ngeno  = 0; | 
|---|
| 655 | float mean = 0; | 
|---|
| 656 | float M2 = 0; | 
|---|
| 657 | float delta = 0; | 
|---|
| 658 | float sdev=0; | 
|---|
| 659 | float variance=0; | 
|---|
| 660 | for (i=0; i<nrsSamples; i++) { | 
|---|
| 661 | int a1 = g1[i]; | 
|---|
| 662 | int a2 = g2[i]; | 
|---|
| 663 | if (a1 != 0 && a2 != 0) { | 
|---|
| 664 | ngeno += 1; | 
|---|
| 665 | int shared = 2-abs(a1-a2); | 
|---|
| 666 | delta = shared - mean; | 
|---|
| 667 | mean = mean + delta/ngeno; | 
|---|
| 668 | M2 += delta*(shared-mean); | 
|---|
| 669 | // yes that second time, the updated mean is used see calcmeansd above; | 
|---|
| 670 | //printf("%d %d %d %d %d %d\\n", i, a1, a2, ngeno, shared, squared); | 
|---|
| 671 | } | 
|---|
| 672 | } | 
|---|
| 673 | if (ngeno > 1) { | 
|---|
| 674 | variance = M2/(ngeno-1); | 
|---|
| 675 | sdev = sqrt(variance); | 
|---|
| 676 | //printf("OK: %d %3.2f %3.2f\\n", ngeno, mean, sdev); | 
|---|
| 677 | } | 
|---|
| 678 | //printf("%d %d %d %1.2f %1.2f\\n", ngeno, sumibs, ssqibs, mean, sdev); | 
|---|
| 679 | result[0] = ngeno; | 
|---|
| 680 | result[1] = mean; | 
|---|
| 681 | result[2] = sdev; | 
|---|
| 682 | return_val = ngeno; | 
|---|
| 683 | """ | 
|---|
| 684 | started = time.time() | 
|---|
| 685 | for s1 in xrange(nSubjects): | 
|---|
| 686 | if s1 in emptyRows: | 
|---|
| 687 | continue | 
|---|
| 688 | (fid1,iid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache.setdefault(s1, ped.getSubject(s1)) | 
|---|
| 689 |  | 
|---|
| 690 | isFounder1 = _founder_cache.setdefault(s1, (did1==mid1)) | 
|---|
| 691 | g1 = genos[s1] | 
|---|
| 692 |  | 
|---|
| 693 | for s2 in xrange(s1+1, nSubjects): | 
|---|
| 694 | if s2 in emptyRows: | 
|---|
| 695 | continue | 
|---|
| 696 | t1s = time.time() | 
|---|
| 697 |  | 
|---|
| 698 | (fid2,iid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache.setdefault(s2, ped.getSubject(s2)) | 
|---|
| 699 |  | 
|---|
| 700 | g2 = genos[s2] | 
|---|
| 701 | isFounder2 = _founder_cache.setdefault(s2, (did2==mid2)) | 
|---|
| 702 |  | 
|---|
| 703 | # Determine the relationship for this pair | 
|---|
| 704 | relcode = REL_UNKNOWN | 
|---|
| 705 | if (fid2 == fid1): | 
|---|
| 706 | if iid1 == iid2: | 
|---|
| 707 | relcode = REL_DUPE | 
|---|
| 708 | elif (did2 == did1) and (mid2 == mid1) and did1 != mid1: | 
|---|
| 709 | relcode = REL_SIBS | 
|---|
| 710 | elif (iid1 == mid2) or (iid1 == did2) or (iid2 == mid1) or (iid2 == did1): | 
|---|
| 711 | relcode = REL_PARENTCHILD | 
|---|
| 712 | elif (str(did1) != '0' and (did2 == did1)) or (str(mid1) != '0' and (mid2 == mid1)): | 
|---|
| 713 | relcode = REL_HALFSIBS | 
|---|
| 714 | else: | 
|---|
| 715 | # People in the same family should be marked as some other | 
|---|
| 716 | # form of related.  In general, these people will have a | 
|---|
| 717 | # pretty random spread of similarity. This distinction is | 
|---|
| 718 | # probably not very useful most of the time | 
|---|
| 719 | relcode = REL_RELATED | 
|---|
| 720 | else: | 
|---|
| 721 | ### Different families | 
|---|
| 722 | relcode = REL_UNRELATED | 
|---|
| 723 |  | 
|---|
| 724 | t1e = time.time() | 
|---|
| 725 | t1sum += t1e-t1s | 
|---|
| 726 |  | 
|---|
| 727 |  | 
|---|
| 728 | ### Calculate sum(2-abs(a1-a2)) and sum((2-abs(a1-a2))**2) and count | 
|---|
| 729 | ### the number of contributing genotypes.  These values are not actually | 
|---|
| 730 | ### calculated here, but instead are looked up in a table for speed. | 
|---|
| 731 | ### FIXME: This is still too slow ... | 
|---|
| 732 | result = [0.0, 0.0, 0.0] | 
|---|
| 733 | ngeno = weave.inline(C_CODE, ['g1', 'g2', 'nrsSamples', 'result']) | 
|---|
| 734 | if ngeno >= minUsegenos: | 
|---|
| 735 | _, mean, sdev = result | 
|---|
| 736 | means.append(mean) | 
|---|
| 737 | sdevs.append(sdev) | 
|---|
| 738 | ngenoL.append(ngeno) | 
|---|
| 739 | pairs.append((s1, s2)) | 
|---|
| 740 | rels.append(relcode) | 
|---|
| 741 | else: | 
|---|
| 742 | skip.add(ndone) # signal no comparable genotypes for this pair | 
|---|
| 743 | ndone += 1 | 
|---|
| 744 | t2e = time.time() | 
|---|
| 745 | t2sum += t2e-t1e | 
|---|
| 746 | t3e = time.time() | 
|---|
| 747 | t3sum += t3e-t2e | 
|---|
| 748 |  | 
|---|
| 749 | logme = [ 'T1:  %s' % (t1sum), 'T2:  %s' % (t2sum), 'T3:  %s' % (t3sum),'TOT: %s' % (t3e-now), | 
|---|
| 750 | '%s pairs with no (or not enough) comparable genotypes (%3.1f%%)' % (len(skip), | 
|---|
| 751 | float(len(skip))/float(tot)*100)] | 
|---|
| 752 | logf.write('%s\n' % '\t'.join(logme)) | 
|---|
| 753 | ### Calculate mean and standard deviation of scores on a per relationship | 
|---|
| 754 | ### type basis, allowing us to flag outliers for each particular relationship | 
|---|
| 755 | ### type | 
|---|
| 756 | relstats = {} | 
|---|
| 757 | relCounts = {} | 
|---|
| 758 | outlierFiles = {} | 
|---|
| 759 | for relCode, relInfo in REL_LOOKUP.items(): | 
|---|
| 760 | relName, relColor, relStyle = relInfo | 
|---|
| 761 | useme = [means[x] for x in xrange(len(means)) if rels[x] == relCode] | 
|---|
| 762 | relCounts[relCode] = len(useme) | 
|---|
| 763 | mm = scipy.mean(useme) | 
|---|
| 764 | ms = scipy.std(useme) | 
|---|
| 765 | useme = [sdevs[x] for x in xrange(len(sdevs)) if rels[x] == relCode] | 
|---|
| 766 | sm = scipy.mean(useme) | 
|---|
| 767 | ss = scipy.std(useme) | 
|---|
| 768 | relstats[relCode] = {'sd':(sm,ss), 'mean':(mm,ms)} | 
|---|
| 769 | s = 'Relstate %s (n=%d): mean(mean)=%3.2f sdev(mean)=%3.2f, mean(sdev)=%3.2f sdev(sdev)=%3.2f\n' % \ | 
|---|
| 770 | (relName,relCounts[relCode], mm, ms, sm, ss) | 
|---|
| 771 | logf.write(s) | 
|---|
| 772 |  | 
|---|
| 773 | ### now fake z scores for each subject like abecasis recommends max(|zmu|,|zsd|) | 
|---|
| 774 | ### within each group, for each pair, z=(groupmean-pairmean)/groupsd | 
|---|
| 775 | available = len(means) | 
|---|
| 776 | logf.write('%d pairs are available of %d\n' % (available, tot)) | 
|---|
| 777 | ### s = '\nOutliers:\nrelationship\tzmean\tzsd\tped1\tped2\tmean\tsd\trmeanmean\trmeansd\trsdmean\trsdsd\n' | 
|---|
| 778 | ### logf.write(s) | 
|---|
| 779 | pairnum   = 0 | 
|---|
| 780 | offset    = 0 | 
|---|
| 781 | nOutliers = 0 | 
|---|
| 782 | cexs      = [] | 
|---|
| 783 | outlierRecords = dict([(r, []) for r in range(N_RELATIONSHIP_TYPES)]) | 
|---|
| 784 | zsdmax = 0 | 
|---|
| 785 | for s1 in range(nSubjects): | 
|---|
| 786 | if s1 in emptyRows: | 
|---|
| 787 | continue | 
|---|
| 788 | (fid1,iid1,did1,mid1,sex1,aff1,ok1,d_sid1,m_sid1) = scache[s1] | 
|---|
| 789 | for s2 in range(s1+1, nSubjects): | 
|---|
| 790 | if s2 in emptyRows: | 
|---|
| 791 | continue | 
|---|
| 792 | if pairnum not in skip: | 
|---|
| 793 | ### Get group stats for this relationship | 
|---|
| 794 | (fid2,iid2,did2,mid2,sex2,aff2,ok2,d_sid2,m_sid2) = scache[s2] | 
|---|
| 795 | try: | 
|---|
| 796 | r = rels[offset] | 
|---|
| 797 | except IndexError: | 
|---|
| 798 | logf.write('###OOPS offset %d available %d  pairnum %d  len(rels) %d', offset, available, pairnum, len(rels)) | 
|---|
| 799 | notfound = ('?',('?','0','0')) | 
|---|
| 800 | relInfo = REL_LOOKUP.get(r,notfound) | 
|---|
| 801 | relName, relColor, relStyle = relInfo | 
|---|
| 802 | rmm,rmd = relstats[r]['mean'] # group mean, group meansd alleles shared | 
|---|
| 803 | rdm,rdd = relstats[r]['sd'] # group sdmean, group sdsd alleles shared | 
|---|
| 804 |  | 
|---|
| 805 | try: | 
|---|
| 806 | zsd = (sdevs[offset] - rdm)/rdd # distance from group mean in group sd units | 
|---|
| 807 | except: | 
|---|
| 808 | zsd = 1 | 
|---|
| 809 | if abs(zsd) > zsdmax: | 
|---|
| 810 | zsdmax = zsd # keep for sort scaling | 
|---|
| 811 | try: | 
|---|
| 812 | zmean = (means[offset] - rmm)/rmd # distance from group mean | 
|---|
| 813 | except: | 
|---|
| 814 | zmean = 1 | 
|---|
| 815 | zmeans[offset] = zmean | 
|---|
| 816 | zstds[offset] = zsd | 
|---|
| 817 | pid=(s1,s2) | 
|---|
| 818 | zrad = max(zsd,zmean) | 
|---|
| 819 | if zrad < 4: | 
|---|
| 820 | zrad = 2 | 
|---|
| 821 | elif 4 < zrad < 15: | 
|---|
| 822 | zrad = 3 # to 9 | 
|---|
| 823 | else: # > 15 6=24+ | 
|---|
| 824 | zrad=zrad/4 | 
|---|
| 825 | zrad = min(zrad,6) # scale limit | 
|---|
| 826 | zrad = max(2,max(zsd,zmean)) # as > 2, z grows | 
|---|
| 827 | pair_data[pid] = (zmean,zsd,r,zrad) | 
|---|
| 828 | if max(zsd,zmean) > Zcutoff: # is potentially interesting | 
|---|
| 829 | mean = means[offset] | 
|---|
| 830 | sdev = sdevs[offset] | 
|---|
| 831 | outlierRecords[r].append((mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd,did1,mid1,did2,mid2)) | 
|---|
| 832 | nOutliers += 1 | 
|---|
| 833 | tbl.write('%s_%s\t%s_%s\t%f\t%f\t%f\t%f\t%d\t%s\t%s\t%s\t%s\t%s\n' % \ | 
|---|
| 834 | (fid1, iid1, fid2, iid2, mean, sdev, zmean,zsd, ngeno, relName, did1,mid1,did2,mid2)) | 
|---|
| 835 | offset += 1 | 
|---|
| 836 | pairnum += 1 | 
|---|
| 837 | logf.write( 'Outliers: %s\n' % (nOutliers)) | 
|---|
| 838 |  | 
|---|
| 839 | ### Write outlier files for each relationship type | 
|---|
| 840 | repOut.append('<h2>Outliers in tab delimited files linked above are also listed below</h2>') | 
|---|
| 841 | lzsd = round(numpy.log10(zsdmax)) + 1 | 
|---|
| 842 | scalefactor = 10**lzsd | 
|---|
| 843 | for relCode, relInfo in REL_LOOKUP.items(): | 
|---|
| 844 | relName, _, _ = relInfo | 
|---|
| 845 | outliers = outlierRecords[relCode] | 
|---|
| 846 | if not outliers: | 
|---|
| 847 | continue | 
|---|
| 848 | outliers = [(scalefactor*int(abs(x[3]))+ int(abs(x[2])),x) for x in outliers] # decorate | 
|---|
| 849 | outliers.sort() | 
|---|
| 850 | outliers.reverse() # largest deviation first | 
|---|
| 851 | outliers = [x[1] for x in outliers] # undecorate | 
|---|
| 852 | nrows = len(outliers) | 
|---|
| 853 | truncated = 0 | 
|---|
| 854 | if nrows > MAX_SHOW_ROWS: | 
|---|
| 855 | s = '<h3>%s outlying pairs (top %d of %d) from %s</h3><table border="0" cellpadding="3">' % \ | 
|---|
| 856 | (relName,MAX_SHOW_ROWS,nrows,title) | 
|---|
| 857 | truncated = nrows - MAX_SHOW_ROWS | 
|---|
| 858 | else: | 
|---|
| 859 | s = '<h3>%s outlying pairs (n=%d) from %s</h3><table border="0" cellpadding="3">' % (relName,nrows,title) | 
|---|
| 860 | repOut.append(s) | 
|---|
| 861 | fhname = '%s_rgGRR_%s_outliers.xls' % (title, relName) | 
|---|
| 862 | fhpath = os.path.join(outdir,fhname) | 
|---|
| 863 | fh = open(fhpath, 'w') | 
|---|
| 864 | newfiles.append(fhname) | 
|---|
| 865 | explanations.append('%s Outlier Pairs %s, N=%d, Cutoff SD=%f' % (relName,title,len(outliers),Zcutoff)) | 
|---|
| 866 | fh.write(OUTLIERS_HEADER) | 
|---|
| 867 | s = ''.join(['<th>%s</th>' % x for x in OUTLIERS_HEADER_list]) | 
|---|
| 868 | repOut.append('<tr align="center">%s</tr>' % s) | 
|---|
| 869 | for n,rec in enumerate(outliers): | 
|---|
| 870 | #(mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd) = rec | 
|---|
| 871 | s = '%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t' % tuple(rec) | 
|---|
| 872 | fh.write('%s%s\n' % (s,relName)) | 
|---|
| 873 | # (mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd, did1,mid1,did2,mid2)) | 
|---|
| 874 | s = '''<td>%f</td><td>%f</td><td>%f</td><td>%f</td><td>%s</td><td>%s</td> | 
|---|
| 875 | <td>%s</td><td>%s</td><td>%f</td><td>%f</td><td>%f</td><td>%f</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td>''' % tuple(rec) | 
|---|
| 876 | s = '%s<td>%s</td>' % (s,relName) | 
|---|
| 877 | if n < MAX_SHOW_ROWS: | 
|---|
| 878 | repOut.append('<tr align="center">%s</tr>' % s) | 
|---|
| 879 | if truncated > 0: | 
|---|
| 880 | repOut.append('<H2>WARNING: %d rows truncated - see outlier file for all %d rows</H2>' % (truncated, | 
|---|
| 881 | nrows)) | 
|---|
| 882 | fh.close() | 
|---|
| 883 | repOut.append('</table><p>') | 
|---|
| 884 |  | 
|---|
| 885 | ### Now, draw the plot in jpeg and svg formats, and optionally in the PDF format | 
|---|
| 886 | ### if requested | 
|---|
| 887 | logf.write('Plotting ...') | 
|---|
| 888 | pointColors = [REL_COLORS[rel] for rel in rels] | 
|---|
| 889 | pointStyles = [REL_POINTS[rel] for rel in rels] | 
|---|
| 890 |  | 
|---|
| 891 | mainTitle = '%s (%s subjects, %d snp)' % (title, nSubjects, nrsSamples) | 
|---|
| 892 | svg.write(SVG_HEADER % (SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[4], | 
|---|
| 893 | SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[0],SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[1], | 
|---|
| 894 | SVG_COLORS[2],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[3],SVG_COLORS[4],SVG_COLORS[4], | 
|---|
| 895 | SVG_COLORS[5],SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[6],mainTitle)) | 
|---|
| 896 | #rpy.r.jpeg(filename='%s.jpg' % (title), width=1600, height=1200, pointsize=12, quality=100, bg='white') | 
|---|
| 897 | #rpy.r.par(mai=(1,1,1,0.5)) | 
|---|
| 898 | #rpy.r('par(xaxs="i",yaxs="i")') | 
|---|
| 899 | #rpy.r.plot(means, sdevs, main=mainTitle, ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2)) | 
|---|
| 900 | #rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE) | 
|---|
| 901 | #rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted') | 
|---|
| 902 | #rpy.r.dev_off() | 
|---|
| 903 |  | 
|---|
| 904 | ### We will now go through each relationship type to partition plot points | 
|---|
| 905 | ### into "bulk" and "outlier" groups.  Bulk points will represent common | 
|---|
| 906 | ### mean/sdev pairs and will cover the majority of the points in the plot -- | 
|---|
| 907 | ### they will use generic tooltip informtion about all of the pairs | 
|---|
| 908 | ### represented by that point.  "Outlier" points will be uncommon pairs, | 
|---|
| 909 | ### with very specific information in their tooltips.  It would be nice to | 
|---|
| 910 | ### keep hte total number of plotted points in the SVG representation to | 
|---|
| 911 | ### ~10000 (certainly less than 100000?) | 
|---|
| 912 | pointMap = {} | 
|---|
| 913 | orderedRels = [y[1] for y in reversed(sorted([(relCounts.get(x, 0),x) for x in REL_LOOKUP.keys()]))] | 
|---|
| 914 | # do we really want this? I want out of zone points last and big | 
|---|
| 915 | for relCode in orderedRels: | 
|---|
| 916 | svgColor = SVG_COLORS[relCode] | 
|---|
| 917 | relName, relColor, relStyle = REL_LOOKUP[relCode] | 
|---|
| 918 | svg.write('<g id="%s" style="stroke:%s; fill:%s; fill-opacity:1.0; stroke-width:1;" cursor="pointer">\n' % (relName, svgColor, svgColor)) | 
|---|
| 919 | pMap = pointMap.setdefault(relCode, {}) | 
|---|
| 920 | nPoints = 0 | 
|---|
| 921 | rpairs=[] | 
|---|
| 922 | rgenos=[] | 
|---|
| 923 | rmeans=[] | 
|---|
| 924 | rsdevs=[] | 
|---|
| 925 | rz = [] | 
|---|
| 926 | for x,rel in enumerate(rels): # all pairs | 
|---|
| 927 | if rel == relCode: | 
|---|
| 928 | s1,s2 = pairs[x] | 
|---|
| 929 | pid=(s1,s2) | 
|---|
| 930 | zmean,zsd,r,zrad = pair_data[pid][:4] | 
|---|
| 931 | rpairs.append(pairs[x]) | 
|---|
| 932 | rgenos.append(ngenoL[x]) | 
|---|
| 933 | rmeans.append(means[x]) | 
|---|
| 934 | rsdevs.append(sdevs[x]) | 
|---|
| 935 | rz.append(zrad) | 
|---|
| 936 | ### Now add the svg point group for this relationship to the svg file | 
|---|
| 937 | for x in range(len(rmeans)): | 
|---|
| 938 | svgX = '%d' % ((rmeans[x] - 1.0) * PLOT_WIDTH) # changed so mean scale is 1-2 | 
|---|
| 939 | svgY = '%d' % (PLOT_HEIGHT - (rsdevs[x] * PLOT_HEIGHT)) # changed so sd scale is 0-1 | 
|---|
| 940 | s1, s2 = rpairs[x] | 
|---|
| 941 | (fid1,uid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache[s1] | 
|---|
| 942 | (fid2,uid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache[s2] | 
|---|
| 943 | ngenos = rgenos[x] | 
|---|
| 944 | nPoints += 1 | 
|---|
| 945 | point = pMap.setdefault((svgX, svgY), []) | 
|---|
| 946 | point.append((rmeans[x], rsdevs[x], fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos,rz[x])) | 
|---|
| 947 | for (svgX, svgY) in pMap: | 
|---|
| 948 | points = pMap[(svgX, svgY)] | 
|---|
| 949 | svgX = int(svgX) | 
|---|
| 950 | svgY = int(svgY) | 
|---|
| 951 | if len(points) > 1: | 
|---|
| 952 | mmean,dmean = calcMeanSD([p[0] for p in points]) | 
|---|
| 953 | msdev,dsdev = calcMeanSD([p[1] for p in points]) | 
|---|
| 954 | mgeno,dgeno = calcMeanSD([p[-1] for p in points]) | 
|---|
| 955 | mingeno = min([p[-1] for p in points]) | 
|---|
| 956 | maxgeno = max([p[-1] for p in points]) | 
|---|
| 957 | svg.write("""<circle cx="%d" cy="%d" r="2" | 
|---|
| 958 | onmouseover="showBTT(evt, %d, %1.2f, %1.2f, %1.2f, %1.2f, %d, %d, %d, %d, %d)" | 
|---|
| 959 | onmouseout="hideBTT(evt)" />\n""" % (svgX, svgY, relCode, mmean, dmean, msdev, dsdev, len(points), mgeno, dgeno, mingeno, maxgeno)) | 
|---|
| 960 | else: | 
|---|
| 961 | mean, sdev, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos, zrad = points[0][:12] | 
|---|
| 962 | rmean = float(relstats[relCode]['mean'][0]) | 
|---|
| 963 | rsdev = float(relstats[relCode]['sd'][0]) | 
|---|
| 964 | if zrad < 4: | 
|---|
| 965 | zrad = 2 | 
|---|
| 966 | elif 4 < zrad < 9: | 
|---|
| 967 | zrad = 3 # to 9 | 
|---|
| 968 | else: # > 9 5=15+ | 
|---|
| 969 | zrad=zrad/3 | 
|---|
| 970 | zrad = min(zrad,5) # scale limit | 
|---|
| 971 | if zrad <= 3: | 
|---|
| 972 | svg.write('<circle cx="%d" cy="%d" r="%s" onmouseover="showOTT(evt, %d, \'%s,%s,%s,%s\', \'%s,%s,%s,%s\', %1.2f, %1.2f, %s, %1.2f, %1.2f)" onmouseout="hideOTT(evt)" />\n' % (svgX, svgY, zrad, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev)) | 
|---|
| 973 | else: # highlight pairs a long way from expectation by outlining circle in red | 
|---|
| 974 | svg.write("""<circle cx="%d" cy="%d" r="%s" style="stroke:red; fill:%s; fill-opacity:1.0; stroke-width:1;" | 
|---|
| 975 | onmouseover="showOTT(evt, %d, \'%s,%s,%s,%s\', \'%s,%s,%s,%s\', %1.2f, %1.2f, %s, %1.2f, %1.2f)" | 
|---|
| 976 | onmouseout="hideOTT(evt)" />\n""" % \ | 
|---|
| 977 | (svgX, svgY, zrad, svgColor, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev)) | 
|---|
| 978 | svg.write('</g>\n') | 
|---|
| 979 |  | 
|---|
| 980 | ### Create a pdf as well if indicated on the command line | 
|---|
| 981 | ### WARNING! for framingham share, with about 50M pairs, this is a 5.5GB pdf! | 
|---|
| 982 | ##    if pdftoo: | 
|---|
| 983 | ##        pdfname = '%s.pdf' % (title) | 
|---|
| 984 | ##        rpy.r.pdf(pdfname, 6, 6) | 
|---|
| 985 | ##        rpy.r.par(mai=(1,1,1,0.5)) | 
|---|
| 986 | ##        rpy.r('par(xaxs="i",yaxs="i")') | 
|---|
| 987 | ##        rpy.r.plot(means, sdevs, main='%s, %d snp' % (title, nSamples), ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2)) | 
|---|
| 988 | ##        rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE) | 
|---|
| 989 | ##        rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted') | 
|---|
| 990 | ##        rpy.r.dev_off() | 
|---|
| 991 |  | 
|---|
| 992 | ### Draw polygons | 
|---|
| 993 | if showPolygons: | 
|---|
| 994 | svg.write('<g id="polygons" cursor="pointer">\n') | 
|---|
| 995 | for rel, poly in POLYGONS.items(): | 
|---|
| 996 | points = ' '.join(['%s,%s' % ((p[0]-1.0)*float(PLOT_WIDTH), (PLOT_HEIGHT - p[1]*PLOT_HEIGHT)) for p in poly]) | 
|---|
| 997 | svg.write('<polygon points="%s" fill="transparent" style="stroke:%s; stroke-width:1"/>\n' % (points, SVG_COLORS[rel])) | 
|---|
| 998 | svg.write('</g>\n') | 
|---|
| 999 |  | 
|---|
| 1000 |  | 
|---|
| 1001 | svg.write(SVG_FOOTER) | 
|---|
| 1002 | svg.close() | 
|---|
| 1003 | return newfiles,explanations,repOut | 
|---|
| 1004 |  | 
|---|
| 1005 | def doIBS(n=100): | 
|---|
| 1006 | """parse parameters from galaxy | 
|---|
| 1007 | expect 'input pbed path' 'basename' 'outpath' 'title' 'logpath' 'n' | 
|---|
| 1008 | <command interpreter="python"> | 
|---|
| 1009 | rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name" | 
|---|
| 1010 | '$out_file1' '$out_file1.files_path' "$title1"  '$n' '$Z' | 
|---|
| 1011 | </command> | 
|---|
| 1012 |  | 
|---|
| 1013 | """ | 
|---|
| 1014 | u="""<command interpreter="python"> | 
|---|
| 1015 | rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name" | 
|---|
| 1016 | '$out_file1' '$out_file1.files_path' "$title1"  '$n' '$Z' | 
|---|
| 1017 | </command> | 
|---|
| 1018 | """ | 
|---|
| 1019 |  | 
|---|
| 1020 |  | 
|---|
| 1021 | if len(sys.argv) < 7: | 
|---|
| 1022 | print >> sys.stdout, 'Need pbed inpath, basename, out_htmlname, outpath, title, logpath, nSNP, Zcutoff on command line please' | 
|---|
| 1023 | print >> sys.stdout, u | 
|---|
| 1024 | sys.exit(1) | 
|---|
| 1025 | ts = '%s%s' % (string.punctuation,string.whitespace) | 
|---|
| 1026 | ptran =  string.maketrans(ts,'_'*len(ts)) | 
|---|
| 1027 | inpath = sys.argv[1] | 
|---|
| 1028 | ldinpath = os.path.split(inpath)[0] | 
|---|
| 1029 | basename = sys.argv[2] | 
|---|
| 1030 | outhtml = sys.argv[3] | 
|---|
| 1031 | newfilepath = sys.argv[4] | 
|---|
| 1032 | title = sys.argv[5].translate(ptran) | 
|---|
| 1033 | logfname = 'Log_%s.txt' % title | 
|---|
| 1034 | logpath = os.path.join(newfilepath,logfname) # log was a child - make part of html extra_files_path zoo | 
|---|
| 1035 | n = int(sys.argv[6]) | 
|---|
| 1036 | try: | 
|---|
| 1037 | Zcutoff = float(sys.argv[7]) | 
|---|
| 1038 | except: | 
|---|
| 1039 | Zcutoff = 2.0 | 
|---|
| 1040 | try: | 
|---|
| 1041 | os.makedirs(newfilepath) | 
|---|
| 1042 | except: | 
|---|
| 1043 | pass | 
|---|
| 1044 | logf = file(logpath,'w') | 
|---|
| 1045 | efp,ibase_name = os.path.split(inpath) # need to use these for outputs in files_path | 
|---|
| 1046 | ped = plinkbinJZ.BPed(inpath) | 
|---|
| 1047 | ped.parse(quick=True) | 
|---|
| 1048 | if ped == None: | 
|---|
| 1049 | print >> sys.stderr, '## doIBSpy problem - cannot open %s or %s - cannot run' % (ldreduced,basename) | 
|---|
| 1050 | sys.exit(1) | 
|---|
| 1051 | newfiles,explanations,repOut = doIBSpy(ped=ped,basename=basename,outdir=newfilepath, | 
|---|
| 1052 | logf=logf,nrsSamples=n,title=title,pdftoo=0,Zcutoff=Zcutoff) | 
|---|
| 1053 | logf.close() | 
|---|
| 1054 | logfs = file(logpath,'r').readlines() | 
|---|
| 1055 | lf = file(outhtml,'w') | 
|---|
| 1056 | lf.write(galhtmlprefix % PROGNAME) | 
|---|
| 1057 | # this is a mess. todo clean up - should each datatype have it's own directory? Yes | 
|---|
| 1058 | # probably. Then titles are universal - but userId libraries are separate. | 
|---|
| 1059 | s = '<div>Output from %s run at %s<br>\n' % (PROGNAME,timenow()) | 
|---|
| 1060 | lf.write('<h4>%s</h4>\n' % s) | 
|---|
| 1061 | fixed = ["'%s'" % x for x in sys.argv] # add quotes just in case | 
|---|
| 1062 | s = 'If you need to rerun this analysis, the command line was\n<pre>%s</pre>\n</div>' % (' '.join(fixed)) | 
|---|
| 1063 | lf.write(s) | 
|---|
| 1064 | # various ways of displaying svg - experiments related to missing svg mimetype on test (!) | 
|---|
| 1065 | #s = """<object data="%s" type="image/svg+xml"  width="%d" height="%d"> | 
|---|
| 1066 | #       <embed src="%s" type="image/svg+xml" width="%d" height="%d" /> | 
|---|
| 1067 | #       </object>""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT,newfiles[0],PLOT_WIDTH,PLOT_HEIGHT) | 
|---|
| 1068 | s = """ <embed src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT) | 
|---|
| 1069 | #s = """ <iframe src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT) | 
|---|
| 1070 | lf.write(s) | 
|---|
| 1071 | lf.write('<div><h4>Click the links below to save output files and plots</h4><br><ol>\n') | 
|---|
| 1072 | for i in range(len(newfiles)): | 
|---|
| 1073 | if i == 0: | 
|---|
| 1074 | lf.write('<li><a href="%s" type="image/svg+xml" >%s</a></li>\n' % (newfiles[i],explanations[i])) | 
|---|
| 1075 | else: | 
|---|
| 1076 | lf.write('<li><a href="%s">%s</a></li>\n' % (newfiles[i],explanations[i])) | 
|---|
| 1077 | flist = os.listdir(newfilepath) | 
|---|
| 1078 | for fname in flist: | 
|---|
| 1079 | if not fname in newfiles: | 
|---|
| 1080 | lf.write('<li><a href="%s">%s</a></li>\n' % (fname,fname)) | 
|---|
| 1081 | lf.write('</ol></div>') | 
|---|
| 1082 | lf.write('<div>%s</div>' % ('\n'.join(repOut))) # repOut is a list of tables | 
|---|
| 1083 | lf.write('<div><hr><h3>Log from this job (also stored in %s)</h3><pre>%s</pre><hr></div>' % (logfname,''.join(logfs))) | 
|---|
| 1084 | lf.write('</body></html>\n') | 
|---|
| 1085 | lf.close() | 
|---|
| 1086 | logf.close() | 
|---|
| 1087 |  | 
|---|
| 1088 | if __name__ == '__main__': | 
|---|
| 1089 | doIBS() | 
|---|