1 | #! /usr/local/bin/python2.4 |
---|
2 | # pedigree data faker |
---|
3 | # specifically designed for scalability testing of |
---|
4 | # Shaun Purcel's PLINK package |
---|
5 | # derived from John Ziniti's original suggestion |
---|
6 | # allele frequency spectrum and random mating added |
---|
7 | # ross lazarus me fecit january 13 2007 |
---|
8 | # copyright ross lazarus 2007 |
---|
9 | # without psyco |
---|
10 | # generates about 10k snp genotypes in 2k subjects (666 trios) per minute or so. |
---|
11 | # so 500k (a billion genotypes), at about 4 trios/min will a couple of hours to generate |
---|
12 | # psyco makes it literally twice as quick!! |
---|
13 | # all rights reserved except as granted under the terms of the LGPL |
---|
14 | # see http://www.gnu.org/licenses/lgpl.html |
---|
15 | # for a copy of the license you receive with this software |
---|
16 | # and for your rights and obligations |
---|
17 | # especially if you wish to modify or redistribute this code |
---|
18 | # january 19 added random missingness inducer |
---|
19 | # currently about 15M genos/minute without psyco, 30M/minute with |
---|
20 | # so a billion genos should take about 40 minutes with psyco or 80 without... |
---|
21 | # added mendel error generator jan 23 rml |
---|
22 | |
---|
23 | |
---|
24 | import random,sys,time,os,string
|
---|
25 | |
---|
26 | from optparse import OptionParser |
---|
27 | |
---|
28 | |
---|
29 | width = 500000 |
---|
30 | ALLELES = ['1','2','3','4'] |
---|
31 | prog = os.path.split(sys.argv[0])[-1] |
---|
32 | debug = 0 |
---|
33 | |
---|
34 | """Natural-order sorting, supporting embedded numbers. |
---|
35 | # found at http://lists.canonical.org/pipermail/kragen-hacks/2005-October/000419.html |
---|
36 | note test code there removed to conserve brain space |
---|
37 | foo9bar2 < foo10bar2 < foo10bar10 |
---|
38 | |
---|
39 | """ |
---|
40 | import random, re, sys |
---|
41 | |
---|
42 | def natsort_key(item): |
---|
43 | chunks = re.split('(\d+(?:\.\d+)?)', item) |
---|
44 | for ii in range(len(chunks)): |
---|
45 | if chunks[ii] and chunks[ii][0] in '0123456789': |
---|
46 | if '.' in chunks[ii]: numtype = float |
---|
47 | else: numtype = int |
---|
48 | # wrap in tuple with '0' to explicitly specify numbers come first |
---|
49 | chunks[ii] = (0, numtype(chunks[ii])) |
---|
50 | else: |
---|
51 | chunks[ii] = (1, chunks[ii]) |
---|
52 | return (chunks, item) |
---|
53 | |
---|
54 | def natsort(seq): |
---|
55 | "Sort a sequence of text strings in a reasonable order." |
---|
56 | alist = [item for item in seq] |
---|
57 | alist.sort(key=natsort_key) |
---|
58 | return alist |
---|
59 | |
---|
60 | |
---|
61 | def makeUniformMAFdist(low=0.02, high=0.5): |
---|
62 | """Fake a non-uniform maf distribution to make the data |
---|
63 | more interesting. Provide uniform 0.02-0.5 distribution""" |
---|
64 | MAFdistribution = [] |
---|
65 | for i in xrange(int(100*low),int(100*high)+1): |
---|
66 | freq = i/100.0 # uniform |
---|
67 | MAFdistribution.append(freq) |
---|
68 | return MAFdistribution |
---|
69 | |
---|
70 | def makeTriangularMAFdist(low=0.02, high=0.5, beta=5): |
---|
71 | """Fake a non-uniform maf distribution to make the data |
---|
72 | more interesting - more rare alleles """ |
---|
73 | MAFdistribution = [] |
---|
74 | for i in xrange(int(100*low),int(100*high)+1): |
---|
75 | freq = (51 - i)/100.0 # large numbers of small allele freqs |
---|
76 | for j in range(beta*i): # or i*i for crude exponential distribution |
---|
77 | MAFdistribution.append(freq) |
---|
78 | return MAFdistribution |
---|
79 | |
---|
80 | def makeFbathead(rslist=[], chromlist=[], poslist=[], width=100000): |
---|
81 | """header row |
---|
82 | """ |
---|
83 | res = ['%s_%s_%s' % (chromlist[x], poslist[x], rslist[x]) for x in range(len(rslist))] |
---|
84 | return ' '.join(res) |
---|
85 | |
---|
86 | def makeMap( width=500000, MAFdistribution=[], useGP=False): |
---|
87 | """make snp allele and frequency tables for consistent generation""" |
---|
88 | usegp = 1 |
---|
89 | snpdb = 'snp126' |
---|
90 | hgdb = 'hg18' |
---|
91 | alleles = [] |
---|
92 | freqs = [] |
---|
93 | rslist = [] |
---|
94 | chromlist = [] |
---|
95 | poslist = [] |
---|
96 | for snp in range(width): |
---|
97 | random.shuffle(ALLELES) |
---|
98 | alleles.append(ALLELES[0:2]) # need two DIFFERENT alleles! |
---|
99 | freqs.append(random.choice(MAFdistribution)) # more rare alleles
|
---|
100 | if useGP: |
---|
101 | try:
|
---|
102 | import MySQLdb
|
---|
103 | genome = MySQLdb.Connect('localhost', 'hg18', 'G3gn0m3')
|
---|
104 | curs = genome.cursor() # use default cursor
|
---|
105 | except:
|
---|
106 | if debug:
|
---|
107 | print 'cannot connect to local copy of golden path'
|
---|
108 | usegp = 0
|
---|
109 | if usegp and useGP: # urrrgghh getting snps into chrom offset order is complicated.... |
---|
110 | curs.execute('use %s' % hgdb) |
---|
111 | print 'Collecting %d real rs numbers - this may take a while' % width |
---|
112 | # get a random draw of enough reasonable (hapmap) snps with frequency data |
---|
113 | s = '''select distinct chrom,chromEnd, name from %s where avHet > 0 and chrom not like '%%random' |
---|
114 | group by name order by rand() limit %d''' % (snpdb,width) |
---|
115 | curs.execute(s) |
---|
116 | reslist = curs.fetchall() |
---|
117 | reslist = ['%s\t%09d\t%s' % (x[3:],y,z) for x,y,z in reslist] # get rid of chr |
---|
118 | reslist = natsort(reslist) |
---|
119 | for s in reslist: |
---|
120 | chrom,pos,rs = s.split('\t') |
---|
121 | rslist.append(rs) |
---|
122 | chromlist.append(chrom) |
---|
123 | poslist.append(pos) |
---|
124 | else: |
---|
125 | chrom = '1' |
---|
126 | for snp in range(width): |
---|
127 | pos = '%d' % (1000*snp) |
---|
128 | rs = 'rs00%d' % snp |
---|
129 | rslist.append(rs) |
---|
130 | chromlist.append(chrom) |
---|
131 | poslist.append(pos) |
---|
132 | return alleles,freqs, rslist, chromlist, poslist |
---|
133 | |
---|
134 | def writeMap(fprefix = '', fpath='./', rslist=[], chromlist=[], poslist=[], width = 500000): |
---|
135 | """make a faked plink compatible map file - fbat files |
---|
136 | have the map written as a header line"""
|
---|
137 | outf = '%s.map'% (fprefix)
|
---|
138 | outf = os.path.join(fpath,outf) |
---|
139 | amap = open(outf, 'w') |
---|
140 | res = ['%s\t%s\t0\t%s' % (chromlist[x],rslist[x],poslist[x]) for x in range(len(rslist))] |
---|
141 | res.append('') |
---|
142 | amap.write('\n'.join(res)) |
---|
143 | amap.close() |
---|
144 | |
---|
145 | def makeMissing(genos=[], missrate = 0.03, missval = '0'): |
---|
146 | """impose some random missingness""" |
---|
147 | nsnps = len(genos) |
---|
148 | for snp in range(nsnps): # ignore first 6 columns |
---|
149 | if random.random() <= missrate: |
---|
150 | genos[snp] = '%s %s' % (missval,missval) |
---|
151 | return genos |
---|
152 | |
---|
153 | def makeTriomissing(genos=[], missrate = 0.03, missval = '0'): |
---|
154 | """impose some random missingness on a trio - moth eaten like real data""" |
---|
155 | for person in (0,1): |
---|
156 | nsnps = len(genos[person]) |
---|
157 | for snp in range(nsnps): |
---|
158 | for person in [0,1,2]: |
---|
159 | if random.random() <= missrate: |
---|
160 | genos[person][snp] = '%s %s' % (missval,missval) |
---|
161 | return genos |
---|
162 | |
---|
163 | |
---|
164 | def makeTriomendel(p1g=(0,0),p2g=(0,0), kiddip = (0,0)): |
---|
165 | """impose some random mendels on a trio |
---|
166 | there are 8 of the 9 mating types we can simulate reasonable errors for |
---|
167 | Note, since random mating dual het parents can produce any genotype we can't generate an interesting |
---|
168 | error for them, so the overall mendel rate will be lower than mendrate, depending on |
---|
169 | allele frequency...""" |
---|
170 | if p1g[0] <> p1g[1] and p2g[0] <> p2g[1]: # both parents het |
---|
171 | return kiddip # cannot simulate a mendel error - anything is legal! |
---|
172 | elif (p1g[0] <> p1g[1]): # p1 is het parent so p2 must be hom |
---|
173 | if p2g[0] == 0: # - make child p2 opposite hom for error |
---|
174 | kiddip = (1,1) |
---|
175 | else: |
---|
176 | kiddip = (0,0) |
---|
177 | elif (p2g[0] <> p2g[1]): # p2 is het parent so p1 must be hom |
---|
178 | if p1g[0] == 0: # - make child p1 opposite hom for error |
---|
179 | kiddip = (1,1) |
---|
180 | else: |
---|
181 | kiddip = (0,0) |
---|
182 | elif (p1g[0] == p1g[1]): # p1 is hom parent and if we get here p2 must also be hom |
---|
183 | if p1g[0] == p2g[0]: # both parents are same hom - make child either het or opposite hom for error |
---|
184 | if random.random() <= 0.5: |
---|
185 | kiddip = (0,1) |
---|
186 | else: |
---|
187 | if p1g[0] == 0: |
---|
188 | kiddip = (1,1) |
---|
189 | else: |
---|
190 | kiddip = (0,0) |
---|
191 | else: # parents are opposite hom - return any hom as an error |
---|
192 | if random.random() <= 0.5: |
---|
193 | kiddip = (0,0) |
---|
194 | else: |
---|
195 | kiddip = (1,1) |
---|
196 | return kiddip |
---|
197 | |
---|
198 | |
---|
199 | |
---|
200 | |
---|
201 | def makeFam(width=100, freqs={}, alleles={}, trio=1, missrate=0.03, missval='0', mendrate=0.0): |
---|
202 | """this family is a simple trio, constructed by random mating two random genotypes |
---|
203 | TODO: why not generate from chromosomes - eg hapmap |
---|
204 | set each haplotype locus according to the conditional |
---|
205 | probability implied by the surrounding loci - eg use both neighboring pairs, triplets |
---|
206 | and quads as observed in hapmap ceu""" |
---|
207 | dadped = '%d 1 0 0 1 1 %s' |
---|
208 | mumped = '%d 2 0 0 2 1 %s' # a mother is a mum where I come from :) |
---|
209 | kidped = '%d 3 1 2 %d %d %s' |
---|
210 | family = [] # result accumulator |
---|
211 | sex = random.choice((1,2)) # for the kid |
---|
212 | affected = random.choice((1,2)) |
---|
213 | genos = [[],[],[]] # dad, mum, kid - 0/1 for common,rare initially, then xform to alleles |
---|
214 | # parent1...kidn lists of 0/1 for common,rare initially, then xformed to alleles |
---|
215 | for snp in xrange(width): |
---|
216 | f = freqs[snp] |
---|
217 | for i in range(2): # do dad and mum |
---|
218 | p = random.random() |
---|
219 | a1 = a2 = 0 |
---|
220 | if p <= f: # a rare allele |
---|
221 | a1 = 1 |
---|
222 | p = random.random() |
---|
223 | if p <= f: # a rare allele |
---|
224 | a2 = 1 |
---|
225 | if a1 > a2: |
---|
226 | a1,a2 = a2,a1 # so ordering consistent - 00,01,11 |
---|
227 | dip = (a1,a2) |
---|
228 | genos[i].append(dip) # tuples of 0,1 |
---|
229 | a1 = random.choice(genos[0][snp]) # dad gamete |
---|
230 | a2 = random.choice(genos[1][snp]) # mum gamete |
---|
231 | if a1 > a2: |
---|
232 | a1,a2 = a2,a1 # so ordering consistent - 00,01,11 |
---|
233 | kiddip = (a1,a2) # NSFW mating! |
---|
234 | genos[2].append(kiddip) |
---|
235 | if mendrate > 0: |
---|
236 | if random.random() <= mendrate: |
---|
237 | genos[2][snp] = makeTriomendel(genos[0][snp],genos[1][snp], kiddip) |
---|
238 | achoice = alleles[snp] |
---|
239 | for g in genos: # now convert to alleles using allele dict |
---|
240 | a1 = achoice[g[snp][0]] # get allele letter |
---|
241 | a2 = achoice[g[snp][1]] |
---|
242 | g[snp] = '%s %s' % (a1,a2) |
---|
243 | if missrate > 0: |
---|
244 | genos = makeTriomissing(genos=genos,missrate=missrate, missval=missval) |
---|
245 | family.append(dadped % (trio,' '.join(genos[0]))) # create a row for each member of trio |
---|
246 | family.append(mumped % (trio,' '.join(genos[1]))) |
---|
247 | family.append(kidped % (trio,sex,affected,' '.join(genos[2]))) |
---|
248 | return family |
---|
249 | |
---|
250 | def makePerson(width=100, aff=1, freqs={}, alleles={}, id=1, missrate = 0.03, missval='0'): |
---|
251 | """make an entire genotype vector for an independent subject""" |
---|
252 | sex = random.choice((1,2)) |
---|
253 | if not aff: |
---|
254 | aff = random.choice((1,2)) |
---|
255 | genos = [] #0/1 for common,rare initially, then xform to alleles |
---|
256 | family = [] |
---|
257 | personped = '%d 1 0 0 %d %d %s' |
---|
258 | poly = (0,1) |
---|
259 | for snp in xrange(width): |
---|
260 | achoice = alleles[snp] |
---|
261 | f = freqs[snp] |
---|
262 | p = random.random() |
---|
263 | a1 = a2 = 0 |
---|
264 | if p <= f: # a rare allele |
---|
265 | a1 = 1 |
---|
266 | p = random.random() |
---|
267 | if p <= f: # a rare allele |
---|
268 | a2 = 1 |
---|
269 | if a1 > a2: |
---|
270 | a1,a2 = a2,a1 # so ordering consistent - 00,01,11 |
---|
271 | a1 = achoice[a1] # get allele letter |
---|
272 | a2 = achoice[a2] |
---|
273 | g = '%s %s' % (a1,a2) |
---|
274 | genos.append(g) |
---|
275 | if missrate > 0.0: |
---|
276 | genos = makeMissing(genos=genos,missrate=missrate, missval=missval) |
---|
277 | family.append(personped % (id,sex,aff,' '.join(genos))) |
---|
278 | return family |
---|
279 | |
---|
280 | def makeHapmap(fprefix= 'fakebigped',width=100, aff=[], freqs={}, |
---|
281 | alleles={}, nsubj = 2000, trios = True, mendrate=0.03, missrate = 0.03, missval='0'): |
---|
282 | """ fake a hapmap file and a pedigree file for eg haploview |
---|
283 | this is arranged as the transpose of a ped file - cols are subjects, rows are markers |
---|
284 | so we need to generate differently since we can't do the transpose in ram reliably for |
---|
285 | a few billion genotypes... |
---|
286 | """ |
---|
287 | outheadprefix = 'rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode %s' |
---|
288 | cfake5 = ["illumina","urn:LSID:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1", |
---|
289 | "urn:LSID:illumina.hapmap.org:Assay:27741:1","urn:lsid:dcc.hapmap.org:Panel:CEPH-30-trios:1","QC+"] |
---|
290 | yfake5 = ["illumina","urn:LSID:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1", |
---|
291 | "urn:LSID:illumina.hapmap.org:Assay:27741:1","urn:LSID:dcc.hapmap.org:Panel:Yoruba-30-trios:1","QC+"] |
---|
292 | sampids = ids |
---|
293 | if trios: |
---|
294 | ts = '%d trios' % int(nsubj/3.) |
---|
295 | else: |
---|
296 | ts = '%d unrelated subjects' % nsubj |
---|
297 | res = ['#%s fake hapmap file %d snps and %s, faked by %s' % (timenow(), width, ts, prog),] |
---|
298 | res.append('# ross lazarus me fecit') |
---|
299 | res.append(outheadprefix % ' '.join(sampids)) # make a header compatible with hapmap extracts |
---|
300 | outf = open('%s.hmap' % (fprefix), 'w') |
---|
301 | started = time.time() |
---|
302 | if trios: |
---|
303 | ntrios = int(nsubj/3.) |
---|
304 | for n in ntrios: # each is a dict |
---|
305 | row = copy.copy(cfake5) # get first fields |
---|
306 | row = map(str,row) |
---|
307 | if race == "YRI": |
---|
308 | row += yfake5 |
---|
309 | elif race == 'CEU': |
---|
310 | row += cfake5 |
---|
311 | else: |
---|
312 | row += ['NA' for x in range(5)] # 5 dummy fields = center protLSID assayLSID panelLSID QCcode |
---|
313 | row += [''.join(sorted(line[x])) for x in sampids] # the genotypes in header (sorted) sample id order |
---|
314 | res.append(' '.join(row)) |
---|
315 | res.append('') |
---|
316 | outfname = '%s_%s_%s_%dkb.geno' % (gene,probeid,race,2*flank/1000) |
---|
317 | f = file(outfname,'w') |
---|
318 | f.write('\n'.join(res)) |
---|
319 | f.close() |
---|
320 | print '### %s: Wrote %d lines to %s' % (timenow(), len(res),outfname) |
---|
321 | |
---|
322 | |
---|
323 | def makePed(fprefix= 'fakebigped', fpath='./',
|
---|
324 | width=500000, nsubj=2000, MAFdistribution=[],alleles={}, |
---|
325 | freqs={}, fbatstyle=True, mendrate = 0.0, missrate = 0.03, missval='0',fbathead=''): |
---|
326 | """fake trios with mendel consistent random mating genotypes in offspring |
---|
327 | with consistent alleles and MAFs for the sample""" |
---|
328 | res = [] |
---|
329 | if fbatstyle: # add a header row with the marker names |
---|
330 | res.append(fbathead) # header row for fbat
|
---|
331 | outfname = '%s.ped'% (fprefix)
|
---|
332 | outfname = os.path.join(fpath,outfname)
|
---|
333 | outf = open(outfname,'w') |
---|
334 | ntrios = int(nsubj/3.) |
---|
335 | outf = open(outfile, 'w') |
---|
336 | started = time.time() |
---|
337 | for trio in xrange(ntrios): |
---|
338 | family = makeFam(width=width, freqs=freqs, alleles=alleles, trio=trio, |
---|
339 | missrate = missrate, mendrate=mendrate, missval=missval) |
---|
340 | res += family |
---|
341 | if (trio + 1) % 10 == 0: # write out to keep ram requirements reasonable |
---|
342 | if (trio + 1) % 50 == 0: # show progress |
---|
343 | dur = time.time() - started |
---|
344 | if dur == 0: |
---|
345 | dur = 1.0 |
---|
346 | print 'Trio: %d, %4.1f genos/sec at %6.1f sec' % (trio + 1, width*trio*3/dur, dur) |
---|
347 | outf.write('\n'.join(res)) |
---|
348 | outf.write('\n') |
---|
349 | res = [] |
---|
350 | if len(res) > 0: # some left |
---|
351 | outf.write('\n'.join(res)) |
---|
352 | outf.write('\n') |
---|
353 | outf.close() |
---|
354 | if debug:
|
---|
355 | print '##makeped : %6.1f seconds total runtime' % (time.time() - started) |
---|
356 | |
---|
357 | def makeIndep(fprefix = 'fakebigped', fpath='./',
|
---|
358 | width=500000, Nunaff=1000, Naff=1000, MAFdistribution=[], |
---|
359 | alleles={}, freqs={}, fbatstyle=True, missrate = 0.03, missval='0',fbathead=''): |
---|
360 | """fake a random sample from a random mating sample |
---|
361 | with consistent alleles and MAFs""" |
---|
362 | res = [] |
---|
363 | Ntot = Nunaff + Naff |
---|
364 | status = [1,]*Nunaff |
---|
365 | status += [2,]*Nunaff
|
---|
366 | outf = '%s.ped' % (fprefix)
|
---|
367 | outf = os.path.join(fpath,outf) |
---|
368 | outf = open(outf, 'w') |
---|
369 | started = time.time() |
---|
370 | #sample = personMaker(width=width, affs=status, freqs=freqs, alleles=alleles, Ntomake=Ntot) |
---|
371 | if fbatstyle: # add a header row with the marker names |
---|
372 | res.append(fbathead) # header row for fbat |
---|
373 | for id in xrange(Ntot): |
---|
374 | if id < Nunaff: |
---|
375 | aff = 1 |
---|
376 | else: |
---|
377 | aff = 2 |
---|
378 | family = makePerson(width=width, aff=aff, freqs=freqs, alleles=alleles, id=id+1) |
---|
379 | res += family |
---|
380 | if (id % 50 == 0): # write out to keep ram requirements reasonable |
---|
381 | if (id % 200 == 0): # show progress |
---|
382 | dur = time.time() - started |
---|
383 | if dur == 0: |
---|
384 | dur = 1.0 |
---|
385 | print 'Id: %d, %4.1f genos/sec at %6.1f sec' % (id, width*id/dur, dur) |
---|
386 | outf.write('\n'.join(res)) |
---|
387 | outf.write('\n') |
---|
388 | res = [] |
---|
389 | if len(res) > 0: # some left |
---|
390 | outf.write('\n'.join(res)) |
---|
391 | outf.write('\n') |
---|
392 | outf.close() |
---|
393 | print '## makeindep: %6.1f seconds total runtime' % (time.time() - started) |
---|
394 | |
---|
395 | u = """ |
---|
396 | Generate either trios or independent subjects with a prespecified |
---|
397 | number of random alleles and a uniform or triangular MAF distribution for |
---|
398 | stress testing. No LD is simulated - alleles are random. Offspring for |
---|
399 | trios are generated by random mating the random parental alleles so there are |
---|
400 | no Mendelian errors unless the -M option is used. Mendelian errors are generated |
---|
401 | randomly according to the possible errors given the parental mating type although |
---|
402 | this is fresh code and not guaranteed to work quite right yet - comments welcomed |
---|
403 | |
---|
404 | Enquiries to ross.lazarus@gmail.com |
---|
405 | |
---|
406 | eg to generate 700 trios with 500k snps, use: |
---|
407 | fakebigped.py -n 2100 -s 500000 |
---|
408 | or to generate 500 independent cases and 500 controls with 100k snps and 0.02 missingness (MCAR), use: |
---|
409 | fakebigped.py -c 500 -n 1000 -s 100000 -m 0.02 |
---|
410 | |
---|
411 | fakebigped.py -o myfake -m 0.05 -s 100000 -n 2000 |
---|
412 | will make fbat compatible myfake.ped with 100k markers in |
---|
413 | 666 trios (total close to 2000 subjects), a uniform MAF distribution and about 5% MCAR missing |
---|
414 | |
---|
415 | fakebigped.py -o myfake -m 0.05 -s 100000 -n 2000 -M 0.05 |
---|
416 | will make fbat compatible myfake.ped with 100k markers in |
---|
417 | 666 trios (total close to 2000 subjects), a uniform MAF distribution, |
---|
418 | about 5% Mendelian errors and about 5% MCAR missing |
---|
419 | |
---|
420 | |
---|
421 | fakebigped.py -o myfakecc -m 0.05 -s 100000 -n 2000 -c 1000 -l |
---|
422 | will make plink compatible myfakecc.ped and myfakecc.map (that's what the -l option does), |
---|
423 | with 100k markers in 1000 cases and 1000 controls (affection status 2 and 1 respectively), |
---|
424 | a triangular MAF distribution (more rare alleles) and about 5% MCAR missing |
---|
425 | |
---|
426 | You should see about 1/4 million genotypes/second so about an hour for a |
---|
427 | 500k snps in 2k subjects and about a 4GB ped file - these are BIG!! |
---|
428 | |
---|
429 | """ |
---|
430 | |
---|
431 | import sys, os, glob |
---|
432 | |
---|
433 | galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> |
---|
434 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
---|
435 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> |
---|
436 | <head> |
---|
437 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> |
---|
438 | <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" /> |
---|
439 | <title></title> |
---|
440 | <link rel="stylesheet" href="/static/style/base.css" type="text/css" /> |
---|
441 | </head> |
---|
442 | <body> |
---|
443 | <div class="document"> |
---|
444 | """ |
---|
445 | |
---|
446 | |
---|
447 | def doImport(outfile=None,outpath=None): |
---|
448 | """ import into one of the new html composite data types for Rgenetics |
---|
449 | Dan Blankenberg with mods by Ross Lazarus |
---|
450 | October 2007 |
---|
451 | """ |
---|
452 | flist = glob.glob(os.path.join(outpath,'*')) |
---|
453 | outf = open(outfile,'w') |
---|
454 | outf.write(galhtmlprefix % prog) |
---|
455 | for i, data in enumerate( flist ): |
---|
456 | outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1])) |
---|
457 | outf.write('<br><h3>This is simulated null genotype data generated by Rgenetics!</h3>')
|
---|
458 | outf.write('%s called with command line:<br><pre>' % prog)
|
---|
459 | outf.write(' '.join(sys.argv))
|
---|
460 | outf.write('\n</pre>\n') |
---|
461 | outf.write("</div></body></html>") |
---|
462 | outf.close() |
---|
463 | |
---|
464 | |
---|
465 | |
---|
466 | if __name__ == "__main__": |
---|
467 | """ |
---|
468 | """ |
---|
469 | parser = OptionParser(usage=u, version="%prog 0.01") |
---|
470 | a = parser.add_option |
---|
471 | a("-n","--nsubjects",type="int",dest="Ntot", |
---|
472 | help="nsubj: total number of subjects",default=2000) |
---|
473 | a("-t","--title",dest="title", |
---|
474 | help="title: file basename for outputs",default='fakeped') |
---|
475 | a("-c","--cases",type="int",dest="Naff", |
---|
476 | help="number of cases: independent subjects with status set to 2 (ie cases). If not set, NTOT/3 trios will be generated", default = 0) |
---|
477 | a("-s","--snps",dest="width",type="int", |
---|
478 | help="snps: total number of snps per subject", default=1000) |
---|
479 | a("-d","--distribution",dest="MAFdist",default="Uniform", |
---|
480 | help="MAF distribution - default is Uniform, can be Triangular") |
---|
481 | a("-o","--outf",dest="outf", |
---|
482 | help="Output file", default = 'fakeped') |
---|
483 | a("-p","--outpath",dest="outpath", |
---|
484 | help="Path for output files", default = './') |
---|
485 | a("-l","--pLink",dest="outstyle", default='L', |
---|
486 | help="Ped files as for Plink - no header, separate Map file - default is Plink style") |
---|
487 | a("-w","--loWmaf", type="float", dest="lowmaf", default=0.01, help="Lower limit for SNP MAF (minor allele freq)") |
---|
488 | a("-m","--missing",dest="missrate",type="float", |
---|
489 | help="missing: probability of missing MCAR - default 0.0", default=0.0) |
---|
490 | a("-v","--valmiss",dest="missval", |
---|
491 | help="missing character: Missing allele code - usually 0 or N - default 0", default="0") |
---|
492 | a("-M","--Mendelrate",dest="mendrate",type="float", |
---|
493 | help="Mendelian error rate: probability of a mendel error per trio, default=0.0", default=0.0) |
---|
494 | a("-H","--noHGRS",dest="useHG",type="int", |
---|
495 | help="Use local copy of UCSC snp126 database to generate real rs numbers", default=True) |
---|
496 | (options,args) = parser.parse_args() |
---|
497 | low = options.lowmaf |
---|
498 | try:
|
---|
499 | os.makedirs(options.outpath)
|
---|
500 | except:
|
---|
501 | pass
|
---|
502 | if options.MAFdist.upper() == 'U': |
---|
503 | mafDist = makeUniformMAFdist(low=low, high=0.5) |
---|
504 | else: |
---|
505 | mafDist = makeTriangularMAFdist(low=low, high=0.5, beta=5)
|
---|
506 | alleles,freqs, rslist, chromlist, poslist = makeMap(width=int(options.width),
|
---|
507 | MAFdistribution=mafDist, useGP=False)
|
---|
508 | fbathead = []
|
---|
509 | s = string.whitespace+string.punctuation
|
---|
510 | trantab = string.maketrans(s,'_'*len(s))
|
---|
511 | title = string.translate(options.title,trantab)
|
---|
512 |
|
---|
513 | if options.outstyle == 'F':
|
---|
514 | fbatstyle = True |
---|
515 | fbathead = makeFbathead(rslist=rslist, chromlist=chromlist, poslist=poslist, width=options.width) |
---|
516 | else:
|
---|
517 | fbatstyle = False |
---|
518 | writeMap(fprefix=title, rslist=rslist, fpath=options.outpath,
|
---|
519 | chromlist=chromlist, poslist=poslist, width=options.width) |
---|
520 | if options.Naff > 0: # make case control data |
---|
521 | makeIndep(fprefix = title, fpath=options.outpath,
|
---|
522 | width=options.width, Nunaff=options.Ntot-options.Naff, |
---|
523 | Naff=options.Naff, MAFdistribution=mafDist,alleles=alleles, freqs=freqs, |
---|
524 | fbatstyle=fbatstyle, missrate=options.missrate, missval=options.missval, |
---|
525 | fbathead=fbathead) |
---|
526 | else: |
---|
527 | makePed(fprefix=options.fprefix, fpath=options.fpath,
|
---|
528 | width=options.width, MAFdistribution=mafDist, nsubj=options.Ntot, |
---|
529 | alleles=alleles, freqs=freqs, fbatstyle=fbatstyle, missrate=options.missrate, |
---|
530 | mendrate=options.mendrate, missval=options.missval, |
---|
531 | fbathead=fbathead)
|
---|
532 | doImport(outfile=options.outf,outpath=options.outpath) |
---|
533 | |
---|
534 | |
---|
535 | |
---|