[2] | 1 | #! /usr/local/bin/python2.4 |
---|
| 2 | # pedigree data faker |
---|
| 3 | # specifically designed for scalability testing of |
---|
| 4 | # Shaun Purcel's PLINK package |
---|
| 5 | # derived from John Ziniti's original suggestion |
---|
| 6 | # allele frequency spectrum and random mating added |
---|
| 7 | # ross lazarus me fecit january 13 2007 |
---|
| 8 | # copyright ross lazarus 2007 |
---|
| 9 | # without psyco |
---|
| 10 | # generates about 10k snp genotypes in 2k subjects (666 trios) per minute or so. |
---|
| 11 | # so 500k (a billion genotypes), at about 4 trios/min will a couple of hours to generate |
---|
| 12 | # psyco makes it literally twice as quick!! |
---|
| 13 | # all rights reserved except as granted under the terms of the LGPL |
---|
| 14 | # see http://www.gnu.org/licenses/lgpl.html |
---|
| 15 | # for a copy of the license you receive with this software |
---|
| 16 | # and for your rights and obligations |
---|
| 17 | # especially if you wish to modify or redistribute this code |
---|
| 18 | # january 19 added random missingness inducer |
---|
| 19 | # currently about 15M genos/minute without psyco, 30M/minute with |
---|
| 20 | # so a billion genos should take about 40 minutes with psyco or 80 without... |
---|
| 21 | # added mendel error generator jan 23 rml |
---|
| 22 | |
---|
| 23 | |
---|
| 24 | import random,sys,time,os,string
|
---|
| 25 | |
---|
| 26 | from optparse import OptionParser |
---|
| 27 | |
---|
| 28 | |
---|
| 29 | width = 500000 |
---|
| 30 | ALLELES = ['1','2','3','4'] |
---|
| 31 | prog = os.path.split(sys.argv[0])[-1] |
---|
| 32 | debug = 0 |
---|
| 33 | |
---|
| 34 | """Natural-order sorting, supporting embedded numbers. |
---|
| 35 | # found at http://lists.canonical.org/pipermail/kragen-hacks/2005-October/000419.html |
---|
| 36 | note test code there removed to conserve brain space |
---|
| 37 | foo9bar2 < foo10bar2 < foo10bar10 |
---|
| 38 | |
---|
| 39 | """ |
---|
| 40 | import random, re, sys |
---|
| 41 | |
---|
| 42 | def natsort_key(item): |
---|
| 43 | chunks = re.split('(\d+(?:\.\d+)?)', item) |
---|
| 44 | for ii in range(len(chunks)): |
---|
| 45 | if chunks[ii] and chunks[ii][0] in '0123456789': |
---|
| 46 | if '.' in chunks[ii]: numtype = float |
---|
| 47 | else: numtype = int |
---|
| 48 | # wrap in tuple with '0' to explicitly specify numbers come first |
---|
| 49 | chunks[ii] = (0, numtype(chunks[ii])) |
---|
| 50 | else: |
---|
| 51 | chunks[ii] = (1, chunks[ii]) |
---|
| 52 | return (chunks, item) |
---|
| 53 | |
---|
| 54 | def natsort(seq): |
---|
| 55 | "Sort a sequence of text strings in a reasonable order." |
---|
| 56 | alist = [item for item in seq] |
---|
| 57 | alist.sort(key=natsort_key) |
---|
| 58 | return alist |
---|
| 59 | |
---|
| 60 | |
---|
| 61 | def makeUniformMAFdist(low=0.02, high=0.5): |
---|
| 62 | """Fake a non-uniform maf distribution to make the data |
---|
| 63 | more interesting. Provide uniform 0.02-0.5 distribution""" |
---|
| 64 | MAFdistribution = [] |
---|
| 65 | for i in xrange(int(100*low),int(100*high)+1): |
---|
| 66 | freq = i/100.0 # uniform |
---|
| 67 | MAFdistribution.append(freq) |
---|
| 68 | return MAFdistribution |
---|
| 69 | |
---|
| 70 | def makeTriangularMAFdist(low=0.02, high=0.5, beta=5): |
---|
| 71 | """Fake a non-uniform maf distribution to make the data |
---|
| 72 | more interesting - more rare alleles """ |
---|
| 73 | MAFdistribution = [] |
---|
| 74 | for i in xrange(int(100*low),int(100*high)+1): |
---|
| 75 | freq = (51 - i)/100.0 # large numbers of small allele freqs |
---|
| 76 | for j in range(beta*i): # or i*i for crude exponential distribution |
---|
| 77 | MAFdistribution.append(freq) |
---|
| 78 | return MAFdistribution |
---|
| 79 | |
---|
| 80 | def makeFbathead(rslist=[], chromlist=[], poslist=[], width=100000): |
---|
| 81 | """header row |
---|
| 82 | """ |
---|
| 83 | res = ['%s_%s_%s' % (chromlist[x], poslist[x], rslist[x]) for x in range(len(rslist))] |
---|
| 84 | return ' '.join(res) |
---|
| 85 | |
---|
| 86 | def makeMap( width=500000, MAFdistribution=[], useGP=False): |
---|
| 87 | """make snp allele and frequency tables for consistent generation""" |
---|
| 88 | usegp = 1 |
---|
| 89 | snpdb = 'snp126' |
---|
| 90 | hgdb = 'hg18' |
---|
| 91 | alleles = [] |
---|
| 92 | freqs = [] |
---|
| 93 | rslist = [] |
---|
| 94 | chromlist = [] |
---|
| 95 | poslist = [] |
---|
| 96 | for snp in range(width): |
---|
| 97 | random.shuffle(ALLELES) |
---|
| 98 | alleles.append(ALLELES[0:2]) # need two DIFFERENT alleles! |
---|
| 99 | freqs.append(random.choice(MAFdistribution)) # more rare alleles
|
---|
| 100 | if useGP: |
---|
| 101 | try:
|
---|
| 102 | import MySQLdb
|
---|
| 103 | genome = MySQLdb.Connect('localhost', 'hg18', 'G3gn0m3')
|
---|
| 104 | curs = genome.cursor() # use default cursor
|
---|
| 105 | except:
|
---|
| 106 | if debug:
|
---|
| 107 | print 'cannot connect to local copy of golden path'
|
---|
| 108 | usegp = 0
|
---|
| 109 | if usegp and useGP: # urrrgghh getting snps into chrom offset order is complicated.... |
---|
| 110 | curs.execute('use %s' % hgdb) |
---|
| 111 | print 'Collecting %d real rs numbers - this may take a while' % width |
---|
| 112 | # get a random draw of enough reasonable (hapmap) snps with frequency data |
---|
| 113 | s = '''select distinct chrom,chromEnd, name from %s where avHet > 0 and chrom not like '%%random' |
---|
| 114 | group by name order by rand() limit %d''' % (snpdb,width) |
---|
| 115 | curs.execute(s) |
---|
| 116 | reslist = curs.fetchall() |
---|
| 117 | reslist = ['%s\t%09d\t%s' % (x[3:],y,z) for x,y,z in reslist] # get rid of chr |
---|
| 118 | reslist = natsort(reslist) |
---|
| 119 | for s in reslist: |
---|
| 120 | chrom,pos,rs = s.split('\t') |
---|
| 121 | rslist.append(rs) |
---|
| 122 | chromlist.append(chrom) |
---|
| 123 | poslist.append(pos) |
---|
| 124 | else: |
---|
| 125 | chrom = '1' |
---|
| 126 | for snp in range(width): |
---|
| 127 | pos = '%d' % (1000*snp) |
---|
| 128 | rs = 'rs00%d' % snp |
---|
| 129 | rslist.append(rs) |
---|
| 130 | chromlist.append(chrom) |
---|
| 131 | poslist.append(pos) |
---|
| 132 | return alleles,freqs, rslist, chromlist, poslist |
---|
| 133 | |
---|
| 134 | def writeMap(fprefix = '', fpath='./', rslist=[], chromlist=[], poslist=[], width = 500000): |
---|
| 135 | """make a faked plink compatible map file - fbat files |
---|
| 136 | have the map written as a header line"""
|
---|
| 137 | outf = '%s.map'% (fprefix)
|
---|
| 138 | outf = os.path.join(fpath,outf) |
---|
| 139 | amap = open(outf, 'w') |
---|
| 140 | res = ['%s\t%s\t0\t%s' % (chromlist[x],rslist[x],poslist[x]) for x in range(len(rslist))] |
---|
| 141 | res.append('') |
---|
| 142 | amap.write('\n'.join(res)) |
---|
| 143 | amap.close() |
---|
| 144 | |
---|
| 145 | def makeMissing(genos=[], missrate = 0.03, missval = '0'): |
---|
| 146 | """impose some random missingness""" |
---|
| 147 | nsnps = len(genos) |
---|
| 148 | for snp in range(nsnps): # ignore first 6 columns |
---|
| 149 | if random.random() <= missrate: |
---|
| 150 | genos[snp] = '%s %s' % (missval,missval) |
---|
| 151 | return genos |
---|
| 152 | |
---|
| 153 | def makeTriomissing(genos=[], missrate = 0.03, missval = '0'): |
---|
| 154 | """impose some random missingness on a trio - moth eaten like real data""" |
---|
| 155 | for person in (0,1): |
---|
| 156 | nsnps = len(genos[person]) |
---|
| 157 | for snp in range(nsnps): |
---|
| 158 | for person in [0,1,2]: |
---|
| 159 | if random.random() <= missrate: |
---|
| 160 | genos[person][snp] = '%s %s' % (missval,missval) |
---|
| 161 | return genos |
---|
| 162 | |
---|
| 163 | |
---|
| 164 | def makeTriomendel(p1g=(0,0),p2g=(0,0), kiddip = (0,0)): |
---|
| 165 | """impose some random mendels on a trio |
---|
| 166 | there are 8 of the 9 mating types we can simulate reasonable errors for |
---|
| 167 | Note, since random mating dual het parents can produce any genotype we can't generate an interesting |
---|
| 168 | error for them, so the overall mendel rate will be lower than mendrate, depending on |
---|
| 169 | allele frequency...""" |
---|
| 170 | if p1g[0] <> p1g[1] and p2g[0] <> p2g[1]: # both parents het |
---|
| 171 | return kiddip # cannot simulate a mendel error - anything is legal! |
---|
| 172 | elif (p1g[0] <> p1g[1]): # p1 is het parent so p2 must be hom |
---|
| 173 | if p2g[0] == 0: # - make child p2 opposite hom for error |
---|
| 174 | kiddip = (1,1) |
---|
| 175 | else: |
---|
| 176 | kiddip = (0,0) |
---|
| 177 | elif (p2g[0] <> p2g[1]): # p2 is het parent so p1 must be hom |
---|
| 178 | if p1g[0] == 0: # - make child p1 opposite hom for error |
---|
| 179 | kiddip = (1,1) |
---|
| 180 | else: |
---|
| 181 | kiddip = (0,0) |
---|
| 182 | elif (p1g[0] == p1g[1]): # p1 is hom parent and if we get here p2 must also be hom |
---|
| 183 | if p1g[0] == p2g[0]: # both parents are same hom - make child either het or opposite hom for error |
---|
| 184 | if random.random() <= 0.5: |
---|
| 185 | kiddip = (0,1) |
---|
| 186 | else: |
---|
| 187 | if p1g[0] == 0: |
---|
| 188 | kiddip = (1,1) |
---|
| 189 | else: |
---|
| 190 | kiddip = (0,0) |
---|
| 191 | else: # parents are opposite hom - return any hom as an error |
---|
| 192 | if random.random() <= 0.5: |
---|
| 193 | kiddip = (0,0) |
---|
| 194 | else: |
---|
| 195 | kiddip = (1,1) |
---|
| 196 | return kiddip |
---|
| 197 | |
---|
| 198 | |
---|
| 199 | |
---|
| 200 | |
---|
| 201 | def makeFam(width=100, freqs={}, alleles={}, trio=1, missrate=0.03, missval='0', mendrate=0.0): |
---|
| 202 | """this family is a simple trio, constructed by random mating two random genotypes |
---|
| 203 | TODO: why not generate from chromosomes - eg hapmap |
---|
| 204 | set each haplotype locus according to the conditional |
---|
| 205 | probability implied by the surrounding loci - eg use both neighboring pairs, triplets |
---|
| 206 | and quads as observed in hapmap ceu""" |
---|
| 207 | dadped = '%d 1 0 0 1 1 %s' |
---|
| 208 | mumped = '%d 2 0 0 2 1 %s' # a mother is a mum where I come from :) |
---|
| 209 | kidped = '%d 3 1 2 %d %d %s' |
---|
| 210 | family = [] # result accumulator |
---|
| 211 | sex = random.choice((1,2)) # for the kid |
---|
| 212 | affected = random.choice((1,2)) |
---|
| 213 | genos = [[],[],[]] # dad, mum, kid - 0/1 for common,rare initially, then xform to alleles |
---|
| 214 | # parent1...kidn lists of 0/1 for common,rare initially, then xformed to alleles |
---|
| 215 | for snp in xrange(width): |
---|
| 216 | f = freqs[snp] |
---|
| 217 | for i in range(2): # do dad and mum |
---|
| 218 | p = random.random() |
---|
| 219 | a1 = a2 = 0 |
---|
| 220 | if p <= f: # a rare allele |
---|
| 221 | a1 = 1 |
---|
| 222 | p = random.random() |
---|
| 223 | if p <= f: # a rare allele |
---|
| 224 | a2 = 1 |
---|
| 225 | if a1 > a2: |
---|
| 226 | a1,a2 = a2,a1 # so ordering consistent - 00,01,11 |
---|
| 227 | dip = (a1,a2) |
---|
| 228 | genos[i].append(dip) # tuples of 0,1 |
---|
| 229 | a1 = random.choice(genos[0][snp]) # dad gamete |
---|
| 230 | a2 = random.choice(genos[1][snp]) # mum gamete |
---|
| 231 | if a1 > a2: |
---|
| 232 | a1,a2 = a2,a1 # so ordering consistent - 00,01,11 |
---|
| 233 | kiddip = (a1,a2) # NSFW mating! |
---|
| 234 | genos[2].append(kiddip) |
---|
| 235 | if mendrate > 0: |
---|
| 236 | if random.random() <= mendrate: |
---|
| 237 | genos[2][snp] = makeTriomendel(genos[0][snp],genos[1][snp], kiddip) |
---|
| 238 | achoice = alleles[snp] |
---|
| 239 | for g in genos: # now convert to alleles using allele dict |
---|
| 240 | a1 = achoice[g[snp][0]] # get allele letter |
---|
| 241 | a2 = achoice[g[snp][1]] |
---|
| 242 | g[snp] = '%s %s' % (a1,a2) |
---|
| 243 | if missrate > 0: |
---|
| 244 | genos = makeTriomissing(genos=genos,missrate=missrate, missval=missval) |
---|
| 245 | family.append(dadped % (trio,' '.join(genos[0]))) # create a row for each member of trio |
---|
| 246 | family.append(mumped % (trio,' '.join(genos[1]))) |
---|
| 247 | family.append(kidped % (trio,sex,affected,' '.join(genos[2]))) |
---|
| 248 | return family |
---|
| 249 | |
---|
| 250 | def makePerson(width=100, aff=1, freqs={}, alleles={}, id=1, missrate = 0.03, missval='0'): |
---|
| 251 | """make an entire genotype vector for an independent subject""" |
---|
| 252 | sex = random.choice((1,2)) |
---|
| 253 | if not aff: |
---|
| 254 | aff = random.choice((1,2)) |
---|
| 255 | genos = [] #0/1 for common,rare initially, then xform to alleles |
---|
| 256 | family = [] |
---|
| 257 | personped = '%d 1 0 0 %d %d %s' |
---|
| 258 | poly = (0,1) |
---|
| 259 | for snp in xrange(width): |
---|
| 260 | achoice = alleles[snp] |
---|
| 261 | f = freqs[snp] |
---|
| 262 | p = random.random() |
---|
| 263 | a1 = a2 = 0 |
---|
| 264 | if p <= f: # a rare allele |
---|
| 265 | a1 = 1 |
---|
| 266 | p = random.random() |
---|
| 267 | if p <= f: # a rare allele |
---|
| 268 | a2 = 1 |
---|
| 269 | if a1 > a2: |
---|
| 270 | a1,a2 = a2,a1 # so ordering consistent - 00,01,11 |
---|
| 271 | a1 = achoice[a1] # get allele letter |
---|
| 272 | a2 = achoice[a2] |
---|
| 273 | g = '%s %s' % (a1,a2) |
---|
| 274 | genos.append(g) |
---|
| 275 | if missrate > 0.0: |
---|
| 276 | genos = makeMissing(genos=genos,missrate=missrate, missval=missval) |
---|
| 277 | family.append(personped % (id,sex,aff,' '.join(genos))) |
---|
| 278 | return family |
---|
| 279 | |
---|
| 280 | def makeHapmap(fprefix= 'fakebigped',width=100, aff=[], freqs={}, |
---|
| 281 | alleles={}, nsubj = 2000, trios = True, mendrate=0.03, missrate = 0.03, missval='0'): |
---|
| 282 | """ fake a hapmap file and a pedigree file for eg haploview |
---|
| 283 | this is arranged as the transpose of a ped file - cols are subjects, rows are markers |
---|
| 284 | so we need to generate differently since we can't do the transpose in ram reliably for |
---|
| 285 | a few billion genotypes... |
---|
| 286 | """ |
---|
| 287 | outheadprefix = 'rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode %s' |
---|
| 288 | cfake5 = ["illumina","urn:LSID:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1", |
---|
| 289 | "urn:LSID:illumina.hapmap.org:Assay:27741:1","urn:lsid:dcc.hapmap.org:Panel:CEPH-30-trios:1","QC+"] |
---|
| 290 | yfake5 = ["illumina","urn:LSID:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1", |
---|
| 291 | "urn:LSID:illumina.hapmap.org:Assay:27741:1","urn:LSID:dcc.hapmap.org:Panel:Yoruba-30-trios:1","QC+"] |
---|
| 292 | sampids = ids |
---|
| 293 | if trios: |
---|
| 294 | ts = '%d trios' % int(nsubj/3.) |
---|
| 295 | else: |
---|
| 296 | ts = '%d unrelated subjects' % nsubj |
---|
| 297 | res = ['#%s fake hapmap file %d snps and %s, faked by %s' % (timenow(), width, ts, prog),] |
---|
| 298 | res.append('# ross lazarus me fecit') |
---|
| 299 | res.append(outheadprefix % ' '.join(sampids)) # make a header compatible with hapmap extracts |
---|
| 300 | outf = open('%s.hmap' % (fprefix), 'w') |
---|
| 301 | started = time.time() |
---|
| 302 | if trios: |
---|
| 303 | ntrios = int(nsubj/3.) |
---|
| 304 | for n in ntrios: # each is a dict |
---|
| 305 | row = copy.copy(cfake5) # get first fields |
---|
| 306 | row = map(str,row) |
---|
| 307 | if race == "YRI": |
---|
| 308 | row += yfake5 |
---|
| 309 | elif race == 'CEU': |
---|
| 310 | row += cfake5 |
---|
| 311 | else: |
---|
| 312 | row += ['NA' for x in range(5)] # 5 dummy fields = center protLSID assayLSID panelLSID QCcode |
---|
| 313 | row += [''.join(sorted(line[x])) for x in sampids] # the genotypes in header (sorted) sample id order |
---|
| 314 | res.append(' '.join(row)) |
---|
| 315 | res.append('') |
---|
| 316 | outfname = '%s_%s_%s_%dkb.geno' % (gene,probeid,race,2*flank/1000) |
---|
| 317 | f = file(outfname,'w') |
---|
| 318 | f.write('\n'.join(res)) |
---|
| 319 | f.close() |
---|
| 320 | print '### %s: Wrote %d lines to %s' % (timenow(), len(res),outfname) |
---|
| 321 | |
---|
| 322 | |
---|
| 323 | def makePed(fprefix= 'fakebigped', fpath='./',
|
---|
| 324 | width=500000, nsubj=2000, MAFdistribution=[],alleles={}, |
---|
| 325 | freqs={}, fbatstyle=True, mendrate = 0.0, missrate = 0.03, missval='0',fbathead=''): |
---|
| 326 | """fake trios with mendel consistent random mating genotypes in offspring |
---|
| 327 | with consistent alleles and MAFs for the sample""" |
---|
| 328 | res = [] |
---|
| 329 | if fbatstyle: # add a header row with the marker names |
---|
| 330 | res.append(fbathead) # header row for fbat
|
---|
| 331 | outfname = '%s.ped'% (fprefix)
|
---|
| 332 | outfname = os.path.join(fpath,outfname)
|
---|
| 333 | outf = open(outfname,'w') |
---|
| 334 | ntrios = int(nsubj/3.) |
---|
| 335 | outf = open(outfile, 'w') |
---|
| 336 | started = time.time() |
---|
| 337 | for trio in xrange(ntrios): |
---|
| 338 | family = makeFam(width=width, freqs=freqs, alleles=alleles, trio=trio, |
---|
| 339 | missrate = missrate, mendrate=mendrate, missval=missval) |
---|
| 340 | res += family |
---|
| 341 | if (trio + 1) % 10 == 0: # write out to keep ram requirements reasonable |
---|
| 342 | if (trio + 1) % 50 == 0: # show progress |
---|
| 343 | dur = time.time() - started |
---|
| 344 | if dur == 0: |
---|
| 345 | dur = 1.0 |
---|
| 346 | print 'Trio: %d, %4.1f genos/sec at %6.1f sec' % (trio + 1, width*trio*3/dur, dur) |
---|
| 347 | outf.write('\n'.join(res)) |
---|
| 348 | outf.write('\n') |
---|
| 349 | res = [] |
---|
| 350 | if len(res) > 0: # some left |
---|
| 351 | outf.write('\n'.join(res)) |
---|
| 352 | outf.write('\n') |
---|
| 353 | outf.close() |
---|
| 354 | if debug:
|
---|
| 355 | print '##makeped : %6.1f seconds total runtime' % (time.time() - started) |
---|
| 356 | |
---|
| 357 | def makeIndep(fprefix = 'fakebigped', fpath='./',
|
---|
| 358 | width=500000, Nunaff=1000, Naff=1000, MAFdistribution=[], |
---|
| 359 | alleles={}, freqs={}, fbatstyle=True, missrate = 0.03, missval='0',fbathead=''): |
---|
| 360 | """fake a random sample from a random mating sample |
---|
| 361 | with consistent alleles and MAFs""" |
---|
| 362 | res = [] |
---|
| 363 | Ntot = Nunaff + Naff |
---|
| 364 | status = [1,]*Nunaff |
---|
| 365 | status += [2,]*Nunaff
|
---|
| 366 | outf = '%s.ped' % (fprefix)
|
---|
| 367 | outf = os.path.join(fpath,outf) |
---|
| 368 | outf = open(outf, 'w') |
---|
| 369 | started = time.time() |
---|
| 370 | #sample = personMaker(width=width, affs=status, freqs=freqs, alleles=alleles, Ntomake=Ntot) |
---|
| 371 | if fbatstyle: # add a header row with the marker names |
---|
| 372 | res.append(fbathead) # header row for fbat |
---|
| 373 | for id in xrange(Ntot): |
---|
| 374 | if id < Nunaff: |
---|
| 375 | aff = 1 |
---|
| 376 | else: |
---|
| 377 | aff = 2 |
---|
| 378 | family = makePerson(width=width, aff=aff, freqs=freqs, alleles=alleles, id=id+1) |
---|
| 379 | res += family |
---|
| 380 | if (id % 50 == 0): # write out to keep ram requirements reasonable |
---|
| 381 | if (id % 200 == 0): # show progress |
---|
| 382 | dur = time.time() - started |
---|
| 383 | if dur == 0: |
---|
| 384 | dur = 1.0 |
---|
| 385 | print 'Id: %d, %4.1f genos/sec at %6.1f sec' % (id, width*id/dur, dur) |
---|
| 386 | outf.write('\n'.join(res)) |
---|
| 387 | outf.write('\n') |
---|
| 388 | res = [] |
---|
| 389 | if len(res) > 0: # some left |
---|
| 390 | outf.write('\n'.join(res)) |
---|
| 391 | outf.write('\n') |
---|
| 392 | outf.close() |
---|
| 393 | print '## makeindep: %6.1f seconds total runtime' % (time.time() - started) |
---|
| 394 | |
---|
| 395 | u = """ |
---|
| 396 | Generate either trios or independent subjects with a prespecified |
---|
| 397 | number of random alleles and a uniform or triangular MAF distribution for |
---|
| 398 | stress testing. No LD is simulated - alleles are random. Offspring for |
---|
| 399 | trios are generated by random mating the random parental alleles so there are |
---|
| 400 | no Mendelian errors unless the -M option is used. Mendelian errors are generated |
---|
| 401 | randomly according to the possible errors given the parental mating type although |
---|
| 402 | this is fresh code and not guaranteed to work quite right yet - comments welcomed |
---|
| 403 | |
---|
| 404 | Enquiries to ross.lazarus@gmail.com |
---|
| 405 | |
---|
| 406 | eg to generate 700 trios with 500k snps, use: |
---|
| 407 | fakebigped.py -n 2100 -s 500000 |
---|
| 408 | or to generate 500 independent cases and 500 controls with 100k snps and 0.02 missingness (MCAR), use: |
---|
| 409 | fakebigped.py -c 500 -n 1000 -s 100000 -m 0.02 |
---|
| 410 | |
---|
| 411 | fakebigped.py -o myfake -m 0.05 -s 100000 -n 2000 |
---|
| 412 | will make fbat compatible myfake.ped with 100k markers in |
---|
| 413 | 666 trios (total close to 2000 subjects), a uniform MAF distribution and about 5% MCAR missing |
---|
| 414 | |
---|
| 415 | fakebigped.py -o myfake -m 0.05 -s 100000 -n 2000 -M 0.05 |
---|
| 416 | will make fbat compatible myfake.ped with 100k markers in |
---|
| 417 | 666 trios (total close to 2000 subjects), a uniform MAF distribution, |
---|
| 418 | about 5% Mendelian errors and about 5% MCAR missing |
---|
| 419 | |
---|
| 420 | |
---|
| 421 | fakebigped.py -o myfakecc -m 0.05 -s 100000 -n 2000 -c 1000 -l |
---|
| 422 | will make plink compatible myfakecc.ped and myfakecc.map (that's what the -l option does), |
---|
| 423 | with 100k markers in 1000 cases and 1000 controls (affection status 2 and 1 respectively), |
---|
| 424 | a triangular MAF distribution (more rare alleles) and about 5% MCAR missing |
---|
| 425 | |
---|
| 426 | You should see about 1/4 million genotypes/second so about an hour for a |
---|
| 427 | 500k snps in 2k subjects and about a 4GB ped file - these are BIG!! |
---|
| 428 | |
---|
| 429 | """ |
---|
| 430 | |
---|
| 431 | import sys, os, glob |
---|
| 432 | |
---|
| 433 | galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> |
---|
| 434 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
---|
| 435 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> |
---|
| 436 | <head> |
---|
| 437 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> |
---|
| 438 | <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" /> |
---|
| 439 | <title></title> |
---|
| 440 | <link rel="stylesheet" href="/static/style/base.css" type="text/css" /> |
---|
| 441 | </head> |
---|
| 442 | <body> |
---|
| 443 | <div class="document"> |
---|
| 444 | """ |
---|
| 445 | |
---|
| 446 | |
---|
| 447 | def doImport(outfile=None,outpath=None): |
---|
| 448 | """ import into one of the new html composite data types for Rgenetics |
---|
| 449 | Dan Blankenberg with mods by Ross Lazarus |
---|
| 450 | October 2007 |
---|
| 451 | """ |
---|
| 452 | flist = glob.glob(os.path.join(outpath,'*')) |
---|
| 453 | outf = open(outfile,'w') |
---|
| 454 | outf.write(galhtmlprefix % prog) |
---|
| 455 | for i, data in enumerate( flist ): |
---|
| 456 | outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1])) |
---|
| 457 | outf.write('<br><h3>This is simulated null genotype data generated by Rgenetics!</h3>')
|
---|
| 458 | outf.write('%s called with command line:<br><pre>' % prog)
|
---|
| 459 | outf.write(' '.join(sys.argv))
|
---|
| 460 | outf.write('\n</pre>\n') |
---|
| 461 | outf.write("</div></body></html>") |
---|
| 462 | outf.close() |
---|
| 463 | |
---|
| 464 | |
---|
| 465 | |
---|
| 466 | if __name__ == "__main__": |
---|
| 467 | """ |
---|
| 468 | """ |
---|
| 469 | parser = OptionParser(usage=u, version="%prog 0.01") |
---|
| 470 | a = parser.add_option |
---|
| 471 | a("-n","--nsubjects",type="int",dest="Ntot", |
---|
| 472 | help="nsubj: total number of subjects",default=2000) |
---|
| 473 | a("-t","--title",dest="title", |
---|
| 474 | help="title: file basename for outputs",default='fakeped') |
---|
| 475 | a("-c","--cases",type="int",dest="Naff", |
---|
| 476 | help="number of cases: independent subjects with status set to 2 (ie cases). If not set, NTOT/3 trios will be generated", default = 0) |
---|
| 477 | a("-s","--snps",dest="width",type="int", |
---|
| 478 | help="snps: total number of snps per subject", default=1000) |
---|
| 479 | a("-d","--distribution",dest="MAFdist",default="Uniform", |
---|
| 480 | help="MAF distribution - default is Uniform, can be Triangular") |
---|
| 481 | a("-o","--outf",dest="outf", |
---|
| 482 | help="Output file", default = 'fakeped') |
---|
| 483 | a("-p","--outpath",dest="outpath", |
---|
| 484 | help="Path for output files", default = './') |
---|
| 485 | a("-l","--pLink",dest="outstyle", default='L', |
---|
| 486 | help="Ped files as for Plink - no header, separate Map file - default is Plink style") |
---|
| 487 | a("-w","--loWmaf", type="float", dest="lowmaf", default=0.01, help="Lower limit for SNP MAF (minor allele freq)") |
---|
| 488 | a("-m","--missing",dest="missrate",type="float", |
---|
| 489 | help="missing: probability of missing MCAR - default 0.0", default=0.0) |
---|
| 490 | a("-v","--valmiss",dest="missval", |
---|
| 491 | help="missing character: Missing allele code - usually 0 or N - default 0", default="0") |
---|
| 492 | a("-M","--Mendelrate",dest="mendrate",type="float", |
---|
| 493 | help="Mendelian error rate: probability of a mendel error per trio, default=0.0", default=0.0) |
---|
| 494 | a("-H","--noHGRS",dest="useHG",type="int", |
---|
| 495 | help="Use local copy of UCSC snp126 database to generate real rs numbers", default=True) |
---|
| 496 | (options,args) = parser.parse_args() |
---|
| 497 | low = options.lowmaf |
---|
| 498 | try:
|
---|
| 499 | os.makedirs(options.outpath)
|
---|
| 500 | except:
|
---|
| 501 | pass
|
---|
| 502 | if options.MAFdist.upper() == 'U': |
---|
| 503 | mafDist = makeUniformMAFdist(low=low, high=0.5) |
---|
| 504 | else: |
---|
| 505 | mafDist = makeTriangularMAFdist(low=low, high=0.5, beta=5)
|
---|
| 506 | alleles,freqs, rslist, chromlist, poslist = makeMap(width=int(options.width),
|
---|
| 507 | MAFdistribution=mafDist, useGP=False)
|
---|
| 508 | fbathead = []
|
---|
| 509 | s = string.whitespace+string.punctuation
|
---|
| 510 | trantab = string.maketrans(s,'_'*len(s))
|
---|
| 511 | title = string.translate(options.title,trantab)
|
---|
| 512 |
|
---|
| 513 | if options.outstyle == 'F':
|
---|
| 514 | fbatstyle = True |
---|
| 515 | fbathead = makeFbathead(rslist=rslist, chromlist=chromlist, poslist=poslist, width=options.width) |
---|
| 516 | else:
|
---|
| 517 | fbatstyle = False |
---|
| 518 | writeMap(fprefix=title, rslist=rslist, fpath=options.outpath,
|
---|
| 519 | chromlist=chromlist, poslist=poslist, width=options.width) |
---|
| 520 | if options.Naff > 0: # make case control data |
---|
| 521 | makeIndep(fprefix = title, fpath=options.outpath,
|
---|
| 522 | width=options.width, Nunaff=options.Ntot-options.Naff, |
---|
| 523 | Naff=options.Naff, MAFdistribution=mafDist,alleles=alleles, freqs=freqs, |
---|
| 524 | fbatstyle=fbatstyle, missrate=options.missrate, missval=options.missval, |
---|
| 525 | fbathead=fbathead) |
---|
| 526 | else: |
---|
| 527 | makePed(fprefix=options.fprefix, fpath=options.fpath,
|
---|
| 528 | width=options.width, MAFdistribution=mafDist, nsubj=options.Ntot, |
---|
| 529 | alleles=alleles, freqs=freqs, fbatstyle=fbatstyle, missrate=options.missrate, |
---|
| 530 | mendrate=options.mendrate, missval=options.missval, |
---|
| 531 | fbathead=fbathead)
|
---|
| 532 | doImport(outfile=options.outf,outpath=options.outpath) |
---|
| 533 | |
---|
| 534 | |
---|
| 535 | |
---|