#! /usr/local/bin/python2.4 # pedigree data faker # specifically designed for scalability testing of # Shaun Purcel's PLINK package # derived from John Ziniti's original suggestion # allele frequency spectrum and random mating added # ross lazarus me fecit january 13 2007 # copyright ross lazarus 2007 # without psyco # generates about 10k snp genotypes in 2k subjects (666 trios) per minute or so. # so 500k (a billion genotypes), at about 4 trios/min will a couple of hours to generate # psyco makes it literally twice as quick!! # all rights reserved except as granted under the terms of the LGPL # see http://www.gnu.org/licenses/lgpl.html # for a copy of the license you receive with this software # and for your rights and obligations # especially if you wish to modify or redistribute this code # january 19 added random missingness inducer # currently about 15M genos/minute without psyco, 30M/minute with # so a billion genos should take about 40 minutes with psyco or 80 without... # added mendel error generator jan 23 rml import random,sys,time,os,string from optparse import OptionParser width = 500000 ALLELES = ['1','2','3','4'] prog = os.path.split(sys.argv[0])[-1] debug = 0 """Natural-order sorting, supporting embedded numbers. # found at http://lists.canonical.org/pipermail/kragen-hacks/2005-October/000419.html note test code there removed to conserve brain space foo9bar2 < foo10bar2 < foo10bar10 """ import random, re, sys def natsort_key(item): chunks = re.split('(\d+(?:\.\d+)?)', item) for ii in range(len(chunks)): if chunks[ii] and chunks[ii][0] in '0123456789': if '.' in chunks[ii]: numtype = float else: numtype = int # wrap in tuple with '0' to explicitly specify numbers come first chunks[ii] = (0, numtype(chunks[ii])) else: chunks[ii] = (1, chunks[ii]) return (chunks, item) def natsort(seq): "Sort a sequence of text strings in a reasonable order." alist = [item for item in seq] alist.sort(key=natsort_key) return alist def makeUniformMAFdist(low=0.02, high=0.5): """Fake a non-uniform maf distribution to make the data more interesting. Provide uniform 0.02-0.5 distribution""" MAFdistribution = [] for i in xrange(int(100*low),int(100*high)+1): freq = i/100.0 # uniform MAFdistribution.append(freq) return MAFdistribution def makeTriangularMAFdist(low=0.02, high=0.5, beta=5): """Fake a non-uniform maf distribution to make the data more interesting - more rare alleles """ MAFdistribution = [] for i in xrange(int(100*low),int(100*high)+1): freq = (51 - i)/100.0 # large numbers of small allele freqs for j in range(beta*i): # or i*i for crude exponential distribution MAFdistribution.append(freq) return MAFdistribution def makeFbathead(rslist=[], chromlist=[], poslist=[], width=100000): """header row """ res = ['%s_%s_%s' % (chromlist[x], poslist[x], rslist[x]) for x in range(len(rslist))] return ' '.join(res) def makeMap( width=500000, MAFdistribution=[], useGP=False): """make snp allele and frequency tables for consistent generation""" usegp = 1 snpdb = 'snp126' hgdb = 'hg18' alleles = [] freqs = [] rslist = [] chromlist = [] poslist = [] for snp in range(width): random.shuffle(ALLELES) alleles.append(ALLELES[0:2]) # need two DIFFERENT alleles! freqs.append(random.choice(MAFdistribution)) # more rare alleles if useGP: try: import MySQLdb genome = MySQLdb.Connect('localhost', 'hg18', 'G3gn0m3') curs = genome.cursor() # use default cursor except: if debug: print 'cannot connect to local copy of golden path' usegp = 0 if usegp and useGP: # urrrgghh getting snps into chrom offset order is complicated.... curs.execute('use %s' % hgdb) print 'Collecting %d real rs numbers - this may take a while' % width # get a random draw of enough reasonable (hapmap) snps with frequency data s = '''select distinct chrom,chromEnd, name from %s where avHet > 0 and chrom not like '%%random' group by name order by rand() limit %d''' % (snpdb,width) curs.execute(s) reslist = curs.fetchall() reslist = ['%s\t%09d\t%s' % (x[3:],y,z) for x,y,z in reslist] # get rid of chr reslist = natsort(reslist) for s in reslist: chrom,pos,rs = s.split('\t') rslist.append(rs) chromlist.append(chrom) poslist.append(pos) else: chrom = '1' for snp in range(width): pos = '%d' % (1000*snp) rs = 'rs00%d' % snp rslist.append(rs) chromlist.append(chrom) poslist.append(pos) return alleles,freqs, rslist, chromlist, poslist def writeMap(fprefix = '', fpath='./', rslist=[], chromlist=[], poslist=[], width = 500000): """make a faked plink compatible map file - fbat files have the map written as a header line""" outf = '%s.map'% (fprefix) outf = os.path.join(fpath,outf) amap = open(outf, 'w') res = ['%s\t%s\t0\t%s' % (chromlist[x],rslist[x],poslist[x]) for x in range(len(rslist))] res.append('') amap.write('\n'.join(res)) amap.close() def makeMissing(genos=[], missrate = 0.03, missval = '0'): """impose some random missingness""" nsnps = len(genos) for snp in range(nsnps): # ignore first 6 columns if random.random() <= missrate: genos[snp] = '%s %s' % (missval,missval) return genos def makeTriomissing(genos=[], missrate = 0.03, missval = '0'): """impose some random missingness on a trio - moth eaten like real data""" for person in (0,1): nsnps = len(genos[person]) for snp in range(nsnps): for person in [0,1,2]: if random.random() <= missrate: genos[person][snp] = '%s %s' % (missval,missval) return genos def makeTriomendel(p1g=(0,0),p2g=(0,0), kiddip = (0,0)): """impose some random mendels on a trio there are 8 of the 9 mating types we can simulate reasonable errors for Note, since random mating dual het parents can produce any genotype we can't generate an interesting error for them, so the overall mendel rate will be lower than mendrate, depending on allele frequency...""" if p1g[0] <> p1g[1] and p2g[0] <> p2g[1]: # both parents het return kiddip # cannot simulate a mendel error - anything is legal! elif (p1g[0] <> p1g[1]): # p1 is het parent so p2 must be hom if p2g[0] == 0: # - make child p2 opposite hom for error kiddip = (1,1) else: kiddip = (0,0) elif (p2g[0] <> p2g[1]): # p2 is het parent so p1 must be hom if p1g[0] == 0: # - make child p1 opposite hom for error kiddip = (1,1) else: kiddip = (0,0) elif (p1g[0] == p1g[1]): # p1 is hom parent and if we get here p2 must also be hom if p1g[0] == p2g[0]: # both parents are same hom - make child either het or opposite hom for error if random.random() <= 0.5: kiddip = (0,1) else: if p1g[0] == 0: kiddip = (1,1) else: kiddip = (0,0) else: # parents are opposite hom - return any hom as an error if random.random() <= 0.5: kiddip = (0,0) else: kiddip = (1,1) return kiddip def makeFam(width=100, freqs={}, alleles={}, trio=1, missrate=0.03, missval='0', mendrate=0.0): """this family is a simple trio, constructed by random mating two random genotypes TODO: why not generate from chromosomes - eg hapmap set each haplotype locus according to the conditional probability implied by the surrounding loci - eg use both neighboring pairs, triplets and quads as observed in hapmap ceu""" dadped = '%d 1 0 0 1 1 %s' mumped = '%d 2 0 0 2 1 %s' # a mother is a mum where I come from :) kidped = '%d 3 1 2 %d %d %s' family = [] # result accumulator sex = random.choice((1,2)) # for the kid affected = random.choice((1,2)) genos = [[],[],[]] # dad, mum, kid - 0/1 for common,rare initially, then xform to alleles # parent1...kidn lists of 0/1 for common,rare initially, then xformed to alleles for snp in xrange(width): f = freqs[snp] for i in range(2): # do dad and mum p = random.random() a1 = a2 = 0 if p <= f: # a rare allele a1 = 1 p = random.random() if p <= f: # a rare allele a2 = 1 if a1 > a2: a1,a2 = a2,a1 # so ordering consistent - 00,01,11 dip = (a1,a2) genos[i].append(dip) # tuples of 0,1 a1 = random.choice(genos[0][snp]) # dad gamete a2 = random.choice(genos[1][snp]) # mum gamete if a1 > a2: a1,a2 = a2,a1 # so ordering consistent - 00,01,11 kiddip = (a1,a2) # NSFW mating! genos[2].append(kiddip) if mendrate > 0: if random.random() <= mendrate: genos[2][snp] = makeTriomendel(genos[0][snp],genos[1][snp], kiddip) achoice = alleles[snp] for g in genos: # now convert to alleles using allele dict a1 = achoice[g[snp][0]] # get allele letter a2 = achoice[g[snp][1]] g[snp] = '%s %s' % (a1,a2) if missrate > 0: genos = makeTriomissing(genos=genos,missrate=missrate, missval=missval) family.append(dadped % (trio,' '.join(genos[0]))) # create a row for each member of trio family.append(mumped % (trio,' '.join(genos[1]))) family.append(kidped % (trio,sex,affected,' '.join(genos[2]))) return family def makePerson(width=100, aff=1, freqs={}, alleles={}, id=1, missrate = 0.03, missval='0'): """make an entire genotype vector for an independent subject""" sex = random.choice((1,2)) if not aff: aff = random.choice((1,2)) genos = [] #0/1 for common,rare initially, then xform to alleles family = [] personped = '%d 1 0 0 %d %d %s' poly = (0,1) for snp in xrange(width): achoice = alleles[snp] f = freqs[snp] p = random.random() a1 = a2 = 0 if p <= f: # a rare allele a1 = 1 p = random.random() if p <= f: # a rare allele a2 = 1 if a1 > a2: a1,a2 = a2,a1 # so ordering consistent - 00,01,11 a1 = achoice[a1] # get allele letter a2 = achoice[a2] g = '%s %s' % (a1,a2) genos.append(g) if missrate > 0.0: genos = makeMissing(genos=genos,missrate=missrate, missval=missval) family.append(personped % (id,sex,aff,' '.join(genos))) return family def makeHapmap(fprefix= 'fakebigped',width=100, aff=[], freqs={}, alleles={}, nsubj = 2000, trios = True, mendrate=0.03, missrate = 0.03, missval='0'): """ fake a hapmap file and a pedigree file for eg haploview this is arranged as the transpose of a ped file - cols are subjects, rows are markers so we need to generate differently since we can't do the transpose in ram reliably for a few billion genotypes... """ outheadprefix = 'rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode %s' cfake5 = ["illumina","urn:LSID:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1", "urn:LSID:illumina.hapmap.org:Assay:27741:1","urn:lsid:dcc.hapmap.org:Panel:CEPH-30-trios:1","QC+"] yfake5 = ["illumina","urn:LSID:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1", "urn:LSID:illumina.hapmap.org:Assay:27741:1","urn:LSID:dcc.hapmap.org:Panel:Yoruba-30-trios:1","QC+"] sampids = ids if trios: ts = '%d trios' % int(nsubj/3.) else: ts = '%d unrelated subjects' % nsubj res = ['#%s fake hapmap file %d snps and %s, faked by %s' % (timenow(), width, ts, prog),] res.append('# ross lazarus me fecit') res.append(outheadprefix % ' '.join(sampids)) # make a header compatible with hapmap extracts outf = open('%s.hmap' % (fprefix), 'w') started = time.time() if trios: ntrios = int(nsubj/3.) for n in ntrios: # each is a dict row = copy.copy(cfake5) # get first fields row = map(str,row) if race == "YRI": row += yfake5 elif race == 'CEU': row += cfake5 else: row += ['NA' for x in range(5)] # 5 dummy fields = center protLSID assayLSID panelLSID QCcode row += [''.join(sorted(line[x])) for x in sampids] # the genotypes in header (sorted) sample id order res.append(' '.join(row)) res.append('') outfname = '%s_%s_%s_%dkb.geno' % (gene,probeid,race,2*flank/1000) f = file(outfname,'w') f.write('\n'.join(res)) f.close() print '### %s: Wrote %d lines to %s' % (timenow(), len(res),outfname) def makePed(fprefix= 'fakebigped', fpath='./', width=500000, nsubj=2000, MAFdistribution=[],alleles={}, freqs={}, fbatstyle=True, mendrate = 0.0, missrate = 0.03, missval='0',fbathead=''): """fake trios with mendel consistent random mating genotypes in offspring with consistent alleles and MAFs for the sample""" res = [] if fbatstyle: # add a header row with the marker names res.append(fbathead) # header row for fbat outfname = '%s.ped'% (fprefix) outfname = os.path.join(fpath,outfname) outf = open(outfname,'w') ntrios = int(nsubj/3.) outf = open(outfile, 'w') started = time.time() for trio in xrange(ntrios): family = makeFam(width=width, freqs=freqs, alleles=alleles, trio=trio, missrate = missrate, mendrate=mendrate, missval=missval) res += family if (trio + 1) % 10 == 0: # write out to keep ram requirements reasonable if (trio + 1) % 50 == 0: # show progress dur = time.time() - started if dur == 0: dur = 1.0 print 'Trio: %d, %4.1f genos/sec at %6.1f sec' % (trio + 1, width*trio*3/dur, dur) outf.write('\n'.join(res)) outf.write('\n') res = [] if len(res) > 0: # some left outf.write('\n'.join(res)) outf.write('\n') outf.close() if debug: print '##makeped : %6.1f seconds total runtime' % (time.time() - started) def makeIndep(fprefix = 'fakebigped', fpath='./', width=500000, Nunaff=1000, Naff=1000, MAFdistribution=[], alleles={}, freqs={}, fbatstyle=True, missrate = 0.03, missval='0',fbathead=''): """fake a random sample from a random mating sample with consistent alleles and MAFs""" res = [] Ntot = Nunaff + Naff status = [1,]*Nunaff status += [2,]*Nunaff outf = '%s.ped' % (fprefix) outf = os.path.join(fpath,outf) outf = open(outf, 'w') started = time.time() #sample = personMaker(width=width, affs=status, freqs=freqs, alleles=alleles, Ntomake=Ntot) if fbatstyle: # add a header row with the marker names res.append(fbathead) # header row for fbat for id in xrange(Ntot): if id < Nunaff: aff = 1 else: aff = 2 family = makePerson(width=width, aff=aff, freqs=freqs, alleles=alleles, id=id+1) res += family if (id % 50 == 0): # write out to keep ram requirements reasonable if (id % 200 == 0): # show progress dur = time.time() - started if dur == 0: dur = 1.0 print 'Id: %d, %4.1f genos/sec at %6.1f sec' % (id, width*id/dur, dur) outf.write('\n'.join(res)) outf.write('\n') res = [] if len(res) > 0: # some left outf.write('\n'.join(res)) outf.write('\n') outf.close() print '## makeindep: %6.1f seconds total runtime' % (time.time() - started) u = """ Generate either trios or independent subjects with a prespecified number of random alleles and a uniform or triangular MAF distribution for stress testing. No LD is simulated - alleles are random. Offspring for trios are generated by random mating the random parental alleles so there are no Mendelian errors unless the -M option is used. Mendelian errors are generated randomly according to the possible errors given the parental mating type although this is fresh code and not guaranteed to work quite right yet - comments welcomed Enquiries to ross.lazarus@gmail.com eg to generate 700 trios with 500k snps, use: fakebigped.py -n 2100 -s 500000 or to generate 500 independent cases and 500 controls with 100k snps and 0.02 missingness (MCAR), use: fakebigped.py -c 500 -n 1000 -s 100000 -m 0.02 fakebigped.py -o myfake -m 0.05 -s 100000 -n 2000 will make fbat compatible myfake.ped with 100k markers in 666 trios (total close to 2000 subjects), a uniform MAF distribution and about 5% MCAR missing fakebigped.py -o myfake -m 0.05 -s 100000 -n 2000 -M 0.05 will make fbat compatible myfake.ped with 100k markers in 666 trios (total close to 2000 subjects), a uniform MAF distribution, about 5% Mendelian errors and about 5% MCAR missing fakebigped.py -o myfakecc -m 0.05 -s 100000 -n 2000 -c 1000 -l will make plink compatible myfakecc.ped and myfakecc.map (that's what the -l option does), with 100k markers in 1000 cases and 1000 controls (affection status 2 and 1 respectively), a triangular MAF distribution (more rare alleles) and about 5% MCAR missing You should see about 1/4 million genotypes/second so about an hour for a 500k snps in 2k subjects and about a 4GB ped file - these are BIG!! """ import sys, os, glob galhtmlprefix = """
""" def doImport(outfile=None,outpath=None): """ import into one of the new html composite data types for Rgenetics Dan Blankenberg with mods by Ross Lazarus October 2007 """ flist = glob.glob(os.path.join(outpath,'*')) outf = open(outfile,'w') outf.write(galhtmlprefix % prog) for i, data in enumerate( flist ): outf.write('
  • %s
  • \n' % (os.path.split(data)[-1],os.path.split(data)[-1])) outf.write('

    This is simulated null genotype data generated by Rgenetics!

    ') outf.write('%s called with command line:
    ' % prog)
        outf.write(' '.join(sys.argv))
        outf.write('\n
    \n') outf.write("
    ") outf.close() if __name__ == "__main__": """ """ parser = OptionParser(usage=u, version="%prog 0.01") a = parser.add_option a("-n","--nsubjects",type="int",dest="Ntot", help="nsubj: total number of subjects",default=2000) a("-t","--title",dest="title", help="title: file basename for outputs",default='fakeped') a("-c","--cases",type="int",dest="Naff", help="number of cases: independent subjects with status set to 2 (ie cases). If not set, NTOT/3 trios will be generated", default = 0) a("-s","--snps",dest="width",type="int", help="snps: total number of snps per subject", default=1000) a("-d","--distribution",dest="MAFdist",default="Uniform", help="MAF distribution - default is Uniform, can be Triangular") a("-o","--outf",dest="outf", help="Output file", default = 'fakeped') a("-p","--outpath",dest="outpath", help="Path for output files", default = './') a("-l","--pLink",dest="outstyle", default='L', help="Ped files as for Plink - no header, separate Map file - default is Plink style") a("-w","--loWmaf", type="float", dest="lowmaf", default=0.01, help="Lower limit for SNP MAF (minor allele freq)") a("-m","--missing",dest="missrate",type="float", help="missing: probability of missing MCAR - default 0.0", default=0.0) a("-v","--valmiss",dest="missval", help="missing character: Missing allele code - usually 0 or N - default 0", default="0") a("-M","--Mendelrate",dest="mendrate",type="float", help="Mendelian error rate: probability of a mendel error per trio, default=0.0", default=0.0) a("-H","--noHGRS",dest="useHG",type="int", help="Use local copy of UCSC snp126 database to generate real rs numbers", default=True) (options,args) = parser.parse_args() low = options.lowmaf try: os.makedirs(options.outpath) except: pass if options.MAFdist.upper() == 'U': mafDist = makeUniformMAFdist(low=low, high=0.5) else: mafDist = makeTriangularMAFdist(low=low, high=0.5, beta=5) alleles,freqs, rslist, chromlist, poslist = makeMap(width=int(options.width), MAFdistribution=mafDist, useGP=False) fbathead = [] s = string.whitespace+string.punctuation trantab = string.maketrans(s,'_'*len(s)) title = string.translate(options.title,trantab) if options.outstyle == 'F': fbatstyle = True fbathead = makeFbathead(rslist=rslist, chromlist=chromlist, poslist=poslist, width=options.width) else: fbatstyle = False writeMap(fprefix=title, rslist=rslist, fpath=options.outpath, chromlist=chromlist, poslist=poslist, width=options.width) if options.Naff > 0: # make case control data makeIndep(fprefix = title, fpath=options.outpath, width=options.width, Nunaff=options.Ntot-options.Naff, Naff=options.Naff, MAFdistribution=mafDist,alleles=alleles, freqs=freqs, fbatstyle=fbatstyle, missrate=options.missrate, missval=options.missval, fbathead=fbathead) else: makePed(fprefix=options.fprefix, fpath=options.fpath, width=options.width, MAFdistribution=mafDist, nsubj=options.Ntot, alleles=alleles, freqs=freqs, fbatstyle=fbatstyle, missrate=options.missrate, mendrate=options.mendrate, missval=options.missval, fbathead=fbathead) doImport(outfile=options.outf,outpath=options.outpath)