root/galaxy-central/tools/rgenetics/rgfakePed.py

リビジョン 2, 22.3 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#! /usr/local/bin/python2.4
2# pedigree data faker
3# specifically designed for scalability testing of
4# Shaun Purcel's PLINK package
5# derived from John Ziniti's original suggestion
6# allele frequency spectrum and random mating added
7# ross lazarus me fecit january 13 2007
8# copyright ross lazarus 2007
9# without psyco
10# generates about 10k snp genotypes in 2k subjects (666 trios) per minute or so.
11# so 500k (a billion genotypes), at about 4 trios/min will a couple of hours to generate
12# psyco makes it literally twice as quick!!
13# all rights reserved except as granted under the terms of the LGPL
14# see http://www.gnu.org/licenses/lgpl.html
15# for a copy of the license you receive with this software
16# and for your rights and obligations
17# especially if you wish to modify or redistribute this code
18# january 19 added random missingness inducer
19# currently about 15M genos/minute without psyco, 30M/minute with
20# so a billion genos should take about 40 minutes with psyco or 80 without...
21# added mendel error generator jan 23 rml
22
23
24import random,sys,time,os,string
25
26from optparse import OptionParser
27
28   
29width = 500000
30ALLELES = ['1','2','3','4']
31prog = os.path.split(sys.argv[0])[-1]
32debug = 0
33
34"""Natural-order sorting, supporting embedded numbers.
35# found at http://lists.canonical.org/pipermail/kragen-hacks/2005-October/000419.html
36note test code there removed to conserve brain space
37foo9bar2 < foo10bar2 < foo10bar10
38
39"""
40import random, re, sys
41
42def natsort_key(item):
43    chunks = re.split('(\d+(?:\.\d+)?)', item)
44    for ii in range(len(chunks)):
45        if chunks[ii] and chunks[ii][0] in '0123456789':
46            if '.' in chunks[ii]: numtype = float
47            else: numtype = int
48            # wrap in tuple with '0' to explicitly specify numbers come first
49            chunks[ii] = (0, numtype(chunks[ii]))
50        else:
51            chunks[ii] = (1, chunks[ii])
52    return (chunks, item)
53
54def natsort(seq):
55    "Sort a sequence of text strings in a reasonable order."
56    alist = [item for item in seq]
57    alist.sort(key=natsort_key)
58    return alist
59
60
61def makeUniformMAFdist(low=0.02, high=0.5):
62    """Fake a non-uniform maf distribution to make the data
63    more interesting. Provide uniform 0.02-0.5 distribution"""
64    MAFdistribution = []
65    for i in xrange(int(100*low),int(100*high)+1):
66       freq = i/100.0 # uniform
67       MAFdistribution.append(freq)
68    return MAFdistribution
69
70def makeTriangularMAFdist(low=0.02, high=0.5, beta=5):
71    """Fake a non-uniform maf distribution to make the data
72    more interesting - more rare alleles """
73    MAFdistribution = []
74    for i in xrange(int(100*low),int(100*high)+1):
75       freq = (51 - i)/100.0 # large numbers of small allele freqs
76       for j in range(beta*i): # or i*i for crude exponential distribution
77            MAFdistribution.append(freq)
78    return MAFdistribution
79
80def makeFbathead(rslist=[], chromlist=[], poslist=[], width=100000):
81    """header row
82    """
83    res = ['%s_%s_%s' % (chromlist[x], poslist[x], rslist[x]) for x in range(len(rslist))]
84    return ' '.join(res)
85
86def makeMap( width=500000, MAFdistribution=[], useGP=False):
87    """make snp allele and frequency tables for consistent generation"""
88    usegp = 1
89    snpdb = 'snp126'
90    hgdb = 'hg18'
91    alleles = []
92    freqs = []
93    rslist = []
94    chromlist = []
95    poslist = []
96    for snp in range(width):
97        random.shuffle(ALLELES)
98        alleles.append(ALLELES[0:2]) # need two DIFFERENT alleles!
99        freqs.append(random.choice(MAFdistribution)) # more rare alleles
100    if useGP:
101        try:
102            import MySQLdb
103            genome = MySQLdb.Connect('localhost', 'hg18', 'G3gn0m3')
104            curs = genome.cursor() # use default cursor
105        except:
106            if debug:
107                print 'cannot connect to local copy of golden path'
108            usegp = 0
109    if usegp and useGP: # urrrgghh getting snps into chrom offset order is complicated....
110        curs.execute('use %s' % hgdb)
111        print 'Collecting %d real rs numbers - this may take a while' % width
112        # get a random draw of enough reasonable (hapmap) snps with frequency data
113        s = '''select distinct chrom,chromEnd, name from %s where avHet > 0 and chrom not like '%%random'
114        group by name order by rand() limit %d''' % (snpdb,width)
115        curs.execute(s)
116        reslist = curs.fetchall()
117        reslist = ['%s\t%09d\t%s' % (x[3:],y,z) for x,y,z in reslist] # get rid of chr
118        reslist = natsort(reslist)
119        for s in reslist:
120            chrom,pos,rs = s.split('\t')
121            rslist.append(rs)
122            chromlist.append(chrom)
123            poslist.append(pos)
124    else:
125        chrom = '1'
126        for snp in range(width):
127            pos = '%d' % (1000*snp)
128            rs = 'rs00%d' % snp
129            rslist.append(rs)
130            chromlist.append(chrom)
131            poslist.append(pos)
132    return alleles,freqs, rslist, chromlist, poslist
133
134def writeMap(fprefix = '', fpath='./', rslist=[], chromlist=[], poslist=[], width = 500000):
135    """make a faked plink compatible map file - fbat files
136    have the map written as a header line"""
137    outf = '%s.map'% (fprefix)
138    outf = os.path.join(fpath,outf)
139    amap = open(outf, 'w')
140    res = ['%s\t%s\t0\t%s' % (chromlist[x],rslist[x],poslist[x]) for x in range(len(rslist))]
141    res.append('')
142    amap.write('\n'.join(res))
143    amap.close()
144
145def makeMissing(genos=[], missrate = 0.03, missval = '0'):
146    """impose some random missingness"""
147    nsnps = len(genos)
148    for snp in range(nsnps): # ignore first 6 columns
149        if random.random() <= missrate:
150            genos[snp] = '%s %s' % (missval,missval)
151    return genos
152
153def makeTriomissing(genos=[], missrate = 0.03, missval = '0'):
154    """impose some random missingness on a trio - moth eaten like real data"""
155    for person in (0,1):
156        nsnps = len(genos[person])
157        for snp in range(nsnps):
158            for person in [0,1,2]:
159                if random.random() <= missrate:
160                    genos[person][snp] = '%s %s' % (missval,missval)
161    return genos
162
163
164def makeTriomendel(p1g=(0,0),p2g=(0,0), kiddip = (0,0)):
165    """impose some random mendels on a trio
166    there are 8 of the 9 mating types we can simulate reasonable errors for
167    Note, since random mating dual het parents can produce any genotype we can't generate an interesting
168    error for them, so the overall mendel rate will be lower than mendrate, depending on
169    allele frequency..."""
170    if p1g[0] <> p1g[1] and p2g[0] <> p2g[1]: # both parents het
171            return kiddip # cannot simulate a mendel error - anything is legal!
172    elif (p1g[0] <> p1g[1]): # p1 is het parent so p2 must be hom
173        if p2g[0] == 0: # - make child p2 opposite hom for error
174            kiddip = (1,1)
175        else:
176            kiddip = (0,0)
177    elif (p2g[0] <> p2g[1]): # p2 is het parent so p1 must be hom
178        if p1g[0] == 0: # - make child p1 opposite hom for error
179            kiddip = (1,1)
180        else:
181            kiddip = (0,0)
182    elif (p1g[0] == p1g[1]): # p1 is hom parent and if we get here p2 must also be hom
183        if p1g[0] == p2g[0]: # both parents are same hom - make child either het or opposite hom for error
184            if random.random() <= 0.5:
185                kiddip = (0,1)
186            else:
187                if p1g[0] == 0:
188                    kiddip = (1,1)
189                else:
190                    kiddip = (0,0)
191        else: # parents are opposite hom - return any hom as an error
192            if random.random() <= 0.5:
193                kiddip = (0,0)
194            else:
195                kiddip = (1,1)
196    return kiddip
197           
198           
199
200
201def makeFam(width=100, freqs={}, alleles={}, trio=1, missrate=0.03, missval='0', mendrate=0.0):
202    """this family is a simple trio, constructed by random mating two random genotypes
203    TODO: why not generate from chromosomes - eg hapmap
204    set each haplotype locus according to the conditional
205    probability implied by the surrounding loci - eg use both neighboring pairs, triplets
206    and quads as observed in hapmap ceu"""
207    dadped = '%d 1 0 0 1 1 %s'
208    mumped = '%d 2 0 0 2 1 %s' # a mother is a mum where I come from :)
209    kidped = '%d 3 1 2 %d %d %s'
210    family = [] # result accumulator
211    sex = random.choice((1,2)) # for the kid
212    affected = random.choice((1,2))
213    genos = [[],[],[]] # dad, mum, kid - 0/1 for common,rare initially, then xform to alleles
214    # parent1...kidn lists of 0/1 for common,rare initially, then xformed to alleles
215    for snp in xrange(width):
216        f = freqs[snp]           
217        for i in range(2): # do dad and mum
218            p = random.random()
219            a1 = a2 = 0
220            if p <= f: # a rare allele
221               a1 = 1
222            p = random.random()
223            if p <= f: # a rare allele
224               a2 = 1
225            if a1 > a2:
226                a1,a2 = a2,a1 # so ordering consistent - 00,01,11
227            dip = (a1,a2)
228            genos[i].append(dip) # tuples of 0,1
229        a1 = random.choice(genos[0][snp]) # dad gamete 
230        a2 = random.choice(genos[1][snp]) # mum gamete
231        if a1 > a2:
232            a1,a2 = a2,a1 # so ordering consistent - 00,01,11
233        kiddip = (a1,a2) # NSFW mating!
234        genos[2].append(kiddip)
235        if mendrate > 0:
236            if random.random() <= mendrate:
237                genos[2][snp] = makeTriomendel(genos[0][snp],genos[1][snp], kiddip)
238        achoice = alleles[snp]
239        for g in genos: # now convert to alleles using allele dict
240          a1 = achoice[g[snp][0]] # get allele letter
241          a2 = achoice[g[snp][1]]             
242          g[snp] = '%s %s' % (a1,a2)
243    if missrate > 0:
244        genos = makeTriomissing(genos=genos,missrate=missrate, missval=missval)
245    family.append(dadped % (trio,' '.join(genos[0]))) # create a row for each member of trio
246    family.append(mumped % (trio,' '.join(genos[1])))
247    family.append(kidped % (trio,sex,affected,' '.join(genos[2])))
248    return family
249
250def makePerson(width=100, aff=1, freqs={}, alleles={}, id=1, missrate = 0.03, missval='0'):
251    """make an entire genotype vector for an independent subject"""
252    sex = random.choice((1,2))
253    if not aff:
254        aff = random.choice((1,2))
255    genos = [] #0/1 for common,rare initially, then xform to alleles
256    family = []
257    personped = '%d 1 0 0 %d %d %s'
258    poly = (0,1)
259    for snp in xrange(width):
260        achoice = alleles[snp]
261        f = freqs[snp]
262        p = random.random()
263        a1 = a2 = 0
264        if p <= f: # a rare allele
265           a1 = 1
266        p = random.random()
267        if p <= f: # a rare allele
268           a2 = 1
269        if a1 > a2:
270            a1,a2 = a2,a1 # so ordering consistent - 00,01,11
271        a1 = achoice[a1] # get allele letter
272        a2 = achoice[a2]
273        g = '%s %s' % (a1,a2)
274        genos.append(g)
275    if missrate > 0.0:
276        genos = makeMissing(genos=genos,missrate=missrate, missval=missval)
277    family.append(personped % (id,sex,aff,' '.join(genos)))
278    return family
279
280def makeHapmap(fprefix= 'fakebigped',width=100, aff=[], freqs={},
281               alleles={}, nsubj = 2000, trios = True, mendrate=0.03, missrate = 0.03, missval='0'):
282    """ fake a hapmap file and a pedigree file for eg haploview
283    this is arranged as the transpose of a ped file - cols are subjects, rows are markers
284    so we need to generate differently since we can't do the transpose in ram reliably for
285    a few billion genotypes...
286    """
287    outheadprefix = 'rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode %s'
288    cfake5 = ["illumina","urn:LSID:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1",
289"urn:LSID:illumina.hapmap.org:Assay:27741:1","urn:lsid:dcc.hapmap.org:Panel:CEPH-30-trios:1","QC+"]
290    yfake5 = ["illumina","urn:LSID:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1",
291"urn:LSID:illumina.hapmap.org:Assay:27741:1","urn:LSID:dcc.hapmap.org:Panel:Yoruba-30-trios:1","QC+"]
292    sampids = ids
293    if trios:
294        ts = '%d trios' % int(nsubj/3.)
295    else:
296        ts = '%d unrelated subjects' % nsubj
297    res = ['#%s fake hapmap file %d snps and %s, faked by %s' % (timenow(), width, ts, prog),]
298    res.append('# ross lazarus me fecit')
299    res.append(outheadprefix % ' '.join(sampids)) # make a header compatible with hapmap extracts
300    outf = open('%s.hmap' % (fprefix), 'w')
301    started = time.time()
302    if trios:
303        ntrios = int(nsubj/3.)
304        for n in ntrios: # each is a dict
305            row = copy.copy(cfake5) # get first fields
306            row = map(str,row)
307            if race == "YRI":
308                row += yfake5
309            elif race == 'CEU':
310                row += cfake5
311            else:
312                row += ['NA' for x in range(5)] # 5 dummy fields = center protLSID assayLSID panelLSID QCcode
313            row += [''.join(sorted(line[x])) for x in sampids] # the genotypes in header (sorted) sample id order
314            res.append(' '.join(row))
315    res.append('')
316    outfname = '%s_%s_%s_%dkb.geno' % (gene,probeid,race,2*flank/1000)
317    f = file(outfname,'w')
318    f.write('\n'.join(res))
319    f.close()
320    print '### %s: Wrote %d lines to %s' % (timenow(), len(res),outfname)
321     
322
323def makePed(fprefix= 'fakebigped', fpath='./',
324            width=500000, nsubj=2000, MAFdistribution=[],alleles={},
325            freqs={}, fbatstyle=True, mendrate = 0.0, missrate = 0.03, missval='0',fbathead=''):
326    """fake trios with mendel consistent random mating genotypes in offspring
327    with consistent alleles and MAFs for the sample"""
328    res = []
329    if fbatstyle: # add a header row with the marker names
330        res.append(fbathead) # header row for fbat
331    outfname = '%s.ped'% (fprefix)
332    outfname = os.path.join(fpath,outfname)
333    outf = open(outfname,'w')
334    ntrios = int(nsubj/3.)
335    outf = open(outfile, 'w')
336    started = time.time()
337    for trio in xrange(ntrios):
338        family = makeFam(width=width, freqs=freqs, alleles=alleles, trio=trio,
339                         missrate = missrate, mendrate=mendrate, missval=missval)
340        res += family
341        if (trio + 1) % 10 == 0: # write out to keep ram requirements reasonable
342            if (trio + 1) % 50 == 0: # show progress
343                dur = time.time() - started
344                if dur == 0:
345                    dur = 1.0
346                print 'Trio: %d, %4.1f genos/sec at %6.1f sec' % (trio + 1, width*trio*3/dur, dur)
347            outf.write('\n'.join(res))
348            outf.write('\n')
349            res = []
350    if len(res) > 0: # some left
351        outf.write('\n'.join(res))
352    outf.write('\n')
353    outf.close()
354    if debug:
355        print '##makeped : %6.1f seconds total runtime' % (time.time() - started)
356
357def makeIndep(fprefix = 'fakebigped', fpath='./',
358              width=500000, Nunaff=1000, Naff=1000, MAFdistribution=[],
359              alleles={}, freqs={}, fbatstyle=True, missrate = 0.03, missval='0',fbathead=''):
360    """fake a random sample from a random mating sample
361    with consistent alleles and MAFs"""
362    res = []
363    Ntot = Nunaff + Naff
364    status = [1,]*Nunaff
365    status += [2,]*Nunaff
366    outf = '%s.ped' % (fprefix)
367    outf = os.path.join(fpath,outf)
368    outf = open(outf, 'w')
369    started = time.time()
370    #sample = personMaker(width=width, affs=status, freqs=freqs, alleles=alleles, Ntomake=Ntot)
371    if fbatstyle: # add a header row with the marker names
372        res.append(fbathead) # header row for fbat
373    for id in xrange(Ntot):
374        if id < Nunaff:
375            aff = 1
376        else:
377            aff = 2
378        family = makePerson(width=width, aff=aff, freqs=freqs, alleles=alleles, id=id+1)
379        res += family
380        if (id % 50 == 0): # write out to keep ram requirements reasonable
381            if (id % 200 == 0): # show progress
382                dur = time.time() - started
383                if dur == 0:
384                    dur = 1.0
385                print 'Id: %d, %4.1f genos/sec at %6.1f sec' % (id, width*id/dur, dur)
386            outf.write('\n'.join(res))
387            outf.write('\n')
388            res = []
389    if len(res) > 0: # some left
390        outf.write('\n'.join(res))
391    outf.write('\n')
392    outf.close()
393    print '## makeindep: %6.1f seconds total runtime' % (time.time() - started)
394
395u = """
396Generate either trios or independent subjects with a prespecified
397number of random alleles and a uniform or triangular MAF distribution for
398stress testing. No LD is simulated - alleles are random. Offspring for
399trios are generated by random mating the random parental alleles so there are
400no Mendelian errors unless the -M option is used. Mendelian errors are generated
401randomly according to the possible errors given the parental mating type although
402this is fresh code and not guaranteed to work quite right yet - comments welcomed
403
404Enquiries to ross.lazarus@gmail.com
405
406eg to generate 700 trios with 500k snps, use:
407fakebigped.py -n 2100 -s 500000
408or to generate 500 independent cases and 500 controls with 100k snps and 0.02 missingness (MCAR), use:
409fakebigped.py -c 500 -n 1000 -s 100000 -m 0.02
410
411fakebigped.py -o myfake -m 0.05 -s 100000 -n 2000
412will make fbat compatible myfake.ped with 100k markers in
413666 trios (total close to 2000 subjects), a uniform MAF distribution and about 5% MCAR missing
414
415fakebigped.py -o myfake -m 0.05 -s 100000 -n 2000 -M 0.05
416will make fbat compatible myfake.ped with 100k markers in
417666 trios (total close to 2000 subjects), a uniform MAF distribution,
418about 5% Mendelian errors and about 5% MCAR missing
419
420
421fakebigped.py  -o myfakecc -m 0.05 -s 100000 -n 2000 -c 1000 -l
422will make plink compatible myfakecc.ped and myfakecc.map (that's what the -l option does),
423with 100k markers in 1000 cases and 1000 controls (affection status 2 and 1 respectively),
424a triangular MAF distribution (more rare alleles) and about 5% MCAR missing
425
426You should see about 1/4 million genotypes/second so about an hour for a
427500k snps in 2k subjects and about a 4GB ped file - these are BIG!!
428
429"""
430
431import sys, os, glob
432
433galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
434<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
435<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
436<head>
437<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
438<meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" />
439<title></title>
440<link rel="stylesheet" href="/static/style/base.css" type="text/css" />
441</head>
442<body>
443<div class="document">
444"""
445
446
447def doImport(outfile=None,outpath=None):
448    """ import into one of the new html composite data types for Rgenetics
449        Dan Blankenberg with mods by Ross Lazarus
450        October 2007
451    """
452    flist = glob.glob(os.path.join(outpath,'*'))
453    outf = open(outfile,'w')
454    outf.write(galhtmlprefix % prog)
455    for i, data in enumerate( flist ):
456        outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1]))
457    outf.write('<br><h3>This is simulated null genotype data generated by Rgenetics!</h3>')
458    outf.write('%s called with command line:<br><pre>' % prog)
459    outf.write(' '.join(sys.argv))
460    outf.write('\n</pre>\n')
461    outf.write("</div></body></html>")
462    outf.close()
463
464
465
466if __name__ == "__main__":
467    """
468    """
469    parser = OptionParser(usage=u, version="%prog 0.01")
470    a = parser.add_option
471    a("-n","--nsubjects",type="int",dest="Ntot",
472      help="nsubj: total number of subjects",default=2000)
473    a("-t","--title",dest="title",
474      help="title: file basename for outputs",default='fakeped')
475    a("-c","--cases",type="int",dest="Naff",
476      help="number of cases: independent subjects with status set to 2 (ie cases). If not set, NTOT/3 trios will be generated", default = 0)
477    a("-s","--snps",dest="width",type="int",
478      help="snps: total number of snps per subject", default=1000)
479    a("-d","--distribution",dest="MAFdist",default="Uniform",
480      help="MAF distribution - default is Uniform, can be Triangular")
481    a("-o","--outf",dest="outf",
482      help="Output file", default = 'fakeped')
483    a("-p","--outpath",dest="outpath",
484      help="Path for output files", default = './')
485    a("-l","--pLink",dest="outstyle", default='L',
486      help="Ped files as for Plink - no header, separate Map file - default is Plink style")
487    a("-w","--loWmaf", type="float", dest="lowmaf", default=0.01, help="Lower limit for SNP MAF (minor allele freq)")
488    a("-m","--missing",dest="missrate",type="float",
489      help="missing: probability of missing MCAR - default 0.0", default=0.0)
490    a("-v","--valmiss",dest="missval",
491      help="missing character: Missing allele code - usually 0 or N - default 0", default="0")
492    a("-M","--Mendelrate",dest="mendrate",type="float",
493      help="Mendelian error rate: probability of a mendel error per trio, default=0.0", default=0.0)   
494    a("-H","--noHGRS",dest="useHG",type="int",
495      help="Use local copy of UCSC snp126 database to generate real rs numbers", default=True)
496    (options,args) = parser.parse_args()
497    low = options.lowmaf
498    try:
499        os.makedirs(options.outpath)
500    except:
501        pass
502    if options.MAFdist.upper() == 'U':
503        mafDist = makeUniformMAFdist(low=low, high=0.5)
504    else:
505        mafDist = makeTriangularMAFdist(low=low, high=0.5, beta=5)
506    alleles,freqs, rslist, chromlist, poslist = makeMap(width=int(options.width),
507                                        MAFdistribution=mafDist, useGP=False)
508    fbathead = []
509    s = string.whitespace+string.punctuation
510    trantab = string.maketrans(s,'_'*len(s))
511    title = string.translate(options.title,trantab)
512   
513    if options.outstyle == 'F':
514        fbatstyle = True
515        fbathead = makeFbathead(rslist=rslist, chromlist=chromlist, poslist=poslist, width=options.width)
516    else:
517        fbatstyle = False
518        writeMap(fprefix=title, rslist=rslist, fpath=options.outpath,
519                 chromlist=chromlist, poslist=poslist, width=options.width)
520    if options.Naff > 0: # make case control data
521        makeIndep(fprefix = title, fpath=options.outpath,
522                  width=options.width, Nunaff=options.Ntot-options.Naff,
523                  Naff=options.Naff, MAFdistribution=mafDist,alleles=alleles, freqs=freqs,
524                  fbatstyle=fbatstyle, missrate=options.missrate, missval=options.missval,
525                  fbathead=fbathead)
526    else:
527        makePed(fprefix=options.fprefix, fpath=options.fpath,
528            width=options.width, MAFdistribution=mafDist, nsubj=options.Ntot,
529            alleles=alleles, freqs=freqs, fbatstyle=fbatstyle, missrate=options.missrate,
530            mendrate=options.mendrate, missval=options.missval,
531                  fbathead=fbathead)
532    doImport(outfile=options.outf,outpath=options.outpath)
533
534
535       
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。