root/galaxy-central/tools/rgenetics/rgPedSub.py @ 2

リビジョン 2, 10.6 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1"""
2July 1 2009 added relatedness filter - fo/oo or all
3released under the terms of the LGPL
4copyright ross lazarus August 2007
5for the rgenetics project
6
7Special galaxy tool for the camp2007 data
8Allows grabbing genotypes from an arbitrary region
9
10Needs a mongo results file in the location hardwired below or could be passed in as
11a library parameter - but this file must have a very specific structure
12rs chrom offset float1...floatn
13
14called as
15
16    <command interpreter="python2.4">
17        campRGeno2.py $region "$rslist" "$title" $output1 $log_file $userId "$lpedIn" "$lhistIn"
18    </command>
19
20
21"""
22
23
24import sys, array, os, string
25from rgutils import galhtmlprefix,plinke,readMap
26
27progname = os.path.split(sys.argv[0])[1]
28
29
30atrandic = {'A':'1','C':'2','G':'3','T':'4','N':'0','-':'0','1':'1','2':'2','3':'3','4':'4','0':'0'}
31
32def doImport(outfile='test',flist=[]):
33    """ import into one of the new html composite data types for Rgenetics
34        Dan Blankenberg with mods by Ross Lazarus
35        October 2007
36    """
37    out = open(outfile,'w')
38    out.write(galhtmlprefix % progname)
39
40    if len(flist) > 0:
41        out.write('<ol>\n')
42        for i, data in enumerate( flist ):
43           out.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1]))
44        out.write('</ol>\n')
45    else:
46           out.write('No files found')
47    out.write("</div></body></html>")
48    out.close()
49
50def setupPedFilter(relfilter='oo',dfile=None):
51    """ figure out who to drop to satisfy relative filtering
52    note single offspring only from each family id
53    ordering of pdict keys makes this 'random' as the first one only is
54    kept if there are multiple sibs from same familyid.
55    """
56    dropId = {}
57    keepoff = (relfilter == 'oo')
58    keepfounder = (relfilter == 'fo')
59    pdict = {}
60    for row in dfile:
61        rowl = row.strip().split()
62        if len(rowl) > 6:
63            idk = (rowl[0],rowl[1])
64            pa =  (rowl[0],rowl[2]) # key for father
65            ma = (rowl[0],rowl[3]) # and mother
66            pdict[idk] = (pa,ma)
67    dfile.seek(0) # rewind
68    pk = pdict.keys()
69    for p in pk:
70        parents = pdict[p]
71        if pdict.get(parents[0],None) or pdict.get(parents[1],None): # parents are in this file
72            if keepfounder:
73                dropId[p] = 1 # flag for removal
74        elif keepoff:
75            dropId[p] = 1 # flag for removal
76    if keepoff: # TODO keep only a random offspring if many - rely on pdict keys being randomly ordered...!   
77        famseen = {}
78        for p in pk: # look for multiples from same family - drop all but first
79             famid = p[0]
80             if famseen.get(famid,None):
81                 dropId[p] = 1 # already got one from this family
82             famseen.setdefault(famid,1)
83    return dropId
84   
85def writeFped(rslist=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None):
86    """ fbat format version
87    """
88    outname = os.path.join(outdir,basename)
89    pedfname = '%s.ped' % outname
90    ofile = file(pedfname, 'w')
91    rsl = ' '.join(rslist) # rslist for fbat
92    ofile.write(rsl)
93    s = 'wrote %d marker header to %s - %s\n' % (len(rslist),pedfname,rsl[:50])
94    lf.write(s)
95    ofile.write('\n')
96    nrows = 0
97    for line in dfile:
98        line = line.strip()
99        if not line:
100            continue
101        line = line.replace('D','N')
102        fields = line.split()
103        preamble = fields[:6]
104        idk = (preamble[0],preamble[1])
105        dropme = dropId.get(idk,None)
106        if not dropme:
107            g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant]
108            g = ' '.join(g)
109            g = g.split() # we'll get there
110            g = [atrandic.get(x,'0') for x in g] # numeric alleles...
111            # hack for framingham ND
112            ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g)))
113            nrows += 1
114    ofile.close()
115    loglist = open(logfile,'r').readlines() # retrieve log to add to html file
116    doImport(outfile,[pedfname,],loglist=loglist)
117    return nrows,pedfname
118
119def writePed(markers=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None):
120    """ split out
121    """
122    outname = os.path.join(outdir,basename)
123    mapfname = '%s.map' % outname
124    pedfname = '%s.ped' % outname
125    ofile = file(pedfname, 'w')
126    # make a map file in the lped library
127    mf = file(mapfname,'w')
128    map = ['%s\t%s\t0\t%d' % (x[0],x[2],x[1]) for x in markers] # chrom,abspos,snp in genomic order
129    mf.write('%s\n' % '\n'.join(map))
130    mf.close()
131    nrows = 0
132    for line in dfile:
133        line = line.strip()
134        if not line:
135            continue
136        #line = line.replace('D','N')
137        fields = line.split()
138        preamble = fields[:6]
139        idk = (preamble[0],preamble[1])
140        dropme = dropId.get(idk,None)
141        if not dropme:
142            g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant]
143            g = ' '.join(g)
144            g = g.split() # we'll get there
145            g = [atrandic.get(x,'0') for x in g] # numeric alleles...
146            # hack for framingham ND
147            ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g)))
148            nrows += 1
149    ofile.close()
150    loglist = open(logfile,'r').readlines() # retrieve log to add to html file
151    doImport(outfile,[mapfname,pedfname,logfile])
152    return nrows,pedfname
153   
154def subset():
155    """  ### Sanity check the arguments
156    now passed in as
157    <command interpreter="python">
158        rgPedSub.py $script_file
159    </command>
160
161    with
162    <configfiles>
163    <configfile name="script_file">
164    title~~~~$title
165    output1~~~~$output1
166    log_file~~~~$log_file
167    userId~~~~$userId
168    outformat~~~~$outformat
169    outdir~~~~$output1.extra_files_path
170    relfilter~~~~$relfilter
171    #if $d.source=='library'
172    inped~~~~$d.lpedIn
173    #else
174    inped~~~~$d.lhistIn.extra_files_path/$d.lhistIn.metadata.base_name
175    #end if
176    #if $m.mtype=='grslist'
177    rslist~~~~$m.rslist
178    region~~~~ 
179    #else
180    rslist~~~~ 
181    region~~~~$m.region
182    #end if
183    </configfile>
184    </configfiles>   
185    """
186    sep = '~~~~' # arbitrary choice
187    conf = {}
188    if len(sys.argv) < 2:
189        print >> sys.stderr, "No configuration file passed as a parameter - cannot run"
190        sys.exit(1)
191    configf = sys.argv[1]
192    config = file(configf,'r').readlines()
193    for row in config:
194        row = row.strip()
195        if len(row) > 0:
196            try:
197                key,valu = row.split(sep)
198                conf[key] = valu
199            except:
200                pass
201    ss = '%s%s' % (string.punctuation,string.whitespace)
202    ptran =  string.maketrans(ss,'_'*len(ss))
203    ### Figure out what genomic region we are interested in
204    region = conf.get('region','')
205    orslist = conf.get('rslist','').replace('X',' ').lower()
206    orslist = orslist.replace(',',' ').lower()
207    # galaxy replaces newlines with XX - go figure
208    title = conf.get('title','').translate(ptran) # for outputs
209    outfile = conf.get('output1','')
210    outdir = conf.get('outdir','')
211    try:
212        os.makedirs(outdir)
213    except:
214        pass
215    outformat = conf.get('outformat','lped')
216    basename = conf.get('basename',title)
217    logfile = os.path.join(outdir,'%s.log' % title)
218    userId = conf.get('userId','') # for library
219    pedFileBase = conf.get('inped','')
220    relfilter = conf.get('relfilter','')
221    MAP_FILE = '%s.map' % pedFileBase
222    DATA_FILE = '%s.ped' % pedFileBase   
223    title = conf.get('title','lped subset')
224    lf = file(logfile,'w')
225    lf.write('config file %s = \n' % configf)
226    lf.write(''.join(config))
227    c = ''
228    spos = epos = 0
229    rslist = []
230    rsdict = {}
231    if region > '':
232        try: # TODO make a regexp?
233            c,rest = region.split(':')
234            c = c.replace('chr','')
235            rest = rest.replace(',','') # remove commas
236            spos,epos = rest.split('-')
237            spos = int(spos)
238            epos = int(epos)
239            s = '## %s parsing chrom %s from %d to %d\n' % (progname,c,spos,epos)
240            lf.write(s)
241        except:
242            s = '##! %s unable to parse region %s - MUST look like "chr8:10,000-100,000\n' % (progname,region)
243            lf.write(s)
244            lf.close()
245            sys.exit(1)
246    else:
247        rslist = orslist.split() # galaxy replaces newlines with XX - go figure
248        rsdict = dict(zip(rslist,rslist))
249    allmarkers = False
250    if len(rslist) == 0 and epos == 0: # must be a full extract - presumably remove relateds or something
251        allmarkers = True
252    ### Figure out which markers are in this region
253    markers,snpcols,rslist,rsdict = readMap(mapfile=MAP_FILE,allmarkers=allmarkers,rsdict=rsdict,c=c,spos=spos,epos=epos)
254    if len(rslist) == 0:
255            s = '##! %s found no rs numbers in %s\n' % (progname,sys.argv[1:3])
256            lf.write(s)
257            lf.write('\n')
258            lf.close()
259            sys.exit(1)
260    s = '## %s looking for %d rs (%s....etc)\n' % (progname,len(rslist),rslist[:5])
261    lf.write(s)
262    try:
263        dfile = open(DATA_FILE, 'r')
264    except: # bad input file name?
265        s = '##! rgPedSub unable to open file %s\n' % (DATA_FILE)
266        lf.write(s)
267        lf.write('\n')
268        lf.close()
269        print >> sys.stdout, s
270        raise
271        sys.exit(1)
272    if relfilter <> 'all': # must read pedigree and figure out who to drop
273        dropId = setupPedFilter(relfilter=relfilter,dfile=dfile)
274    else:
275        dropId = {}
276    wewant = [(6+(2*snpcols[x])) for x in rslist] #
277    # column indices of first geno of each marker pair to get the markers into genomic
278    ### ... and then parse the rest of the ped file to pull out
279    ### the genotypes for all subjects for those markers
280    # /usr/local/galaxy/data/rg/1/lped/
281    if len(dropId.keys()) > 0:
282        s = '## dropped the following subjects to satisfy requirement that relfilter = %s\n' % relfilter
283        lf.write(s)
284        if relfilter == 'oo':
285            s = '## note that one random offspring from each family was kept if there were multiple offspring\n'
286            lf.write(s)
287        s = 'FamilyId\tSubjectId\n'
288        lf.write(s)
289        dk = dropId.keys()
290        dk.sort()
291        for k in dk:
292            s = '%s\t%s\n' % (k[0],k[1])
293            lf.write(s)
294    lf.write('\n')
295    lf.close()
296    if outformat == 'lped':
297        nrows,pedfname=writePed(markers=markers,outdir=outdir,title=title,basename=basename,dfile=dfile,
298                 wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile)
299    elif outformat == 'fped':
300        nrows,pedfname=writeFped(rslist=rslist,outdir=outdir,title=title,basename=basename,dfile=dfile,
301                  wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile)
302    dfile.close()   
303
304if __name__ == "__main__":
305    subset()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。