root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/gene_reader.py

リビジョン 3, 10.2 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Readers extracting gene (exon and intron) information from bed / gtf / gff
3formats.
4
5 - GeneReader: yields exons
6 - CDSReader: yields cds_exons
7 - FeatureReader: yields cds_exons, introns, exons
8
9For gff/gtf, the start_codon stop_codon line types are merged with CDSs.
10"""
11
12import sys
13from bx.bitset import *
14from bx.bitset_utils import *
15from bx.bitset_builders import *
16
17def GeneReader( fh, format='gff' ):
18    """ yield chrom, strand, gene_exons, name """
19
20    known_formats = ( 'gff', 'gtf', 'bed')
21    if format not in known_formats:
22        print >>sys.stderr,  '%s format not in %s' % (format, ",".join( known_formats ))
23        raise '?'
24   
25    if format == 'bed':
26        for line in fh:   
27            f = line.strip().split()
28            chrom = f[0]
29            chrom_start = int(f[1])
30            name = f[4]
31            strand = f[5]
32            cdsStart = int(f[6])
33            cdsEnd = int(f[7])
34            blockCount = int(f[9])
35            blockSizes = [ int(i) for i in f[10].strip(',').split(',') ]
36            blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ]
37
38            # grab cdsStart - cdsEnd
39            gene_exons = []
40            for base,offset in zip( blockStarts, blockSizes ):
41                exon_start = base
42                exon_end = base+offset
43                gene_exons.append( (exon_start, exon_end) )
44            yield chrom, strand, gene_exons, name
45    genelist = {}
46    grouplist = []
47    if format == 'gff' or format == 'gtf':
48        for line in fh:
49            if line.startswith('#'): continue
50            fields = line.strip().split('\t')
51            if len( fields ) < 9: continue
52
53            # fields
54
55            chrom = fields[0]
56            ex_st = int( fields[3] ) - 1 # make zero-centered
57            ex_end = int( fields[4] ) #+ 1 # make exclusive
58            strand = fields[6]
59
60            if format == 'gtf':
61                group = fields[8].split(';')[0]
62            else:
63                group = fields[8]
64
65            if group not in grouplist: grouplist.append( group )
66            if group not in genelist:
67                genelist[group] = (chrom, strand, [])
68            exons_i = 2
69            genelist[group][exons_i].append( ( ex_st, ex_end ) )
70
71        sp = lambda a,b: cmp( a[0], b[0] )
72
73        #for gene in genelist.values():
74        for gene in grouplist:
75            chrom, strand, gene_exons = genelist[ gene ]
76            gene_exons = bitset_union( gene_exons )
77            yield chrom, strand, gene_exons, gene
78
79def CDSReader( fh, format='gff' ):
80    """ yield chrom, strand, cds_exons, name """
81
82    known_formats = ( 'gff', 'gtf', 'bed')
83    if format not in known_formats:
84        print >>sys.stderr,  '%s format not in %s' % (format, ",".join( known_formats ))
85        raise '?'
86   
87    if format == 'bed':
88        for line in fh:   
89            f = line.strip().split()
90            chrom = f[0]
91            chrom_start = int(f[1])
92            name = f[4]
93            strand = f[5]
94            cdsStart = int(f[6])
95            cdsEnd = int(f[7])
96            blockCount = int(f[9])
97            blockSizes = [ int(i) for i in f[10].strip(',').split(',') ]
98            blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ]
99
100            # grab cdsStart - cdsEnd
101            cds_exons = []
102            cds_seq = ''
103            genome_seq_index = []
104            for base,offset in zip( blockStarts, blockSizes ):
105                if (base + offset) < cdsStart: continue
106                if base > cdsEnd: continue
107                exon_start = max( base, cdsStart )
108                exon_end = min( base+offset, cdsEnd )
109                cds_exons.append( (exon_start, exon_end) )
110            yield chrom, strand, cds_exons, name
111
112    genelist = {}
113    grouplist = []
114    if format == 'gff' or format == 'gtf':
115        for line in fh:
116            if line.startswith('#'): continue
117            fields = line.strip().split('\t')
118            if len( fields ) < 9: continue
119            if fields[2] not in ('CDS', 'stop_codon', 'start_codon'): continue
120
121            # fields
122
123            chrom = fields[0]
124            ex_st = int( fields[3] ) - 1 # make zero-centered
125            ex_end = int( fields[4] ) #+ 1 # make exclusive
126            strand = fields[6]
127
128            if format == 'gtf':
129                group = fields[8].split(';')[0]
130            else:
131                group = fields[8]
132
133            if group not in grouplist: grouplist.append( group )
134            if group not in genelist:
135                genelist[group] = (chrom, strand, [])
136           
137            genelist[group][2].append( ( ex_st, ex_end ) )
138
139        sp = lambda a,b: cmp( a[0], b[0] )
140
141        #for gene in genelist.values():
142        for gene in grouplist:
143            chrom, strand, cds_exons = genelist[ gene ]
144            seqlen = sum([ a[1]-a[0] for a in cds_exons ])
145            overhang = seqlen % 3
146            if overhang > 0:
147                #print >>sys.stderr, "adjusting ", gene 
148                if strand == '+':
149                    cds_exons[-1] = ( cds_exons[-1][0], cds_exons[-1][1] - overhang )
150                else:
151                    cds_exons[0] = ( cds_exons[0][0] + overhang, cds_exons[0][1] )
152            cds_exons = bitset_union( cds_exons )
153            yield chrom, strand, cds_exons, gene
154
155def FeatureReader( fh, format='gff', alt_introns_subtract="exons", gtf_parse=None):
156    """
157    yield chrom, strand, cds_exons, introns, exons, name
158
159    gtf_parse Example:
160    # parse gene_id from transcript_id "AC073130.2-001"; gene_id "TES";
161    gene_name = lambda s: s.split(';')[1].split()[1].strip('"')
162
163    for chrom, strand, cds_exons, introns, exons, name in FeatureReader( sys.stdin, format='gtf', gtf_parse=gene_name )
164    """
165
166    known_formats = ( 'gff', 'gtf', 'bed')
167    if format not in known_formats:
168        print >>sys.stderr,  '%s format not in %s' % (format, ",".join( known_formats ))
169        raise '?'
170   
171    if format == 'bed':
172        for line in fh:   
173            f = line.strip().split()
174            chrom = f[0]
175            chrom_start = int(f[1])
176            name = f[4]
177            strand = f[5]
178            cdsStart = int(f[6])
179            cdsEnd = int(f[7])
180            blockCount = int(f[9])
181            blockSizes = [ int(i) for i in f[10].strip(',').split(',') ]
182            blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ]
183
184            # grab cdsStart - cdsEnd
185            cds_exons = []
186            exons = []
187           
188            cds_seq = ''
189            genome_seq_index = []
190            for base,offset in zip( blockStarts, blockSizes ):
191                if (base + offset) < cdsStart: continue
192                if base > cdsEnd: continue
193                # exons
194                exon_start = base
195                exon_end = base+offset
196                exons.append( (exon_start, exon_end) )
197                # cds exons
198                exon_start = max( base, cdsStart )
199                exon_end = min( base+offset, cdsEnd )
200                cds_exons.append( (exon_start, exon_end) )
201            cds_exons = bitset_union( cds_exons )
202            exons = bitset_union( exons )
203            introns = bitset_complement( exons )
204            yield chrom, strand, cds_exons, introns, exons, name
205
206    genelist = {}
207    grouplist = []
208    if format == 'gff' or format == 'gtf':
209        for line in fh:
210            if line.startswith('#'): continue
211            fields = line.strip().split('\t')
212            if len( fields ) < 9: continue
213
214            # fields
215
216            chrom = fields[0]
217            ex_st = int( fields[3] ) - 1 # make zero-centered
218            ex_end = int( fields[4] ) #+ 1 # make exclusive
219            strand = fields[6]
220
221            if format == 'gtf':
222                if not gtf_parse:
223                    group = fields[8].split(';')[0]
224                else:
225                    group = gtf_parse( fields[8] )
226            else:
227                group = fields[8]
228
229            # Results are listed in the same order as encountered
230            if group not in grouplist: grouplist.append( group )
231
232            if group not in genelist:
233                # chrom, strand, cds_exons, introns, exons, cds_start, cds_end
234                genelist[group] = [chrom, strand, [], [], [], None, None]
235           
236            if fields[2] == 'exon':
237                genelist[group][4].append( ( ex_st, ex_end ) )
238
239            elif fields[2] in ('CDS', 'stop_codon', 'start_codon'):
240                genelist[group][2].append( ( ex_st, ex_end ) )
241
242                if fields[2] == 'start_codon':
243                    if strand == '+': genelist[group][5] = ex_st
244                    else: genelist[group][5] = ex_end
245                if fields[2] == 'stop_codon':
246                    if strand == '+': genelist[group][5] = ex_end
247                    else: genelist[group][5] = ex_st
248
249            elif fields[2] == 'intron':
250                genelist[group][3].append( ( ex_st, ex_end ) )
251
252        for gene in grouplist:
253            chrom, strand, cds_exons, introns, exons, cds_start, cds_end = genelist[ gene ]
254
255            cds_exons = bitset_union( cds_exons )
256            exons = bitset_union( exons )
257
258            # assure that cds exons were within the cds range
259            if cds_start is not None and cds_end is not None:
260                if strand == '+':
261                    cds_exons = bitset_intersect( cds_exons, [(cds_start,cds_end)] )
262                else:
263                    cds_exons = bitset_intersect( cds_exons, [(cds_end,cds_start)] )
264
265            # assure that introns are non-overlapping with themselves or exons
266            if alt_introns_subtract:
267                if alt_introns_subtract == 'exons':
268                    introns = bitset_subtract( introns, exons )
269                if alt_introns_subtract == 'cds_exons':
270                    introns = bitset_subtract( introns, cds_exons )
271            else: introns = bitset_union( introns )
272
273            # assure CDS is a multiple of 3, trim from last exon if necessary
274            seqlen = sum([ a[1]-a[0] for a in cds_exons ])
275            overhang = seqlen % 3
276            if overhang > 0:
277                if strand == '+':
278                    cds_exons[-1] = ( cds_exons[-1][0], cds_exons[-1][1] - overhang )
279                else:
280                    cds_exons[0] = ( cds_exons[0][0] + overhang, cds_exons[0][1] )
281
282            yield chrom, strand, cds_exons, introns, exons, gene
283
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。