1 | """ |
---|
2 | Readers extracting gene (exon and intron) information from bed / gtf / gff |
---|
3 | formats. |
---|
4 | |
---|
5 | - GeneReader: yields exons |
---|
6 | - CDSReader: yields cds_exons |
---|
7 | - FeatureReader: yields cds_exons, introns, exons |
---|
8 | |
---|
9 | For gff/gtf, the start_codon stop_codon line types are merged with CDSs. |
---|
10 | """ |
---|
11 | |
---|
12 | import sys |
---|
13 | from bx.bitset import * |
---|
14 | from bx.bitset_utils import * |
---|
15 | from bx.bitset_builders import * |
---|
16 | |
---|
17 | def GeneReader( fh, format='gff' ): |
---|
18 | """ yield chrom, strand, gene_exons, name """ |
---|
19 | |
---|
20 | known_formats = ( 'gff', 'gtf', 'bed') |
---|
21 | if format not in known_formats: |
---|
22 | print >>sys.stderr, '%s format not in %s' % (format, ",".join( known_formats )) |
---|
23 | raise '?' |
---|
24 | |
---|
25 | if format == 'bed': |
---|
26 | for line in fh: |
---|
27 | f = line.strip().split() |
---|
28 | chrom = f[0] |
---|
29 | chrom_start = int(f[1]) |
---|
30 | name = f[4] |
---|
31 | strand = f[5] |
---|
32 | cdsStart = int(f[6]) |
---|
33 | cdsEnd = int(f[7]) |
---|
34 | blockCount = int(f[9]) |
---|
35 | blockSizes = [ int(i) for i in f[10].strip(',').split(',') ] |
---|
36 | blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ] |
---|
37 | |
---|
38 | # grab cdsStart - cdsEnd |
---|
39 | gene_exons = [] |
---|
40 | for base,offset in zip( blockStarts, blockSizes ): |
---|
41 | exon_start = base |
---|
42 | exon_end = base+offset |
---|
43 | gene_exons.append( (exon_start, exon_end) ) |
---|
44 | yield chrom, strand, gene_exons, name |
---|
45 | genelist = {} |
---|
46 | grouplist = [] |
---|
47 | if format == 'gff' or format == 'gtf': |
---|
48 | for line in fh: |
---|
49 | if line.startswith('#'): continue |
---|
50 | fields = line.strip().split('\t') |
---|
51 | if len( fields ) < 9: continue |
---|
52 | |
---|
53 | # fields |
---|
54 | |
---|
55 | chrom = fields[0] |
---|
56 | ex_st = int( fields[3] ) - 1 # make zero-centered |
---|
57 | ex_end = int( fields[4] ) #+ 1 # make exclusive |
---|
58 | strand = fields[6] |
---|
59 | |
---|
60 | if format == 'gtf': |
---|
61 | group = fields[8].split(';')[0] |
---|
62 | else: |
---|
63 | group = fields[8] |
---|
64 | |
---|
65 | if group not in grouplist: grouplist.append( group ) |
---|
66 | if group not in genelist: |
---|
67 | genelist[group] = (chrom, strand, []) |
---|
68 | exons_i = 2 |
---|
69 | genelist[group][exons_i].append( ( ex_st, ex_end ) ) |
---|
70 | |
---|
71 | sp = lambda a,b: cmp( a[0], b[0] ) |
---|
72 | |
---|
73 | #for gene in genelist.values(): |
---|
74 | for gene in grouplist: |
---|
75 | chrom, strand, gene_exons = genelist[ gene ] |
---|
76 | gene_exons = bitset_union( gene_exons ) |
---|
77 | yield chrom, strand, gene_exons, gene |
---|
78 | |
---|
79 | def CDSReader( fh, format='gff' ): |
---|
80 | """ yield chrom, strand, cds_exons, name """ |
---|
81 | |
---|
82 | known_formats = ( 'gff', 'gtf', 'bed') |
---|
83 | if format not in known_formats: |
---|
84 | print >>sys.stderr, '%s format not in %s' % (format, ",".join( known_formats )) |
---|
85 | raise '?' |
---|
86 | |
---|
87 | if format == 'bed': |
---|
88 | for line in fh: |
---|
89 | f = line.strip().split() |
---|
90 | chrom = f[0] |
---|
91 | chrom_start = int(f[1]) |
---|
92 | name = f[4] |
---|
93 | strand = f[5] |
---|
94 | cdsStart = int(f[6]) |
---|
95 | cdsEnd = int(f[7]) |
---|
96 | blockCount = int(f[9]) |
---|
97 | blockSizes = [ int(i) for i in f[10].strip(',').split(',') ] |
---|
98 | blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ] |
---|
99 | |
---|
100 | # grab cdsStart - cdsEnd |
---|
101 | cds_exons = [] |
---|
102 | cds_seq = '' |
---|
103 | genome_seq_index = [] |
---|
104 | for base,offset in zip( blockStarts, blockSizes ): |
---|
105 | if (base + offset) < cdsStart: continue |
---|
106 | if base > cdsEnd: continue |
---|
107 | exon_start = max( base, cdsStart ) |
---|
108 | exon_end = min( base+offset, cdsEnd ) |
---|
109 | cds_exons.append( (exon_start, exon_end) ) |
---|
110 | yield chrom, strand, cds_exons, name |
---|
111 | |
---|
112 | genelist = {} |
---|
113 | grouplist = [] |
---|
114 | if format == 'gff' or format == 'gtf': |
---|
115 | for line in fh: |
---|
116 | if line.startswith('#'): continue |
---|
117 | fields = line.strip().split('\t') |
---|
118 | if len( fields ) < 9: continue |
---|
119 | if fields[2] not in ('CDS', 'stop_codon', 'start_codon'): continue |
---|
120 | |
---|
121 | # fields |
---|
122 | |
---|
123 | chrom = fields[0] |
---|
124 | ex_st = int( fields[3] ) - 1 # make zero-centered |
---|
125 | ex_end = int( fields[4] ) #+ 1 # make exclusive |
---|
126 | strand = fields[6] |
---|
127 | |
---|
128 | if format == 'gtf': |
---|
129 | group = fields[8].split(';')[0] |
---|
130 | else: |
---|
131 | group = fields[8] |
---|
132 | |
---|
133 | if group not in grouplist: grouplist.append( group ) |
---|
134 | if group not in genelist: |
---|
135 | genelist[group] = (chrom, strand, []) |
---|
136 | |
---|
137 | genelist[group][2].append( ( ex_st, ex_end ) ) |
---|
138 | |
---|
139 | sp = lambda a,b: cmp( a[0], b[0] ) |
---|
140 | |
---|
141 | #for gene in genelist.values(): |
---|
142 | for gene in grouplist: |
---|
143 | chrom, strand, cds_exons = genelist[ gene ] |
---|
144 | seqlen = sum([ a[1]-a[0] for a in cds_exons ]) |
---|
145 | overhang = seqlen % 3 |
---|
146 | if overhang > 0: |
---|
147 | #print >>sys.stderr, "adjusting ", gene |
---|
148 | if strand == '+': |
---|
149 | cds_exons[-1] = ( cds_exons[-1][0], cds_exons[-1][1] - overhang ) |
---|
150 | else: |
---|
151 | cds_exons[0] = ( cds_exons[0][0] + overhang, cds_exons[0][1] ) |
---|
152 | cds_exons = bitset_union( cds_exons ) |
---|
153 | yield chrom, strand, cds_exons, gene |
---|
154 | |
---|
155 | def FeatureReader( fh, format='gff', alt_introns_subtract="exons", gtf_parse=None): |
---|
156 | """ |
---|
157 | yield chrom, strand, cds_exons, introns, exons, name |
---|
158 | |
---|
159 | gtf_parse Example: |
---|
160 | # parse gene_id from transcript_id "AC073130.2-001"; gene_id "TES"; |
---|
161 | gene_name = lambda s: s.split(';')[1].split()[1].strip('"') |
---|
162 | |
---|
163 | for chrom, strand, cds_exons, introns, exons, name in FeatureReader( sys.stdin, format='gtf', gtf_parse=gene_name ) |
---|
164 | """ |
---|
165 | |
---|
166 | known_formats = ( 'gff', 'gtf', 'bed') |
---|
167 | if format not in known_formats: |
---|
168 | print >>sys.stderr, '%s format not in %s' % (format, ",".join( known_formats )) |
---|
169 | raise '?' |
---|
170 | |
---|
171 | if format == 'bed': |
---|
172 | for line in fh: |
---|
173 | f = line.strip().split() |
---|
174 | chrom = f[0] |
---|
175 | chrom_start = int(f[1]) |
---|
176 | name = f[4] |
---|
177 | strand = f[5] |
---|
178 | cdsStart = int(f[6]) |
---|
179 | cdsEnd = int(f[7]) |
---|
180 | blockCount = int(f[9]) |
---|
181 | blockSizes = [ int(i) for i in f[10].strip(',').split(',') ] |
---|
182 | blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ] |
---|
183 | |
---|
184 | # grab cdsStart - cdsEnd |
---|
185 | cds_exons = [] |
---|
186 | exons = [] |
---|
187 | |
---|
188 | cds_seq = '' |
---|
189 | genome_seq_index = [] |
---|
190 | for base,offset in zip( blockStarts, blockSizes ): |
---|
191 | if (base + offset) < cdsStart: continue |
---|
192 | if base > cdsEnd: continue |
---|
193 | # exons |
---|
194 | exon_start = base |
---|
195 | exon_end = base+offset |
---|
196 | exons.append( (exon_start, exon_end) ) |
---|
197 | # cds exons |
---|
198 | exon_start = max( base, cdsStart ) |
---|
199 | exon_end = min( base+offset, cdsEnd ) |
---|
200 | cds_exons.append( (exon_start, exon_end) ) |
---|
201 | cds_exons = bitset_union( cds_exons ) |
---|
202 | exons = bitset_union( exons ) |
---|
203 | introns = bitset_complement( exons ) |
---|
204 | yield chrom, strand, cds_exons, introns, exons, name |
---|
205 | |
---|
206 | genelist = {} |
---|
207 | grouplist = [] |
---|
208 | if format == 'gff' or format == 'gtf': |
---|
209 | for line in fh: |
---|
210 | if line.startswith('#'): continue |
---|
211 | fields = line.strip().split('\t') |
---|
212 | if len( fields ) < 9: continue |
---|
213 | |
---|
214 | # fields |
---|
215 | |
---|
216 | chrom = fields[0] |
---|
217 | ex_st = int( fields[3] ) - 1 # make zero-centered |
---|
218 | ex_end = int( fields[4] ) #+ 1 # make exclusive |
---|
219 | strand = fields[6] |
---|
220 | |
---|
221 | if format == 'gtf': |
---|
222 | if not gtf_parse: |
---|
223 | group = fields[8].split(';')[0] |
---|
224 | else: |
---|
225 | group = gtf_parse( fields[8] ) |
---|
226 | else: |
---|
227 | group = fields[8] |
---|
228 | |
---|
229 | # Results are listed in the same order as encountered |
---|
230 | if group not in grouplist: grouplist.append( group ) |
---|
231 | |
---|
232 | if group not in genelist: |
---|
233 | # chrom, strand, cds_exons, introns, exons, cds_start, cds_end |
---|
234 | genelist[group] = [chrom, strand, [], [], [], None, None] |
---|
235 | |
---|
236 | if fields[2] == 'exon': |
---|
237 | genelist[group][4].append( ( ex_st, ex_end ) ) |
---|
238 | |
---|
239 | elif fields[2] in ('CDS', 'stop_codon', 'start_codon'): |
---|
240 | genelist[group][2].append( ( ex_st, ex_end ) ) |
---|
241 | |
---|
242 | if fields[2] == 'start_codon': |
---|
243 | if strand == '+': genelist[group][5] = ex_st |
---|
244 | else: genelist[group][5] = ex_end |
---|
245 | if fields[2] == 'stop_codon': |
---|
246 | if strand == '+': genelist[group][5] = ex_end |
---|
247 | else: genelist[group][5] = ex_st |
---|
248 | |
---|
249 | elif fields[2] == 'intron': |
---|
250 | genelist[group][3].append( ( ex_st, ex_end ) ) |
---|
251 | |
---|
252 | for gene in grouplist: |
---|
253 | chrom, strand, cds_exons, introns, exons, cds_start, cds_end = genelist[ gene ] |
---|
254 | |
---|
255 | cds_exons = bitset_union( cds_exons ) |
---|
256 | exons = bitset_union( exons ) |
---|
257 | |
---|
258 | # assure that cds exons were within the cds range |
---|
259 | if cds_start is not None and cds_end is not None: |
---|
260 | if strand == '+': |
---|
261 | cds_exons = bitset_intersect( cds_exons, [(cds_start,cds_end)] ) |
---|
262 | else: |
---|
263 | cds_exons = bitset_intersect( cds_exons, [(cds_end,cds_start)] ) |
---|
264 | |
---|
265 | # assure that introns are non-overlapping with themselves or exons |
---|
266 | if alt_introns_subtract: |
---|
267 | if alt_introns_subtract == 'exons': |
---|
268 | introns = bitset_subtract( introns, exons ) |
---|
269 | if alt_introns_subtract == 'cds_exons': |
---|
270 | introns = bitset_subtract( introns, cds_exons ) |
---|
271 | else: introns = bitset_union( introns ) |
---|
272 | |
---|
273 | # assure CDS is a multiple of 3, trim from last exon if necessary |
---|
274 | seqlen = sum([ a[1]-a[0] for a in cds_exons ]) |
---|
275 | overhang = seqlen % 3 |
---|
276 | if overhang > 0: |
---|
277 | if strand == '+': |
---|
278 | cds_exons[-1] = ( cds_exons[-1][0], cds_exons[-1][1] - overhang ) |
---|
279 | else: |
---|
280 | cds_exons[0] = ( cds_exons[0][0] + overhang, cds_exons[0][1] ) |
---|
281 | |
---|
282 | yield chrom, strand, cds_exons, introns, exons, gene |
---|
283 | |
---|