Context Navigation

gene_reader.py

リビジョン 3, 10.2 KB (コミッタ: kohda, 14 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号
1	"""
2	Readers extracting gene (exon and intron) information from bed / gtf / gff
3	formats.
4
5	- GeneReader: yields exons
6	- CDSReader: yields cds_exons
7	- FeatureReader: yields cds_exons, introns, exons
8
9	For gff/gtf, the start_codon stop_codon line types are merged with CDSs.
10	"""
11
12	import sys
13	from bx.bitset import *
14	from bx.bitset_utils import *
15	from bx.bitset_builders import *
16
17	def GeneReader( fh, format='gff' ):
18	""" yield chrom, strand, gene_exons, name """
19
20	known_formats = ( 'gff', 'gtf', 'bed')
21	if format not in known_formats:
22	print >>sys.stderr, '%s format not in %s' % (format, ",".join( known_formats ))
23	raise '?'
24
25	if format == 'bed':
26	for line in fh:
27	f = line.strip().split()
28	chrom = f[0]
29	chrom_start = int(f[1])
30	name = f[4]
31	strand = f[5]
32	cdsStart = int(f[6])
33	cdsEnd = int(f[7])
34	blockCount = int(f[9])
35	blockSizes = [ int(i) for i in f[10].strip(',').split(',') ]
36	blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ]
37
38	# grab cdsStart - cdsEnd
39	gene_exons = []
40	for base,offset in zip( blockStarts, blockSizes ):
41	exon_start = base
42	exon_end = base+offset
43	gene_exons.append( (exon_start, exon_end) )
44	yield chrom, strand, gene_exons, name
45	genelist = {}
46	grouplist = []
47	if format == 'gff' or format == 'gtf':
48	for line in fh:
49	if line.startswith('#'): continue
50	fields = line.strip().split('\t')
51	if len( fields ) < 9: continue
52
53	# fields
54
55	chrom = fields[0]
56	ex_st = int( fields[3] ) - 1 # make zero-centered
57	ex_end = int( fields[4] ) #+ 1 # make exclusive
58	strand = fields[6]
59
60	if format == 'gtf':
61	group = fields[8].split(';')[0]
62	else:
63	group = fields[8]
64
65	if group not in grouplist: grouplist.append( group )
66	if group not in genelist:
67	genelist[group] = (chrom, strand, [])
68	exons_i = 2
69	genelist[group][exons_i].append( ( ex_st, ex_end ) )
70
71	sp = lambda a,b: cmp( a[0], b[0] )
72
73	#for gene in genelist.values():
74	for gene in grouplist:
75	chrom, strand, gene_exons = genelist[ gene ]
76	gene_exons = bitset_union( gene_exons )
77	yield chrom, strand, gene_exons, gene
78
79	def CDSReader( fh, format='gff' ):
80	""" yield chrom, strand, cds_exons, name """
81
82	known_formats = ( 'gff', 'gtf', 'bed')
83	if format not in known_formats:
84	print >>sys.stderr, '%s format not in %s' % (format, ",".join( known_formats ))
85	raise '?'
86
87	if format == 'bed':
88	for line in fh:
89	f = line.strip().split()
90	chrom = f[0]
91	chrom_start = int(f[1])
92	name = f[4]
93	strand = f[5]
94	cdsStart = int(f[6])
95	cdsEnd = int(f[7])
96	blockCount = int(f[9])
97	blockSizes = [ int(i) for i in f[10].strip(',').split(',') ]
98	blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ]
99
100	# grab cdsStart - cdsEnd
101	cds_exons = []
102	cds_seq = ''
103	genome_seq_index = []
104	for base,offset in zip( blockStarts, blockSizes ):
105	if (base + offset) < cdsStart: continue
106	if base > cdsEnd: continue
107	exon_start = max( base, cdsStart )
108	exon_end = min( base+offset, cdsEnd )
109	cds_exons.append( (exon_start, exon_end) )
110	yield chrom, strand, cds_exons, name
111
112	genelist = {}
113	grouplist = []
114	if format == 'gff' or format == 'gtf':
115	for line in fh:
116	if line.startswith('#'): continue
117	fields = line.strip().split('\t')
118	if len( fields ) < 9: continue
119	if fields[2] not in ('CDS', 'stop_codon', 'start_codon'): continue
120
121	# fields
122
123	chrom = fields[0]
124	ex_st = int( fields[3] ) - 1 # make zero-centered
125	ex_end = int( fields[4] ) #+ 1 # make exclusive
126	strand = fields[6]
127
128	if format == 'gtf':
129	group = fields[8].split(';')[0]
130	else:
131	group = fields[8]
132
133	if group not in grouplist: grouplist.append( group )
134	if group not in genelist:
135	genelist[group] = (chrom, strand, [])
136
137	genelist[group][2].append( ( ex_st, ex_end ) )
138
139	sp = lambda a,b: cmp( a[0], b[0] )
140
141	#for gene in genelist.values():
142	for gene in grouplist:
143	chrom, strand, cds_exons = genelist[ gene ]
144	seqlen = sum([ a[1]-a[0] for a in cds_exons ])
145	overhang = seqlen % 3
146	if overhang > 0:
147	#print >>sys.stderr, "adjusting ", gene
148	if strand == '+':
149	cds_exons[-1] = ( cds_exons[-1][0], cds_exons[-1][1] - overhang )
150	else:
151	cds_exons[0] = ( cds_exons[0][0] + overhang, cds_exons[0][1] )
152	cds_exons = bitset_union( cds_exons )
153	yield chrom, strand, cds_exons, gene
154
155	def FeatureReader( fh, format='gff', alt_introns_subtract="exons", gtf_parse=None):
156	"""
157	yield chrom, strand, cds_exons, introns, exons, name
158
159	gtf_parse Example:
160	# parse gene_id from transcript_id "AC073130.2-001"; gene_id "TES";
161	gene_name = lambda s: s.split(';')[1].split()[1].strip('"')
162
163	for chrom, strand, cds_exons, introns, exons, name in FeatureReader( sys.stdin, format='gtf', gtf_parse=gene_name )
164	"""
165
166	known_formats = ( 'gff', 'gtf', 'bed')
167	if format not in known_formats:
168	print >>sys.stderr, '%s format not in %s' % (format, ",".join( known_formats ))
169	raise '?'
170
171	if format == 'bed':
172	for line in fh:
173	f = line.strip().split()
174	chrom = f[0]
175	chrom_start = int(f[1])
176	name = f[4]
177	strand = f[5]
178	cdsStart = int(f[6])
179	cdsEnd = int(f[7])
180	blockCount = int(f[9])
181	blockSizes = [ int(i) for i in f[10].strip(',').split(',') ]
182	blockStarts = [ chrom_start + int(i) for i in f[11].strip(',').split(',') ]
183
184	# grab cdsStart - cdsEnd
185	cds_exons = []
186	exons = []
187
188	cds_seq = ''
189	genome_seq_index = []
190	for base,offset in zip( blockStarts, blockSizes ):
191	if (base + offset) < cdsStart: continue
192	if base > cdsEnd: continue
193	# exons
194	exon_start = base
195	exon_end = base+offset
196	exons.append( (exon_start, exon_end) )
197	# cds exons
198	exon_start = max( base, cdsStart )
199	exon_end = min( base+offset, cdsEnd )
200	cds_exons.append( (exon_start, exon_end) )
201	cds_exons = bitset_union( cds_exons )
202	exons = bitset_union( exons )
203	introns = bitset_complement( exons )
204	yield chrom, strand, cds_exons, introns, exons, name
205
206	genelist = {}
207	grouplist = []
208	if format == 'gff' or format == 'gtf':
209	for line in fh:
210	if line.startswith('#'): continue
211	fields = line.strip().split('\t')
212	if len( fields ) < 9: continue
213
214	# fields
215
216	chrom = fields[0]
217	ex_st = int( fields[3] ) - 1 # make zero-centered
218	ex_end = int( fields[4] ) #+ 1 # make exclusive
219	strand = fields[6]
220
221	if format == 'gtf':
222	if not gtf_parse:
223	group = fields[8].split(';')[0]
224	else:
225	group = gtf_parse( fields[8] )
226	else:
227	group = fields[8]
228
229	# Results are listed in the same order as encountered
230	if group not in grouplist: grouplist.append( group )
231
232	if group not in genelist:
233	# chrom, strand, cds_exons, introns, exons, cds_start, cds_end
234	genelist[group] = [chrom, strand, [], [], [], None, None]
235
236	if fields[2] == 'exon':
237	genelist[group][4].append( ( ex_st, ex_end ) )
238
239	elif fields[2] in ('CDS', 'stop_codon', 'start_codon'):
240	genelist[group][2].append( ( ex_st, ex_end ) )
241
242	if fields[2] == 'start_codon':
243	if strand == '+': genelist[group][5] = ex_st
244	else: genelist[group][5] = ex_end
245	if fields[2] == 'stop_codon':
246	if strand == '+': genelist[group][5] = ex_end
247	else: genelist[group][5] = ex_st
248
249	elif fields[2] == 'intron':
250	genelist[group][3].append( ( ex_st, ex_end ) )
251
252	for gene in grouplist:
253	chrom, strand, cds_exons, introns, exons, cds_start, cds_end = genelist[ gene ]
254
255	cds_exons = bitset_union( cds_exons )
256	exons = bitset_union( exons )
257
258	# assure that cds exons were within the cds range
259	if cds_start is not None and cds_end is not None:
260	if strand == '+':
261	cds_exons = bitset_intersect( cds_exons, [(cds_start,cds_end)] )
262	else:
263	cds_exons = bitset_intersect( cds_exons, [(cds_end,cds_start)] )
264
265	# assure that introns are non-overlapping with themselves or exons
266	if alt_introns_subtract:
267	if alt_introns_subtract == 'exons':
268	introns = bitset_subtract( introns, exons )
269	if alt_introns_subtract == 'cds_exons':
270	introns = bitset_subtract( introns, cds_exons )
271	else: introns = bitset_union( introns )
272
273	# assure CDS is a multiple of 3, trim from last exon if necessary
274	seqlen = sum([ a[1]-a[0] for a in cds_exons ])
275	overhang = seqlen % 3
276	if overhang > 0:
277	if strand == '+':
278	cds_exons[-1] = ( cds_exons[-1][0], cds_exons[-1][1] - overhang )
279	else:
280	cds_exons[0] = ( cds_exons[0][0] + overhang, cds_exons[0][1] )
281
282	yield chrom, strand, cds_exons, introns, exons, gene
283

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/gene_reader.py

異なるフォーマットでダウンロード: