Context Navigation

maf_tile_2.py

リビジョン 3, 11.5 KB (コミッタ: kohda, 15 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号
1	#!/usr/bin/python2.6
2
3	"""
4	'Tile' the blocks of a maf file over each of a set of intervals. The
5	highest scoring block that covers any part of a region will be used, and
6	pieces not covered by any block filled with "-" or optionally "*".
7
8	This version uses synteny annotation if found on the alignment blocks, and
9	will attempt to fill gaps with special characters depending on the type of
10	gap, similar to the projected alignment display of the UCSC genome browser:
11	'*' for new, '=' for inverse/inset, '#' for contig, 'X' for missing.
12
13	- The list of species to tile is specified by the first argument (either a
14	newick tree or just a comma separated list).
15
16	- The `seq_db` is a lookup table mapping species and chromosome names
17	to nib file for filling in the reference species sequence. In this file
18	column 1 contains the species, column 2 the chromomsome or contig, and
19	column 4 the directory containing the sequences in nib format.
20
21	- The remaining arguments are a list of maf files which must have
22	corresponding ".index" files.
23
24	TODO: The seq_db format is specific to something old and obsure at PSU,
25	need to standardize.
26
27	usage: %prog list,of,species,to,keep seq_db_file indexed_maf_files ...
28	-m, --missingData: Inserts wildcards for missing block rows instead of '-'
29	-s, --strand: Use strand information for intervals, reveres complement if '-'
30	"""
31
32	import psyco_full
33
34	from cookbook import doc_optparse
35
36	import operator
37
38	import bx.align.maf as maf
39	import bx.align as align
40	from bx import misc
41	import bx.seq.nib
42	import os
43	import string
44	import sys
45
46	tree_tx = string.maketrans( "(),", " " )
47
48	def main():
49
50	options, args = doc_optparse.parse( __doc__ )
51	try:
52	sources = args[0].translate( tree_tx ).split()
53	seq_db = load_seq_db( args[1] )
54	index = maf.MultiIndexed( args[2:] )
55
56	out = maf.Writer( sys.stdout )
57	missing_data = bool(options.missingData)
58	use_strand = bool(options.strand)
59	except:
60	doc_optparse.exception()
61
62	for line in sys.stdin:
63	fields = line.split()
64	ref_src, start, end = fields[0:3]
65	if use_strand and len( fields ) > 5:
66	strand = fields[5]
67	else:
68	strand = '+'
69	do_interval( sources, index, out, ref_src, int( start ), int( end ), seq_db, missing_data, strand )
70
71	out.close()
72
73	def load_seq_db( fname ):
74	db = {}
75	for line in open( fname ):
76	fields = line.split(',')
77	src = fields[1] + "." + fields[2]
78	seq = fields[4]
79	db[src]=seq.strip()
80	return db
81
82	def get_fill_char( maf_status ):
83	"""
84	Return the character that should be used to fill between blocks
85	having a given status
86	"""
87	## assert maf_status not in ( maf.MAF_CONTIG_NESTED_STATUS, maf.MAF_NEW_NESTED_STATUS,
88	## maf.MAF_MAYBE_NEW_NESTED_STATUS ), \
89	## "Nested rows do not make sense in a single coverage MAF (or do they?)"
90	if maf_status in ( maf.MAF_NEW_STATUS, maf.MAF_MAYBE_NEW_STATUS,
91	maf.MAF_NEW_NESTED_STATUS, maf.MAF_MAYBE_NEW_NESTED_STATUS ):
92	return "*"
93	elif maf_status in ( maf.MAF_INVERSE_STATUS, maf.MAF_INSERT_STATUS ):
94	return "="
95	elif maf_status in ( maf.MAF_CONTIG_STATUS, maf.MAF_CONTIG_NESTED_STATUS ):
96	return "#"
97	elif maf_status == maf.MAF_MISSING_STATUS:
98	return "X"
99	else:
100	raise "Unknwon maf status"
101
102	def guess_fill_char( left_comp, right_comp ):
103	"""
104	For the case where there is no annotated synteny we will try to guess it
105	"""
106	# No left component, obiously new
107	return "*"
108	# First check that the blocks have the same src (not just species) and
109	# orientation
110	if ( left_comp.src == right_comp.src and left_comp.strand != right_comp.strand ):
111	# Are they completely contiguous? Easy to call that a gap
112	if left_comp.end == right_comp.start:
113	return "-"
114	# TODO: should be able to make some guesses about short insertions
115	# here
116	# All other cases we have no clue about
117	return "*"
118
119	def remove_all_gap_columns( texts ):
120	"""
121	Remove any columns containing only gaps from alignment texts
122	"""
123	seqs = [ list( t ) for t in texts ]
124	i = 0
125	text_size = len( texts[0] )
126	while i < text_size:
127	all_gap = True
128	for seq in seqs:
129	if seq[i] not in ( '-', '#', '*', '=', 'X', '@' ):
130	all_gap = False
131	if all_gap:
132	for seq in seqs:
133	del seq[i]
134	text_size -= 1
135	else:
136	i += 1
137	return [ ''.join( s ) for s in seqs ]
138
139	def do_interval( sources, index, out, ref_src, start, end, seq_db, missing_data, strand ):
140	"""
141	Join together alignment blocks to create a semi human projected local
142	alignment (small reference sequence deletions are kept as supported by
143	the local alignment).
144	"""
145	ref_src_size = None
146	# Make sure the reference component is also the first in the source list
147	assert sources[0].split('.')[0] == ref_src.split('.')[0], "%s != %s" \
148	% ( sources[0].split('.')[0], ref_src.split('.')[0] )
149	# Determine the overall length of the interval
150	base_len = end - start
151	# Counter for the last reference species base we have processed
152	last_stop = start
153	# Rows in maf blocks come in in arbitrary order, we'll convert things
154	# to the destred order of the tiled block
155	source_to_index = dict( ( name, i ) for ( i, name ) in enumerate( sources ) )
156	# This gets all the maf blocks overlapping our interval of interest
157	# NOTE: Unlike maf_tile we're expecting
158	# things to be single coverage in the reference species, so we won't
159	# sort by score and lay down.
160	blocks = index.get( ref_src, start, end )
161	# The last component seen for each species onto which we are tiling
162	last_components = [ None ] * len( sources )
163	last_status = [ None ] * len( sources )
164	cols_needing_fill = [ 0 ] * len( sources )
165	# The list of strings in which we build up the tiled alignment
166	tiled_rows = [ "" for i in range( len( sources ) ) ]
167	# Enumerate the (ordered) list of blocks
168	for i, block in enumerate( blocks ):
169	# Check for overlap in reference species
170	ref = block.get_component_by_src_start( ref_src )
171	if ref.start < last_stop:
172	if ref.end < last_stop:
173	continue
174	block = block.slice_by_component( ref, last_stop, min( end, ref.end ) )
175	ref = block.get_component_by_src_start( ref_src )
176	block = block.slice_by_component( ref, max( start, ref.start ), min( end, ref.end ) )
177	ref = block.get_component_by_src_start( ref_src )
178	# print block
179	assert last_components[0] is None or ref.start >= last_components[0].end, \
180	"MAF must be sorted and single coverage in reference species!"
181	assert ref.strand == "+", \
182	"MAF must have all reference species blocks on the plus strand"
183	# Store the size of the reference sequence for building fake block
184	if ref_src_size is None:
185	ref_src_size = ref.src_size
186	# Handle the reference component seperately, it has no synteny status
187	# but we will try to fill in missing sequence
188	if ref.start > last_stop:
189	# Need to fill in some reference sequence
190	chunk_len = ref.start - last_stop
191	text = bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).get( last_stop, chunk_len )
192	tiled_rows[0] += text
193	for source in sources[1:]:
194	cols_needing_fill[ source_to_index[ source ] ] += chunk_len
195	# Do reference component
196	chunk_len = len( ref.text )
197	tiled_rows[0] += ref.text
198	# Do each other component
199	for source in sources[1:]:
200	source_index = source_to_index[ source ]
201	comp = block.get_component_by_src_start( source )
202	if comp:
203	if comp.synteny_left is None:
204	left_status, left_length = None, -1
205	else:
206	left_status, left_length = comp.synteny_left
207	if comp.synteny_right is None:
208	right_status, right_length = None, -1
209	else:
210	right_status, right_length = comp.synteny_right
211	# We have a component, do we need to do some filling?
212	cols_to_fill = cols_needing_fill[ source_index ]
213	if cols_to_fill > 0:
214	# Adjacent components should have matching status
215	## assert last_status[ source_index ] is None or last_status[ source_index ] == left_status, \
216	## "left status (%s) does not match right status (%s) of last component for %s" \
217	## % ( left_status, last_status[ source_index ], source )
218	if left_status is None:
219	fill_char = guess_fill_char( last_components[source_index], comp )
220	else:
221	fill_char = get_fill_char( left_status )
222	tiled_rows[ source_index ] += ( fill_char * cols_to_fill )
223	cols_needing_fill[ source_index ] = 0
224	# Okay, filled up to current position, now append the text
225	tiled_rows[ source_index ] += comp.text
226	assert len( tiled_rows[ source_index ] ) == len( tiled_rows[ 0 ] ), \
227	"length of tiled row should match reference row"
228	last_components[ source_index ] = comp
229	last_status[ source_index ] = right_status
230	else:
231	# No component, we'll have to fill this region when we know
232	# the status
233	cols_needing_fill[ source_index ] += chunk_len
234	last_stop = ref.end
235	# No more components, clean up the ends
236	if last_stop < end:
237	# Need to fill in some reference sequence
238	chunk_len = end - last_stop
239	tiled_rows[0] += bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).get( last_stop, chunk_len )
240	for source in sources[1:]:
241	cols_needing_fill[ source_to_index[ source ] ] += chunk_len
242	# Any final filling that needs to be done?
243	for source in sources[1:]:
244	source_index = source_to_index[ source ]
245	fill_needed = cols_needing_fill[ source_index ]
246	if fill_needed > 0:
247	if last_components[ source_index ] is None:
248	# print >>sys.stderr, "Never saw any components for %s, filling with @" % source
249	fill_char = '@'
250	else:
251	if last_status[ source_index ] is None:
252	fill_char = '*'
253	else:
254	fill_char = get_fill_char( last_status[ source_index ] )
255	tiled_rows[ source_index ] += fill_char * fill_needed
256	assert len( tiled_rows[ source_index ] ) == len( tiled_rows[ 0 ] ), \
257	"length of tiled row should match reference row"
258	# Okay, now make up the fake alignment from the tiled rows.
259	tiled_rows = remove_all_gap_columns( tiled_rows )
260	a = align.Alignment()
261	for i, name in enumerate( sources ):
262	text = "".join( tiled_rows[i] )
263	size = len( text ) - text.count( "-" )
264	if i == 0:
265	if ref_src_size is None: ref_src_size = bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).length
266	c = align.Component( ref_src, start, end-start, "+", ref_src_size, text )
267	else:
268	c = align.Component( name + ".fake", 0, size, "?", size, text )
269	a.add_component( c )
270	if strand == '-':
271	a = a.reverse_complement()
272	out.write( a )
273
274	main()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/EGG-INFO/scripts/maf_tile_2.py

異なるフォーマットでダウンロード: