Context Navigation

maf_tile.py

リビジョン 3, 4.4 KB (コミッタ: kohda, 15 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号
1	#!/usr/bin/python2.6
2
3	"""
4	'Tile' the blocks of a maf file over each of a set of intervals. The
5	highest scoring block that covers any part of a region will be used, and
6	pieces not covered by any block filled with "-" or optionally "*". The list
7	of species to tile is specified by `tree` (either a tree or just a comma
8	separated list). The `seq_db` is a lookup table mapping chromosome names
9	to nib file for filling in the reference species. Maf files must be indexed.
10
11	NOTE: See maf_tile_2.py for a more sophisticated version of this program, I
12	think this one will be eliminated in the future.
13
14	usage: %prog tree maf_files...
15	-m, --missingData: Inserts wildcards for missing block rows instead of '-'
16	"""
17
18	import psyco_full
19
20	from bx.cookbook import doc_optparse
21
22	import bx.align.maf
23	import bx.align as align
24	from bx import misc
25	import bx.seq.nib
26	import os
27	import string
28	import sys
29
30	tree_tx = string.maketrans( "(),", " " )
31
32	def main():
33
34	options, args = doc_optparse.parse( __doc__ )
35	try:
36	sources = args[0].translate( tree_tx ).split()
37	seq_db = load_seq_db( args[1] )
38	index = bx.align.maf.MultiIndexed( args[2:] )
39
40	out = bx.align.maf.Writer( sys.stdout )
41	missing_data = bool(options.missingData)
42	except:
43	doc_optparse.exception()
44
45	for line in sys.stdin:
46	ref_src, start, end = line.split()[0:3]
47	do_interval( sources, index, out, ref_src, int( start ), int( end ), seq_db, missing_data )
48
49	out.close()
50
51	def load_seq_db( fname ):
52	db = {}
53	for line in open( fname ):
54	fields = line.split(',')
55	src = fields[1] + "." + fields[2]
56	seq = fields[4]
57	db[src]=seq.strip()
58	return db
59
60	def do_interval( sources, index, out, ref_src, start, end, seq_db, missing_data ):
61
62	assert sources[0].split('.')[0] == ref_src.split('.')[0], "%s != %s" % ( sources[0].split('.')[0], ref_src.split('.')[0] )
63
64	base_len = end - start
65
66	blocks = index.get( ref_src, start, end )
67	# From low to high score
68	blocks.sort( lambda a, b: cmp( a.score, b.score ) )
69
70	mask = [ -1 ] * base_len
71
72	# print len( blocks )
73	# print blocks[0]
74
75	ref_src_size = None
76	for i, block in enumerate( blocks ):
77	ref = block.get_component_by_src_start( ref_src )
78	ref_src_size = ref.src_size
79	assert ref.strand == "+"
80	slice_start = max( start, ref.start )
81	slice_end = min( end, ref.end )
82	for j in range( slice_start, slice_end ):
83	mask[j-start] = i
84
85	#print >>sys.stderr, mask
86
87	tiled = []
88	for i in range( len( sources ) ): tiled.append( [] )
89
90	for ss, ee, index in intervals_from_mask( mask ):
91	if index < 0:
92	tiled[0].append( bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).get( start+ss, ee-ss ) )
93	for row in tiled[1:]:
94	if missing_data:
95	row.append( "" ( ee - ss ) )
96	else:
97	row.append( "-" * ( ee - ss ) )
98	else:
99	slice_start = start + ss
100	slice_end = start + ee
101	block = blocks[index]
102	ref = block.get_component_by_src_start( ref_src )
103	sliced = block.slice_by_component( ref, slice_start, slice_end )
104	sliced = sliced.limit_to_species( sources )
105	sliced.remove_all_gap_columns()
106	for i, src in enumerate( sources ):
107	comp = sliced.get_component_by_src_start( src )
108	if comp:
109	tiled[i].append( comp.text )
110	else:
111	if missing_data: tiled[i].append( "" sliced.text_size )
112	else: tiled[i].append( "-" * sliced.text_size )
113
114	a = align.Alignment()
115	for i, name in enumerate( sources ):
116	text = "".join( tiled[i] )
117	size = len( text ) - text.count( "-" )
118	if i == 0:
119	if ref_src_size is None: ref_src_size = bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).length
120	c = align.Component( ref_src, start, end-start, "+", ref_src_size, text )
121	else:
122	c = align.Component( name + ".fake", 0, size, "?", size, text )
123	a.add_component( c )
124
125	out.write( a )
126
127
128	def intervals_from_mask( mask ):
129	start = 0
130	last = mask[0]
131	for i in range( 1, len( mask ) ):
132	if mask[i] != last:
133	yield start, i, last
134	start = i
135	last = mask[i]
136	yield start, len(mask), last
137
138	main()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/EGG-INFO/scripts/maf_tile.py

異なるフォーマットでダウンロード: