| 1 | #!/usr/bin/python2.6 |
|---|
| 2 | |
|---|
| 3 | """ |
|---|
| 4 | 'Tile' the blocks of a maf file over each of a set of intervals. The |
|---|
| 5 | highest scoring block that covers any part of a region will be used, and |
|---|
| 6 | pieces not covered by any block filled with "-" or optionally "*". The list |
|---|
| 7 | of species to tile is specified by `tree` (either a tree or just a comma |
|---|
| 8 | separated list). The `seq_db` is a lookup table mapping chromosome names |
|---|
| 9 | to nib file for filling in the reference species. Maf files must be indexed. |
|---|
| 10 | |
|---|
| 11 | NOTE: See maf_tile_2.py for a more sophisticated version of this program, I |
|---|
| 12 | think this one will be eliminated in the future. |
|---|
| 13 | |
|---|
| 14 | usage: %prog tree maf_files... |
|---|
| 15 | -m, --missingData: Inserts wildcards for missing block rows instead of '-' |
|---|
| 16 | """ |
|---|
| 17 | |
|---|
| 18 | import psyco_full |
|---|
| 19 | |
|---|
| 20 | from bx.cookbook import doc_optparse |
|---|
| 21 | |
|---|
| 22 | import bx.align.maf |
|---|
| 23 | import bx.align as align |
|---|
| 24 | from bx import misc |
|---|
| 25 | import bx.seq.nib |
|---|
| 26 | import os |
|---|
| 27 | import string |
|---|
| 28 | import sys |
|---|
| 29 | |
|---|
| 30 | tree_tx = string.maketrans( "(),", " " ) |
|---|
| 31 | |
|---|
| 32 | def main(): |
|---|
| 33 | |
|---|
| 34 | options, args = doc_optparse.parse( __doc__ ) |
|---|
| 35 | try: |
|---|
| 36 | sources = args[0].translate( tree_tx ).split() |
|---|
| 37 | seq_db = load_seq_db( args[1] ) |
|---|
| 38 | index = bx.align.maf.MultiIndexed( args[2:] ) |
|---|
| 39 | |
|---|
| 40 | out = bx.align.maf.Writer( sys.stdout ) |
|---|
| 41 | missing_data = bool(options.missingData) |
|---|
| 42 | except: |
|---|
| 43 | doc_optparse.exception() |
|---|
| 44 | |
|---|
| 45 | for line in sys.stdin: |
|---|
| 46 | ref_src, start, end = line.split()[0:3] |
|---|
| 47 | do_interval( sources, index, out, ref_src, int( start ), int( end ), seq_db, missing_data ) |
|---|
| 48 | |
|---|
| 49 | out.close() |
|---|
| 50 | |
|---|
| 51 | def load_seq_db( fname ): |
|---|
| 52 | db = {} |
|---|
| 53 | for line in open( fname ): |
|---|
| 54 | fields = line.split(',') |
|---|
| 55 | src = fields[1] + "." + fields[2] |
|---|
| 56 | seq = fields[4] |
|---|
| 57 | db[src]=seq.strip() |
|---|
| 58 | return db |
|---|
| 59 | |
|---|
| 60 | def do_interval( sources, index, out, ref_src, start, end, seq_db, missing_data ): |
|---|
| 61 | |
|---|
| 62 | assert sources[0].split('.')[0] == ref_src.split('.')[0], "%s != %s" % ( sources[0].split('.')[0], ref_src.split('.')[0] ) |
|---|
| 63 | |
|---|
| 64 | base_len = end - start |
|---|
| 65 | |
|---|
| 66 | blocks = index.get( ref_src, start, end ) |
|---|
| 67 | # From low to high score |
|---|
| 68 | blocks.sort( lambda a, b: cmp( a.score, b.score ) ) |
|---|
| 69 | |
|---|
| 70 | mask = [ -1 ] * base_len |
|---|
| 71 | |
|---|
| 72 | # print len( blocks ) |
|---|
| 73 | # print blocks[0] |
|---|
| 74 | |
|---|
| 75 | ref_src_size = None |
|---|
| 76 | for i, block in enumerate( blocks ): |
|---|
| 77 | ref = block.get_component_by_src_start( ref_src ) |
|---|
| 78 | ref_src_size = ref.src_size |
|---|
| 79 | assert ref.strand == "+" |
|---|
| 80 | slice_start = max( start, ref.start ) |
|---|
| 81 | slice_end = min( end, ref.end ) |
|---|
| 82 | for j in range( slice_start, slice_end ): |
|---|
| 83 | mask[j-start] = i |
|---|
| 84 | |
|---|
| 85 | #print >>sys.stderr, mask |
|---|
| 86 | |
|---|
| 87 | tiled = [] |
|---|
| 88 | for i in range( len( sources ) ): tiled.append( [] ) |
|---|
| 89 | |
|---|
| 90 | for ss, ee, index in intervals_from_mask( mask ): |
|---|
| 91 | if index < 0: |
|---|
| 92 | tiled[0].append( bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).get( start+ss, ee-ss ) ) |
|---|
| 93 | for row in tiled[1:]: |
|---|
| 94 | if missing_data: |
|---|
| 95 | row.append( "*" * ( ee - ss ) ) |
|---|
| 96 | else: |
|---|
| 97 | row.append( "-" * ( ee - ss ) ) |
|---|
| 98 | else: |
|---|
| 99 | slice_start = start + ss |
|---|
| 100 | slice_end = start + ee |
|---|
| 101 | block = blocks[index] |
|---|
| 102 | ref = block.get_component_by_src_start( ref_src ) |
|---|
| 103 | sliced = block.slice_by_component( ref, slice_start, slice_end ) |
|---|
| 104 | sliced = sliced.limit_to_species( sources ) |
|---|
| 105 | sliced.remove_all_gap_columns() |
|---|
| 106 | for i, src in enumerate( sources ): |
|---|
| 107 | comp = sliced.get_component_by_src_start( src ) |
|---|
| 108 | if comp: |
|---|
| 109 | tiled[i].append( comp.text ) |
|---|
| 110 | else: |
|---|
| 111 | if missing_data: tiled[i].append( "*" * sliced.text_size ) |
|---|
| 112 | else: tiled[i].append( "-" * sliced.text_size ) |
|---|
| 113 | |
|---|
| 114 | a = align.Alignment() |
|---|
| 115 | for i, name in enumerate( sources ): |
|---|
| 116 | text = "".join( tiled[i] ) |
|---|
| 117 | size = len( text ) - text.count( "-" ) |
|---|
| 118 | if i == 0: |
|---|
| 119 | if ref_src_size is None: ref_src_size = bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).length |
|---|
| 120 | c = align.Component( ref_src, start, end-start, "+", ref_src_size, text ) |
|---|
| 121 | else: |
|---|
| 122 | c = align.Component( name + ".fake", 0, size, "?", size, text ) |
|---|
| 123 | a.add_component( c ) |
|---|
| 124 | |
|---|
| 125 | out.write( a ) |
|---|
| 126 | |
|---|
| 127 | |
|---|
| 128 | def intervals_from_mask( mask ): |
|---|
| 129 | start = 0 |
|---|
| 130 | last = mask[0] |
|---|
| 131 | for i in range( 1, len( mask ) ): |
|---|
| 132 | if mask[i] != last: |
|---|
| 133 | yield start, i, last |
|---|
| 134 | start = i |
|---|
| 135 | last = mask[i] |
|---|
| 136 | yield start, len(mask), last |
|---|
| 137 | |
|---|
| 138 | main() |
|---|