[3] | 1 | """ |
---|
| 2 | Tools for tiling / projecting alignments onto an interval of a sequence. |
---|
| 3 | """ |
---|
| 4 | |
---|
| 5 | import bx.align as align |
---|
| 6 | from bx import misc |
---|
| 7 | import bx.seq.nib |
---|
| 8 | import os |
---|
| 9 | import string |
---|
| 10 | import sys |
---|
| 11 | |
---|
| 12 | |
---|
| 13 | def tile_interval( sources, index, ref_src, start, end, seq_db=None ): |
---|
| 14 | """ |
---|
| 15 | Tile maf blocks onto an interval. The resulting block will span the interval |
---|
| 16 | exactly and contain the column from the highest scoring alignment at each |
---|
| 17 | position. |
---|
| 18 | |
---|
| 19 | `sources`: list of sequence source names to include in final block |
---|
| 20 | `index`: an instnace that can return maf blocks overlapping intervals |
---|
| 21 | `ref_src`: source name of the interval (ie, hg17.chr7) |
---|
| 22 | `start`: start of interval |
---|
| 23 | `end`: end of interval |
---|
| 24 | `seq_db`: a mapping for source names in the reference species to nib files |
---|
| 25 | """ |
---|
| 26 | # First entry in sources should also be on the reference species |
---|
| 27 | assert sources[0].split('.')[0] == ref_src.split('.')[0], \ |
---|
| 28 | "%s != %s" % ( sources[0].split('.')[0], ref_src.split('.')[0] ) |
---|
| 29 | base_len = end - start |
---|
| 30 | blocks = index.get( ref_src, start, end ) |
---|
| 31 | # From low to high score |
---|
| 32 | blocks.sort( lambda a, b: cmp( a.score, b.score ) ) |
---|
| 33 | mask = [ -1 ] * base_len |
---|
| 34 | ref_src_size = None |
---|
| 35 | for i, block in enumerate( blocks ): |
---|
| 36 | ref = block.get_component_by_src_start( ref_src ) |
---|
| 37 | ref_src_size = ref.src_size |
---|
| 38 | assert ref.strand == "+" |
---|
| 39 | slice_start = max( start, ref.start ) |
---|
| 40 | slice_end = min( end, ref.end ) |
---|
| 41 | for j in range( slice_start, slice_end ): |
---|
| 42 | mask[j-start] = i |
---|
| 43 | tiled = [] |
---|
| 44 | for i in range( len( sources ) ): |
---|
| 45 | tiled.append( [] ) |
---|
| 46 | for ss, ee, index in intervals_from_mask( mask ): |
---|
| 47 | # Interval with no covering alignments |
---|
| 48 | if index < 0: |
---|
| 49 | # Get sequence if available, otherwise just use 'N' |
---|
| 50 | if seq_db: |
---|
| 51 | tiled[0].append( bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).get( start+ss, ee-ss ) ) |
---|
| 52 | else: |
---|
| 53 | tiled[0].append( "N" * (ee-ss) ) |
---|
| 54 | # Gaps in all other species |
---|
| 55 | for row in tiled[1:]: |
---|
| 56 | row.append( "-" * ( ee - ss ) ) |
---|
| 57 | else: |
---|
| 58 | slice_start = start + ss |
---|
| 59 | slice_end = start + ee |
---|
| 60 | block = blocks[index] |
---|
| 61 | ref = block.get_component_by_src_start( ref_src ) |
---|
| 62 | sliced = block.slice_by_component( ref, slice_start, slice_end ) |
---|
| 63 | sliced = sliced.limit_to_species( sources ) |
---|
| 64 | sliced.remove_all_gap_columns() |
---|
| 65 | for i, src in enumerate( sources ): |
---|
| 66 | comp = sliced.get_component_by_src_start( src ) |
---|
| 67 | if comp: |
---|
| 68 | tiled[i].append( comp.text ) |
---|
| 69 | else: |
---|
| 70 | tiled[i].append( "-" * sliced.text_size ) |
---|
| 71 | return [ "".join( t ) for t in tiled ] |
---|
| 72 | |
---|
| 73 | def intervals_from_mask( mask ): |
---|
| 74 | start = 0 |
---|
| 75 | last = mask[0] |
---|
| 76 | for i in range( 1, len( mask ) ): |
---|
| 77 | if mask[i] != last: |
---|
| 78 | yield start, i, last |
---|
| 79 | start = i |
---|
| 80 | last = mask[i] |
---|
| 81 | yield start, len(mask), last |
---|