1 | """ |
---|
2 | Tools for tiling / projecting alignments onto an interval of a sequence. |
---|
3 | """ |
---|
4 | |
---|
5 | import bx.align as align |
---|
6 | from bx import misc |
---|
7 | import bx.seq.nib |
---|
8 | import os |
---|
9 | import string |
---|
10 | import sys |
---|
11 | |
---|
12 | |
---|
13 | def tile_interval( sources, index, ref_src, start, end, seq_db=None ): |
---|
14 | """ |
---|
15 | Tile maf blocks onto an interval. The resulting block will span the interval |
---|
16 | exactly and contain the column from the highest scoring alignment at each |
---|
17 | position. |
---|
18 | |
---|
19 | `sources`: list of sequence source names to include in final block |
---|
20 | `index`: an instnace that can return maf blocks overlapping intervals |
---|
21 | `ref_src`: source name of the interval (ie, hg17.chr7) |
---|
22 | `start`: start of interval |
---|
23 | `end`: end of interval |
---|
24 | `seq_db`: a mapping for source names in the reference species to nib files |
---|
25 | """ |
---|
26 | # First entry in sources should also be on the reference species |
---|
27 | assert sources[0].split('.')[0] == ref_src.split('.')[0], \ |
---|
28 | "%s != %s" % ( sources[0].split('.')[0], ref_src.split('.')[0] ) |
---|
29 | base_len = end - start |
---|
30 | blocks = index.get( ref_src, start, end ) |
---|
31 | # From low to high score |
---|
32 | blocks.sort( lambda a, b: cmp( a.score, b.score ) ) |
---|
33 | mask = [ -1 ] * base_len |
---|
34 | ref_src_size = None |
---|
35 | for i, block in enumerate( blocks ): |
---|
36 | ref = block.get_component_by_src_start( ref_src ) |
---|
37 | ref_src_size = ref.src_size |
---|
38 | assert ref.strand == "+" |
---|
39 | slice_start = max( start, ref.start ) |
---|
40 | slice_end = min( end, ref.end ) |
---|
41 | for j in range( slice_start, slice_end ): |
---|
42 | mask[j-start] = i |
---|
43 | tiled = [] |
---|
44 | for i in range( len( sources ) ): |
---|
45 | tiled.append( [] ) |
---|
46 | for ss, ee, index in intervals_from_mask( mask ): |
---|
47 | # Interval with no covering alignments |
---|
48 | if index < 0: |
---|
49 | # Get sequence if available, otherwise just use 'N' |
---|
50 | if seq_db: |
---|
51 | tiled[0].append( bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).get( start+ss, ee-ss ) ) |
---|
52 | else: |
---|
53 | tiled[0].append( "N" * (ee-ss) ) |
---|
54 | # Gaps in all other species |
---|
55 | for row in tiled[1:]: |
---|
56 | row.append( "-" * ( ee - ss ) ) |
---|
57 | else: |
---|
58 | slice_start = start + ss |
---|
59 | slice_end = start + ee |
---|
60 | block = blocks[index] |
---|
61 | ref = block.get_component_by_src_start( ref_src ) |
---|
62 | sliced = block.slice_by_component( ref, slice_start, slice_end ) |
---|
63 | sliced = sliced.limit_to_species( sources ) |
---|
64 | sliced.remove_all_gap_columns() |
---|
65 | for i, src in enumerate( sources ): |
---|
66 | comp = sliced.get_component_by_src_start( src ) |
---|
67 | if comp: |
---|
68 | tiled[i].append( comp.text ) |
---|
69 | else: |
---|
70 | tiled[i].append( "-" * sliced.text_size ) |
---|
71 | return [ "".join( t ) for t in tiled ] |
---|
72 | |
---|
73 | def intervals_from_mask( mask ): |
---|
74 | start = 0 |
---|
75 | last = mask[0] |
---|
76 | for i in range( 1, len( mask ) ): |
---|
77 | if mask[i] != last: |
---|
78 | yield start, i, last |
---|
79 | start = i |
---|
80 | last = mask[i] |
---|
81 | yield start, len(mask), last |
---|