root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/align/tools/tile.py @ 3

リビジョン 3, 3.0 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Tools for tiling / projecting alignments onto an interval of a sequence.
3"""
4
5import bx.align as align
6from bx import misc
7import bx.seq.nib
8import os
9import string
10import sys
11
12
13def tile_interval( sources, index, ref_src, start, end, seq_db=None ):
14    """
15    Tile maf blocks onto an interval. The resulting block will span the interval
16    exactly and contain the column from the highest scoring alignment at each
17    position.
18   
19    `sources`: list of sequence source names to include in final block
20    `index`: an instnace that can return maf blocks overlapping intervals
21    `ref_src`: source name of the interval (ie, hg17.chr7)
22    `start`: start of interval
23    `end`: end of interval
24    `seq_db`: a mapping for source names in the reference species to nib files
25    """
26    # First entry in sources should also be on the reference species
27    assert sources[0].split('.')[0] == ref_src.split('.')[0], \
28        "%s != %s" % ( sources[0].split('.')[0], ref_src.split('.')[0] )
29    base_len = end - start
30    blocks = index.get( ref_src, start, end )
31    # From low to high score
32    blocks.sort( lambda a, b: cmp( a.score, b.score ) )
33    mask = [ -1 ] * base_len
34    ref_src_size = None
35    for i, block in enumerate( blocks ):
36        ref = block.get_component_by_src_start( ref_src )
37        ref_src_size = ref.src_size
38        assert ref.strand == "+"
39        slice_start = max( start, ref.start )
40        slice_end = min( end, ref.end )
41        for j in range( slice_start, slice_end ):
42            mask[j-start] = i
43    tiled = []
44    for i in range( len( sources ) ):
45        tiled.append( [] )
46    for ss, ee, index in intervals_from_mask( mask ):
47        # Interval with no covering alignments
48        if index < 0:
49            # Get sequence if available, otherwise just use 'N'
50            if seq_db:
51                tiled[0].append( bx.seq.nib.NibFile( open( seq_db[ ref_src ] ) ).get( start+ss, ee-ss ) )
52            else:
53                tiled[0].append( "N" * (ee-ss) )
54            # Gaps in all other species
55            for row in tiled[1:]:
56                row.append( "-" * ( ee - ss ) )
57        else:
58            slice_start = start + ss
59            slice_end = start + ee
60            block = blocks[index]
61            ref = block.get_component_by_src_start( ref_src )
62            sliced = block.slice_by_component( ref, slice_start, slice_end )
63            sliced = sliced.limit_to_species( sources )
64            sliced.remove_all_gap_columns()
65            for i, src in enumerate( sources ):
66                comp = sliced.get_component_by_src_start( src )
67                if comp:
68                    tiled[i].append( comp.text )
69                else:
70                    tiled[i].append( "-" * sliced.text_size )       
71    return [ "".join( t ) for t in tiled ]
72
73def intervals_from_mask( mask ):
74    start = 0
75    last = mask[0]
76    for i in range( 1, len( mask ) ):
77        if mask[i] != last:
78            yield start, i, last
79            start = i
80            last = mask[i]
81    yield start, len(mask), last
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。