root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/align/tools/thread.py @ 3

リビジョン 3, 3.9 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Tools for "threading" out specific species from alignments (removing other
3species and fixing alignment text).
4"""
5
6import sys
7from itertools import *
8from copy import deepcopy
9
10def thread( mafs, species ):
11    """
12    Restrict an list of alignments to a given list of species by:
13   
14        1) Removing components for any other species
15        2) Remove any columns containing all gaps
16     
17    Example:
18     
19    >>> import bx.align.maf
20     
21    >>> block1 = bx.align.maf.from_string( '''
22    ... a score=4964.0
23    ... s hg18.chr10                  52686 44 + 135374737 GTGCTAACTTACTGCTCCACAGAAAACATCAATTCTGCTCATGC
24    ... s rheMac2.chr20            58163346 43 -  88221753 ATATTATCTTAACATTAAAGA-AGAACAGTAATTCTGGTCATAA
25    ... s panTro1.chrUn_random    208115356 44 - 240967748 GTGCTAACTGACTGCTCCAGAGAAAACATCAATTCTGTTCATGT
26    ... s oryCun1.scaffold_175207     85970 22 +    212797 ----------------------AAAATATTAGTTATCACCATAT
27    ... s bosTau2.chr23            23894492 43 +  41602928 AAACTACCTTAATGTCACAGG-AAACAATGTATgctgctgctgc
28    ... ''' )
29   
30    >>> block2 = bx.align.maf.from_string( '''
31    ... a score=9151.0
32    ... s hg18.chr10                  52730 69 + 135374737 GCAGGTACAATTCATCAAGAAAG-GAATTACAACTTCAGAAATGTGTTCAAAATATATCCATACTT-TGAC
33    ... s oryCun1.scaffold_175207     85992 71 +    212797 TCTAGTGCTCTCCAATAATATAATAGATTATAACTTCATATAATTATGTGAAATATAAGATTATTTATCAG
34    ... s panTro1.chrUn_random    208115400 69 - 240967748 GCAGCTACTATTCATCAAGAAAG-GGATTACAACTTCAGAAATGTGTTCAAAGTGTATCCATACTT-TGAT
35    ... s rheMac2.chr20            58163389 69 -  88221753 ACACATATTATTTCTTAACATGGAGGATTATATCTT-AAACATGTGTGCaaaatataaatatatat-tcaa
36    ... ''' )
37   
38    >>> mafs = [ block1, block2 ]
39   
40    >>> threaded = [ t for t in thread( mafs, [ "hg18", "panTro1" ] ) ]
41   
42    >>> len( threaded )
43    2
44   
45    >>> print threaded[0]
46    a score=0.0
47    s hg18.chr10 52686 44 + 135374737 GTGCTAACTTACTGCTCCACAGAAAACATCAATTCTGCTCATGC
48    s panTro1.chrUn_random 208115356 44 - 240967748 GTGCTAACTGACTGCTCCAGAGAAAACATCAATTCTGTTCATGT
49    <BLANKLINE>
50
51    >>> print threaded[1]
52    a score=0.0
53    s hg18.chr10 52730 69 + 135374737 GCAGGTACAATTCATCAAGAAAGGAATTACAACTTCAGAAATGTGTTCAAAATATATCCATACTTTGAC
54    s panTro1.chrUn_random 208115400 69 - 240967748 GCAGCTACTATTCATCAAGAAAGGGATTACAACTTCAGAAATGTGTTCAAAGTGTATCCATACTTTGAT
55    <BLANKLINE>
56   
57    """
58    for m in mafs:
59        new_maf = deepcopy( m )
60        new_components = get_components_for_species( new_maf, species )
61        if new_components:
62            remove_all_gap_columns( new_components )         
63            new_maf.components = new_components
64            new_maf.score = 0.0
65            new_maf.text_size = len(new_components[0].text)
66            yield new_maf
67       
68def get_components_for_species( alignment, species ):
69    """Return the component for each species in the list `species` or None"""
70    # If the number of components in the alignment is less that the requested number
71    # of species we can immediately fail
72    if len( alignment.components ) < len( species ): return None
73    # Otherwise, build an index of components by species, then lookup
74    index = dict( [ ( c.src.split( '.' )[0], c ) for c in alignment.components ] )
75    try: return [ index[s] for s in species ]
76    except: return None
77
78def remove_all_gap_columns( components ):
79    """
80    Remove any columns containing only gaps from a set of alignment components,
81    text of components is modified IN PLACE.
82   
83    TODO: Optimize this with Pyrex.
84    """       
85    seqs = [ list( c.text ) for c in components ]
86    i = 0
87    text_size = len( seqs[0] )
88    while i < text_size:
89        all_gap = True
90        for seq in seqs:
91            if seq[i] != '-': all_gap = False
92        if all_gap:
93            for seq in seqs: del seq[i]
94            text_size -= 1
95        else:
96            i += 1
97    for i in range( len( components ) ):
98        components[i].text = ''.join( seqs[i] )
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。