1 | """ |
---|
2 | Tools for "threading" out specific species from alignments (removing other |
---|
3 | species and fixing alignment text). |
---|
4 | """ |
---|
5 | |
---|
6 | import sys |
---|
7 | from itertools import * |
---|
8 | from copy import deepcopy |
---|
9 | |
---|
10 | def thread( mafs, species ): |
---|
11 | """ |
---|
12 | Restrict an list of alignments to a given list of species by: |
---|
13 | |
---|
14 | 1) Removing components for any other species |
---|
15 | 2) Remove any columns containing all gaps |
---|
16 | |
---|
17 | Example: |
---|
18 | |
---|
19 | >>> import bx.align.maf |
---|
20 | |
---|
21 | >>> block1 = bx.align.maf.from_string( ''' |
---|
22 | ... a score=4964.0 |
---|
23 | ... s hg18.chr10 52686 44 + 135374737 GTGCTAACTTACTGCTCCACAGAAAACATCAATTCTGCTCATGC |
---|
24 | ... s rheMac2.chr20 58163346 43 - 88221753 ATATTATCTTAACATTAAAGA-AGAACAGTAATTCTGGTCATAA |
---|
25 | ... s panTro1.chrUn_random 208115356 44 - 240967748 GTGCTAACTGACTGCTCCAGAGAAAACATCAATTCTGTTCATGT |
---|
26 | ... s oryCun1.scaffold_175207 85970 22 + 212797 ----------------------AAAATATTAGTTATCACCATAT |
---|
27 | ... s bosTau2.chr23 23894492 43 + 41602928 AAACTACCTTAATGTCACAGG-AAACAATGTATgctgctgctgc |
---|
28 | ... ''' ) |
---|
29 | |
---|
30 | >>> block2 = bx.align.maf.from_string( ''' |
---|
31 | ... a score=9151.0 |
---|
32 | ... s hg18.chr10 52730 69 + 135374737 GCAGGTACAATTCATCAAGAAAG-GAATTACAACTTCAGAAATGTGTTCAAAATATATCCATACTT-TGAC |
---|
33 | ... s oryCun1.scaffold_175207 85992 71 + 212797 TCTAGTGCTCTCCAATAATATAATAGATTATAACTTCATATAATTATGTGAAATATAAGATTATTTATCAG |
---|
34 | ... s panTro1.chrUn_random 208115400 69 - 240967748 GCAGCTACTATTCATCAAGAAAG-GGATTACAACTTCAGAAATGTGTTCAAAGTGTATCCATACTT-TGAT |
---|
35 | ... s rheMac2.chr20 58163389 69 - 88221753 ACACATATTATTTCTTAACATGGAGGATTATATCTT-AAACATGTGTGCaaaatataaatatatat-tcaa |
---|
36 | ... ''' ) |
---|
37 | |
---|
38 | >>> mafs = [ block1, block2 ] |
---|
39 | |
---|
40 | >>> threaded = [ t for t in thread( mafs, [ "hg18", "panTro1" ] ) ] |
---|
41 | |
---|
42 | >>> len( threaded ) |
---|
43 | 2 |
---|
44 | |
---|
45 | >>> print threaded[0] |
---|
46 | a score=0.0 |
---|
47 | s hg18.chr10 52686 44 + 135374737 GTGCTAACTTACTGCTCCACAGAAAACATCAATTCTGCTCATGC |
---|
48 | s panTro1.chrUn_random 208115356 44 - 240967748 GTGCTAACTGACTGCTCCAGAGAAAACATCAATTCTGTTCATGT |
---|
49 | <BLANKLINE> |
---|
50 | |
---|
51 | >>> print threaded[1] |
---|
52 | a score=0.0 |
---|
53 | s hg18.chr10 52730 69 + 135374737 GCAGGTACAATTCATCAAGAAAGGAATTACAACTTCAGAAATGTGTTCAAAATATATCCATACTTTGAC |
---|
54 | s panTro1.chrUn_random 208115400 69 - 240967748 GCAGCTACTATTCATCAAGAAAGGGATTACAACTTCAGAAATGTGTTCAAAGTGTATCCATACTTTGAT |
---|
55 | <BLANKLINE> |
---|
56 | |
---|
57 | """ |
---|
58 | for m in mafs: |
---|
59 | new_maf = deepcopy( m ) |
---|
60 | new_components = get_components_for_species( new_maf, species ) |
---|
61 | if new_components: |
---|
62 | remove_all_gap_columns( new_components ) |
---|
63 | new_maf.components = new_components |
---|
64 | new_maf.score = 0.0 |
---|
65 | new_maf.text_size = len(new_components[0].text) |
---|
66 | yield new_maf |
---|
67 | |
---|
68 | def get_components_for_species( alignment, species ): |
---|
69 | """Return the component for each species in the list `species` or None""" |
---|
70 | # If the number of components in the alignment is less that the requested number |
---|
71 | # of species we can immediately fail |
---|
72 | if len( alignment.components ) < len( species ): return None |
---|
73 | # Otherwise, build an index of components by species, then lookup |
---|
74 | index = dict( [ ( c.src.split( '.' )[0], c ) for c in alignment.components ] ) |
---|
75 | try: return [ index[s] for s in species ] |
---|
76 | except: return None |
---|
77 | |
---|
78 | def remove_all_gap_columns( components ): |
---|
79 | """ |
---|
80 | Remove any columns containing only gaps from a set of alignment components, |
---|
81 | text of components is modified IN PLACE. |
---|
82 | |
---|
83 | TODO: Optimize this with Pyrex. |
---|
84 | """ |
---|
85 | seqs = [ list( c.text ) for c in components ] |
---|
86 | i = 0 |
---|
87 | text_size = len( seqs[0] ) |
---|
88 | while i < text_size: |
---|
89 | all_gap = True |
---|
90 | for seq in seqs: |
---|
91 | if seq[i] != '-': all_gap = False |
---|
92 | if all_gap: |
---|
93 | for seq in seqs: del seq[i] |
---|
94 | text_size -= 1 |
---|
95 | else: |
---|
96 | i += 1 |
---|
97 | for i in range( len( components ) ): |
---|
98 | components[i].text = ''.join( seqs[i] ) |
---|