| 1 | """ |
|---|
| 2 | Tools for "threading" out specific species from alignments (removing other |
|---|
| 3 | species and fixing alignment text). |
|---|
| 4 | """ |
|---|
| 5 | |
|---|
| 6 | import sys |
|---|
| 7 | from itertools import * |
|---|
| 8 | from copy import deepcopy |
|---|
| 9 | |
|---|
| 10 | def thread( mafs, species ): |
|---|
| 11 | """ |
|---|
| 12 | Restrict an list of alignments to a given list of species by: |
|---|
| 13 | |
|---|
| 14 | 1) Removing components for any other species |
|---|
| 15 | 2) Remove any columns containing all gaps |
|---|
| 16 | |
|---|
| 17 | Example: |
|---|
| 18 | |
|---|
| 19 | >>> import bx.align.maf |
|---|
| 20 | |
|---|
| 21 | >>> block1 = bx.align.maf.from_string( ''' |
|---|
| 22 | ... a score=4964.0 |
|---|
| 23 | ... s hg18.chr10 52686 44 + 135374737 GTGCTAACTTACTGCTCCACAGAAAACATCAATTCTGCTCATGC |
|---|
| 24 | ... s rheMac2.chr20 58163346 43 - 88221753 ATATTATCTTAACATTAAAGA-AGAACAGTAATTCTGGTCATAA |
|---|
| 25 | ... s panTro1.chrUn_random 208115356 44 - 240967748 GTGCTAACTGACTGCTCCAGAGAAAACATCAATTCTGTTCATGT |
|---|
| 26 | ... s oryCun1.scaffold_175207 85970 22 + 212797 ----------------------AAAATATTAGTTATCACCATAT |
|---|
| 27 | ... s bosTau2.chr23 23894492 43 + 41602928 AAACTACCTTAATGTCACAGG-AAACAATGTATgctgctgctgc |
|---|
| 28 | ... ''' ) |
|---|
| 29 | |
|---|
| 30 | >>> block2 = bx.align.maf.from_string( ''' |
|---|
| 31 | ... a score=9151.0 |
|---|
| 32 | ... s hg18.chr10 52730 69 + 135374737 GCAGGTACAATTCATCAAGAAAG-GAATTACAACTTCAGAAATGTGTTCAAAATATATCCATACTT-TGAC |
|---|
| 33 | ... s oryCun1.scaffold_175207 85992 71 + 212797 TCTAGTGCTCTCCAATAATATAATAGATTATAACTTCATATAATTATGTGAAATATAAGATTATTTATCAG |
|---|
| 34 | ... s panTro1.chrUn_random 208115400 69 - 240967748 GCAGCTACTATTCATCAAGAAAG-GGATTACAACTTCAGAAATGTGTTCAAAGTGTATCCATACTT-TGAT |
|---|
| 35 | ... s rheMac2.chr20 58163389 69 - 88221753 ACACATATTATTTCTTAACATGGAGGATTATATCTT-AAACATGTGTGCaaaatataaatatatat-tcaa |
|---|
| 36 | ... ''' ) |
|---|
| 37 | |
|---|
| 38 | >>> mafs = [ block1, block2 ] |
|---|
| 39 | |
|---|
| 40 | >>> threaded = [ t for t in thread( mafs, [ "hg18", "panTro1" ] ) ] |
|---|
| 41 | |
|---|
| 42 | >>> len( threaded ) |
|---|
| 43 | 2 |
|---|
| 44 | |
|---|
| 45 | >>> print threaded[0] |
|---|
| 46 | a score=0.0 |
|---|
| 47 | s hg18.chr10 52686 44 + 135374737 GTGCTAACTTACTGCTCCACAGAAAACATCAATTCTGCTCATGC |
|---|
| 48 | s panTro1.chrUn_random 208115356 44 - 240967748 GTGCTAACTGACTGCTCCAGAGAAAACATCAATTCTGTTCATGT |
|---|
| 49 | <BLANKLINE> |
|---|
| 50 | |
|---|
| 51 | >>> print threaded[1] |
|---|
| 52 | a score=0.0 |
|---|
| 53 | s hg18.chr10 52730 69 + 135374737 GCAGGTACAATTCATCAAGAAAGGAATTACAACTTCAGAAATGTGTTCAAAATATATCCATACTTTGAC |
|---|
| 54 | s panTro1.chrUn_random 208115400 69 - 240967748 GCAGCTACTATTCATCAAGAAAGGGATTACAACTTCAGAAATGTGTTCAAAGTGTATCCATACTTTGAT |
|---|
| 55 | <BLANKLINE> |
|---|
| 56 | |
|---|
| 57 | """ |
|---|
| 58 | for m in mafs: |
|---|
| 59 | new_maf = deepcopy( m ) |
|---|
| 60 | new_components = get_components_for_species( new_maf, species ) |
|---|
| 61 | if new_components: |
|---|
| 62 | remove_all_gap_columns( new_components ) |
|---|
| 63 | new_maf.components = new_components |
|---|
| 64 | new_maf.score = 0.0 |
|---|
| 65 | new_maf.text_size = len(new_components[0].text) |
|---|
| 66 | yield new_maf |
|---|
| 67 | |
|---|
| 68 | def get_components_for_species( alignment, species ): |
|---|
| 69 | """Return the component for each species in the list `species` or None""" |
|---|
| 70 | # If the number of components in the alignment is less that the requested number |
|---|
| 71 | # of species we can immediately fail |
|---|
| 72 | if len( alignment.components ) < len( species ): return None |
|---|
| 73 | # Otherwise, build an index of components by species, then lookup |
|---|
| 74 | index = dict( [ ( c.src.split( '.' )[0], c ) for c in alignment.components ] ) |
|---|
| 75 | try: return [ index[s] for s in species ] |
|---|
| 76 | except: return None |
|---|
| 77 | |
|---|
| 78 | def remove_all_gap_columns( components ): |
|---|
| 79 | """ |
|---|
| 80 | Remove any columns containing only gaps from a set of alignment components, |
|---|
| 81 | text of components is modified IN PLACE. |
|---|
| 82 | |
|---|
| 83 | TODO: Optimize this with Pyrex. |
|---|
| 84 | """ |
|---|
| 85 | seqs = [ list( c.text ) for c in components ] |
|---|
| 86 | i = 0 |
|---|
| 87 | text_size = len( seqs[0] ) |
|---|
| 88 | while i < text_size: |
|---|
| 89 | all_gap = True |
|---|
| 90 | for seq in seqs: |
|---|
| 91 | if seq[i] != '-': all_gap = False |
|---|
| 92 | if all_gap: |
|---|
| 93 | for seq in seqs: del seq[i] |
|---|
| 94 | text_size -= 1 |
|---|
| 95 | else: |
|---|
| 96 | i += 1 |
|---|
| 97 | for i in range( len( components ) ): |
|---|
| 98 | components[i].text = ''.join( seqs[i] ) |
|---|