[3] | 1 | """ |
---|
| 2 | Classes for char-to-int mapping and int-to-int mapping. |
---|
| 3 | |
---|
| 4 | :Author: James Taylor (james@bx.psu.edu) |
---|
| 5 | |
---|
| 6 | The char-to-int mapping can be used to translate a list of strings |
---|
| 7 | over some alphabet to a single int array (example for encoding a multiple |
---|
| 8 | sequence alignment). |
---|
| 9 | |
---|
| 10 | The int-to-int mapping is particularly useful for creating partitions, |
---|
| 11 | and provides methods to merge/split symbols in the output mapping. |
---|
| 12 | |
---|
| 13 | The two forms of mapping can be combined, for example to encode a |
---|
| 14 | multiple sequence alignment in a reduced alphabet defined by a partition |
---|
| 15 | of alignment columns. Many of the helper methods provided are for |
---|
| 16 | solving such alignment oriented problems. |
---|
| 17 | |
---|
| 18 | This code was originally written for the `ESPERR`_ project which includes |
---|
| 19 | software for searcing for alignment encodings that work well for specific |
---|
| 20 | classification problems using various Markov chain classifiers over the |
---|
| 21 | reduced encodings. |
---|
| 22 | |
---|
| 23 | Most of the core implementation is in the pyrex/C extension |
---|
| 24 | "_seqmapping.pyx" for performance reasons (specifically to avoid the |
---|
| 25 | excessive bounds checking that would make a sequence/array lookup heavy |
---|
| 26 | problem like this slow in pure python). |
---|
| 27 | |
---|
| 28 | .. _ESPERR: http://www.bx.psu.edu/projects/esperr/ |
---|
| 29 | """ |
---|
| 30 | |
---|
| 31 | from _seqmapping import * |
---|
| 32 | |
---|
| 33 | # Char->Int mapping for DNA characters with missing data |
---|
| 34 | |
---|
| 35 | DNA = CharToIntArrayMapping() |
---|
| 36 | DNA.set_mapping( "a", 0 ) |
---|
| 37 | DNA.set_mapping( "A", 0 ) |
---|
| 38 | DNA.set_mapping( "c", 1 ) |
---|
| 39 | DNA.set_mapping( "C", 1 ) |
---|
| 40 | DNA.set_mapping( "g", 2 ) |
---|
| 41 | DNA.set_mapping( "G", 2 ) |
---|
| 42 | DNA.set_mapping( "t", 3 ) |
---|
| 43 | DNA.set_mapping( "T", 3 ) |
---|
| 44 | DNA.set_mapping( "-", 4 ) |
---|
| 45 | DNA.set_mapping( "*", 5 ) |
---|
| 46 | |
---|
| 47 | # Creating mappings |
---|
| 48 | |
---|
| 49 | def alignment_mapping_from_file( f, char_mapping=DNA ): |
---|
| 50 | """ |
---|
| 51 | Create a mapping from a file of alignment columns. |
---|
| 52 | """ |
---|
| 53 | columns, symbols = [], [] |
---|
| 54 | for line in f: |
---|
| 55 | column, symbol = line.split() |
---|
| 56 | columns.append( column ) |
---|
| 57 | symbols.append( int( symbol ) ) |
---|
| 58 | |
---|
| 59 | align_count = len( columns[0] ) |
---|
| 60 | |
---|
| 61 | mapping = IntToIntMapping( char_mapping.get_out_size() ** align_count ) |
---|
| 62 | |
---|
| 63 | for column, symbol in zip( columns, symbols ): |
---|
| 64 | index = char_mapping.translate_list( list( column ) )[0] |
---|
| 65 | mapping.set_mapping( index, symbol ) |
---|
| 66 | |
---|
| 67 | return align_count, mapping |
---|
| 68 | |
---|
| 69 | def second_mapping_from_file( f, first_mapping, char_mapping=DNA ): |
---|
| 70 | |
---|
| 71 | columns, symbols = [], [] |
---|
| 72 | for line in f: |
---|
| 73 | column, symbol = line.split() |
---|
| 74 | columns.append( column ) |
---|
| 75 | symbols.append( int( symbol ) ) |
---|
| 76 | |
---|
| 77 | align_count = len( columns[0] ) |
---|
| 78 | |
---|
| 79 | mapping = IntToIntMapping( first_mapping.get_out_size() ) |
---|
| 80 | |
---|
| 81 | for column, symbol in zip( columns, symbols ): |
---|
| 82 | index = char_mapping.translate_list( list( column ) )[0] |
---|
| 83 | if first_mapping[index] >= 0: |
---|
| 84 | mapping.set_mapping( first_mapping[index], symbol ) |
---|
| 85 | |
---|
| 86 | return mapping |
---|
| 87 | |
---|
| 88 | |
---|
| 89 | def identity_mapping( size ): |
---|
| 90 | mapping = IntToIntMapping( size ) |
---|
| 91 | for i in range( size ): |
---|
| 92 | mapping.set_mapping( i, i ) |
---|
| 93 | return mapping |
---|
| 94 | |
---|