| 1 | """ |
|---|
| 2 | Classes for char-to-int mapping and int-to-int mapping. |
|---|
| 3 | |
|---|
| 4 | :Author: James Taylor (james@bx.psu.edu) |
|---|
| 5 | |
|---|
| 6 | The char-to-int mapping can be used to translate a list of strings |
|---|
| 7 | over some alphabet to a single int array (example for encoding a multiple |
|---|
| 8 | sequence alignment). |
|---|
| 9 | |
|---|
| 10 | The int-to-int mapping is particularly useful for creating partitions, |
|---|
| 11 | and provides methods to merge/split symbols in the output mapping. |
|---|
| 12 | |
|---|
| 13 | The two forms of mapping can be combined, for example to encode a |
|---|
| 14 | multiple sequence alignment in a reduced alphabet defined by a partition |
|---|
| 15 | of alignment columns. Many of the helper methods provided are for |
|---|
| 16 | solving such alignment oriented problems. |
|---|
| 17 | |
|---|
| 18 | This code was originally written for the `ESPERR`_ project which includes |
|---|
| 19 | software for searcing for alignment encodings that work well for specific |
|---|
| 20 | classification problems using various Markov chain classifiers over the |
|---|
| 21 | reduced encodings. |
|---|
| 22 | |
|---|
| 23 | Most of the core implementation is in the pyrex/C extension |
|---|
| 24 | "_seqmapping.pyx" for performance reasons (specifically to avoid the |
|---|
| 25 | excessive bounds checking that would make a sequence/array lookup heavy |
|---|
| 26 | problem like this slow in pure python). |
|---|
| 27 | |
|---|
| 28 | .. _ESPERR: http://www.bx.psu.edu/projects/esperr/ |
|---|
| 29 | """ |
|---|
| 30 | |
|---|
| 31 | from _seqmapping import * |
|---|
| 32 | |
|---|
| 33 | # Char->Int mapping for DNA characters with missing data |
|---|
| 34 | |
|---|
| 35 | DNA = CharToIntArrayMapping() |
|---|
| 36 | DNA.set_mapping( "a", 0 ) |
|---|
| 37 | DNA.set_mapping( "A", 0 ) |
|---|
| 38 | DNA.set_mapping( "c", 1 ) |
|---|
| 39 | DNA.set_mapping( "C", 1 ) |
|---|
| 40 | DNA.set_mapping( "g", 2 ) |
|---|
| 41 | DNA.set_mapping( "G", 2 ) |
|---|
| 42 | DNA.set_mapping( "t", 3 ) |
|---|
| 43 | DNA.set_mapping( "T", 3 ) |
|---|
| 44 | DNA.set_mapping( "-", 4 ) |
|---|
| 45 | DNA.set_mapping( "*", 5 ) |
|---|
| 46 | |
|---|
| 47 | # Creating mappings |
|---|
| 48 | |
|---|
| 49 | def alignment_mapping_from_file( f, char_mapping=DNA ): |
|---|
| 50 | """ |
|---|
| 51 | Create a mapping from a file of alignment columns. |
|---|
| 52 | """ |
|---|
| 53 | columns, symbols = [], [] |
|---|
| 54 | for line in f: |
|---|
| 55 | column, symbol = line.split() |
|---|
| 56 | columns.append( column ) |
|---|
| 57 | symbols.append( int( symbol ) ) |
|---|
| 58 | |
|---|
| 59 | align_count = len( columns[0] ) |
|---|
| 60 | |
|---|
| 61 | mapping = IntToIntMapping( char_mapping.get_out_size() ** align_count ) |
|---|
| 62 | |
|---|
| 63 | for column, symbol in zip( columns, symbols ): |
|---|
| 64 | index = char_mapping.translate_list( list( column ) )[0] |
|---|
| 65 | mapping.set_mapping( index, symbol ) |
|---|
| 66 | |
|---|
| 67 | return align_count, mapping |
|---|
| 68 | |
|---|
| 69 | def second_mapping_from_file( f, first_mapping, char_mapping=DNA ): |
|---|
| 70 | |
|---|
| 71 | columns, symbols = [], [] |
|---|
| 72 | for line in f: |
|---|
| 73 | column, symbol = line.split() |
|---|
| 74 | columns.append( column ) |
|---|
| 75 | symbols.append( int( symbol ) ) |
|---|
| 76 | |
|---|
| 77 | align_count = len( columns[0] ) |
|---|
| 78 | |
|---|
| 79 | mapping = IntToIntMapping( first_mapping.get_out_size() ) |
|---|
| 80 | |
|---|
| 81 | for column, symbol in zip( columns, symbols ): |
|---|
| 82 | index = char_mapping.translate_list( list( column ) )[0] |
|---|
| 83 | if first_mapping[index] >= 0: |
|---|
| 84 | mapping.set_mapping( first_mapping[index], symbol ) |
|---|
| 85 | |
|---|
| 86 | return mapping |
|---|
| 87 | |
|---|
| 88 | |
|---|
| 89 | def identity_mapping( size ): |
|---|
| 90 | mapping = IntToIntMapping( size ) |
|---|
| 91 | for i in range( size ): |
|---|
| 92 | mapping.set_mapping( i, i ) |
|---|
| 93 | return mapping |
|---|
| 94 | |
|---|