1 | """ |
---|
2 | Classes for char-to-int mapping and int-to-int mapping. |
---|
3 | |
---|
4 | :Author: James Taylor (james@bx.psu.edu) |
---|
5 | |
---|
6 | The char-to-int mapping can be used to translate a list of strings |
---|
7 | over some alphabet to a single int array (example for encoding a multiple |
---|
8 | sequence alignment). |
---|
9 | |
---|
10 | The int-to-int mapping is particularly useful for creating partitions, |
---|
11 | and provides methods to merge/split symbols in the output mapping. |
---|
12 | |
---|
13 | The two forms of mapping can be combined, for example to encode a |
---|
14 | multiple sequence alignment in a reduced alphabet defined by a partition |
---|
15 | of alignment columns. Many of the helper methods provided are for |
---|
16 | solving such alignment oriented problems. |
---|
17 | |
---|
18 | This code was originally written for the `ESPERR`_ project which includes |
---|
19 | software for searcing for alignment encodings that work well for specific |
---|
20 | classification problems using various Markov chain classifiers over the |
---|
21 | reduced encodings. |
---|
22 | |
---|
23 | Most of the core implementation is in the pyrex/C extension |
---|
24 | "_seqmapping.pyx" for performance reasons (specifically to avoid the |
---|
25 | excessive bounds checking that would make a sequence/array lookup heavy |
---|
26 | problem like this slow in pure python). |
---|
27 | |
---|
28 | .. _ESPERR: http://www.bx.psu.edu/projects/esperr/ |
---|
29 | """ |
---|
30 | |
---|
31 | from _seqmapping import * |
---|
32 | |
---|
33 | # Char->Int mapping for DNA characters with missing data |
---|
34 | |
---|
35 | DNA = CharToIntArrayMapping() |
---|
36 | DNA.set_mapping( "a", 0 ) |
---|
37 | DNA.set_mapping( "A", 0 ) |
---|
38 | DNA.set_mapping( "c", 1 ) |
---|
39 | DNA.set_mapping( "C", 1 ) |
---|
40 | DNA.set_mapping( "g", 2 ) |
---|
41 | DNA.set_mapping( "G", 2 ) |
---|
42 | DNA.set_mapping( "t", 3 ) |
---|
43 | DNA.set_mapping( "T", 3 ) |
---|
44 | DNA.set_mapping( "-", 4 ) |
---|
45 | DNA.set_mapping( "*", 5 ) |
---|
46 | |
---|
47 | # Creating mappings |
---|
48 | |
---|
49 | def alignment_mapping_from_file( f, char_mapping=DNA ): |
---|
50 | """ |
---|
51 | Create a mapping from a file of alignment columns. |
---|
52 | """ |
---|
53 | columns, symbols = [], [] |
---|
54 | for line in f: |
---|
55 | column, symbol = line.split() |
---|
56 | columns.append( column ) |
---|
57 | symbols.append( int( symbol ) ) |
---|
58 | |
---|
59 | align_count = len( columns[0] ) |
---|
60 | |
---|
61 | mapping = IntToIntMapping( char_mapping.get_out_size() ** align_count ) |
---|
62 | |
---|
63 | for column, symbol in zip( columns, symbols ): |
---|
64 | index = char_mapping.translate_list( list( column ) )[0] |
---|
65 | mapping.set_mapping( index, symbol ) |
---|
66 | |
---|
67 | return align_count, mapping |
---|
68 | |
---|
69 | def second_mapping_from_file( f, first_mapping, char_mapping=DNA ): |
---|
70 | |
---|
71 | columns, symbols = [], [] |
---|
72 | for line in f: |
---|
73 | column, symbol = line.split() |
---|
74 | columns.append( column ) |
---|
75 | symbols.append( int( symbol ) ) |
---|
76 | |
---|
77 | align_count = len( columns[0] ) |
---|
78 | |
---|
79 | mapping = IntToIntMapping( first_mapping.get_out_size() ) |
---|
80 | |
---|
81 | for column, symbol in zip( columns, symbols ): |
---|
82 | index = char_mapping.translate_list( list( column ) )[0] |
---|
83 | if first_mapping[index] >= 0: |
---|
84 | mapping.set_mapping( first_mapping[index], symbol ) |
---|
85 | |
---|
86 | return mapping |
---|
87 | |
---|
88 | |
---|
89 | def identity_mapping( size ): |
---|
90 | mapping = IntToIntMapping( size ) |
---|
91 | for i in range( size ): |
---|
92 | mapping.set_mapping( i, i ) |
---|
93 | return mapping |
---|
94 | |
---|