1 | """ |
---|
2 | Classes for working with position specific matrices. |
---|
3 | """ |
---|
4 | |
---|
5 | from numpy import * |
---|
6 | from copy import copy |
---|
7 | |
---|
8 | import _pwm |
---|
9 | |
---|
10 | class BaseMatrix( object ): |
---|
11 | """ |
---|
12 | Base class for position specific matrices. |
---|
13 | """ |
---|
14 | def __init__( self, alphabet=None, sorted_alphabet=None, |
---|
15 | char_to_index=None, values=None ): |
---|
16 | self.alphabet = alphabet |
---|
17 | self.sorted_alphabet = sorted_alphabet |
---|
18 | self.char_to_index = char_to_index |
---|
19 | self.values = values |
---|
20 | |
---|
21 | @classmethod |
---|
22 | def from_rows( Class, alphabet, rows ): |
---|
23 | """ |
---|
24 | Create a new matrix for a sequence over alphabet `alphabet` taking |
---|
25 | values from `rows` which is a list whose length is the width of the |
---|
26 | matrix, and whose elements are lists of values associated with each |
---|
27 | character (in the order those characters appear in alphabet). |
---|
28 | """ |
---|
29 | # Sorted alphabet |
---|
30 | sorted_alphabet = sorted( alphabet ) |
---|
31 | # Character to index mapping (initialized to -1) |
---|
32 | char_to_index = zeros( (256), int16 ) - 1 |
---|
33 | for i, ch in enumerate( sorted_alphabet ): |
---|
34 | char_to_index[ ord(ch) ] = i |
---|
35 | # Array |
---|
36 | values = zeros( ( len( rows) , len( alphabet ) ), float32 ) |
---|
37 | for i, row in enumerate( rows ): |
---|
38 | assert len( row ) == len( alphabet ) |
---|
39 | for ch, val in zip( alphabet, row ): |
---|
40 | values[i, char_to_index[ord(ch)]] = val |
---|
41 | # Matrix |
---|
42 | matrix = Class() |
---|
43 | matrix.alphabet = alphabet |
---|
44 | matrix.sorted_alphabet = sorted_alphabet |
---|
45 | matrix.char_to_index = char_to_index |
---|
46 | matrix.values = values |
---|
47 | return matrix |
---|
48 | |
---|
49 | @classmethod |
---|
50 | def create_from_other( Class, other, values=None ): |
---|
51 | """ |
---|
52 | Create a new Matrix with attributes taken from `other` but with the |
---|
53 | values taken from `values` if provided |
---|
54 | """ |
---|
55 | m = Class() |
---|
56 | m.alphabet = other.alphabet |
---|
57 | m.sorted_alphabet = other.sorted_alphabet |
---|
58 | m.char_to_index = other.char_to_index |
---|
59 | if values is not None: |
---|
60 | m.values = values |
---|
61 | else: |
---|
62 | m.values = other.values |
---|
63 | return m |
---|
64 | |
---|
65 | @property |
---|
66 | def width( self ): |
---|
67 | """ |
---|
68 | Return the width (size along the sequence axis) of this matrix. |
---|
69 | """ |
---|
70 | return self.values.shape[0] |
---|
71 | |
---|
72 | def reverse_complement( self ): |
---|
73 | """ |
---|
74 | Create the reverse complement of this matrix. The result probably |
---|
75 | only makese sense if the alphabet is that of DNA ('A','C','G','T'). |
---|
76 | """ |
---|
77 | rval = copy( self ) |
---|
78 | # Conveniently enough, reversing rows and columns is exactly what we |
---|
79 | # want, since this results in A swapping with T and C swapping with G. |
---|
80 | rval.values = self.values[::-1,::-1].copy() |
---|
81 | return rval |
---|
82 | |
---|
83 | class FrequencyMatrix( BaseMatrix ): |
---|
84 | """ |
---|
85 | A position specific count/frequency matrix. |
---|
86 | """ |
---|
87 | |
---|
88 | DEFAULT_CORRECTION = 0.0000000001 |
---|
89 | """ |
---|
90 | Default value to use for correcting when dealing with counts of zero, |
---|
91 | chosen to produce scoring matrices that are the same as produced by CREAD. |
---|
92 | """ |
---|
93 | |
---|
94 | def to_logodds_scoring_matrix( self, background=None, correction=DEFAULT_CORRECTION ): |
---|
95 | """ |
---|
96 | Create a standard logodds scoring matrix. |
---|
97 | """ |
---|
98 | alphabet_size = len( self.alphabet ) |
---|
99 | if background is None: |
---|
100 | background = ones( alphabet_size, float32 ) / alphabet_size |
---|
101 | # Row totals as a one column array |
---|
102 | totals = sum( self.values, 1 )[:,newaxis] |
---|
103 | values = log2( maximum( self.values, correction ) ) \ |
---|
104 | - log2( totals ) \ |
---|
105 | - log2( maximum( background, correction ) ) |
---|
106 | return ScoringMatrix.create_from_other( self, values.astype( float32 ) ) |
---|
107 | |
---|
108 | def to_stormo_scoring_matrix( self, background=None ): |
---|
109 | """ |
---|
110 | Create a scoring matrix from this count matrix using the method from: |
---|
111 | |
---|
112 | Hertz, G.Z. and G.D. Stormo (1999). Identifying DNA and protein patterns with statistically |
---|
113 | significant alignments of multiple sequences. Bioinformatics 15(7): 563-577. |
---|
114 | """ |
---|
115 | alphabet_size = len( self.alphabet ) |
---|
116 | if background is None: |
---|
117 | background = ones( alphabet_size, float32 ) / alphabet_size |
---|
118 | # Row totals as a one column array |
---|
119 | totals = sum( self.values, 1 )[:,newaxis] |
---|
120 | values = log2( self.values + background ) \ |
---|
121 | - log2( totals + 1 ) - log2( background ) |
---|
122 | return ScoringMatrix.create_from_other( self, values.astype( float32 ) ) |
---|
123 | |
---|
124 | class ScoringMatrix( BaseMatrix ): |
---|
125 | """ |
---|
126 | A position specific matrix containing values that are suitable for |
---|
127 | scoring a sequence. |
---|
128 | """ |
---|
129 | |
---|
130 | def score_string( self, string ): |
---|
131 | """ |
---|
132 | Score each valid position in `string` using this scoring matrix. |
---|
133 | Positions which were not scored are set to nan. |
---|
134 | """ |
---|
135 | rval = zeros( len( string ), float32 ) |
---|
136 | rval[:] = nan |
---|
137 | _pwm.score_string( self.values, self.char_to_index, string, rval ) |
---|
138 | return rval |
---|
139 | |
---|
140 | def score_string_with_gaps( self, string ): |
---|
141 | """ |
---|
142 | Score each valid position in `string` using this scoring matrix. |
---|
143 | Positions which were not scored are set to nan. Gap characters are |
---|
144 | ignored (matrices score across them). |
---|
145 | """ |
---|
146 | rval = zeros( len( string ), float32 ) |
---|
147 | rval[:] = nan |
---|
148 | _pwm.score_string_with_gaps( self.values, self.char_to_index, string, rval ) |
---|
149 | return rval |
---|