[3] | 1 | """ |
---|
| 2 | Classes for working with position specific matrices. |
---|
| 3 | """ |
---|
| 4 | |
---|
| 5 | from numpy import * |
---|
| 6 | from copy import copy |
---|
| 7 | |
---|
| 8 | import _pwm |
---|
| 9 | |
---|
| 10 | class BaseMatrix( object ): |
---|
| 11 | """ |
---|
| 12 | Base class for position specific matrices. |
---|
| 13 | """ |
---|
| 14 | def __init__( self, alphabet=None, sorted_alphabet=None, |
---|
| 15 | char_to_index=None, values=None ): |
---|
| 16 | self.alphabet = alphabet |
---|
| 17 | self.sorted_alphabet = sorted_alphabet |
---|
| 18 | self.char_to_index = char_to_index |
---|
| 19 | self.values = values |
---|
| 20 | |
---|
| 21 | @classmethod |
---|
| 22 | def from_rows( Class, alphabet, rows ): |
---|
| 23 | """ |
---|
| 24 | Create a new matrix for a sequence over alphabet `alphabet` taking |
---|
| 25 | values from `rows` which is a list whose length is the width of the |
---|
| 26 | matrix, and whose elements are lists of values associated with each |
---|
| 27 | character (in the order those characters appear in alphabet). |
---|
| 28 | """ |
---|
| 29 | # Sorted alphabet |
---|
| 30 | sorted_alphabet = sorted( alphabet ) |
---|
| 31 | # Character to index mapping (initialized to -1) |
---|
| 32 | char_to_index = zeros( (256), int16 ) - 1 |
---|
| 33 | for i, ch in enumerate( sorted_alphabet ): |
---|
| 34 | char_to_index[ ord(ch) ] = i |
---|
| 35 | # Array |
---|
| 36 | values = zeros( ( len( rows) , len( alphabet ) ), float32 ) |
---|
| 37 | for i, row in enumerate( rows ): |
---|
| 38 | assert len( row ) == len( alphabet ) |
---|
| 39 | for ch, val in zip( alphabet, row ): |
---|
| 40 | values[i, char_to_index[ord(ch)]] = val |
---|
| 41 | # Matrix |
---|
| 42 | matrix = Class() |
---|
| 43 | matrix.alphabet = alphabet |
---|
| 44 | matrix.sorted_alphabet = sorted_alphabet |
---|
| 45 | matrix.char_to_index = char_to_index |
---|
| 46 | matrix.values = values |
---|
| 47 | return matrix |
---|
| 48 | |
---|
| 49 | @classmethod |
---|
| 50 | def create_from_other( Class, other, values=None ): |
---|
| 51 | """ |
---|
| 52 | Create a new Matrix with attributes taken from `other` but with the |
---|
| 53 | values taken from `values` if provided |
---|
| 54 | """ |
---|
| 55 | m = Class() |
---|
| 56 | m.alphabet = other.alphabet |
---|
| 57 | m.sorted_alphabet = other.sorted_alphabet |
---|
| 58 | m.char_to_index = other.char_to_index |
---|
| 59 | if values is not None: |
---|
| 60 | m.values = values |
---|
| 61 | else: |
---|
| 62 | m.values = other.values |
---|
| 63 | return m |
---|
| 64 | |
---|
| 65 | @property |
---|
| 66 | def width( self ): |
---|
| 67 | """ |
---|
| 68 | Return the width (size along the sequence axis) of this matrix. |
---|
| 69 | """ |
---|
| 70 | return self.values.shape[0] |
---|
| 71 | |
---|
| 72 | def reverse_complement( self ): |
---|
| 73 | """ |
---|
| 74 | Create the reverse complement of this matrix. The result probably |
---|
| 75 | only makese sense if the alphabet is that of DNA ('A','C','G','T'). |
---|
| 76 | """ |
---|
| 77 | rval = copy( self ) |
---|
| 78 | # Conveniently enough, reversing rows and columns is exactly what we |
---|
| 79 | # want, since this results in A swapping with T and C swapping with G. |
---|
| 80 | rval.values = self.values[::-1,::-1].copy() |
---|
| 81 | return rval |
---|
| 82 | |
---|
| 83 | class FrequencyMatrix( BaseMatrix ): |
---|
| 84 | """ |
---|
| 85 | A position specific count/frequency matrix. |
---|
| 86 | """ |
---|
| 87 | |
---|
| 88 | DEFAULT_CORRECTION = 0.0000000001 |
---|
| 89 | """ |
---|
| 90 | Default value to use for correcting when dealing with counts of zero, |
---|
| 91 | chosen to produce scoring matrices that are the same as produced by CREAD. |
---|
| 92 | """ |
---|
| 93 | |
---|
| 94 | def to_logodds_scoring_matrix( self, background=None, correction=DEFAULT_CORRECTION ): |
---|
| 95 | """ |
---|
| 96 | Create a standard logodds scoring matrix. |
---|
| 97 | """ |
---|
| 98 | alphabet_size = len( self.alphabet ) |
---|
| 99 | if background is None: |
---|
| 100 | background = ones( alphabet_size, float32 ) / alphabet_size |
---|
| 101 | # Row totals as a one column array |
---|
| 102 | totals = sum( self.values, 1 )[:,newaxis] |
---|
| 103 | values = log2( maximum( self.values, correction ) ) \ |
---|
| 104 | - log2( totals ) \ |
---|
| 105 | - log2( maximum( background, correction ) ) |
---|
| 106 | return ScoringMatrix.create_from_other( self, values.astype( float32 ) ) |
---|
| 107 | |
---|
| 108 | def to_stormo_scoring_matrix( self, background=None ): |
---|
| 109 | """ |
---|
| 110 | Create a scoring matrix from this count matrix using the method from: |
---|
| 111 | |
---|
| 112 | Hertz, G.Z. and G.D. Stormo (1999). Identifying DNA and protein patterns with statistically |
---|
| 113 | significant alignments of multiple sequences. Bioinformatics 15(7): 563-577. |
---|
| 114 | """ |
---|
| 115 | alphabet_size = len( self.alphabet ) |
---|
| 116 | if background is None: |
---|
| 117 | background = ones( alphabet_size, float32 ) / alphabet_size |
---|
| 118 | # Row totals as a one column array |
---|
| 119 | totals = sum( self.values, 1 )[:,newaxis] |
---|
| 120 | values = log2( self.values + background ) \ |
---|
| 121 | - log2( totals + 1 ) - log2( background ) |
---|
| 122 | return ScoringMatrix.create_from_other( self, values.astype( float32 ) ) |
---|
| 123 | |
---|
| 124 | class ScoringMatrix( BaseMatrix ): |
---|
| 125 | """ |
---|
| 126 | A position specific matrix containing values that are suitable for |
---|
| 127 | scoring a sequence. |
---|
| 128 | """ |
---|
| 129 | |
---|
| 130 | def score_string( self, string ): |
---|
| 131 | """ |
---|
| 132 | Score each valid position in `string` using this scoring matrix. |
---|
| 133 | Positions which were not scored are set to nan. |
---|
| 134 | """ |
---|
| 135 | rval = zeros( len( string ), float32 ) |
---|
| 136 | rval[:] = nan |
---|
| 137 | _pwm.score_string( self.values, self.char_to_index, string, rval ) |
---|
| 138 | return rval |
---|
| 139 | |
---|
| 140 | def score_string_with_gaps( self, string ): |
---|
| 141 | """ |
---|
| 142 | Score each valid position in `string` using this scoring matrix. |
---|
| 143 | Positions which were not scored are set to nan. Gap characters are |
---|
| 144 | ignored (matrices score across them). |
---|
| 145 | """ |
---|
| 146 | rval = zeros( len( string ), float32 ) |
---|
| 147 | rval[:] = nan |
---|
| 148 | _pwm.score_string_with_gaps( self.values, self.char_to_index, string, rval ) |
---|
| 149 | return rval |
---|