| 1 | """ |
|---|
| 2 | Classes for working with position specific matrices. |
|---|
| 3 | """ |
|---|
| 4 | |
|---|
| 5 | from numpy import * |
|---|
| 6 | from copy import copy |
|---|
| 7 | |
|---|
| 8 | import _pwm |
|---|
| 9 | |
|---|
| 10 | class BaseMatrix( object ): |
|---|
| 11 | """ |
|---|
| 12 | Base class for position specific matrices. |
|---|
| 13 | """ |
|---|
| 14 | def __init__( self, alphabet=None, sorted_alphabet=None, |
|---|
| 15 | char_to_index=None, values=None ): |
|---|
| 16 | self.alphabet = alphabet |
|---|
| 17 | self.sorted_alphabet = sorted_alphabet |
|---|
| 18 | self.char_to_index = char_to_index |
|---|
| 19 | self.values = values |
|---|
| 20 | |
|---|
| 21 | @classmethod |
|---|
| 22 | def from_rows( Class, alphabet, rows ): |
|---|
| 23 | """ |
|---|
| 24 | Create a new matrix for a sequence over alphabet `alphabet` taking |
|---|
| 25 | values from `rows` which is a list whose length is the width of the |
|---|
| 26 | matrix, and whose elements are lists of values associated with each |
|---|
| 27 | character (in the order those characters appear in alphabet). |
|---|
| 28 | """ |
|---|
| 29 | # Sorted alphabet |
|---|
| 30 | sorted_alphabet = sorted( alphabet ) |
|---|
| 31 | # Character to index mapping (initialized to -1) |
|---|
| 32 | char_to_index = zeros( (256), int16 ) - 1 |
|---|
| 33 | for i, ch in enumerate( sorted_alphabet ): |
|---|
| 34 | char_to_index[ ord(ch) ] = i |
|---|
| 35 | # Array |
|---|
| 36 | values = zeros( ( len( rows) , len( alphabet ) ), float32 ) |
|---|
| 37 | for i, row in enumerate( rows ): |
|---|
| 38 | assert len( row ) == len( alphabet ) |
|---|
| 39 | for ch, val in zip( alphabet, row ): |
|---|
| 40 | values[i, char_to_index[ord(ch)]] = val |
|---|
| 41 | # Matrix |
|---|
| 42 | matrix = Class() |
|---|
| 43 | matrix.alphabet = alphabet |
|---|
| 44 | matrix.sorted_alphabet = sorted_alphabet |
|---|
| 45 | matrix.char_to_index = char_to_index |
|---|
| 46 | matrix.values = values |
|---|
| 47 | return matrix |
|---|
| 48 | |
|---|
| 49 | @classmethod |
|---|
| 50 | def create_from_other( Class, other, values=None ): |
|---|
| 51 | """ |
|---|
| 52 | Create a new Matrix with attributes taken from `other` but with the |
|---|
| 53 | values taken from `values` if provided |
|---|
| 54 | """ |
|---|
| 55 | m = Class() |
|---|
| 56 | m.alphabet = other.alphabet |
|---|
| 57 | m.sorted_alphabet = other.sorted_alphabet |
|---|
| 58 | m.char_to_index = other.char_to_index |
|---|
| 59 | if values is not None: |
|---|
| 60 | m.values = values |
|---|
| 61 | else: |
|---|
| 62 | m.values = other.values |
|---|
| 63 | return m |
|---|
| 64 | |
|---|
| 65 | @property |
|---|
| 66 | def width( self ): |
|---|
| 67 | """ |
|---|
| 68 | Return the width (size along the sequence axis) of this matrix. |
|---|
| 69 | """ |
|---|
| 70 | return self.values.shape[0] |
|---|
| 71 | |
|---|
| 72 | def reverse_complement( self ): |
|---|
| 73 | """ |
|---|
| 74 | Create the reverse complement of this matrix. The result probably |
|---|
| 75 | only makese sense if the alphabet is that of DNA ('A','C','G','T'). |
|---|
| 76 | """ |
|---|
| 77 | rval = copy( self ) |
|---|
| 78 | # Conveniently enough, reversing rows and columns is exactly what we |
|---|
| 79 | # want, since this results in A swapping with T and C swapping with G. |
|---|
| 80 | rval.values = self.values[::-1,::-1].copy() |
|---|
| 81 | return rval |
|---|
| 82 | |
|---|
| 83 | class FrequencyMatrix( BaseMatrix ): |
|---|
| 84 | """ |
|---|
| 85 | A position specific count/frequency matrix. |
|---|
| 86 | """ |
|---|
| 87 | |
|---|
| 88 | DEFAULT_CORRECTION = 0.0000000001 |
|---|
| 89 | """ |
|---|
| 90 | Default value to use for correcting when dealing with counts of zero, |
|---|
| 91 | chosen to produce scoring matrices that are the same as produced by CREAD. |
|---|
| 92 | """ |
|---|
| 93 | |
|---|
| 94 | def to_logodds_scoring_matrix( self, background=None, correction=DEFAULT_CORRECTION ): |
|---|
| 95 | """ |
|---|
| 96 | Create a standard logodds scoring matrix. |
|---|
| 97 | """ |
|---|
| 98 | alphabet_size = len( self.alphabet ) |
|---|
| 99 | if background is None: |
|---|
| 100 | background = ones( alphabet_size, float32 ) / alphabet_size |
|---|
| 101 | # Row totals as a one column array |
|---|
| 102 | totals = sum( self.values, 1 )[:,newaxis] |
|---|
| 103 | values = log2( maximum( self.values, correction ) ) \ |
|---|
| 104 | - log2( totals ) \ |
|---|
| 105 | - log2( maximum( background, correction ) ) |
|---|
| 106 | return ScoringMatrix.create_from_other( self, values.astype( float32 ) ) |
|---|
| 107 | |
|---|
| 108 | def to_stormo_scoring_matrix( self, background=None ): |
|---|
| 109 | """ |
|---|
| 110 | Create a scoring matrix from this count matrix using the method from: |
|---|
| 111 | |
|---|
| 112 | Hertz, G.Z. and G.D. Stormo (1999). Identifying DNA and protein patterns with statistically |
|---|
| 113 | significant alignments of multiple sequences. Bioinformatics 15(7): 563-577. |
|---|
| 114 | """ |
|---|
| 115 | alphabet_size = len( self.alphabet ) |
|---|
| 116 | if background is None: |
|---|
| 117 | background = ones( alphabet_size, float32 ) / alphabet_size |
|---|
| 118 | # Row totals as a one column array |
|---|
| 119 | totals = sum( self.values, 1 )[:,newaxis] |
|---|
| 120 | values = log2( self.values + background ) \ |
|---|
| 121 | - log2( totals + 1 ) - log2( background ) |
|---|
| 122 | return ScoringMatrix.create_from_other( self, values.astype( float32 ) ) |
|---|
| 123 | |
|---|
| 124 | class ScoringMatrix( BaseMatrix ): |
|---|
| 125 | """ |
|---|
| 126 | A position specific matrix containing values that are suitable for |
|---|
| 127 | scoring a sequence. |
|---|
| 128 | """ |
|---|
| 129 | |
|---|
| 130 | def score_string( self, string ): |
|---|
| 131 | """ |
|---|
| 132 | Score each valid position in `string` using this scoring matrix. |
|---|
| 133 | Positions which were not scored are set to nan. |
|---|
| 134 | """ |
|---|
| 135 | rval = zeros( len( string ), float32 ) |
|---|
| 136 | rval[:] = nan |
|---|
| 137 | _pwm.score_string( self.values, self.char_to_index, string, rval ) |
|---|
| 138 | return rval |
|---|
| 139 | |
|---|
| 140 | def score_string_with_gaps( self, string ): |
|---|
| 141 | """ |
|---|
| 142 | Score each valid position in `string` using this scoring matrix. |
|---|
| 143 | Positions which were not scored are set to nan. Gap characters are |
|---|
| 144 | ignored (matrices score across them). |
|---|
| 145 | """ |
|---|
| 146 | rval = zeros( len( string ), float32 ) |
|---|
| 147 | rval[:] = nan |
|---|
| 148 | _pwm.score_string_with_gaps( self.values, self.char_to_index, string, rval ) |
|---|
| 149 | return rval |
|---|