| 1 | #!/usr/bin/env python |
|---|
| 2 | |
|---|
| 3 | import sys |
|---|
| 4 | import math |
|---|
| 5 | import string |
|---|
| 6 | from numpy import * |
|---|
| 7 | from sets import * |
|---|
| 8 | |
|---|
| 9 | # This is the average of all species in the alignment outside of exons |
|---|
| 10 | # > mean(r) |
|---|
| 11 | # A T C G |
|---|
| 12 | # 0.2863776 0.2878264 0.2129560 0.2128400 |
|---|
| 13 | # > sd(r) |
|---|
| 14 | # A T C G |
|---|
| 15 | # 0.01316192 0.01371148 0.01293836 0.01386655 |
|---|
| 16 | |
|---|
| 17 | ENCODE_NONCODING_BACKGROUND = { 'A':0.2863776,'T':0.2878264,'G':0.2128400,'C':0.2129560} |
|---|
| 18 | |
|---|
| 19 | class Align(object): |
|---|
| 20 | def __init__ (self, seqrows, headers=None): |
|---|
| 21 | self.rows = seqrows |
|---|
| 22 | self.nrows = len(seqrows) |
|---|
| 23 | ncol = None |
|---|
| 24 | for rownum,row in enumerate(self.rows): |
|---|
| 25 | try: |
|---|
| 26 | if ncol == None: ncol = len(row) |
|---|
| 27 | elif ncol != len(row): |
|---|
| 28 | raise "Align: __init__:alignment block:row %d does not have %d columns, it has %d" % (rownum,ncol,len(row)) |
|---|
| 29 | except: |
|---|
| 30 | print row |
|---|
| 31 | raise '' |
|---|
| 32 | self.ncols = ncol |
|---|
| 33 | self.dims = (self.nrows,self.ncols) |
|---|
| 34 | self.headers = headers |
|---|
| 35 | def __str__ (self): |
|---|
| 36 | return "\n".join(self.rows) |
|---|
| 37 | |
|---|
| 38 | class AlignScoreMatrix (object): |
|---|
| 39 | def __init__(self,align): |
|---|
| 40 | nan = float('nan') |
|---|
| 41 | |
|---|
| 42 | matrix = zeros((align.nrows,align.ncols),float32) |
|---|
| 43 | |
|---|
| 44 | # set to nans |
|---|
| 45 | for ir in range( len(matrix) ): |
|---|
| 46 | for ic in range(len( matrix[ir] )): |
|---|
| 47 | matrix[ir][ic] = nan |
|---|
| 48 | self.matrix = matrix |
|---|
| 49 | |
|---|
| 50 | def __len__(self): |
|---|
| 51 | return shape(self.matrix)[1] |
|---|
| 52 | |
|---|
| 53 | def __str__(self): |
|---|
| 54 | print self.matrix |
|---|
| 55 | |
|---|
| 56 | def score_align_motif (align,motif,gapmask=None,byPosition=True): |
|---|
| 57 | |
|---|
| 58 | #dbg |
|---|
| 59 | #print >>sys.stderr, align.headers |
|---|
| 60 | chr,chr_start,chr_stop = align.headers[0] |
|---|
| 61 | |
|---|
| 62 | # a blank score matrix |
|---|
| 63 | nrows,ncols = align.dims |
|---|
| 64 | ascoremax = AlignScoreMatrix( align ) |
|---|
| 65 | scoremax = ascoremax.matrix |
|---|
| 66 | |
|---|
| 67 | minSeqLen = len( motif ) |
|---|
| 68 | for ir in range(nrows): |
|---|
| 69 | pass |
|---|
| 70 | |
|---|
| 71 | # row is missing data |
|---|
| 72 | if isnan(align.rows[ir][0]): continue |
|---|
| 73 | |
|---|
| 74 | for start in range(ncols): |
|---|
| 75 | |
|---|
| 76 | if align.rows[ir][start] == '-': continue |
|---|
| 77 | elif align.rows[ir][start] == 'n': continue |
|---|
| 78 | elif align.rows[ir][start] == 'N': continue |
|---|
| 79 | |
|---|
| 80 | # get enough sequence for the weight matrix |
|---|
| 81 | subseq = "" |
|---|
| 82 | end = 0 |
|---|
| 83 | ic = start |
|---|
| 84 | while len(subseq) < minSeqLen: |
|---|
| 85 | #for ic in range(start,ncols): |
|---|
| 86 | |
|---|
| 87 | if ic >= len(align.rows[ir]): break |
|---|
| 88 | char = align.rows[ir][ic].upper() |
|---|
| 89 | ic += 1 |
|---|
| 90 | if char == '-' or char == 'N': continue |
|---|
| 91 | else: subseq += char |
|---|
| 92 | |
|---|
| 93 | if len(subseq) == minSeqLen: |
|---|
| 94 | end = ic+1 |
|---|
| 95 | for_score = int( match_consensus(subseq,motif) ) |
|---|
| 96 | revseq = reverse_complement( subseq ) |
|---|
| 97 | rev_score = int( match_consensus(revseq,motif) ) |
|---|
| 98 | |
|---|
| 99 | score = max(for_score, rev_score) |
|---|
| 100 | #dbg |
|---|
| 101 | #if ir == 0: print >>sys.stderr, int(chr_start) + start - align.rows[ir].count('-',0,start), subseq, score |
|---|
| 102 | |
|---|
| 103 | # replace the alignment positions with the result |
|---|
| 104 | if byPosition: |
|---|
| 105 | scoremax[ir][start] = score |
|---|
| 106 | else: |
|---|
| 107 | # replace positions matching the width of the pwm |
|---|
| 108 | for i in range(start,end): |
|---|
| 109 | if isnan(scoremax[ir][i]): scoremax[ir][i] = score |
|---|
| 110 | elif score > scoremax[ir][i]: scoremax[ir][i] = score |
|---|
| 111 | #break |
|---|
| 112 | # mask gap characters |
|---|
| 113 | if gapmask == None: |
|---|
| 114 | gapmask = score_align_gaps(align) |
|---|
| 115 | putmask( scoremax, gapmask, float('nan') ) |
|---|
| 116 | return scoremax |
|---|
| 117 | |
|---|
| 118 | #----------- |
|---|
| 119 | # |
|---|
| 120 | # WeightMatrix-- |
|---|
| 121 | # A position weight matrix (PWM) representation of a motif. |
|---|
| 122 | # |
|---|
| 123 | #---------- |
|---|
| 124 | # construction arguments: |
|---|
| 125 | # id: id (name) of the motif |
|---|
| 126 | # rows: the matrix; each row is a hash from symbol to weight, with |
|---|
| 127 | # .. the weight in string form |
|---|
| 128 | # alphabet: symbols allowed |
|---|
| 129 | # background: hash from symbol to background probability of that symbol; if |
|---|
| 130 | # .. not specified, ENCODE_NONCODING_BACKGROUND is used |
|---|
| 131 | # internal fields: |
|---|
| 132 | # rows: the matrix; each row is a hash from symbol to log-odds score |
|---|
| 133 | # .. of that symbol for that row of the weight matrix |
|---|
| 134 | # counts: the matrix; count[row][sym] is the weight, as an integer |
|---|
| 135 | # probs: the matrix; probs[row][sym] is the weight, as an probability |
|---|
| 136 | #---------- |
|---|
| 137 | |
|---|
| 138 | class PositionWeightMatrix (object): |
|---|
| 139 | |
|---|
| 140 | complementMap = string.maketrans("ACGTacgt","TGCAtgca") |
|---|
| 141 | |
|---|
| 142 | # IUPAC-IUB |
|---|
| 143 | symbols = { |
|---|
| 144 | 'A':Set(['A']), |
|---|
| 145 | 'C':Set(['C']), |
|---|
| 146 | 'G':Set(['G']), |
|---|
| 147 | 'T':Set(['T']), |
|---|
| 148 | 'R':Set(['A','G']), |
|---|
| 149 | 'Y':Set(['C','T']), |
|---|
| 150 | 'M':Set(['A','C']), |
|---|
| 151 | 'K':Set(['G','T']), |
|---|
| 152 | 'S':Set(['G','C']), |
|---|
| 153 | 'W':Set(['A','T']), |
|---|
| 154 | 'H':Set(['A','C','T']), |
|---|
| 155 | 'B':Set(['G','T','C']), |
|---|
| 156 | 'V':Set(['G','C','A']), |
|---|
| 157 | 'D':Set(['G','T','A'])} |
|---|
| 158 | |
|---|
| 159 | def __init__ (self, id, rows, alphabet, background=None, score_correction=True): |
|---|
| 160 | |
|---|
| 161 | self.id = id |
|---|
| 162 | self.alphabet = alphabet |
|---|
| 163 | nsymbols = len(self.alphabet) |
|---|
| 164 | for i in range(len(self.alphabet)): |
|---|
| 165 | self.alphabet[ i ] = self.alphabet[ i ].upper() |
|---|
| 166 | if background != None: |
|---|
| 167 | self.background = background |
|---|
| 168 | else: |
|---|
| 169 | self.background = {} |
|---|
| 170 | sorted_alphabet = [] |
|---|
| 171 | sorted_alphabet[:] = self.alphabet[:] |
|---|
| 172 | sorted_alphabet.sort() |
|---|
| 173 | if ['A','C','G','T'] == sorted_alphabet: |
|---|
| 174 | self.background = ENCODE_NONCODING_BACKGROUND |
|---|
| 175 | else: |
|---|
| 176 | for x in self.alphabet: self.background[ x ] = float(1)/len(self.alphabet) |
|---|
| 177 | |
|---|
| 178 | if (score_correction == True): |
|---|
| 179 | self.score_correction = self.corrected_probability_score |
|---|
| 180 | else: |
|---|
| 181 | self.score_correction = self.simple_probability |
|---|
| 182 | |
|---|
| 183 | # partition counts from consensus symbol |
|---|
| 184 | # in order to properly handle scaling in the presense of non-integers, |
|---|
| 185 | # we prescan the matrix to figure out the largest scale factor, then go |
|---|
| 186 | # back through and scale 'em all (some rows may be integer counts, |
|---|
| 187 | # others may be probabilities) |
|---|
| 188 | |
|---|
| 189 | self.consensus = [] |
|---|
| 190 | scale = 1 |
|---|
| 191 | |
|---|
| 192 | for i in range(len(rows)): |
|---|
| 193 | |
|---|
| 194 | #try: |
|---|
| 195 | fields,consensus = rows[i][:nsymbols],rows[i][-1] |
|---|
| 196 | for x,count in enumerate(fields): |
|---|
| 197 | try: |
|---|
| 198 | (w,s) = self.parse_weight(count) |
|---|
| 199 | except ValueError: |
|---|
| 200 | raise "pwm row %s has bad weight %s" % (" ".join(fields),t) |
|---|
| 201 | |
|---|
| 202 | # replace row counts with (values,scale) |
|---|
| 203 | rows[i][x] = (w,s) |
|---|
| 204 | scale = max(s,scale) |
|---|
| 205 | |
|---|
| 206 | #except: |
|---|
| 207 | #print >>sys.stderr,rows |
|---|
| 208 | #raise ValueError |
|---|
| 209 | #raise ValueError, "pwm row %s has wrong field count" % " ".join(fields) |
|---|
| 210 | |
|---|
| 211 | self.consensus.append(consensus) |
|---|
| 212 | |
|---|
| 213 | |
|---|
| 214 | hashRows = [] |
|---|
| 215 | self.matrix_base_counts = {} # for pseudocounts |
|---|
| 216 | self.counts = [] # for scaled counts |
|---|
| 217 | self.probs = [] # for probabilities |
|---|
| 218 | |
|---|
| 219 | # scale counts to integers |
|---|
| 220 | for i in range(len(rows)): |
|---|
| 221 | hashRows.append(dict()) |
|---|
| 222 | for x,sym in enumerate(alphabet): |
|---|
| 223 | (w,s) = rows[i][x] |
|---|
| 224 | hashRows[i][sym] = w * scale/s |
|---|
| 225 | assert hashRows[i][sym] >= 0 |
|---|
| 226 | if sym not in self.matrix_base_counts: self.matrix_base_counts[sym] = 0 |
|---|
| 227 | self.matrix_base_counts[sym] += hashRows[i][sym] |
|---|
| 228 | self.counts.append( hashRows[i].copy() ) |
|---|
| 229 | self.probs.append( hashRows[i].copy() ) |
|---|
| 230 | totalWeight = float(sum(self.probs[i].values())) |
|---|
| 231 | for sym in self.probs[i]: |
|---|
| 232 | self.probs[i][sym] /= totalWeight |
|---|
| 233 | self.sites = sum ( hashRows[0].values() ) |
|---|
| 234 | |
|---|
| 235 | # scan pwm to pre-compute logs of probabilities and min and max log-odds |
|---|
| 236 | # scores (over the whole PWM) for scaling; note that the same min and max |
|---|
| 237 | # applies for scaling long-odds scores for quantum comparisions |
|---|
| 238 | self.information_content = [] |
|---|
| 239 | minSum = 0 |
|---|
| 240 | maxSum = 0 |
|---|
| 241 | |
|---|
| 242 | for i in range( len( hashRows )): |
|---|
| 243 | self.information_content.append( self.information_content_calculation( i, hashRows ) ) |
|---|
| 244 | newHashRow = {} |
|---|
| 245 | for base in self.alphabet: |
|---|
| 246 | newHashRow[base] = self.pwm_score(base, i, hashRows) |
|---|
| 247 | hashRows[i] = newHashRow |
|---|
| 248 | |
|---|
| 249 | minSum += min(hashRows[i].values()) |
|---|
| 250 | maxSum += max(hashRows[i].values()) |
|---|
| 251 | |
|---|
| 252 | self.minSum = minSum |
|---|
| 253 | self.maxSum = maxSum |
|---|
| 254 | self.rows = hashRows |
|---|
| 255 | |
|---|
| 256 | # Reference 1: Wasserman and Sandelin: Nat Rev Genet. 2004 Apr;5(4):276-87. |
|---|
| 257 | # Reference 2: Gertz et al.: Genome Res. 2005 Aug;15(8):1145-52. |
|---|
| 258 | def information_content_calculation(self, i, counts): |
|---|
| 259 | # Reference 1) |
|---|
| 260 | return 2 + sum( [ self.information_base_content(base,i,counts) for base in self.alphabet ] ) |
|---|
| 261 | |
|---|
| 262 | # Reference 2) |
|---|
| 263 | #return sum( [ self.information_base_content(base,i,counts) for base in self.alphabet ] ) |
|---|
| 264 | |
|---|
| 265 | def information_base_content(self, base, i, counts): |
|---|
| 266 | |
|---|
| 267 | # Reference 1) |
|---|
| 268 | #return self.score_correction(counts,base,i) * math.log ( self.score_correction(counts,base,i), 2) |
|---|
| 269 | |
|---|
| 270 | # Reference 2) |
|---|
| 271 | return self.score_correction(counts,base,i) * self.pwm_score(base, i, counts) |
|---|
| 272 | |
|---|
| 273 | def __call__ (self,seq): |
|---|
| 274 | return self.score_seq(seq) |
|---|
| 275 | |
|---|
| 276 | def __add__ (self,other): |
|---|
| 277 | |
|---|
| 278 | assert self.alphabet == other.alphabet |
|---|
| 279 | r,(p,q) = self.max_correlation(other) |
|---|
| 280 | |
|---|
| 281 | if p == q == 0: width = max( len(self),len(other) ) |
|---|
| 282 | elif p > 0: width = max( len(other)+p, len(self) ) |
|---|
| 283 | elif q > 0: width = max( len(self)+q, len(other) ) |
|---|
| 284 | |
|---|
| 285 | sumx = zeros( (width,len(self.alphabet)),dtype='int') |
|---|
| 286 | selfx = self.to_count_matrix() |
|---|
| 287 | otherx = other.to_count_matrix() |
|---|
| 288 | |
|---|
| 289 | if p == q == 0: |
|---|
| 290 | sumx[:len(self)] += selfx |
|---|
| 291 | sumx[:len(other)] += otherx |
|---|
| 292 | elif p > 0: |
|---|
| 293 | sumx[p:p+len(other)] += otherx |
|---|
| 294 | sumx[:len(self)] += selfx |
|---|
| 295 | else: |
|---|
| 296 | sumx[:len(other)] += otherx |
|---|
| 297 | sumx[q:q+len(self)] += selfx |
|---|
| 298 | |
|---|
| 299 | newRows = [] |
|---|
| 300 | for i,x in enumerate(sumx): |
|---|
| 301 | y = list(x) |
|---|
| 302 | y.append( consensus_symbol(y) ) |
|---|
| 303 | y = [ str(yi) for yi in y] |
|---|
| 304 | newRows.append( y ) |
|---|
| 305 | return PositionWeightMatrix(self.id+other.id,newRows,self.alphabet,self.background) |
|---|
| 306 | |
|---|
| 307 | def __old_add__ (self,other,maxp=None): |
|---|
| 308 | |
|---|
| 309 | assert self.alphabet == other.alphabet |
|---|
| 310 | bigN = max(len(self),len(other)) |
|---|
| 311 | smallN = min(len(self),len(other)) |
|---|
| 312 | if not maxp: |
|---|
| 313 | prsq = self.correlation(other) |
|---|
| 314 | maxp = prsq.index( max(prsq) ) |
|---|
| 315 | |
|---|
| 316 | leftpad = ' ' * maxp |
|---|
| 317 | rightsize = bigN - smallN |
|---|
| 318 | rightpad = ' ' * rightsize |
|---|
| 319 | leftStrings = [] |
|---|
| 320 | rightStrings = [] |
|---|
| 321 | |
|---|
| 322 | if len(self) > len(other): |
|---|
| 323 | larger = self |
|---|
| 324 | smaller = other |
|---|
| 325 | leftStrings = self.consensus |
|---|
| 326 | rightStrings = list(leftpad) + other.consensus + list(rightpad) |
|---|
| 327 | else: |
|---|
| 328 | smaller = self |
|---|
| 329 | larger = other |
|---|
| 330 | leftStrings = list(leftpad) + self.consensus + list(rightpad) |
|---|
| 331 | rightStrings = other.consensus |
|---|
| 332 | |
|---|
| 333 | sumx = zeros([bigN,len(self.alphabet)]) |
|---|
| 334 | sumx += larger.to_count_matrix() |
|---|
| 335 | sumx[maxp:maxp+smallN] += smaller.to_count_matrix() |
|---|
| 336 | |
|---|
| 337 | newRows = [] |
|---|
| 338 | for i,x in enumerate(sumx): |
|---|
| 339 | y = list(x) |
|---|
| 340 | y.append( leftStrings[i] + rightStrings[i] ) |
|---|
| 341 | y = [ str(yi) for yi in y] |
|---|
| 342 | newRows.append( y ) |
|---|
| 343 | |
|---|
| 344 | #return PositionWeightMatrix(self.id+other.id,newRows[maxp:maxp+smallN],self.alphabet,self.background) |
|---|
| 345 | return PositionWeightMatrix(self.id+other.id,newRows,self.alphabet,self.background) |
|---|
| 346 | |
|---|
| 347 | def to_matrix(self): |
|---|
| 348 | m = zeros([len(self),len(self.alphabet)]) |
|---|
| 349 | for i in range(len(self)): |
|---|
| 350 | for j,a in enumerate(self.alphabet): |
|---|
| 351 | m[i][j] = self[i][a] |
|---|
| 352 | return m |
|---|
| 353 | |
|---|
| 354 | def to_count_matrix(self): |
|---|
| 355 | m = zeros([len(self),len(self.alphabet)],dtype='int') |
|---|
| 356 | for i in range(len(self)): |
|---|
| 357 | for j,a in enumerate(self.alphabet): |
|---|
| 358 | m[i][j] = self.counts[i][a] |
|---|
| 359 | return m |
|---|
| 360 | |
|---|
| 361 | def max_correlation(self, otherwmx): |
|---|
| 362 | rsq,ixtuple = self.slide_correlation(otherwmx) |
|---|
| 363 | max_rsq = max(rsq) |
|---|
| 364 | maxp,maxq = ixtuple[rsq.index(max_rsq)] |
|---|
| 365 | return max_rsq,(maxp,maxq) |
|---|
| 366 | |
|---|
| 367 | def slide_correlation(self, other): |
|---|
| 368 | assert self.alphabet == other.alphabet |
|---|
| 369 | selfx = self.to_count_matrix() |
|---|
| 370 | otherx = other.to_count_matrix() |
|---|
| 371 | rsq = [] |
|---|
| 372 | ixtuple = [] |
|---|
| 373 | # self staggered over other, scan self backwards until flush |
|---|
| 374 | for q in range(len(other)-1,-1,-1): |
|---|
| 375 | r = 0 |
|---|
| 376 | n = 0 |
|---|
| 377 | for p in range(len(self)): |
|---|
| 378 | if q+p < len(other): |
|---|
| 379 | r += rsquared( list(selfx[p]), list(otherx[q+p]) ) |
|---|
| 380 | n += 1 |
|---|
| 381 | else: |
|---|
| 382 | n += 1 |
|---|
| 383 | rsq.append( r/n ) |
|---|
| 384 | ixtuple.append( (0,q) ) |
|---|
| 385 | # other staggered below self , scan other forward |
|---|
| 386 | for p in range(1,len(self)): |
|---|
| 387 | r = 0 |
|---|
| 388 | n = 0 |
|---|
| 389 | for q in range(len(other)): |
|---|
| 390 | if p+q < len(self): |
|---|
| 391 | r += rsquared( list(selfx[p+q]), list(otherx[q]) ) |
|---|
| 392 | n += 1 |
|---|
| 393 | else: |
|---|
| 394 | n += 1 |
|---|
| 395 | rsq.append( r/n ) |
|---|
| 396 | ixtuple.append( (p,0) ) |
|---|
| 397 | return rsq,ixtuple |
|---|
| 398 | |
|---|
| 399 | def correlation(self, otherwmx): |
|---|
| 400 | assert self.alphabet == otherwmx.alphabet |
|---|
| 401 | width = len(self.alphabet) |
|---|
| 402 | if len(self) > len(otherwmx): |
|---|
| 403 | larger = self.to_count_matrix() |
|---|
| 404 | smaller = otherwmx.to_count_matrix() |
|---|
| 405 | else: |
|---|
| 406 | smaller = self.to_count_matrix() |
|---|
| 407 | larger = otherwmx.to_count_matrix() |
|---|
| 408 | bigN = len(larger) |
|---|
| 409 | smallN = len(smaller) |
|---|
| 410 | position_rsq = [] |
|---|
| 411 | |
|---|
| 412 | # slide small over large, for ave rsq |
|---|
| 413 | for p in range(bigN): |
|---|
| 414 | if p+smallN <= bigN: |
|---|
| 415 | r = 0 |
|---|
| 416 | for q in range(smallN): |
|---|
| 417 | r += rsquared(list(smaller[q]),list(larger[p+q])) |
|---|
| 418 | position_rsq.append( r / smallN ) |
|---|
| 419 | return position_rsq |
|---|
| 420 | |
|---|
| 421 | def score_align (self,align,gapmask=None,byPosition=True): |
|---|
| 422 | |
|---|
| 423 | # a blank score matrix |
|---|
| 424 | nrows,ncols = align.dims |
|---|
| 425 | ascoremax = AlignScoreMatrix( align ) |
|---|
| 426 | scoremax = ascoremax.matrix |
|---|
| 427 | |
|---|
| 428 | minSeqLen = len( self ) |
|---|
| 429 | for ir in range(nrows): |
|---|
| 430 | |
|---|
| 431 | # row is missing data |
|---|
| 432 | if isnan(align.rows[ir][0]): continue |
|---|
| 433 | |
|---|
| 434 | for start in range(ncols): |
|---|
| 435 | if align.rows[ir][start] == '-': continue |
|---|
| 436 | elif align.rows[ir][start] == 'n': continue |
|---|
| 437 | elif align.rows[ir][start] == 'N': continue |
|---|
| 438 | |
|---|
| 439 | # get enough sequence for the weight matrix |
|---|
| 440 | subseq = "" |
|---|
| 441 | end = 0 |
|---|
| 442 | for ic in range(start,ncols): |
|---|
| 443 | |
|---|
| 444 | char = align.rows[ir][ic] |
|---|
| 445 | if char == '-' or char == 'N': continue |
|---|
| 446 | else: subseq += char |
|---|
| 447 | |
|---|
| 448 | if len(subseq) == minSeqLen: |
|---|
| 449 | end = ic+1 |
|---|
| 450 | |
|---|
| 451 | #forward |
|---|
| 452 | scores = self.score_seq( subseq ) |
|---|
| 453 | raw,forward_score = scores[0] |
|---|
| 454 | #reverse |
|---|
| 455 | scores = self.score_reverse_seq( subseq ) |
|---|
| 456 | raw,reverse_score = scores[0] |
|---|
| 457 | |
|---|
| 458 | score = max(forward_score, reverse_score) |
|---|
| 459 | |
|---|
| 460 | # replace the alignment positions with the result |
|---|
| 461 | if byPosition: |
|---|
| 462 | scoremax[ir][start] = score |
|---|
| 463 | else: |
|---|
| 464 | # replace positions matching the width of the pwm |
|---|
| 465 | for i in range(start,end): |
|---|
| 466 | if isnan(scoremax[ir][i]): scoremax[ir][i] = score |
|---|
| 467 | elif score > scoremax[ir][i]: scoremax[ir][i] = score |
|---|
| 468 | # mask gap characters |
|---|
| 469 | if gapmask == None: |
|---|
| 470 | gapmask = score_align_gaps(align) |
|---|
| 471 | putmask( scoremax, gapmask, float('nan') ) |
|---|
| 472 | return scoremax |
|---|
| 473 | |
|---|
| 474 | # seq can be a string, a list of characters, or a quantum sequence (a list |
|---|
| 475 | # of hashes from symbols to probability) |
|---|
| 476 | |
|---|
| 477 | def score_seq(self,seq): |
|---|
| 478 | if (type(seq[0]) == dict): |
|---|
| 479 | return self.score_quantum_seq(seq) |
|---|
| 480 | |
|---|
| 481 | scores = [] |
|---|
| 482 | for start in range( len(seq)): |
|---|
| 483 | if start + len(self) > len(seq): break |
|---|
| 484 | subseq = seq[ start:start+len(self) ] |
|---|
| 485 | raw = 0 |
|---|
| 486 | try: |
|---|
| 487 | for i,nt in enumerate(subseq): raw += self.rows[i][nt.upper()] |
|---|
| 488 | scaled = self.scaled( raw ) |
|---|
| 489 | except KeyError: |
|---|
| 490 | raw,scaled = float('nan'),float('nan') |
|---|
| 491 | scores.append( (raw, scaled) ) |
|---|
| 492 | return scores |
|---|
| 493 | |
|---|
| 494 | def score_quantum_seq(self,seq): |
|---|
| 495 | scores = [] |
|---|
| 496 | for start in range(len(seq)): |
|---|
| 497 | if (start + len(self) > len(seq)): break |
|---|
| 498 | subseq = seq[start:start+len(self)] |
|---|
| 499 | raw = 0 |
|---|
| 500 | try: |
|---|
| 501 | for i,nt in enumerate(subseq): |
|---|
| 502 | numer = sum([subseq[i][nt] * self.probs[i][nt] for nt in subseq[i]]) |
|---|
| 503 | denom = sum([subseq[i][nt] * self.background[nt] for nt in subseq[i]]) |
|---|
| 504 | raw += math.log(numer/denom,2) |
|---|
| 505 | scaled = self.scaled(raw) |
|---|
| 506 | except KeyError: |
|---|
| 507 | raw,scaled = float('nan'),float('nan') |
|---|
| 508 | except OverflowError,e: |
|---|
| 509 | raw,scaled = float('nan'),float('nan') |
|---|
| 510 | except ValueError,e: |
|---|
| 511 | raw,scaled = float('nan'),float('nan') |
|---|
| 512 | scores.append((raw,scaled)) |
|---|
| 513 | return scores |
|---|
| 514 | |
|---|
| 515 | def score_reverse_seq(self,seq): |
|---|
| 516 | revSeq = reverse_complement( seq ) |
|---|
| 517 | scores = self.score_seq( revSeq ) |
|---|
| 518 | scores.reverse() |
|---|
| 519 | return scores |
|---|
| 520 | |
|---|
| 521 | def scaled(self,val): |
|---|
| 522 | return ( val - self.minSum ) / (self.maxSum - self.minSum) |
|---|
| 523 | |
|---|
| 524 | def pseudocount(self, base=None): |
|---|
| 525 | f = lambda count: math.sqrt( count + 1 ) |
|---|
| 526 | if base in self.alphabet: |
|---|
| 527 | return f( self.matrix_base_counts[base] ) |
|---|
| 528 | elif base == None: |
|---|
| 529 | return f ( self.sites ) |
|---|
| 530 | else: |
|---|
| 531 | return float("nan") |
|---|
| 532 | |
|---|
| 533 | def simple_probability (self,freq, base, i): |
|---|
| 534 | # p(base,i) = f(base,i) |
|---|
| 535 | # ---------------------- |
|---|
| 536 | # sum(f(base,{A,C,G,T})) |
|---|
| 537 | |
|---|
| 538 | return float( freq[i][base] ) / sum([freq[i][nt] for nt in self.alphabet]) |
|---|
| 539 | |
|---|
| 540 | def corrected_probability_score (self,freq, base, i): |
|---|
| 541 | # p(base,i) = f(base,i) + s(base) |
|---|
| 542 | # -------------------- |
|---|
| 543 | # N + sum(s(A,C,T,G)) |
|---|
| 544 | |
|---|
| 545 | f = float( freq[i][base] ) |
|---|
| 546 | s = self.pseudocount(base) |
|---|
| 547 | N = self.sites |
|---|
| 548 | #print >>sys.stderr, "f:%.3f + s:%.3f = %.3f" % (f,s,f + s) |
|---|
| 549 | #print >>sys.stderr, "-------------------------" |
|---|
| 550 | #print >>sys.stderr, "N:%d + %d = %d" % (N,self.pseudocount(), N + self.pseudocount()) |
|---|
| 551 | #print >>sys.stderr, "\t\t %.3f\n" % ((f + s) / (N + self.pseudocount())) |
|---|
| 552 | |
|---|
| 553 | assert (f + s) > 0 |
|---|
| 554 | return (f + s) / (N + self.pseudocount()) |
|---|
| 555 | |
|---|
| 556 | def pwm_score (self,base,i,freq,background=None): |
|---|
| 557 | if background == None: background = self.background |
|---|
| 558 | p = self.score_correction(freq,base,i) |
|---|
| 559 | #print >>sys.stderr, p |
|---|
| 560 | #print >>sys.stderr, "k %d %c" % (i,base),freq[i][base] |
|---|
| 561 | b = background[ base ] |
|---|
| 562 | try: |
|---|
| 563 | return math.log( p/b, 2) |
|---|
| 564 | except OverflowError,e: |
|---|
| 565 | ## print >>sys.stderr,"base=%c, math.log(%.3f / %.3f)" % (base,p,b) |
|---|
| 566 | ## print >>sys.stderr,self.id |
|---|
| 567 | return float('nan') |
|---|
| 568 | except ValueError,e: |
|---|
| 569 | ## print >>sys.stderr,"base=%c, math.log(%.3f / %.3f)" % (base,p,b) |
|---|
| 570 | ## print >>sys.stderr,self.id |
|---|
| 571 | return float('nan') |
|---|
| 572 | |
|---|
| 573 | def parse_weight (self, weightString): |
|---|
| 574 | |
|---|
| 575 | fields = weightString.split(".") |
|---|
| 576 | if (len(fields) > 2): raise ValueError |
|---|
| 577 | |
|---|
| 578 | w = int(fields[0]) |
|---|
| 579 | s = 1 |
|---|
| 580 | |
|---|
| 581 | if (len(fields) == 2): |
|---|
| 582 | for cnt in range(0,len(fields[1])): s *= 10 |
|---|
| 583 | w = s*w + int(fields[1]) |
|---|
| 584 | |
|---|
| 585 | return (w,s) # w = the weight |
|---|
| 586 | # s = the scale used (a power of 10) |
|---|
| 587 | |
|---|
| 588 | def __str__ (self): |
|---|
| 589 | lines = [self.id] |
|---|
| 590 | headers = ["%s" % nt for nt in self.alphabet] |
|---|
| 591 | lines.append("P0\t" + "\t".join(headers)) |
|---|
| 592 | for ix in range(0,len(self.rows)): |
|---|
| 593 | weights = ["%d" % self.counts[ix][nt] for nt in self.alphabet] |
|---|
| 594 | #lines.append(("%02d\t" % ix) + "\t".join(weights) + "\t" + self.consensus[ix]) |
|---|
| 595 | lines.append(("%02d\t" % ix) + "\t".join(weights) + "\t" + str(sum(self.counts[ix].values())) + "\t" + self.consensus[ix]) |
|---|
| 596 | |
|---|
| 597 | return "\n".join(lines) |
|---|
| 598 | |
|---|
| 599 | def __getitem__ (self,key): |
|---|
| 600 | return self.rows[key] |
|---|
| 601 | |
|---|
| 602 | def __setitem__ (self,key,value): |
|---|
| 603 | self.rows[key] = value |
|---|
| 604 | |
|---|
| 605 | def __len__ (self): |
|---|
| 606 | return len( self.rows ) |
|---|
| 607 | |
|---|
| 608 | def score_align_gaps (align): |
|---|
| 609 | # a blank score matrix |
|---|
| 610 | nrows,ncols = align.dims |
|---|
| 611 | scoremax = AlignScoreMatrix( align ).matrix |
|---|
| 612 | for ir in range(nrows): |
|---|
| 613 | # row is missing data |
|---|
| 614 | if isnan(align.rows[ir][0]): continue |
|---|
| 615 | # scan for gaps |
|---|
| 616 | for pos in range(ncols): |
|---|
| 617 | if align.rows[ir][pos] == '-': scoremax[ir][pos] = 1 |
|---|
| 618 | else: scoremax[ir][pos] = 0 |
|---|
| 619 | return scoremax |
|---|
| 620 | |
|---|
| 621 | #----------- |
|---|
| 622 | # |
|---|
| 623 | # WeightMatrix Reader-- |
|---|
| 624 | # Read position weight matrices (PWM) from a file. |
|---|
| 625 | # |
|---|
| 626 | #----------- |
|---|
| 627 | |
|---|
| 628 | class Reader (object): |
|---|
| 629 | """Iterate over all interesting weight matrices in a file""" |
|---|
| 630 | |
|---|
| 631 | def __init__ (self,file,tfIds=None,name=None,format='basic',background=None,score_correction=True): |
|---|
| 632 | self.tfIds = tfIds |
|---|
| 633 | self.file = file |
|---|
| 634 | self.name = name |
|---|
| 635 | self.lineNumber = 0 |
|---|
| 636 | self.format = format |
|---|
| 637 | self.background = background |
|---|
| 638 | self.score_correction = score_correction |
|---|
| 639 | |
|---|
| 640 | |
|---|
| 641 | def close (self): |
|---|
| 642 | self.file.close() |
|---|
| 643 | |
|---|
| 644 | |
|---|
| 645 | def where (self): |
|---|
| 646 | if (self.name == None): |
|---|
| 647 | return "line %d" % self.lineNumber |
|---|
| 648 | else: |
|---|
| 649 | return "line %d in %s" % (self.lineNumber,self.name) |
|---|
| 650 | |
|---|
| 651 | |
|---|
| 652 | def __iter__ (self): |
|---|
| 653 | if self.format == 'basic': |
|---|
| 654 | return self.read_as_basic() |
|---|
| 655 | elif self.format == 'transfac': |
|---|
| 656 | return self.read_as_transfac() |
|---|
| 657 | else: |
|---|
| 658 | raise "unknown weight matrix file format: '%s'" % self.format |
|---|
| 659 | |
|---|
| 660 | def read_as_basic(self): |
|---|
| 661 | tfId = None |
|---|
| 662 | pwmRows = None |
|---|
| 663 | |
|---|
| 664 | alphabet = ['A','C','G','T'] |
|---|
| 665 | while (True): |
|---|
| 666 | line = self.file.readline() |
|---|
| 667 | if (not line): break |
|---|
| 668 | line = line.strip() |
|---|
| 669 | self.lineNumber += 1 |
|---|
| 670 | if line.startswith(">"): |
|---|
| 671 | if pwmRows != None: |
|---|
| 672 | yield PositionWeightMatrix(tfId,pwmRows,alphabet,background=self.background) |
|---|
| 673 | #try: |
|---|
| 674 | #yield PositionWeightMatrix(tfId,pwmRows,alphabet) |
|---|
| 675 | #except: |
|---|
| 676 | #print >>sys.stderr, "Failed to read", tfId |
|---|
| 677 | tfId = line.strip()[1:] |
|---|
| 678 | pwmRows = [] |
|---|
| 679 | elif line[0].isdigit(): |
|---|
| 680 | tokens = line.strip().split() |
|---|
| 681 | tokens.append( consensus_symbol(line) ) |
|---|
| 682 | vals = [float(v) for v in tokens[:-1]] |
|---|
| 683 | #print >>sys.stderr,[ "%.2f" % (float(v)/sum(vals)) for v in vals], tokens[-1] |
|---|
| 684 | pwmRows.append( tokens ) |
|---|
| 685 | if pwmRows != None: # we've finished collecting a desired matrix |
|---|
| 686 | yield PositionWeightMatrix(tfId,pwmRows,alphabet,background=self.background,score_correction=self.score_correction) |
|---|
| 687 | |
|---|
| 688 | def read_as_transfac(self): |
|---|
| 689 | self.tfToPwm = {} |
|---|
| 690 | tfId = None |
|---|
| 691 | pwmRows = None |
|---|
| 692 | |
|---|
| 693 | while (True): |
|---|
| 694 | line = self.file.readline() |
|---|
| 695 | if (not line): break |
|---|
| 696 | line = line.strip() |
|---|
| 697 | self.lineNumber += 1 |
|---|
| 698 | # handle an ID line |
|---|
| 699 | if line.startswith("ID"): |
|---|
| 700 | if pwmRows != None: # we've finished collecting a desired matrix |
|---|
| 701 | try: |
|---|
| 702 | yield PositionWeightMatrix(tfId,pwmRows,alphabet,background=self.background,score_correction=self.score_correction) |
|---|
| 703 | except: |
|---|
| 704 | print >>sys.stderr, "Failed to read", tfId |
|---|
| 705 | tfId = None |
|---|
| 706 | pwmRows = None |
|---|
| 707 | |
|---|
| 708 | tokens = line.split (None, 2) |
|---|
| 709 | if len(tokens) != 2: |
|---|
| 710 | raise ValueError, "bad line, need two fields (%s)" % self.where() |
|---|
| 711 | tfId = tokens[1] |
|---|
| 712 | if self.tfIds != None and (not tfId in self.tfIds): |
|---|
| 713 | continue # ignore it, this isn't a desired matrix |
|---|
| 714 | if tfId in self.tfToPwm: |
|---|
| 715 | raise ValueError, "transcription factor %s appears twice (%s)" \ |
|---|
| 716 | % (tfId,self.where()) |
|---|
| 717 | pwmRows = [] # start collecting a desired matrix |
|---|
| 718 | continue |
|---|
| 719 | |
|---|
| 720 | # if we're not collecting, skip this line |
|---|
| 721 | if pwmRows == None: continue |
|---|
| 722 | if len(line) < 1: continue |
|---|
| 723 | |
|---|
| 724 | # name, if present, added to ID |
|---|
| 725 | if line.startswith('NA'): |
|---|
| 726 | words = line.strip().split() |
|---|
| 727 | tfId = tfId + "\t" + " ".join(words[1:]) |
|---|
| 728 | |
|---|
| 729 | # handle a P0 line |
|---|
| 730 | if line.startswith("P0"): |
|---|
| 731 | alphabet = line.split()[1:] |
|---|
| 732 | if len(alphabet) < 2: |
|---|
| 733 | raise ValueError, "bad line, need more dna (%s)" % self.where() |
|---|
| 734 | continue |
|---|
| 735 | |
|---|
| 736 | # handle a 01,02,etc. line |
|---|
| 737 | if line[0].isdigit(): |
|---|
| 738 | tokens = line.split () |
|---|
| 739 | try: |
|---|
| 740 | index = int(tokens[0]) |
|---|
| 741 | if index != len(pwmRows)+1: raise ValueError |
|---|
| 742 | except: |
|---|
| 743 | raise ValueError,"bad line, bad index (%s)" % self.where() |
|---|
| 744 | pwmRows.append(tokens[1:]) |
|---|
| 745 | continue |
|---|
| 746 | # skip low quality entries |
|---|
| 747 | if line.startswith("CC TRANSFAC Sites of quality"): |
|---|
| 748 | print >>sys.stderr, line.strip(), tfId |
|---|
| 749 | pwmRows = None |
|---|
| 750 | continue |
|---|
| 751 | if pwmRows != None: # we've finished collecting a desired matrix |
|---|
| 752 | yield PositionWeightMatrix(tfId,pwmRows,alphabet,background=self.background,score_correction=self.score_correction) |
|---|
| 753 | # clean up |
|---|
| 754 | self.tfToPwm = None |
|---|
| 755 | |
|---|
| 756 | def isnan(x): |
|---|
| 757 | #return ieeespecial.isnan(x) |
|---|
| 758 | if x==x: return False |
|---|
| 759 | return True |
|---|
| 760 | |
|---|
| 761 | def reverse_complement (nukes): |
|---|
| 762 | return nukes[::-1].translate(PositionWeightMatrix.complementMap) |
|---|
| 763 | |
|---|
| 764 | def rsquared( x, y ): |
|---|
| 765 | try: |
|---|
| 766 | return sum_of_squares(x,y)**2 / (sum_of_squares(x) * sum_of_squares(y)) |
|---|
| 767 | except ZeroDivisionError: |
|---|
| 768 | #return float('nan') |
|---|
| 769 | return 0 |
|---|
| 770 | |
|---|
| 771 | def sum_of_squares( x,y=None ): |
|---|
| 772 | if not y: y = x |
|---|
| 773 | xmean = float(sum( x )) / len( x ) |
|---|
| 774 | ymean = float(sum( y )) / len( y ) |
|---|
| 775 | assert len(x) == len(y) |
|---|
| 776 | return sum([ float(xi)*float(yi) for xi,yi in zip(x,y)]) - len(x)*xmean*ymean |
|---|
| 777 | |
|---|
| 778 | def match_consensus(sequence,pattern): |
|---|
| 779 | |
|---|
| 780 | return c_match_consensus( sequence, pattern, len(sequence)) |
|---|
| 781 | |
|---|
| 782 | #for s,p in zip(sequence,pattern): |
|---|
| 783 | #if p == 'N': continue |
|---|
| 784 | #if not s in PositionWeightMatrix.symbols[p]: return False |
|---|
| 785 | |
|---|
| 786 | #return True |
|---|
| 787 | |
|---|
| 788 | def consensus_symbol( pattern ): |
|---|
| 789 | |
|---|
| 790 | if type(pattern)==type(""): |
|---|
| 791 | try: |
|---|
| 792 | pattern = [int(x) for x in pattern.split()] |
|---|
| 793 | except ValueError,e: |
|---|
| 794 | print >>sys.stderr, pattern |
|---|
| 795 | raise ValueError,e |
|---|
| 796 | |
|---|
| 797 | # IUPAC-IUB nomenclature for wobblers |
|---|
| 798 | wobblers = { |
|---|
| 799 | 'R':Set(['A','G']), |
|---|
| 800 | 'Y':Set(['C','T']), |
|---|
| 801 | 'M':Set(['A','C']), |
|---|
| 802 | 'K':Set(['G','T']), |
|---|
| 803 | 'S':Set(['G','C']), |
|---|
| 804 | 'W':Set(['A','T']), |
|---|
| 805 | 'H':Set(['A','C','T']), |
|---|
| 806 | 'B':Set(['G','T','C']), |
|---|
| 807 | 'V':Set(['G','C','A']), |
|---|
| 808 | 'D':Set(['G','T','A'])} |
|---|
| 809 | |
|---|
| 810 | symbols = ['A','C','G','T'] |
|---|
| 811 | |
|---|
| 812 | if type(pattern)==type({}): |
|---|
| 813 | pattern = [pattern[u] for u in symbols] |
|---|
| 814 | |
|---|
| 815 | total = sum(pattern) |
|---|
| 816 | f = [ (space/1e5)+(float(x)/total) for space,x in enumerate(pattern) ] |
|---|
| 817 | copy = [] |
|---|
| 818 | copy[:] = f[:] |
|---|
| 819 | copy.sort() |
|---|
| 820 | |
|---|
| 821 | # http://www.genomatix.de/online_help/help_matinspector/matrix_help.html -- |
|---|
| 822 | # url says consensus must be greater than 50%, and at least twice the freq |
|---|
| 823 | # of the second-most frequent. A double-degenerate symbol can be used |
|---|
| 824 | # if the top two account for 75% or more of the nt, if each is less than 50% |
|---|
| 825 | # Otherwise, N is used in the consensus. |
|---|
| 826 | tops = copy[-2:] |
|---|
| 827 | if tops[1] > 0.5 and tops[1] >= 2 * tops[0]: return symbols[f.index(tops[1])] |
|---|
| 828 | elif tops[0] < 0.5 and sum(tops) >= 0.75: |
|---|
| 829 | degen = Set([ symbols[f.index(v)] for v in tops ]) |
|---|
| 830 | for degenSymbol,wobbles in wobblers.items(): |
|---|
| 831 | #print >>sys.stderr,wobbles |
|---|
| 832 | if degen == wobbles: |
|---|
| 833 | return degenSymbol |
|---|
| 834 | else: return 'N' |
|---|
| 835 | print >>sys.stderr,pattern |
|---|
| 836 | raise '?' |
|---|
| 837 | |
|---|
| 838 | # import C extensions |
|---|
| 839 | try: |
|---|
| 840 | from _position_weight_matrix import c_match_consensus |
|---|
| 841 | ## print >>sys.stderr, "C match_consensus used" |
|---|
| 842 | except: |
|---|
| 843 | ## print >>sys.stderr, "python match_consensus used" |
|---|
| 844 | def match_consensus(sequence, pattern, size): |
|---|
| 845 | for s,p in zip(sequence,pattern): |
|---|
| 846 | if p == 'N': continue |
|---|
| 847 | if not s in PositionWeightMatrix.symbols[p]: return False |
|---|
| 848 | |
|---|
| 849 | return True |
|---|