1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | import sys |
---|
4 | import math |
---|
5 | import string |
---|
6 | from numpy import * |
---|
7 | from sets import * |
---|
8 | |
---|
9 | # This is the average of all species in the alignment outside of exons |
---|
10 | # > mean(r) |
---|
11 | # A T C G |
---|
12 | # 0.2863776 0.2878264 0.2129560 0.2128400 |
---|
13 | # > sd(r) |
---|
14 | # A T C G |
---|
15 | # 0.01316192 0.01371148 0.01293836 0.01386655 |
---|
16 | |
---|
17 | ENCODE_NONCODING_BACKGROUND = { 'A':0.2863776,'T':0.2878264,'G':0.2128400,'C':0.2129560} |
---|
18 | |
---|
19 | class Align(object): |
---|
20 | def __init__ (self, seqrows, headers=None): |
---|
21 | self.rows = seqrows |
---|
22 | self.nrows = len(seqrows) |
---|
23 | ncol = None |
---|
24 | for rownum,row in enumerate(self.rows): |
---|
25 | try: |
---|
26 | if ncol == None: ncol = len(row) |
---|
27 | elif ncol != len(row): |
---|
28 | raise "Align: __init__:alignment block:row %d does not have %d columns, it has %d" % (rownum,ncol,len(row)) |
---|
29 | except: |
---|
30 | print row |
---|
31 | raise '' |
---|
32 | self.ncols = ncol |
---|
33 | self.dims = (self.nrows,self.ncols) |
---|
34 | self.headers = headers |
---|
35 | def __str__ (self): |
---|
36 | return "\n".join(self.rows) |
---|
37 | |
---|
38 | class AlignScoreMatrix (object): |
---|
39 | def __init__(self,align): |
---|
40 | nan = float('nan') |
---|
41 | |
---|
42 | matrix = zeros((align.nrows,align.ncols),float32) |
---|
43 | |
---|
44 | # set to nans |
---|
45 | for ir in range( len(matrix) ): |
---|
46 | for ic in range(len( matrix[ir] )): |
---|
47 | matrix[ir][ic] = nan |
---|
48 | self.matrix = matrix |
---|
49 | |
---|
50 | def __len__(self): |
---|
51 | return shape(self.matrix)[1] |
---|
52 | |
---|
53 | def __str__(self): |
---|
54 | print self.matrix |
---|
55 | |
---|
56 | def score_align_motif (align,motif,gapmask=None,byPosition=True): |
---|
57 | |
---|
58 | #dbg |
---|
59 | #print >>sys.stderr, align.headers |
---|
60 | chr,chr_start,chr_stop = align.headers[0] |
---|
61 | |
---|
62 | # a blank score matrix |
---|
63 | nrows,ncols = align.dims |
---|
64 | ascoremax = AlignScoreMatrix( align ) |
---|
65 | scoremax = ascoremax.matrix |
---|
66 | |
---|
67 | minSeqLen = len( motif ) |
---|
68 | for ir in range(nrows): |
---|
69 | pass |
---|
70 | |
---|
71 | # row is missing data |
---|
72 | if isnan(align.rows[ir][0]): continue |
---|
73 | |
---|
74 | for start in range(ncols): |
---|
75 | |
---|
76 | if align.rows[ir][start] == '-': continue |
---|
77 | elif align.rows[ir][start] == 'n': continue |
---|
78 | elif align.rows[ir][start] == 'N': continue |
---|
79 | |
---|
80 | # get enough sequence for the weight matrix |
---|
81 | subseq = "" |
---|
82 | end = 0 |
---|
83 | ic = start |
---|
84 | while len(subseq) < minSeqLen: |
---|
85 | #for ic in range(start,ncols): |
---|
86 | |
---|
87 | if ic >= len(align.rows[ir]): break |
---|
88 | char = align.rows[ir][ic].upper() |
---|
89 | ic += 1 |
---|
90 | if char == '-' or char == 'N': continue |
---|
91 | else: subseq += char |
---|
92 | |
---|
93 | if len(subseq) == minSeqLen: |
---|
94 | end = ic+1 |
---|
95 | for_score = int( match_consensus(subseq,motif) ) |
---|
96 | revseq = reverse_complement( subseq ) |
---|
97 | rev_score = int( match_consensus(revseq,motif) ) |
---|
98 | |
---|
99 | score = max(for_score, rev_score) |
---|
100 | #dbg |
---|
101 | #if ir == 0: print >>sys.stderr, int(chr_start) + start - align.rows[ir].count('-',0,start), subseq, score |
---|
102 | |
---|
103 | # replace the alignment positions with the result |
---|
104 | if byPosition: |
---|
105 | scoremax[ir][start] = score |
---|
106 | else: |
---|
107 | # replace positions matching the width of the pwm |
---|
108 | for i in range(start,end): |
---|
109 | if isnan(scoremax[ir][i]): scoremax[ir][i] = score |
---|
110 | elif score > scoremax[ir][i]: scoremax[ir][i] = score |
---|
111 | #break |
---|
112 | # mask gap characters |
---|
113 | if gapmask == None: |
---|
114 | gapmask = score_align_gaps(align) |
---|
115 | putmask( scoremax, gapmask, float('nan') ) |
---|
116 | return scoremax |
---|
117 | |
---|
118 | #----------- |
---|
119 | # |
---|
120 | # WeightMatrix-- |
---|
121 | # A position weight matrix (PWM) representation of a motif. |
---|
122 | # |
---|
123 | #---------- |
---|
124 | # construction arguments: |
---|
125 | # id: id (name) of the motif |
---|
126 | # rows: the matrix; each row is a hash from symbol to weight, with |
---|
127 | # .. the weight in string form |
---|
128 | # alphabet: symbols allowed |
---|
129 | # background: hash from symbol to background probability of that symbol; if |
---|
130 | # .. not specified, ENCODE_NONCODING_BACKGROUND is used |
---|
131 | # internal fields: |
---|
132 | # rows: the matrix; each row is a hash from symbol to log-odds score |
---|
133 | # .. of that symbol for that row of the weight matrix |
---|
134 | # counts: the matrix; count[row][sym] is the weight, as an integer |
---|
135 | # probs: the matrix; probs[row][sym] is the weight, as an probability |
---|
136 | #---------- |
---|
137 | |
---|
138 | class PositionWeightMatrix (object): |
---|
139 | |
---|
140 | complementMap = string.maketrans("ACGTacgt","TGCAtgca") |
---|
141 | |
---|
142 | # IUPAC-IUB |
---|
143 | symbols = { |
---|
144 | 'A':Set(['A']), |
---|
145 | 'C':Set(['C']), |
---|
146 | 'G':Set(['G']), |
---|
147 | 'T':Set(['T']), |
---|
148 | 'R':Set(['A','G']), |
---|
149 | 'Y':Set(['C','T']), |
---|
150 | 'M':Set(['A','C']), |
---|
151 | 'K':Set(['G','T']), |
---|
152 | 'S':Set(['G','C']), |
---|
153 | 'W':Set(['A','T']), |
---|
154 | 'H':Set(['A','C','T']), |
---|
155 | 'B':Set(['G','T','C']), |
---|
156 | 'V':Set(['G','C','A']), |
---|
157 | 'D':Set(['G','T','A'])} |
---|
158 | |
---|
159 | def __init__ (self, id, rows, alphabet, background=None, score_correction=True): |
---|
160 | |
---|
161 | self.id = id |
---|
162 | self.alphabet = alphabet |
---|
163 | nsymbols = len(self.alphabet) |
---|
164 | for i in range(len(self.alphabet)): |
---|
165 | self.alphabet[ i ] = self.alphabet[ i ].upper() |
---|
166 | if background != None: |
---|
167 | self.background = background |
---|
168 | else: |
---|
169 | self.background = {} |
---|
170 | sorted_alphabet = [] |
---|
171 | sorted_alphabet[:] = self.alphabet[:] |
---|
172 | sorted_alphabet.sort() |
---|
173 | if ['A','C','G','T'] == sorted_alphabet: |
---|
174 | self.background = ENCODE_NONCODING_BACKGROUND |
---|
175 | else: |
---|
176 | for x in self.alphabet: self.background[ x ] = float(1)/len(self.alphabet) |
---|
177 | |
---|
178 | if (score_correction == True): |
---|
179 | self.score_correction = self.corrected_probability_score |
---|
180 | else: |
---|
181 | self.score_correction = self.simple_probability |
---|
182 | |
---|
183 | # partition counts from consensus symbol |
---|
184 | # in order to properly handle scaling in the presense of non-integers, |
---|
185 | # we prescan the matrix to figure out the largest scale factor, then go |
---|
186 | # back through and scale 'em all (some rows may be integer counts, |
---|
187 | # others may be probabilities) |
---|
188 | |
---|
189 | self.consensus = [] |
---|
190 | scale = 1 |
---|
191 | |
---|
192 | for i in range(len(rows)): |
---|
193 | |
---|
194 | #try: |
---|
195 | fields,consensus = rows[i][:nsymbols],rows[i][-1] |
---|
196 | for x,count in enumerate(fields): |
---|
197 | try: |
---|
198 | (w,s) = self.parse_weight(count) |
---|
199 | except ValueError: |
---|
200 | raise "pwm row %s has bad weight %s" % (" ".join(fields),t) |
---|
201 | |
---|
202 | # replace row counts with (values,scale) |
---|
203 | rows[i][x] = (w,s) |
---|
204 | scale = max(s,scale) |
---|
205 | |
---|
206 | #except: |
---|
207 | #print >>sys.stderr,rows |
---|
208 | #raise ValueError |
---|
209 | #raise ValueError, "pwm row %s has wrong field count" % " ".join(fields) |
---|
210 | |
---|
211 | self.consensus.append(consensus) |
---|
212 | |
---|
213 | |
---|
214 | hashRows = [] |
---|
215 | self.matrix_base_counts = {} # for pseudocounts |
---|
216 | self.counts = [] # for scaled counts |
---|
217 | self.probs = [] # for probabilities |
---|
218 | |
---|
219 | # scale counts to integers |
---|
220 | for i in range(len(rows)): |
---|
221 | hashRows.append(dict()) |
---|
222 | for x,sym in enumerate(alphabet): |
---|
223 | (w,s) = rows[i][x] |
---|
224 | hashRows[i][sym] = w * scale/s |
---|
225 | assert hashRows[i][sym] >= 0 |
---|
226 | if sym not in self.matrix_base_counts: self.matrix_base_counts[sym] = 0 |
---|
227 | self.matrix_base_counts[sym] += hashRows[i][sym] |
---|
228 | self.counts.append( hashRows[i].copy() ) |
---|
229 | self.probs.append( hashRows[i].copy() ) |
---|
230 | totalWeight = float(sum(self.probs[i].values())) |
---|
231 | for sym in self.probs[i]: |
---|
232 | self.probs[i][sym] /= totalWeight |
---|
233 | self.sites = sum ( hashRows[0].values() ) |
---|
234 | |
---|
235 | # scan pwm to pre-compute logs of probabilities and min and max log-odds |
---|
236 | # scores (over the whole PWM) for scaling; note that the same min and max |
---|
237 | # applies for scaling long-odds scores for quantum comparisions |
---|
238 | self.information_content = [] |
---|
239 | minSum = 0 |
---|
240 | maxSum = 0 |
---|
241 | |
---|
242 | for i in range( len( hashRows )): |
---|
243 | self.information_content.append( self.information_content_calculation( i, hashRows ) ) |
---|
244 | newHashRow = {} |
---|
245 | for base in self.alphabet: |
---|
246 | newHashRow[base] = self.pwm_score(base, i, hashRows) |
---|
247 | hashRows[i] = newHashRow |
---|
248 | |
---|
249 | minSum += min(hashRows[i].values()) |
---|
250 | maxSum += max(hashRows[i].values()) |
---|
251 | |
---|
252 | self.minSum = minSum |
---|
253 | self.maxSum = maxSum |
---|
254 | self.rows = hashRows |
---|
255 | |
---|
256 | # Reference 1: Wasserman and Sandelin: Nat Rev Genet. 2004 Apr;5(4):276-87. |
---|
257 | # Reference 2: Gertz et al.: Genome Res. 2005 Aug;15(8):1145-52. |
---|
258 | def information_content_calculation(self, i, counts): |
---|
259 | # Reference 1) |
---|
260 | return 2 + sum( [ self.information_base_content(base,i,counts) for base in self.alphabet ] ) |
---|
261 | |
---|
262 | # Reference 2) |
---|
263 | #return sum( [ self.information_base_content(base,i,counts) for base in self.alphabet ] ) |
---|
264 | |
---|
265 | def information_base_content(self, base, i, counts): |
---|
266 | |
---|
267 | # Reference 1) |
---|
268 | #return self.score_correction(counts,base,i) * math.log ( self.score_correction(counts,base,i), 2) |
---|
269 | |
---|
270 | # Reference 2) |
---|
271 | return self.score_correction(counts,base,i) * self.pwm_score(base, i, counts) |
---|
272 | |
---|
273 | def __call__ (self,seq): |
---|
274 | return self.score_seq(seq) |
---|
275 | |
---|
276 | def __add__ (self,other): |
---|
277 | |
---|
278 | assert self.alphabet == other.alphabet |
---|
279 | r,(p,q) = self.max_correlation(other) |
---|
280 | |
---|
281 | if p == q == 0: width = max( len(self),len(other) ) |
---|
282 | elif p > 0: width = max( len(other)+p, len(self) ) |
---|
283 | elif q > 0: width = max( len(self)+q, len(other) ) |
---|
284 | |
---|
285 | sumx = zeros( (width,len(self.alphabet)),dtype='int') |
---|
286 | selfx = self.to_count_matrix() |
---|
287 | otherx = other.to_count_matrix() |
---|
288 | |
---|
289 | if p == q == 0: |
---|
290 | sumx[:len(self)] += selfx |
---|
291 | sumx[:len(other)] += otherx |
---|
292 | elif p > 0: |
---|
293 | sumx[p:p+len(other)] += otherx |
---|
294 | sumx[:len(self)] += selfx |
---|
295 | else: |
---|
296 | sumx[:len(other)] += otherx |
---|
297 | sumx[q:q+len(self)] += selfx |
---|
298 | |
---|
299 | newRows = [] |
---|
300 | for i,x in enumerate(sumx): |
---|
301 | y = list(x) |
---|
302 | y.append( consensus_symbol(y) ) |
---|
303 | y = [ str(yi) for yi in y] |
---|
304 | newRows.append( y ) |
---|
305 | return PositionWeightMatrix(self.id+other.id,newRows,self.alphabet,self.background) |
---|
306 | |
---|
307 | def __old_add__ (self,other,maxp=None): |
---|
308 | |
---|
309 | assert self.alphabet == other.alphabet |
---|
310 | bigN = max(len(self),len(other)) |
---|
311 | smallN = min(len(self),len(other)) |
---|
312 | if not maxp: |
---|
313 | prsq = self.correlation(other) |
---|
314 | maxp = prsq.index( max(prsq) ) |
---|
315 | |
---|
316 | leftpad = ' ' * maxp |
---|
317 | rightsize = bigN - smallN |
---|
318 | rightpad = ' ' * rightsize |
---|
319 | leftStrings = [] |
---|
320 | rightStrings = [] |
---|
321 | |
---|
322 | if len(self) > len(other): |
---|
323 | larger = self |
---|
324 | smaller = other |
---|
325 | leftStrings = self.consensus |
---|
326 | rightStrings = list(leftpad) + other.consensus + list(rightpad) |
---|
327 | else: |
---|
328 | smaller = self |
---|
329 | larger = other |
---|
330 | leftStrings = list(leftpad) + self.consensus + list(rightpad) |
---|
331 | rightStrings = other.consensus |
---|
332 | |
---|
333 | sumx = zeros([bigN,len(self.alphabet)]) |
---|
334 | sumx += larger.to_count_matrix() |
---|
335 | sumx[maxp:maxp+smallN] += smaller.to_count_matrix() |
---|
336 | |
---|
337 | newRows = [] |
---|
338 | for i,x in enumerate(sumx): |
---|
339 | y = list(x) |
---|
340 | y.append( leftStrings[i] + rightStrings[i] ) |
---|
341 | y = [ str(yi) for yi in y] |
---|
342 | newRows.append( y ) |
---|
343 | |
---|
344 | #return PositionWeightMatrix(self.id+other.id,newRows[maxp:maxp+smallN],self.alphabet,self.background) |
---|
345 | return PositionWeightMatrix(self.id+other.id,newRows,self.alphabet,self.background) |
---|
346 | |
---|
347 | def to_matrix(self): |
---|
348 | m = zeros([len(self),len(self.alphabet)]) |
---|
349 | for i in range(len(self)): |
---|
350 | for j,a in enumerate(self.alphabet): |
---|
351 | m[i][j] = self[i][a] |
---|
352 | return m |
---|
353 | |
---|
354 | def to_count_matrix(self): |
---|
355 | m = zeros([len(self),len(self.alphabet)],dtype='int') |
---|
356 | for i in range(len(self)): |
---|
357 | for j,a in enumerate(self.alphabet): |
---|
358 | m[i][j] = self.counts[i][a] |
---|
359 | return m |
---|
360 | |
---|
361 | def max_correlation(self, otherwmx): |
---|
362 | rsq,ixtuple = self.slide_correlation(otherwmx) |
---|
363 | max_rsq = max(rsq) |
---|
364 | maxp,maxq = ixtuple[rsq.index(max_rsq)] |
---|
365 | return max_rsq,(maxp,maxq) |
---|
366 | |
---|
367 | def slide_correlation(self, other): |
---|
368 | assert self.alphabet == other.alphabet |
---|
369 | selfx = self.to_count_matrix() |
---|
370 | otherx = other.to_count_matrix() |
---|
371 | rsq = [] |
---|
372 | ixtuple = [] |
---|
373 | # self staggered over other, scan self backwards until flush |
---|
374 | for q in range(len(other)-1,-1,-1): |
---|
375 | r = 0 |
---|
376 | n = 0 |
---|
377 | for p in range(len(self)): |
---|
378 | if q+p < len(other): |
---|
379 | r += rsquared( list(selfx[p]), list(otherx[q+p]) ) |
---|
380 | n += 1 |
---|
381 | else: |
---|
382 | n += 1 |
---|
383 | rsq.append( r/n ) |
---|
384 | ixtuple.append( (0,q) ) |
---|
385 | # other staggered below self , scan other forward |
---|
386 | for p in range(1,len(self)): |
---|
387 | r = 0 |
---|
388 | n = 0 |
---|
389 | for q in range(len(other)): |
---|
390 | if p+q < len(self): |
---|
391 | r += rsquared( list(selfx[p+q]), list(otherx[q]) ) |
---|
392 | n += 1 |
---|
393 | else: |
---|
394 | n += 1 |
---|
395 | rsq.append( r/n ) |
---|
396 | ixtuple.append( (p,0) ) |
---|
397 | return rsq,ixtuple |
---|
398 | |
---|
399 | def correlation(self, otherwmx): |
---|
400 | assert self.alphabet == otherwmx.alphabet |
---|
401 | width = len(self.alphabet) |
---|
402 | if len(self) > len(otherwmx): |
---|
403 | larger = self.to_count_matrix() |
---|
404 | smaller = otherwmx.to_count_matrix() |
---|
405 | else: |
---|
406 | smaller = self.to_count_matrix() |
---|
407 | larger = otherwmx.to_count_matrix() |
---|
408 | bigN = len(larger) |
---|
409 | smallN = len(smaller) |
---|
410 | position_rsq = [] |
---|
411 | |
---|
412 | # slide small over large, for ave rsq |
---|
413 | for p in range(bigN): |
---|
414 | if p+smallN <= bigN: |
---|
415 | r = 0 |
---|
416 | for q in range(smallN): |
---|
417 | r += rsquared(list(smaller[q]),list(larger[p+q])) |
---|
418 | position_rsq.append( r / smallN ) |
---|
419 | return position_rsq |
---|
420 | |
---|
421 | def score_align (self,align,gapmask=None,byPosition=True): |
---|
422 | |
---|
423 | # a blank score matrix |
---|
424 | nrows,ncols = align.dims |
---|
425 | ascoremax = AlignScoreMatrix( align ) |
---|
426 | scoremax = ascoremax.matrix |
---|
427 | |
---|
428 | minSeqLen = len( self ) |
---|
429 | for ir in range(nrows): |
---|
430 | |
---|
431 | # row is missing data |
---|
432 | if isnan(align.rows[ir][0]): continue |
---|
433 | |
---|
434 | for start in range(ncols): |
---|
435 | if align.rows[ir][start] == '-': continue |
---|
436 | elif align.rows[ir][start] == 'n': continue |
---|
437 | elif align.rows[ir][start] == 'N': continue |
---|
438 | |
---|
439 | # get enough sequence for the weight matrix |
---|
440 | subseq = "" |
---|
441 | end = 0 |
---|
442 | for ic in range(start,ncols): |
---|
443 | |
---|
444 | char = align.rows[ir][ic] |
---|
445 | if char == '-' or char == 'N': continue |
---|
446 | else: subseq += char |
---|
447 | |
---|
448 | if len(subseq) == minSeqLen: |
---|
449 | end = ic+1 |
---|
450 | |
---|
451 | #forward |
---|
452 | scores = self.score_seq( subseq ) |
---|
453 | raw,forward_score = scores[0] |
---|
454 | #reverse |
---|
455 | scores = self.score_reverse_seq( subseq ) |
---|
456 | raw,reverse_score = scores[0] |
---|
457 | |
---|
458 | score = max(forward_score, reverse_score) |
---|
459 | |
---|
460 | # replace the alignment positions with the result |
---|
461 | if byPosition: |
---|
462 | scoremax[ir][start] = score |
---|
463 | else: |
---|
464 | # replace positions matching the width of the pwm |
---|
465 | for i in range(start,end): |
---|
466 | if isnan(scoremax[ir][i]): scoremax[ir][i] = score |
---|
467 | elif score > scoremax[ir][i]: scoremax[ir][i] = score |
---|
468 | # mask gap characters |
---|
469 | if gapmask == None: |
---|
470 | gapmask = score_align_gaps(align) |
---|
471 | putmask( scoremax, gapmask, float('nan') ) |
---|
472 | return scoremax |
---|
473 | |
---|
474 | # seq can be a string, a list of characters, or a quantum sequence (a list |
---|
475 | # of hashes from symbols to probability) |
---|
476 | |
---|
477 | def score_seq(self,seq): |
---|
478 | if (type(seq[0]) == dict): |
---|
479 | return self.score_quantum_seq(seq) |
---|
480 | |
---|
481 | scores = [] |
---|
482 | for start in range( len(seq)): |
---|
483 | if start + len(self) > len(seq): break |
---|
484 | subseq = seq[ start:start+len(self) ] |
---|
485 | raw = 0 |
---|
486 | try: |
---|
487 | for i,nt in enumerate(subseq): raw += self.rows[i][nt.upper()] |
---|
488 | scaled = self.scaled( raw ) |
---|
489 | except KeyError: |
---|
490 | raw,scaled = float('nan'),float('nan') |
---|
491 | scores.append( (raw, scaled) ) |
---|
492 | return scores |
---|
493 | |
---|
494 | def score_quantum_seq(self,seq): |
---|
495 | scores = [] |
---|
496 | for start in range(len(seq)): |
---|
497 | if (start + len(self) > len(seq)): break |
---|
498 | subseq = seq[start:start+len(self)] |
---|
499 | raw = 0 |
---|
500 | try: |
---|
501 | for i,nt in enumerate(subseq): |
---|
502 | numer = sum([subseq[i][nt] * self.probs[i][nt] for nt in subseq[i]]) |
---|
503 | denom = sum([subseq[i][nt] * self.background[nt] for nt in subseq[i]]) |
---|
504 | raw += math.log(numer/denom,2) |
---|
505 | scaled = self.scaled(raw) |
---|
506 | except KeyError: |
---|
507 | raw,scaled = float('nan'),float('nan') |
---|
508 | except OverflowError,e: |
---|
509 | raw,scaled = float('nan'),float('nan') |
---|
510 | except ValueError,e: |
---|
511 | raw,scaled = float('nan'),float('nan') |
---|
512 | scores.append((raw,scaled)) |
---|
513 | return scores |
---|
514 | |
---|
515 | def score_reverse_seq(self,seq): |
---|
516 | revSeq = reverse_complement( seq ) |
---|
517 | scores = self.score_seq( revSeq ) |
---|
518 | scores.reverse() |
---|
519 | return scores |
---|
520 | |
---|
521 | def scaled(self,val): |
---|
522 | return ( val - self.minSum ) / (self.maxSum - self.minSum) |
---|
523 | |
---|
524 | def pseudocount(self, base=None): |
---|
525 | f = lambda count: math.sqrt( count + 1 ) |
---|
526 | if base in self.alphabet: |
---|
527 | return f( self.matrix_base_counts[base] ) |
---|
528 | elif base == None: |
---|
529 | return f ( self.sites ) |
---|
530 | else: |
---|
531 | return float("nan") |
---|
532 | |
---|
533 | def simple_probability (self,freq, base, i): |
---|
534 | # p(base,i) = f(base,i) |
---|
535 | # ---------------------- |
---|
536 | # sum(f(base,{A,C,G,T})) |
---|
537 | |
---|
538 | return float( freq[i][base] ) / sum([freq[i][nt] for nt in self.alphabet]) |
---|
539 | |
---|
540 | def corrected_probability_score (self,freq, base, i): |
---|
541 | # p(base,i) = f(base,i) + s(base) |
---|
542 | # -------------------- |
---|
543 | # N + sum(s(A,C,T,G)) |
---|
544 | |
---|
545 | f = float( freq[i][base] ) |
---|
546 | s = self.pseudocount(base) |
---|
547 | N = self.sites |
---|
548 | #print >>sys.stderr, "f:%.3f + s:%.3f = %.3f" % (f,s,f + s) |
---|
549 | #print >>sys.stderr, "-------------------------" |
---|
550 | #print >>sys.stderr, "N:%d + %d = %d" % (N,self.pseudocount(), N + self.pseudocount()) |
---|
551 | #print >>sys.stderr, "\t\t %.3f\n" % ((f + s) / (N + self.pseudocount())) |
---|
552 | |
---|
553 | assert (f + s) > 0 |
---|
554 | return (f + s) / (N + self.pseudocount()) |
---|
555 | |
---|
556 | def pwm_score (self,base,i,freq,background=None): |
---|
557 | if background == None: background = self.background |
---|
558 | p = self.score_correction(freq,base,i) |
---|
559 | #print >>sys.stderr, p |
---|
560 | #print >>sys.stderr, "k %d %c" % (i,base),freq[i][base] |
---|
561 | b = background[ base ] |
---|
562 | try: |
---|
563 | return math.log( p/b, 2) |
---|
564 | except OverflowError,e: |
---|
565 | ## print >>sys.stderr,"base=%c, math.log(%.3f / %.3f)" % (base,p,b) |
---|
566 | ## print >>sys.stderr,self.id |
---|
567 | return float('nan') |
---|
568 | except ValueError,e: |
---|
569 | ## print >>sys.stderr,"base=%c, math.log(%.3f / %.3f)" % (base,p,b) |
---|
570 | ## print >>sys.stderr,self.id |
---|
571 | return float('nan') |
---|
572 | |
---|
573 | def parse_weight (self, weightString): |
---|
574 | |
---|
575 | fields = weightString.split(".") |
---|
576 | if (len(fields) > 2): raise ValueError |
---|
577 | |
---|
578 | w = int(fields[0]) |
---|
579 | s = 1 |
---|
580 | |
---|
581 | if (len(fields) == 2): |
---|
582 | for cnt in range(0,len(fields[1])): s *= 10 |
---|
583 | w = s*w + int(fields[1]) |
---|
584 | |
---|
585 | return (w,s) # w = the weight |
---|
586 | # s = the scale used (a power of 10) |
---|
587 | |
---|
588 | def __str__ (self): |
---|
589 | lines = [self.id] |
---|
590 | headers = ["%s" % nt for nt in self.alphabet] |
---|
591 | lines.append("P0\t" + "\t".join(headers)) |
---|
592 | for ix in range(0,len(self.rows)): |
---|
593 | weights = ["%d" % self.counts[ix][nt] for nt in self.alphabet] |
---|
594 | #lines.append(("%02d\t" % ix) + "\t".join(weights) + "\t" + self.consensus[ix]) |
---|
595 | lines.append(("%02d\t" % ix) + "\t".join(weights) + "\t" + str(sum(self.counts[ix].values())) + "\t" + self.consensus[ix]) |
---|
596 | |
---|
597 | return "\n".join(lines) |
---|
598 | |
---|
599 | def __getitem__ (self,key): |
---|
600 | return self.rows[key] |
---|
601 | |
---|
602 | def __setitem__ (self,key,value): |
---|
603 | self.rows[key] = value |
---|
604 | |
---|
605 | def __len__ (self): |
---|
606 | return len( self.rows ) |
---|
607 | |
---|
608 | def score_align_gaps (align): |
---|
609 | # a blank score matrix |
---|
610 | nrows,ncols = align.dims |
---|
611 | scoremax = AlignScoreMatrix( align ).matrix |
---|
612 | for ir in range(nrows): |
---|
613 | # row is missing data |
---|
614 | if isnan(align.rows[ir][0]): continue |
---|
615 | # scan for gaps |
---|
616 | for pos in range(ncols): |
---|
617 | if align.rows[ir][pos] == '-': scoremax[ir][pos] = 1 |
---|
618 | else: scoremax[ir][pos] = 0 |
---|
619 | return scoremax |
---|
620 | |
---|
621 | #----------- |
---|
622 | # |
---|
623 | # WeightMatrix Reader-- |
---|
624 | # Read position weight matrices (PWM) from a file. |
---|
625 | # |
---|
626 | #----------- |
---|
627 | |
---|
628 | class Reader (object): |
---|
629 | """Iterate over all interesting weight matrices in a file""" |
---|
630 | |
---|
631 | def __init__ (self,file,tfIds=None,name=None,format='basic',background=None,score_correction=True): |
---|
632 | self.tfIds = tfIds |
---|
633 | self.file = file |
---|
634 | self.name = name |
---|
635 | self.lineNumber = 0 |
---|
636 | self.format = format |
---|
637 | self.background = background |
---|
638 | self.score_correction = score_correction |
---|
639 | |
---|
640 | |
---|
641 | def close (self): |
---|
642 | self.file.close() |
---|
643 | |
---|
644 | |
---|
645 | def where (self): |
---|
646 | if (self.name == None): |
---|
647 | return "line %d" % self.lineNumber |
---|
648 | else: |
---|
649 | return "line %d in %s" % (self.lineNumber,self.name) |
---|
650 | |
---|
651 | |
---|
652 | def __iter__ (self): |
---|
653 | if self.format == 'basic': |
---|
654 | return self.read_as_basic() |
---|
655 | elif self.format == 'transfac': |
---|
656 | return self.read_as_transfac() |
---|
657 | else: |
---|
658 | raise "unknown weight matrix file format: '%s'" % self.format |
---|
659 | |
---|
660 | def read_as_basic(self): |
---|
661 | tfId = None |
---|
662 | pwmRows = None |
---|
663 | |
---|
664 | alphabet = ['A','C','G','T'] |
---|
665 | while (True): |
---|
666 | line = self.file.readline() |
---|
667 | if (not line): break |
---|
668 | line = line.strip() |
---|
669 | self.lineNumber += 1 |
---|
670 | if line.startswith(">"): |
---|
671 | if pwmRows != None: |
---|
672 | yield PositionWeightMatrix(tfId,pwmRows,alphabet,background=self.background) |
---|
673 | #try: |
---|
674 | #yield PositionWeightMatrix(tfId,pwmRows,alphabet) |
---|
675 | #except: |
---|
676 | #print >>sys.stderr, "Failed to read", tfId |
---|
677 | tfId = line.strip()[1:] |
---|
678 | pwmRows = [] |
---|
679 | elif line[0].isdigit(): |
---|
680 | tokens = line.strip().split() |
---|
681 | tokens.append( consensus_symbol(line) ) |
---|
682 | vals = [float(v) for v in tokens[:-1]] |
---|
683 | #print >>sys.stderr,[ "%.2f" % (float(v)/sum(vals)) for v in vals], tokens[-1] |
---|
684 | pwmRows.append( tokens ) |
---|
685 | if pwmRows != None: # we've finished collecting a desired matrix |
---|
686 | yield PositionWeightMatrix(tfId,pwmRows,alphabet,background=self.background,score_correction=self.score_correction) |
---|
687 | |
---|
688 | def read_as_transfac(self): |
---|
689 | self.tfToPwm = {} |
---|
690 | tfId = None |
---|
691 | pwmRows = None |
---|
692 | |
---|
693 | while (True): |
---|
694 | line = self.file.readline() |
---|
695 | if (not line): break |
---|
696 | line = line.strip() |
---|
697 | self.lineNumber += 1 |
---|
698 | # handle an ID line |
---|
699 | if line.startswith("ID"): |
---|
700 | if pwmRows != None: # we've finished collecting a desired matrix |
---|
701 | try: |
---|
702 | yield PositionWeightMatrix(tfId,pwmRows,alphabet,background=self.background,score_correction=self.score_correction) |
---|
703 | except: |
---|
704 | print >>sys.stderr, "Failed to read", tfId |
---|
705 | tfId = None |
---|
706 | pwmRows = None |
---|
707 | |
---|
708 | tokens = line.split (None, 2) |
---|
709 | if len(tokens) != 2: |
---|
710 | raise ValueError, "bad line, need two fields (%s)" % self.where() |
---|
711 | tfId = tokens[1] |
---|
712 | if self.tfIds != None and (not tfId in self.tfIds): |
---|
713 | continue # ignore it, this isn't a desired matrix |
---|
714 | if tfId in self.tfToPwm: |
---|
715 | raise ValueError, "transcription factor %s appears twice (%s)" \ |
---|
716 | % (tfId,self.where()) |
---|
717 | pwmRows = [] # start collecting a desired matrix |
---|
718 | continue |
---|
719 | |
---|
720 | # if we're not collecting, skip this line |
---|
721 | if pwmRows == None: continue |
---|
722 | if len(line) < 1: continue |
---|
723 | |
---|
724 | # name, if present, added to ID |
---|
725 | if line.startswith('NA'): |
---|
726 | words = line.strip().split() |
---|
727 | tfId = tfId + "\t" + " ".join(words[1:]) |
---|
728 | |
---|
729 | # handle a P0 line |
---|
730 | if line.startswith("P0"): |
---|
731 | alphabet = line.split()[1:] |
---|
732 | if len(alphabet) < 2: |
---|
733 | raise ValueError, "bad line, need more dna (%s)" % self.where() |
---|
734 | continue |
---|
735 | |
---|
736 | # handle a 01,02,etc. line |
---|
737 | if line[0].isdigit(): |
---|
738 | tokens = line.split () |
---|
739 | try: |
---|
740 | index = int(tokens[0]) |
---|
741 | if index != len(pwmRows)+1: raise ValueError |
---|
742 | except: |
---|
743 | raise ValueError,"bad line, bad index (%s)" % self.where() |
---|
744 | pwmRows.append(tokens[1:]) |
---|
745 | continue |
---|
746 | # skip low quality entries |
---|
747 | if line.startswith("CC TRANSFAC Sites of quality"): |
---|
748 | print >>sys.stderr, line.strip(), tfId |
---|
749 | pwmRows = None |
---|
750 | continue |
---|
751 | if pwmRows != None: # we've finished collecting a desired matrix |
---|
752 | yield PositionWeightMatrix(tfId,pwmRows,alphabet,background=self.background,score_correction=self.score_correction) |
---|
753 | # clean up |
---|
754 | self.tfToPwm = None |
---|
755 | |
---|
756 | def isnan(x): |
---|
757 | #return ieeespecial.isnan(x) |
---|
758 | if x==x: return False |
---|
759 | return True |
---|
760 | |
---|
761 | def reverse_complement (nukes): |
---|
762 | return nukes[::-1].translate(PositionWeightMatrix.complementMap) |
---|
763 | |
---|
764 | def rsquared( x, y ): |
---|
765 | try: |
---|
766 | return sum_of_squares(x,y)**2 / (sum_of_squares(x) * sum_of_squares(y)) |
---|
767 | except ZeroDivisionError: |
---|
768 | #return float('nan') |
---|
769 | return 0 |
---|
770 | |
---|
771 | def sum_of_squares( x,y=None ): |
---|
772 | if not y: y = x |
---|
773 | xmean = float(sum( x )) / len( x ) |
---|
774 | ymean = float(sum( y )) / len( y ) |
---|
775 | assert len(x) == len(y) |
---|
776 | return sum([ float(xi)*float(yi) for xi,yi in zip(x,y)]) - len(x)*xmean*ymean |
---|
777 | |
---|
778 | def match_consensus(sequence,pattern): |
---|
779 | |
---|
780 | return c_match_consensus( sequence, pattern, len(sequence)) |
---|
781 | |
---|
782 | #for s,p in zip(sequence,pattern): |
---|
783 | #if p == 'N': continue |
---|
784 | #if not s in PositionWeightMatrix.symbols[p]: return False |
---|
785 | |
---|
786 | #return True |
---|
787 | |
---|
788 | def consensus_symbol( pattern ): |
---|
789 | |
---|
790 | if type(pattern)==type(""): |
---|
791 | try: |
---|
792 | pattern = [int(x) for x in pattern.split()] |
---|
793 | except ValueError,e: |
---|
794 | print >>sys.stderr, pattern |
---|
795 | raise ValueError,e |
---|
796 | |
---|
797 | # IUPAC-IUB nomenclature for wobblers |
---|
798 | wobblers = { |
---|
799 | 'R':Set(['A','G']), |
---|
800 | 'Y':Set(['C','T']), |
---|
801 | 'M':Set(['A','C']), |
---|
802 | 'K':Set(['G','T']), |
---|
803 | 'S':Set(['G','C']), |
---|
804 | 'W':Set(['A','T']), |
---|
805 | 'H':Set(['A','C','T']), |
---|
806 | 'B':Set(['G','T','C']), |
---|
807 | 'V':Set(['G','C','A']), |
---|
808 | 'D':Set(['G','T','A'])} |
---|
809 | |
---|
810 | symbols = ['A','C','G','T'] |
---|
811 | |
---|
812 | if type(pattern)==type({}): |
---|
813 | pattern = [pattern[u] for u in symbols] |
---|
814 | |
---|
815 | total = sum(pattern) |
---|
816 | f = [ (space/1e5)+(float(x)/total) for space,x in enumerate(pattern) ] |
---|
817 | copy = [] |
---|
818 | copy[:] = f[:] |
---|
819 | copy.sort() |
---|
820 | |
---|
821 | # http://www.genomatix.de/online_help/help_matinspector/matrix_help.html -- |
---|
822 | # url says consensus must be greater than 50%, and at least twice the freq |
---|
823 | # of the second-most frequent. A double-degenerate symbol can be used |
---|
824 | # if the top two account for 75% or more of the nt, if each is less than 50% |
---|
825 | # Otherwise, N is used in the consensus. |
---|
826 | tops = copy[-2:] |
---|
827 | if tops[1] > 0.5 and tops[1] >= 2 * tops[0]: return symbols[f.index(tops[1])] |
---|
828 | elif tops[0] < 0.5 and sum(tops) >= 0.75: |
---|
829 | degen = Set([ symbols[f.index(v)] for v in tops ]) |
---|
830 | for degenSymbol,wobbles in wobblers.items(): |
---|
831 | #print >>sys.stderr,wobbles |
---|
832 | if degen == wobbles: |
---|
833 | return degenSymbol |
---|
834 | else: return 'N' |
---|
835 | print >>sys.stderr,pattern |
---|
836 | raise '?' |
---|
837 | |
---|
838 | # import C extensions |
---|
839 | try: |
---|
840 | from _position_weight_matrix import c_match_consensus |
---|
841 | ## print >>sys.stderr, "C match_consensus used" |
---|
842 | except: |
---|
843 | ## print >>sys.stderr, "python match_consensus used" |
---|
844 | def match_consensus(sequence, pattern, size): |
---|
845 | for s,p in zip(sequence,pattern): |
---|
846 | if p == 'N': continue |
---|
847 | if not s in PositionWeightMatrix.symbols[p]: return False |
---|
848 | |
---|
849 | return True |
---|