root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/pwm/position_weight_matrix.py

リビジョン 3, 29.3 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1#!/usr/bin/env python
2
3import sys
4import math
5import string
6from numpy import *
7from sets import *
8
9# This is the average of all species in the alignment outside of exons
10#        > mean(r)
11#        A         T         C         G
12#        0.2863776 0.2878264 0.2129560 0.2128400
13#        > sd(r)
14#        A          T          C          G
15#        0.01316192 0.01371148 0.01293836 0.01386655
16
17ENCODE_NONCODING_BACKGROUND = { 'A':0.2863776,'T':0.2878264,'G':0.2128400,'C':0.2129560}
18
19class Align(object):
20    def __init__ (self, seqrows, headers=None):
21        self.rows = seqrows
22        self.nrows = len(seqrows)
23        ncol = None
24        for rownum,row in enumerate(self.rows):
25            try:
26                if ncol == None: ncol = len(row)
27                elif ncol != len(row):
28                    raise "Align: __init__:alignment block:row %d does not have %d columns, it has %d" % (rownum,ncol,len(row))
29            except:
30                print row
31                raise ''
32        self.ncols = ncol
33        self.dims = (self.nrows,self.ncols)
34        self.headers = headers
35    def __str__ (self):
36        return "\n".join(self.rows)
37
38class AlignScoreMatrix (object):
39    def __init__(self,align):
40        nan = float('nan')
41
42        matrix = zeros((align.nrows,align.ncols),float32)
43       
44        # set to nans
45        for ir in range( len(matrix) ):
46            for ic in range(len( matrix[ir] )):
47                matrix[ir][ic] = nan
48        self.matrix = matrix
49
50    def __len__(self):
51        return shape(self.matrix)[1]
52
53    def __str__(self):
54        print self.matrix
55
56def score_align_motif (align,motif,gapmask=None,byPosition=True):
57
58    #dbg
59    #print >>sys.stderr, align.headers
60    chr,chr_start,chr_stop = align.headers[0]
61
62    # a blank score matrix
63    nrows,ncols = align.dims
64    ascoremax = AlignScoreMatrix( align )
65    scoremax = ascoremax.matrix
66
67    minSeqLen = len( motif )
68    for ir in range(nrows):
69        pass
70
71        # row is missing data
72        if isnan(align.rows[ir][0]): continue
73
74        for start in range(ncols):
75
76            if align.rows[ir][start] == '-': continue
77            elif align.rows[ir][start] == 'n': continue
78            elif align.rows[ir][start] == 'N': continue
79
80            # get enough sequence for the weight matrix
81            subseq = ""
82            end = 0
83            ic = start
84            while len(subseq) < minSeqLen:
85            #for ic in range(start,ncols):
86
87                if ic >= len(align.rows[ir]): break
88                char = align.rows[ir][ic].upper()
89                ic += 1
90                if char == '-' or char == 'N': continue
91                else: subseq += char
92
93            if len(subseq) == minSeqLen:
94                end = ic+1
95                for_score = int( match_consensus(subseq,motif) )
96                revseq = reverse_complement( subseq )
97                rev_score = int( match_consensus(revseq,motif) )
98
99                score = max(for_score, rev_score)
100                #dbg
101                #if ir == 0: print >>sys.stderr, int(chr_start) + start - align.rows[ir].count('-',0,start), subseq, score
102
103                # replace the alignment positions with the result
104                if byPosition:
105                    scoremax[ir][start] = score
106                else:
107                # replace positions matching the width of the pwm
108                    for i in range(start,end):
109                        if isnan(scoremax[ir][i]): scoremax[ir][i] = score
110                        elif score > scoremax[ir][i]: scoremax[ir][i] = score
111                #break
112    # mask gap characters
113    if gapmask == None:
114        gapmask = score_align_gaps(align)
115    putmask( scoremax, gapmask, float('nan') )
116    return scoremax
117
118#-----------
119#
120# WeightMatrix--
121#    A position weight matrix (PWM) representation of a motif.
122#
123#----------
124# construction arguments:
125#   id:         id (name) of the motif
126#   rows:       the matrix;  each row is a hash from symbol to weight, with
127#               .. the weight in string form
128#   alphabet:   symbols allowed
129#   background: hash from symbol to background probability of that symbol;  if
130#               .. not specified, ENCODE_NONCODING_BACKGROUND is used
131# internal fields:
132#   rows:       the matrix;  each row is a hash from symbol to log-odds score
133#               .. of that symbol for that row of the weight matrix
134#   counts:     the matrix;  count[row][sym] is the weight, as an integer
135#   probs:      the matrix;  probs[row][sym] is the weight, as an probability
136#----------
137
138class PositionWeightMatrix (object):
139
140    complementMap = string.maketrans("ACGTacgt","TGCAtgca")
141
142    # IUPAC-IUB
143    symbols = {
144        'A':Set(['A']),
145        'C':Set(['C']),
146        'G':Set(['G']),
147        'T':Set(['T']),
148        'R':Set(['A','G']),
149        'Y':Set(['C','T']),
150        'M':Set(['A','C']),
151        'K':Set(['G','T']),
152        'S':Set(['G','C']),
153        'W':Set(['A','T']),
154        'H':Set(['A','C','T']),
155        'B':Set(['G','T','C']),
156        'V':Set(['G','C','A']),
157        'D':Set(['G','T','A'])}
158
159    def __init__ (self, id, rows, alphabet, background=None, score_correction=True):
160
161        self.id       = id
162        self.alphabet = alphabet
163        nsymbols = len(self.alphabet)
164        for i in range(len(self.alphabet)):
165            self.alphabet[ i ] = self.alphabet[ i ].upper()
166        if background != None:
167            self.background = background
168        else:
169            self.background = {}
170            sorted_alphabet = []
171            sorted_alphabet[:] = self.alphabet[:]
172            sorted_alphabet.sort()
173            if ['A','C','G','T'] == sorted_alphabet:
174                self.background = ENCODE_NONCODING_BACKGROUND
175            else:
176                for x in self.alphabet: self.background[ x ] = float(1)/len(self.alphabet)
177
178        if (score_correction == True):
179            self.score_correction = self.corrected_probability_score
180        else:
181            self.score_correction = self.simple_probability
182
183        # partition counts from consensus symbol
184        # in order to properly handle scaling in the presense of non-integers,
185        # we prescan the matrix to figure out the largest scale factor, then go
186        # back through and scale 'em all (some rows may be integer counts,
187        # others may be probabilities)
188
189        self.consensus = []
190        scale = 1
191
192        for i in range(len(rows)):
193
194            #try:
195            fields,consensus = rows[i][:nsymbols],rows[i][-1]
196            for x,count in enumerate(fields):
197                try:
198                    (w,s) = self.parse_weight(count)
199                except ValueError:
200                    raise "pwm row %s has bad weight %s" % (" ".join(fields),t)
201
202                # replace row counts with (values,scale)
203                rows[i][x] = (w,s)
204                scale = max(s,scale)
205
206            #except:
207                #print >>sys.stderr,rows
208                #raise ValueError
209                #raise ValueError, "pwm row %s has wrong field count" % " ".join(fields)
210
211            self.consensus.append(consensus)
212
213
214        hashRows = []
215        self.matrix_base_counts = {} # for pseudocounts
216        self.counts = [] # for scaled counts
217        self.probs = [] # for probabilities
218
219        # scale counts to integers
220        for i in range(len(rows)):
221            hashRows.append(dict())
222            for x,sym in enumerate(alphabet):
223                (w,s) = rows[i][x]
224                hashRows[i][sym] = w * scale/s
225                assert hashRows[i][sym] >= 0
226                if sym not in self.matrix_base_counts: self.matrix_base_counts[sym] = 0
227                self.matrix_base_counts[sym] += hashRows[i][sym]
228            self.counts.append( hashRows[i].copy() )
229            self.probs.append( hashRows[i].copy() )
230            totalWeight = float(sum(self.probs[i].values()))
231            for sym in self.probs[i]:
232                self.probs[i][sym] /= totalWeight
233        self.sites = sum ( hashRows[0].values() )
234
235        # scan pwm to pre-compute logs of probabilities and min and max log-odds
236        # scores (over the whole PWM) for scaling;  note that the same min and max
237        # applies for scaling long-odds scores for quantum comparisions
238        self.information_content = []
239        minSum = 0
240        maxSum = 0
241
242        for i in range( len( hashRows )):
243            self.information_content.append( self.information_content_calculation( i, hashRows ) )
244            newHashRow = {}
245            for base in self.alphabet:
246                newHashRow[base] = self.pwm_score(base, i, hashRows)
247            hashRows[i] = newHashRow
248
249            minSum += min(hashRows[i].values())
250            maxSum += max(hashRows[i].values())
251
252        self.minSum = minSum
253        self.maxSum = maxSum
254        self.rows = hashRows
255
256    # Reference 1: Wasserman and Sandelin: Nat Rev Genet. 2004 Apr;5(4):276-87.
257    # Reference 2: Gertz et al.: Genome Res. 2005 Aug;15(8):1145-52.
258    def information_content_calculation(self, i, counts):
259        # Reference 1)
260        return 2 + sum( [ self.information_base_content(base,i,counts) for base in self.alphabet ] )
261
262        # Reference 2)
263        #return sum( [ self.information_base_content(base,i,counts) for base in self.alphabet ] )
264
265    def information_base_content(self, base, i, counts):
266
267        # Reference 1)
268        #return self.score_correction(counts,base,i) * math.log ( self.score_correction(counts,base,i), 2)
269
270        # Reference 2)
271        return self.score_correction(counts,base,i) * self.pwm_score(base, i, counts)
272
273    def __call__ (self,seq):
274        return self.score_seq(seq)
275
276    def __add__ (self,other):
277
278        assert self.alphabet == other.alphabet
279        r,(p,q) = self.max_correlation(other)
280
281        if p == q == 0: width = max( len(self),len(other) )
282        elif p > 0: width = max( len(other)+p, len(self) )
283        elif q > 0: width = max( len(self)+q, len(other) )
284
285        sumx = zeros( (width,len(self.alphabet)),dtype='int')
286        selfx = self.to_count_matrix()
287        otherx = other.to_count_matrix()
288
289        if p == q == 0:
290            sumx[:len(self)] += selfx
291            sumx[:len(other)] += otherx
292        elif p > 0:
293            sumx[p:p+len(other)] += otherx
294            sumx[:len(self)] += selfx
295        else:
296            sumx[:len(other)] += otherx
297            sumx[q:q+len(self)] += selfx
298
299        newRows = []
300        for i,x in enumerate(sumx):
301            y = list(x)
302            y.append( consensus_symbol(y) )
303            y = [ str(yi) for yi in y]
304            newRows.append( y )
305        return PositionWeightMatrix(self.id+other.id,newRows,self.alphabet,self.background)
306
307    def __old_add__ (self,other,maxp=None):
308
309        assert self.alphabet == other.alphabet
310        bigN = max(len(self),len(other))
311        smallN = min(len(self),len(other))
312        if not maxp:
313            prsq = self.correlation(other)
314            maxp = prsq.index( max(prsq) )
315
316        leftpad = ' ' * maxp
317        rightsize = bigN - smallN
318        rightpad = ' ' * rightsize
319        leftStrings = []
320        rightStrings = []
321
322        if len(self) > len(other):
323            larger = self
324            smaller = other
325            leftStrings = self.consensus
326            rightStrings = list(leftpad) + other.consensus + list(rightpad)
327        else:
328            smaller = self
329            larger = other
330            leftStrings = list(leftpad) + self.consensus + list(rightpad)
331            rightStrings = other.consensus
332
333        sumx = zeros([bigN,len(self.alphabet)])
334        sumx += larger.to_count_matrix()
335        sumx[maxp:maxp+smallN] += smaller.to_count_matrix()
336
337        newRows = []
338        for i,x in enumerate(sumx):
339            y = list(x)
340            y.append( leftStrings[i] + rightStrings[i] )
341            y = [ str(yi) for yi in y]
342            newRows.append( y )
343
344        #return PositionWeightMatrix(self.id+other.id,newRows[maxp:maxp+smallN],self.alphabet,self.background)
345        return PositionWeightMatrix(self.id+other.id,newRows,self.alphabet,self.background)
346
347    def to_matrix(self):
348        m = zeros([len(self),len(self.alphabet)])
349        for i in range(len(self)):
350            for j,a in enumerate(self.alphabet):
351                m[i][j] = self[i][a]
352        return m
353
354    def to_count_matrix(self):
355        m = zeros([len(self),len(self.alphabet)],dtype='int')
356        for i in range(len(self)):
357            for j,a in enumerate(self.alphabet):
358                m[i][j] = self.counts[i][a]
359        return m
360
361    def max_correlation(self, otherwmx):
362        rsq,ixtuple = self.slide_correlation(otherwmx)
363        max_rsq = max(rsq)
364        maxp,maxq = ixtuple[rsq.index(max_rsq)]
365        return max_rsq,(maxp,maxq)
366
367    def slide_correlation(self, other):
368        assert self.alphabet == other.alphabet
369        selfx = self.to_count_matrix()
370        otherx = other.to_count_matrix()
371        rsq = []
372        ixtuple = []
373        # self staggered over other, scan self backwards until flush
374        for q in range(len(other)-1,-1,-1):
375            r = 0
376            n = 0
377            for p in range(len(self)):
378                if q+p < len(other):
379                    r += rsquared( list(selfx[p]), list(otherx[q+p]) )
380                    n += 1
381                else:
382                    n += 1
383            rsq.append( r/n )
384            ixtuple.append( (0,q) )
385        # other staggered below self , scan other forward
386        for p in range(1,len(self)):
387            r = 0
388            n = 0
389            for q in range(len(other)):
390                if p+q < len(self):
391                    r += rsquared( list(selfx[p+q]), list(otherx[q]) )
392                    n += 1
393                else:
394                    n += 1
395            rsq.append( r/n )
396            ixtuple.append( (p,0) )
397        return rsq,ixtuple
398
399    def correlation(self, otherwmx):
400        assert self.alphabet == otherwmx.alphabet
401        width = len(self.alphabet)
402        if len(self) > len(otherwmx):
403            larger = self.to_count_matrix()
404            smaller = otherwmx.to_count_matrix()
405        else:
406            smaller = self.to_count_matrix()
407            larger = otherwmx.to_count_matrix()
408        bigN = len(larger)
409        smallN = len(smaller)
410        position_rsq = []
411
412        # slide small over large, for ave rsq
413        for p in range(bigN):
414            if p+smallN <= bigN:
415                r = 0
416                for q in range(smallN):
417                    r += rsquared(list(smaller[q]),list(larger[p+q]))
418                position_rsq.append( r / smallN )
419        return position_rsq
420
421    def score_align (self,align,gapmask=None,byPosition=True):
422
423        # a blank score matrix
424        nrows,ncols = align.dims
425        ascoremax = AlignScoreMatrix( align )
426        scoremax = ascoremax.matrix
427
428        minSeqLen = len( self )
429        for ir in range(nrows):
430
431            # row is missing data
432            if isnan(align.rows[ir][0]): continue
433
434            for start in range(ncols):
435                if align.rows[ir][start] == '-': continue
436                elif align.rows[ir][start] == 'n': continue
437                elif align.rows[ir][start] == 'N': continue
438
439                # get enough sequence for the weight matrix
440                subseq = ""
441                end = 0
442                for ic in range(start,ncols):
443
444                    char = align.rows[ir][ic]
445                    if char == '-' or char == 'N': continue
446                    else: subseq += char
447
448                    if len(subseq) == minSeqLen:
449                        end = ic+1
450
451                        #forward
452                        scores = self.score_seq( subseq )
453                        raw,forward_score = scores[0]
454                        #reverse
455                        scores = self.score_reverse_seq( subseq )
456                        raw,reverse_score = scores[0]
457
458                        score = max(forward_score, reverse_score)
459
460                        # replace the alignment positions with the result
461                        if byPosition:
462                            scoremax[ir][start] = score
463                        else:
464                        # replace positions matching the width of the pwm
465                            for i in range(start,end):
466                                if isnan(scoremax[ir][i]): scoremax[ir][i] = score
467                                elif score > scoremax[ir][i]: scoremax[ir][i] = score
468        # mask gap characters
469        if gapmask == None:
470            gapmask = score_align_gaps(align)
471        putmask( scoremax, gapmask, float('nan') )
472        return scoremax
473
474    # seq can be a string, a list of characters, or a quantum sequence (a list
475    # of hashes from symbols to probability)
476
477    def score_seq(self,seq):
478        if (type(seq[0]) == dict):
479            return self.score_quantum_seq(seq)
480 
481        scores = []
482        for start in range( len(seq)):
483            if start + len(self) > len(seq): break
484            subseq = seq[ start:start+len(self) ]
485            raw = 0
486            try:
487                for i,nt in enumerate(subseq): raw += self.rows[i][nt.upper()]
488                scaled = self.scaled( raw )
489            except KeyError:
490                raw,scaled = float('nan'),float('nan')
491            scores.append( (raw, scaled) )
492        return scores
493
494    def score_quantum_seq(self,seq):
495        scores = []
496        for start in range(len(seq)):
497            if (start + len(self) > len(seq)): break
498            subseq = seq[start:start+len(self)]
499            raw = 0
500            try:
501                for i,nt in enumerate(subseq):
502                    numer = sum([subseq[i][nt] * self.probs[i][nt]   for nt in subseq[i]])
503                    denom = sum([subseq[i][nt] * self.background[nt] for nt in subseq[i]])
504                    raw += math.log(numer/denom,2)
505                scaled = self.scaled(raw)
506            except KeyError:
507                raw,scaled = float('nan'),float('nan')
508            except OverflowError,e:
509                raw,scaled = float('nan'),float('nan')
510            except ValueError,e:
511                raw,scaled = float('nan'),float('nan')
512            scores.append((raw,scaled))
513        return scores
514
515    def score_reverse_seq(self,seq):
516        revSeq = reverse_complement( seq )
517        scores = self.score_seq( revSeq )
518        scores.reverse()
519        return scores
520
521    def scaled(self,val):
522        return ( val - self.minSum ) / (self.maxSum - self.minSum)
523
524    def pseudocount(self, base=None):
525        f = lambda count: math.sqrt( count + 1 )
526        if base in self.alphabet:
527            return f( self.matrix_base_counts[base] )
528        elif base == None:
529            return f ( self.sites )
530        else:
531            return float("nan")
532
533    def simple_probability (self,freq, base, i):
534        # p(base,i) = f(base,i)
535        #             ----------------------
536        #             sum(f(base,{A,C,G,T}))
537
538        return float( freq[i][base] ) / sum([freq[i][nt] for nt in self.alphabet])
539
540    def corrected_probability_score (self,freq, base, i):
541        # p(base,i) = f(base,i) + s(base)
542        #             --------------------
543        #              N + sum(s(A,C,T,G))
544
545        f = float( freq[i][base] )
546        s = self.pseudocount(base)
547        N = self.sites
548        #print >>sys.stderr, "f:%.3f + s:%.3f = %.3f" % (f,s,f + s)
549        #print >>sys.stderr, "-------------------------"
550        #print >>sys.stderr, "N:%d + %d = %d" % (N,self.pseudocount(), N + self.pseudocount())
551        #print >>sys.stderr, "\t\t %.3f\n" % ((f + s) / (N + self.pseudocount()))
552
553        assert (f + s) > 0
554        return (f + s) / (N + self.pseudocount())
555
556    def pwm_score (self,base,i,freq,background=None):
557        if background == None: background = self.background
558        p = self.score_correction(freq,base,i)
559        #print >>sys.stderr, p
560        #print >>sys.stderr, "k %d %c" % (i,base),freq[i][base]
561        b = background[ base ]
562        try:
563            return math.log( p/b, 2)
564        except OverflowError,e:
565            ## print >>sys.stderr,"base=%c, math.log(%.3f / %.3f)" % (base,p,b)
566            ## print >>sys.stderr,self.id
567            return float('nan')
568        except ValueError,e:
569            ## print >>sys.stderr,"base=%c, math.log(%.3f / %.3f)" % (base,p,b)
570            ## print >>sys.stderr,self.id
571            return float('nan')
572
573    def parse_weight (self, weightString):
574
575        fields = weightString.split(".")
576        if (len(fields) > 2): raise ValueError
577
578        w = int(fields[0])
579        s = 1
580
581        if (len(fields) == 2):
582            for cnt in range(0,len(fields[1])): s *= 10
583            w = s*w + int(fields[1])
584
585        return (w,s)    # w = the weight
586                        # s = the scale used (a power of 10)
587
588    def __str__ (self):
589        lines = [self.id]
590        headers = ["%s" % nt for nt in self.alphabet]
591        lines.append("P0\t" + "\t".join(headers))
592        for ix in range(0,len(self.rows)):
593            weights = ["%d" % self.counts[ix][nt] for nt in self.alphabet]
594            #lines.append(("%02d\t" % ix) + "\t".join(weights) + "\t" + self.consensus[ix])
595            lines.append(("%02d\t" % ix) + "\t".join(weights) + "\t" + str(sum(self.counts[ix].values())) + "\t" + self.consensus[ix])
596
597        return "\n".join(lines)
598
599    def __getitem__ (self,key):
600        return self.rows[key]
601
602    def __setitem__ (self,key,value):
603        self.rows[key] = value
604
605    def __len__ (self):
606        return len( self.rows )
607
608def score_align_gaps (align):
609    # a blank score matrix
610    nrows,ncols = align.dims
611    scoremax = AlignScoreMatrix( align ).matrix
612    for ir in range(nrows):
613        # row is missing data
614        if isnan(align.rows[ir][0]): continue
615        # scan for gaps
616        for pos in range(ncols):
617            if align.rows[ir][pos] == '-': scoremax[ir][pos] = 1
618            else: scoremax[ir][pos] = 0
619    return scoremax
620
621#-----------
622#
623# WeightMatrix Reader--
624#    Read position weight matrices (PWM) from a file.
625#
626#-----------
627
628class Reader (object):
629    """Iterate over all interesting weight matrices in a file"""
630
631    def __init__ (self,file,tfIds=None,name=None,format='basic',background=None,score_correction=True):
632        self.tfIds      = tfIds
633        self.file       = file
634        self.name       = name
635        self.lineNumber = 0
636        self.format     = format
637        self.background = background
638        self.score_correction = score_correction
639
640
641    def close (self):
642        self.file.close()
643
644
645    def where (self):
646        if (self.name == None):
647            return "line %d" % self.lineNumber
648        else:
649            return "line %d in %s" % (self.lineNumber,self.name)
650
651
652    def __iter__ (self):
653        if self.format == 'basic':
654            return self.read_as_basic()
655        elif self.format == 'transfac':
656            return self.read_as_transfac()
657        else:
658            raise "unknown weight matrix file format: '%s'" % self.format
659
660    def read_as_basic(self):
661        tfId    = None
662        pwmRows = None
663   
664        alphabet = ['A','C','G','T']
665        while (True):
666            line = self.file.readline()
667            if (not line): break
668            line = line.strip()
669            self.lineNumber += 1
670            if line.startswith(">"):
671                if pwmRows != None:
672                    yield PositionWeightMatrix(tfId,pwmRows,alphabet,background=self.background)
673                    #try:
674                        #yield PositionWeightMatrix(tfId,pwmRows,alphabet)
675                    #except:
676                        #print >>sys.stderr, "Failed to read", tfId
677                tfId = line.strip()[1:]
678                pwmRows = []
679            elif line[0].isdigit():
680                tokens = line.strip().split()
681                tokens.append( consensus_symbol(line) )
682                vals = [float(v) for v in tokens[:-1]]
683                #print >>sys.stderr,[ "%.2f" % (float(v)/sum(vals)) for v in vals], tokens[-1]
684                pwmRows.append( tokens )
685        if pwmRows != None: # we've finished collecting a desired matrix
686            yield PositionWeightMatrix(tfId,pwmRows,alphabet,background=self.background,score_correction=self.score_correction)
687   
688    def read_as_transfac(self):
689        self.tfToPwm = {}
690        tfId    = None
691        pwmRows = None
692   
693        while (True):
694            line = self.file.readline()
695            if (not line): break
696            line = line.strip()
697            self.lineNumber += 1
698            # handle an ID line
699            if line.startswith("ID"):
700                if pwmRows != None: # we've finished collecting a desired matrix
701                    try:
702                        yield PositionWeightMatrix(tfId,pwmRows,alphabet,background=self.background,score_correction=self.score_correction)
703                    except:
704                        print >>sys.stderr, "Failed to read", tfId
705                    tfId    = None
706                    pwmRows = None
707   
708                tokens = line.split (None, 2)
709                if len(tokens) != 2:
710                    raise ValueError, "bad line, need two fields (%s)" % self.where()
711                tfId = tokens[1]
712                if self.tfIds != None and (not tfId in self.tfIds):
713                    continue          # ignore it, this isn't a desired matrix
714                if tfId in self.tfToPwm:
715                    raise ValueError, "transcription factor %s appears twice (%s)" \
716                        % (tfId,self.where())
717                pwmRows = []          # start collecting a desired matrix
718                continue
719   
720            # if we're not collecting, skip this line
721            if pwmRows == None: continue
722            if len(line) < 1:   continue
723
724            # name, if present, added to ID
725            if line.startswith('NA'):
726                words = line.strip().split()
727                tfId =  tfId + "\t" + " ".join(words[1:])
728   
729            # handle a P0 line
730            if line.startswith("P0"):
731                alphabet = line.split()[1:]
732                if len(alphabet) < 2:
733                    raise ValueError, "bad line, need more dna (%s)" % self.where()
734                continue
735   
736            # handle a 01,02,etc. line
737            if line[0].isdigit():
738                tokens = line.split ()
739                try:
740                    index = int(tokens[0])
741                    if index != len(pwmRows)+1: raise ValueError
742                except:
743                    raise ValueError,"bad line, bad index (%s)" % self.where()
744                pwmRows.append(tokens[1:])
745                continue
746            # skip low quality entries
747            if line.startswith("CC  TRANSFAC Sites of quality"):
748                print >>sys.stderr, line.strip(), tfId
749                pwmRows = None
750                continue
751        if pwmRows != None: # we've finished collecting a desired matrix
752            yield PositionWeightMatrix(tfId,pwmRows,alphabet,background=self.background,score_correction=self.score_correction)
753        # clean up
754        self.tfToPwm = None
755
756def isnan(x):
757    #return ieeespecial.isnan(x)
758    if x==x: return False
759    return True
760
761def reverse_complement (nukes):
762    return nukes[::-1].translate(PositionWeightMatrix.complementMap)
763
764def rsquared( x, y ):
765    try:
766        return sum_of_squares(x,y)**2 / (sum_of_squares(x) * sum_of_squares(y))
767    except ZeroDivisionError:
768        #return float('nan')
769        return 0
770
771def sum_of_squares( x,y=None ):
772    if not y: y = x
773    xmean = float(sum( x )) / len( x )
774    ymean = float(sum( y )) / len( y )
775    assert len(x) == len(y)
776    return sum([ float(xi)*float(yi) for xi,yi in zip(x,y)]) - len(x)*xmean*ymean
777
778def match_consensus(sequence,pattern):
779
780    return c_match_consensus( sequence, pattern, len(sequence))
781
782    #for s,p in zip(sequence,pattern):
783        #if p == 'N': continue
784        #if not s in PositionWeightMatrix.symbols[p]: return False
785
786    #return True
787
788def consensus_symbol( pattern ):
789
790    if type(pattern)==type(""):
791        try:
792            pattern = [int(x) for x in pattern.split()]
793        except ValueError,e:
794            print >>sys.stderr, pattern
795            raise ValueError,e
796
797    # IUPAC-IUB nomenclature for wobblers
798    wobblers = {
799        'R':Set(['A','G']),
800        'Y':Set(['C','T']),
801        'M':Set(['A','C']),
802        'K':Set(['G','T']),
803        'S':Set(['G','C']),
804        'W':Set(['A','T']),
805        'H':Set(['A','C','T']),
806        'B':Set(['G','T','C']),
807        'V':Set(['G','C','A']),
808        'D':Set(['G','T','A'])}
809
810    symbols = ['A','C','G','T']
811
812    if type(pattern)==type({}):
813        pattern = [pattern[u] for u in symbols]
814
815    total = sum(pattern)
816    f = [ (space/1e5)+(float(x)/total) for space,x in enumerate(pattern) ]
817    copy = []
818    copy[:] = f[:]
819    copy.sort()
820
821    # http://www.genomatix.de/online_help/help_matinspector/matrix_help.html --
822    # url says consensus must be greater than 50%, and at least twice the freq
823    # of the second-most frequent.  A double-degenerate symbol can be used
824    # if the top two account for 75% or more of the nt, if each is less than 50%
825    # Otherwise, N is used in the consensus.
826    tops = copy[-2:]
827    if tops[1] > 0.5 and tops[1] >= 2 * tops[0]: return symbols[f.index(tops[1])]
828    elif tops[0] < 0.5 and sum(tops) >= 0.75:
829        degen = Set([ symbols[f.index(v)] for v in tops ])
830        for degenSymbol,wobbles in wobblers.items():
831            #print >>sys.stderr,wobbles
832            if degen == wobbles:
833                return degenSymbol
834    else: return 'N'
835    print >>sys.stderr,pattern
836    raise '?'
837
838# import C extensions
839try:
840    from _position_weight_matrix import c_match_consensus
841    ## print >>sys.stderr, "C match_consensus used"
842except:
843    ## print >>sys.stderr, "python match_consensus used"
844    def match_consensus(sequence, pattern, size):
845        for s,p in zip(sequence,pattern):
846            if p == 'N': continue
847            if not s in PositionWeightMatrix.symbols[p]: return False
848
849        return True
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。