root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/pwm/pwm_score_maf.py @ 3

リビジョン 3, 7.8 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1#!/usr/bin/python2.4
2import sys,os
3from bx.align import maf as align_maf
4import bx.pwm.position_weight_matrix as pwmx
5
6def isnan(x):
7    #return ieeespecial.isnan(x)
8    if x==x: return False
9    return True
10
11NaN = float('nan')
12#NaN = ieeespecial.nan
13#Inf = ieeespecial.plus_inf
14#NInf = ieeespecial.minus_inf
15
16def main():
17
18    pwm_file = sys.argv[1]
19    splist = sys.argv[2]
20    if len(sys.argv) ==4:
21        inmaf = open(sys.argv[3])
22    else:
23        inmaf = sys.stdin
24
25    # read alignment species
26    species = []
27    for sp in splist.split(','):
28        species.append( sp )
29
30    # read weight matrices
31    pwm = {}
32    for wm in pwmx.Reader(open( pwm_file ), format='basic'):
33        pwm[ wm.id ] = wm
34
35    fbunch = {}
36    for scoremax,index,headers in MafScorer(pwm, species, inmaf):
37        #print >>sys.stderr, index
38        for k,matrix in scoremax.items():
39            fname = k + '.mx'
40            if fname not in fbunch:
41                fbunch[fname] = open(fname,'w')
42                print >>sys.stderr,"Writing",fname
43
44            for i in range( len(matrix)):
45                for j in range( len(matrix[i])):
46                    print >>fbunch[fname], "%.2f" % matrix[i][j],
47                print >>fbunch[fname]
48
49    for file in fbunch.values():
50        file.close()
51
52def MafScorer(pwm,species,inmaf):
53
54    index = 0
55    scoremax,width = None,None
56    for maf in align_maf.Reader( inmaf ):
57        #try:
58        if True:
59            val = MafBlockScorer(pwm,species,maf)
60            for scoremax,width,headers in val: yield scoremax,index,headers
61            #scoremax,width,headers = MafBlockScorer(pwm,species,maf)
62        try: pass
63        except:
64            print >>sys.stderr, "Failed on:"
65            syserr = align_maf.Writer( sys.stderr )
66            syserr.write( maf )
67            #print >>sys.stderr,headers
68            if width: print >>sys.stderr,width
69            if scoremax: print >>sys.stderr,len(scoremax)
70            syserr.close()
71            sys.exit(1)
72        index += width
73        yield scoremax,index,headers
74
75def MafMotifSelect(mafblock,pwm,motif=None,threshold=0):
76
77    if motif != None and len(motif) != len(pwm):
78        raise "pwm and motif must be the same length"
79    # generic alignment
80    alignlist = [ c.text for c in mafblock.components ]
81    align = pwmx.Align( alignlist )
82    nrows,ncols = align.dims
83    #chr,chr_start,chr_stop = align.headers[0]
84    # required sequence length
85    minSeqLen = len( motif )
86    # record the text sizes from the alignment rows
87    align_match_lens = []
88
89    for start in range(ncols - minSeqLen):
90        if align.rows[0][start] == '-': continue
91        subseq = ""
92        pwm_score_vec = []
93        motif_score_vec = []
94        max_cols = 0
95        for ir in range(nrows):
96            expanded = align.rows[ir].count( '-', start, minSeqLen)
97            subtext = align.rows[ir][ start : minSeqLen+expanded ]
98            max_cols = max( len(subtext), max_cols )
99            subseq = subtext.replace('-','')
100            revseq = pwmx.reverse_complement(subseq)
101            # pwm score
102            nill,f_score = pwm.score_seq( subseq )[0]
103            r_score, nill = pwm.score_seq( revseq )[0]
104            pwm_score_vec.append( max(f_score, r_score) )
105            # consensus score
106            if motif is not None:
107                for_score = int( pwmx.match_consensus(subseq,motif) )
108                rev_score = int( pwmx.match_consensus(revseq,motif) )
109                motif_score_vec.append( max(for_score, rev_score) )
110        #check threshold
111        try:
112            assert not isnan(max(pwm_score_vec) )
113            assert not isnan(max(motif_score_vec) )
114        except:
115            print >>sys.stderr, pwm_score_vec, motif_score_vec
116            print >>sys.stderr, len(subseq), len(pwm)
117        if max(pwm_score_vec) < threshold: continue
118        if max(motif_score_vec) < threshold: continue
119        # chop block
120        col_start = start
121        col_end = max_cols + 1
122        motifmaf = mafblock.slice( col_start, col_end )
123        yield motifmaf, pwm_score_vec, motif_score_vec
124               
125    """
126    for ir in range(nrows):
127        # scan alignment row for motif subsequences
128        for start in range(ncols):
129            if align.rows[ir][start] == '-': continue
130            elif align.rows[ir][start] == 'n': continue
131            elif align.rows[ir][start] == 'N': continue
132            # gather enough subseq for motif
133            for ic in range(start,ncols):
134                char = align.rows[ir][ic].upper()
135                if char == '-' or char == 'N': continue
136                else: subseq += char
137                if len(subseq) == minSeqLen:
138                    revseq = pwmx.reverse_complement( subseq )
139                    align_match_lens.append( ic )
140                    # pwm score
141                    nill,f_score = pwm.score_seq( subseq )[0]
142                    r_score, nill = pwm.score_seq( revseq )[0]
143                    pwm_score_vec.append( max(f_score, r_score) )
144                    # consensus score
145                    if motif is not None:
146                        for_score = int( pwmx.match_consensus(subseq,motif) )
147                        rev_score = int( pwmx.match_consensus(revseq,motif) )
148                        motif_score_vec.append( max(for_score, rev_score) )
149                    #check threshold
150                    try:
151                        assert not isnan(max(pwm_score_vec) )
152                        assert not isnan(max(motif_score_vec) )
153                    except:
154                        print >>sys.stderr, pwm_score_vec, motif_score_vec
155                        print >>sys.stderr, len(subseq), len(pwm)
156                    if max(pwm_score_vec) < threshold: continue
157                    if max(motif_score_vec) < threshold: continue
158                    # chop block
159                    col_start = start
160                    col_end = max( align_match_lens ) + 1
161                    motifmaf = mafblock.slice( col_start, col_end )
162
163                    print subseq,revseq,ic
164                    print align_match_lens
165                    yield motifmaf, pwm_score_vec, motif_score_vec
166        """
167
168def MafBlockScorer(pwm,species,maf):
169    width = len(maf.components[0].text)
170    headers = [ (c.src,c.start,c.end) for c in maf.components]
171
172    # expand block rows to full
173    mafBlockSpecies = [specName.src.split('.')[0] for specName in maf.components]
174    alignlist = []
175    for sp in species:
176        try:
177            i = mafBlockSpecies.index( sp )
178            alignlist.append( maf.components[i].text )
179        except ValueError:
180            alignlist.append( [ NaN for n in range( width ) ] )
181    alignrows = pwmx.Align( alignlist )
182    scoremax = {}
183    # record gap positions
184    filter = pwmx.score_align_gaps( alignrows )
185    # score pwm models
186    for model in pwm.keys():
187        #print >>sys.stderr,"%s_%d_%d" % headers[0],width,model
188        scoremax[model] = pwm[model].score_align( alignrows, filter )
189    yield scoremax,width,headers
190
191def MafMotifScorer(species,maf,motifs):
192    width = len(maf.components[0].text)
193    headers = [ (c.src,c.start,c.end) for c in maf.components]
194
195    # expand block rows to full
196    mafBlockSpecies = [specName.src.split('.')[0] for specName in maf.components]
197    alignlist = []
198    for sp in species:
199        try:
200            i = mafBlockSpecies.index( sp )
201            alignlist.append( maf.components[i].text )
202        except ValueError:
203            alignlist.append( [ NaN for n in range( width ) ] )
204
205    alignrows = pwmx.Align( alignlist, headers )
206    # record gap positions
207    filter = pwmx.score_align_gaps( alignrows )
208    # score motif
209    #print >>sys.stderr, headers
210    if isinstance( motifs, list):
211        scoremax = {}
212        for string in motifs:
213            scoremax[string] = pwmx.score_align_motif( alignrows, string, filter )
214    else:
215        scoremax = pwmx.score_align_motif( alignrows, motif, filter )
216    yield scoremax,width,headers
217
218if __name__ == '__main__': main()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。