1 | #!/usr/bin/python2.4 |
---|
2 | import sys,os |
---|
3 | from bx.align import maf as align_maf |
---|
4 | import bx.pwm.position_weight_matrix as pwmx |
---|
5 | |
---|
6 | def isnan(x): |
---|
7 | #return ieeespecial.isnan(x) |
---|
8 | if x==x: return False |
---|
9 | return True |
---|
10 | |
---|
11 | NaN = float('nan') |
---|
12 | #NaN = ieeespecial.nan |
---|
13 | #Inf = ieeespecial.plus_inf |
---|
14 | #NInf = ieeespecial.minus_inf |
---|
15 | |
---|
16 | def main(): |
---|
17 | |
---|
18 | pwm_file = sys.argv[1] |
---|
19 | splist = sys.argv[2] |
---|
20 | if len(sys.argv) ==4: |
---|
21 | inmaf = open(sys.argv[3]) |
---|
22 | else: |
---|
23 | inmaf = sys.stdin |
---|
24 | |
---|
25 | # read alignment species |
---|
26 | species = [] |
---|
27 | for sp in splist.split(','): |
---|
28 | species.append( sp ) |
---|
29 | |
---|
30 | # read weight matrices |
---|
31 | pwm = {} |
---|
32 | for wm in pwmx.Reader(open( pwm_file ), format='basic'): |
---|
33 | pwm[ wm.id ] = wm |
---|
34 | |
---|
35 | fbunch = {} |
---|
36 | for scoremax,index,headers in MafScorer(pwm, species, inmaf): |
---|
37 | #print >>sys.stderr, index |
---|
38 | for k,matrix in scoremax.items(): |
---|
39 | fname = k + '.mx' |
---|
40 | if fname not in fbunch: |
---|
41 | fbunch[fname] = open(fname,'w') |
---|
42 | print >>sys.stderr,"Writing",fname |
---|
43 | |
---|
44 | for i in range( len(matrix)): |
---|
45 | for j in range( len(matrix[i])): |
---|
46 | print >>fbunch[fname], "%.2f" % matrix[i][j], |
---|
47 | print >>fbunch[fname] |
---|
48 | |
---|
49 | for file in fbunch.values(): |
---|
50 | file.close() |
---|
51 | |
---|
52 | def MafScorer(pwm,species,inmaf): |
---|
53 | |
---|
54 | index = 0 |
---|
55 | scoremax,width = None,None |
---|
56 | for maf in align_maf.Reader( inmaf ): |
---|
57 | #try: |
---|
58 | if True: |
---|
59 | val = MafBlockScorer(pwm,species,maf) |
---|
60 | for scoremax,width,headers in val: yield scoremax,index,headers |
---|
61 | #scoremax,width,headers = MafBlockScorer(pwm,species,maf) |
---|
62 | try: pass |
---|
63 | except: |
---|
64 | print >>sys.stderr, "Failed on:" |
---|
65 | syserr = align_maf.Writer( sys.stderr ) |
---|
66 | syserr.write( maf ) |
---|
67 | #print >>sys.stderr,headers |
---|
68 | if width: print >>sys.stderr,width |
---|
69 | if scoremax: print >>sys.stderr,len(scoremax) |
---|
70 | syserr.close() |
---|
71 | sys.exit(1) |
---|
72 | index += width |
---|
73 | yield scoremax,index,headers |
---|
74 | |
---|
75 | def MafMotifSelect(mafblock,pwm,motif=None,threshold=0): |
---|
76 | |
---|
77 | if motif != None and len(motif) != len(pwm): |
---|
78 | raise "pwm and motif must be the same length" |
---|
79 | # generic alignment |
---|
80 | alignlist = [ c.text for c in mafblock.components ] |
---|
81 | align = pwmx.Align( alignlist ) |
---|
82 | nrows,ncols = align.dims |
---|
83 | #chr,chr_start,chr_stop = align.headers[0] |
---|
84 | # required sequence length |
---|
85 | minSeqLen = len( motif ) |
---|
86 | # record the text sizes from the alignment rows |
---|
87 | align_match_lens = [] |
---|
88 | |
---|
89 | for start in range(ncols - minSeqLen): |
---|
90 | if align.rows[0][start] == '-': continue |
---|
91 | subseq = "" |
---|
92 | pwm_score_vec = [] |
---|
93 | motif_score_vec = [] |
---|
94 | max_cols = 0 |
---|
95 | for ir in range(nrows): |
---|
96 | expanded = align.rows[ir].count( '-', start, minSeqLen) |
---|
97 | subtext = align.rows[ir][ start : minSeqLen+expanded ] |
---|
98 | max_cols = max( len(subtext), max_cols ) |
---|
99 | subseq = subtext.replace('-','') |
---|
100 | revseq = pwmx.reverse_complement(subseq) |
---|
101 | # pwm score |
---|
102 | nill,f_score = pwm.score_seq( subseq )[0] |
---|
103 | r_score, nill = pwm.score_seq( revseq )[0] |
---|
104 | pwm_score_vec.append( max(f_score, r_score) ) |
---|
105 | # consensus score |
---|
106 | if motif is not None: |
---|
107 | for_score = int( pwmx.match_consensus(subseq,motif) ) |
---|
108 | rev_score = int( pwmx.match_consensus(revseq,motif) ) |
---|
109 | motif_score_vec.append( max(for_score, rev_score) ) |
---|
110 | #check threshold |
---|
111 | try: |
---|
112 | assert not isnan(max(pwm_score_vec) ) |
---|
113 | assert not isnan(max(motif_score_vec) ) |
---|
114 | except: |
---|
115 | print >>sys.stderr, pwm_score_vec, motif_score_vec |
---|
116 | print >>sys.stderr, len(subseq), len(pwm) |
---|
117 | if max(pwm_score_vec) < threshold: continue |
---|
118 | if max(motif_score_vec) < threshold: continue |
---|
119 | # chop block |
---|
120 | col_start = start |
---|
121 | col_end = max_cols + 1 |
---|
122 | motifmaf = mafblock.slice( col_start, col_end ) |
---|
123 | yield motifmaf, pwm_score_vec, motif_score_vec |
---|
124 | |
---|
125 | """ |
---|
126 | for ir in range(nrows): |
---|
127 | # scan alignment row for motif subsequences |
---|
128 | for start in range(ncols): |
---|
129 | if align.rows[ir][start] == '-': continue |
---|
130 | elif align.rows[ir][start] == 'n': continue |
---|
131 | elif align.rows[ir][start] == 'N': continue |
---|
132 | # gather enough subseq for motif |
---|
133 | for ic in range(start,ncols): |
---|
134 | char = align.rows[ir][ic].upper() |
---|
135 | if char == '-' or char == 'N': continue |
---|
136 | else: subseq += char |
---|
137 | if len(subseq) == minSeqLen: |
---|
138 | revseq = pwmx.reverse_complement( subseq ) |
---|
139 | align_match_lens.append( ic ) |
---|
140 | # pwm score |
---|
141 | nill,f_score = pwm.score_seq( subseq )[0] |
---|
142 | r_score, nill = pwm.score_seq( revseq )[0] |
---|
143 | pwm_score_vec.append( max(f_score, r_score) ) |
---|
144 | # consensus score |
---|
145 | if motif is not None: |
---|
146 | for_score = int( pwmx.match_consensus(subseq,motif) ) |
---|
147 | rev_score = int( pwmx.match_consensus(revseq,motif) ) |
---|
148 | motif_score_vec.append( max(for_score, rev_score) ) |
---|
149 | #check threshold |
---|
150 | try: |
---|
151 | assert not isnan(max(pwm_score_vec) ) |
---|
152 | assert not isnan(max(motif_score_vec) ) |
---|
153 | except: |
---|
154 | print >>sys.stderr, pwm_score_vec, motif_score_vec |
---|
155 | print >>sys.stderr, len(subseq), len(pwm) |
---|
156 | if max(pwm_score_vec) < threshold: continue |
---|
157 | if max(motif_score_vec) < threshold: continue |
---|
158 | # chop block |
---|
159 | col_start = start |
---|
160 | col_end = max( align_match_lens ) + 1 |
---|
161 | motifmaf = mafblock.slice( col_start, col_end ) |
---|
162 | |
---|
163 | print subseq,revseq,ic |
---|
164 | print align_match_lens |
---|
165 | yield motifmaf, pwm_score_vec, motif_score_vec |
---|
166 | """ |
---|
167 | |
---|
168 | def MafBlockScorer(pwm,species,maf): |
---|
169 | width = len(maf.components[0].text) |
---|
170 | headers = [ (c.src,c.start,c.end) for c in maf.components] |
---|
171 | |
---|
172 | # expand block rows to full |
---|
173 | mafBlockSpecies = [specName.src.split('.')[0] for specName in maf.components] |
---|
174 | alignlist = [] |
---|
175 | for sp in species: |
---|
176 | try: |
---|
177 | i = mafBlockSpecies.index( sp ) |
---|
178 | alignlist.append( maf.components[i].text ) |
---|
179 | except ValueError: |
---|
180 | alignlist.append( [ NaN for n in range( width ) ] ) |
---|
181 | alignrows = pwmx.Align( alignlist ) |
---|
182 | scoremax = {} |
---|
183 | # record gap positions |
---|
184 | filter = pwmx.score_align_gaps( alignrows ) |
---|
185 | # score pwm models |
---|
186 | for model in pwm.keys(): |
---|
187 | #print >>sys.stderr,"%s_%d_%d" % headers[0],width,model |
---|
188 | scoremax[model] = pwm[model].score_align( alignrows, filter ) |
---|
189 | yield scoremax,width,headers |
---|
190 | |
---|
191 | def MafMotifScorer(species,maf,motifs): |
---|
192 | width = len(maf.components[0].text) |
---|
193 | headers = [ (c.src,c.start,c.end) for c in maf.components] |
---|
194 | |
---|
195 | # expand block rows to full |
---|
196 | mafBlockSpecies = [specName.src.split('.')[0] for specName in maf.components] |
---|
197 | alignlist = [] |
---|
198 | for sp in species: |
---|
199 | try: |
---|
200 | i = mafBlockSpecies.index( sp ) |
---|
201 | alignlist.append( maf.components[i].text ) |
---|
202 | except ValueError: |
---|
203 | alignlist.append( [ NaN for n in range( width ) ] ) |
---|
204 | |
---|
205 | alignrows = pwmx.Align( alignlist, headers ) |
---|
206 | # record gap positions |
---|
207 | filter = pwmx.score_align_gaps( alignrows ) |
---|
208 | # score motif |
---|
209 | #print >>sys.stderr, headers |
---|
210 | if isinstance( motifs, list): |
---|
211 | scoremax = {} |
---|
212 | for string in motifs: |
---|
213 | scoremax[string] = pwmx.score_align_motif( alignrows, string, filter ) |
---|
214 | else: |
---|
215 | scoremax = pwmx.score_align_motif( alignrows, motif, filter ) |
---|
216 | yield scoremax,width,headers |
---|
217 | |
---|
218 | if __name__ == '__main__': main() |
---|