Context Navigation

position_weight_matrix.py

リビジョン 3, 29.3 KB (コミッタ: kohda, 15 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号
1	#!/usr/bin/env python
2
3	import sys
4	import math
5	import string
6	from numpy import *
7	from sets import *
8
9	# This is the average of all species in the alignment outside of exons
10	# > mean(r)
11	# A T C G
12	# 0.2863776 0.2878264 0.2129560 0.2128400
13	# > sd(r)
14	# A T C G
15	# 0.01316192 0.01371148 0.01293836 0.01386655
16
17	ENCODE_NONCODING_BACKGROUND = { 'A':0.2863776,'T':0.2878264,'G':0.2128400,'C':0.2129560}
18
19	class Align(object):
20	def __init__ (self, seqrows, headers=None):
21	self.rows = seqrows
22	self.nrows = len(seqrows)
23	ncol = None
24	for rownum,row in enumerate(self.rows):
25	try:
26	if ncol == None: ncol = len(row)
27	elif ncol != len(row):
28	raise "Align: __init__:alignment block:row %d does not have %d columns, it has %d" % (rownum,ncol,len(row))
29	except:
30	print row
31	raise ''
32	self.ncols = ncol
33	self.dims = (self.nrows,self.ncols)
34	self.headers = headers
35	def __str__ (self):
36	return "\n".join(self.rows)
37
38	class AlignScoreMatrix (object):
39	def __init__(self,align):
40	nan = float('nan')
41
42	matrix = zeros((align.nrows,align.ncols),float32)
43
44	# set to nans
45	for ir in range( len(matrix) ):
46	for ic in range(len( matrix[ir] )):
47	matrix[ir][ic] = nan
48	self.matrix = matrix
49
50	def __len__(self):
51	return shape(self.matrix)[1]
52
53	def __str__(self):
54	print self.matrix
55
56	def score_align_motif (align,motif,gapmask=None,byPosition=True):
57
58	#dbg
59	#print >>sys.stderr, align.headers
60	chr,chr_start,chr_stop = align.headers[0]
61
62	# a blank score matrix
63	nrows,ncols = align.dims
64	ascoremax = AlignScoreMatrix( align )
65	scoremax = ascoremax.matrix
66
67	minSeqLen = len( motif )
68	for ir in range(nrows):
69	pass
70
71	# row is missing data
72	if isnan(align.rows[ir][0]): continue
73
74	for start in range(ncols):
75
76	if align.rows[ir][start] == '-': continue
77	elif align.rows[ir][start] == 'n': continue
78	elif align.rows[ir][start] == 'N': continue
79
80	# get enough sequence for the weight matrix
81	subseq = ""
82	end = 0
83	ic = start
84	while len(subseq) < minSeqLen:
85	#for ic in range(start,ncols):
86
87	if ic >= len(align.rows[ir]): break
88	char = align.rows[ir][ic].upper()
89	ic += 1
90	if char == '-' or char == 'N': continue
91	else: subseq += char
92
93	if len(subseq) == minSeqLen:
94	end = ic+1
95	for_score = int( match_consensus(subseq,motif) )
96	revseq = reverse_complement( subseq )
97	rev_score = int( match_consensus(revseq,motif) )
98
99	score = max(for_score, rev_score)
100	#dbg
101	#if ir == 0: print >>sys.stderr, int(chr_start) + start - align.rows[ir].count('-',0,start), subseq, score
102
103	# replace the alignment positions with the result
104	if byPosition:
105	scoremax[ir][start] = score
106	else:
107	# replace positions matching the width of the pwm
108	for i in range(start,end):
109	if isnan(scoremax[ir][i]): scoremax[ir][i] = score
110	elif score > scoremax[ir][i]: scoremax[ir][i] = score
111	#break
112	# mask gap characters
113	if gapmask == None:
114	gapmask = score_align_gaps(align)
115	putmask( scoremax, gapmask, float('nan') )
116	return scoremax
117
118	#-----------
119	#
120	# WeightMatrix--
121	# A position weight matrix (PWM) representation of a motif.
122	#
123	#----------
124	# construction arguments:
125	# id: id (name) of the motif
126	# rows: the matrix; each row is a hash from symbol to weight, with
127	# .. the weight in string form
128	# alphabet: symbols allowed
129	# background: hash from symbol to background probability of that symbol; if
130	# .. not specified, ENCODE_NONCODING_BACKGROUND is used
131	# internal fields:
132	# rows: the matrix; each row is a hash from symbol to log-odds score
133	# .. of that symbol for that row of the weight matrix
134	# counts: the matrix; count[row][sym] is the weight, as an integer
135	# probs: the matrix; probs[row][sym] is the weight, as an probability
136	#----------
137
138	class PositionWeightMatrix (object):
139
140	complementMap = string.maketrans("ACGTacgt","TGCAtgca")
141
142	# IUPAC-IUB
143	symbols = {
144	'A':Set(['A']),
145	'C':Set(['C']),
146	'G':Set(['G']),
147	'T':Set(['T']),
148	'R':Set(['A','G']),
149	'Y':Set(['C','T']),
150	'M':Set(['A','C']),
151	'K':Set(['G','T']),
152	'S':Set(['G','C']),
153	'W':Set(['A','T']),
154	'H':Set(['A','C','T']),
155	'B':Set(['G','T','C']),
156	'V':Set(['G','C','A']),
157	'D':Set(['G','T','A'])}
158
159	def __init__ (self, id, rows, alphabet, background=None, score_correction=True):
160
161	self.id = id
162	self.alphabet = alphabet
163	nsymbols = len(self.alphabet)
164	for i in range(len(self.alphabet)):
165	self.alphabet[ i ] = self.alphabet[ i ].upper()
166	if background != None:
167	self.background = background
168	else:
169	self.background = {}
170	sorted_alphabet = []
171	sorted_alphabet[:] = self.alphabet[:]
172	sorted_alphabet.sort()
173	if ['A','C','G','T'] == sorted_alphabet:
174	self.background = ENCODE_NONCODING_BACKGROUND
175	else:
176	for x in self.alphabet: self.background[ x ] = float(1)/len(self.alphabet)
177
178	if (score_correction == True):
179	self.score_correction = self.corrected_probability_score
180	else:
181	self.score_correction = self.simple_probability
182
183	# partition counts from consensus symbol
184	# in order to properly handle scaling in the presense of non-integers,
185	# we prescan the matrix to figure out the largest scale factor, then go
186	# back through and scale 'em all (some rows may be integer counts,
187	# others may be probabilities)
188
189	self.consensus = []
190	scale = 1
191
192	for i in range(len(rows)):
193
194	#try:
195	fields,consensus = rows[i][:nsymbols],rows[i][-1]
196	for x,count in enumerate(fields):
197	try:
198	(w,s) = self.parse_weight(count)
199	except ValueError:
200	raise "pwm row %s has bad weight %s" % (" ".join(fields),t)
201
202	# replace row counts with (values,scale)
203	rows[i][x] = (w,s)
204	scale = max(s,scale)
205
206	#except:
207	#print >>sys.stderr,rows
208	#raise ValueError
209	#raise ValueError, "pwm row %s has wrong field count" % " ".join(fields)
210
211	self.consensus.append(consensus)
212
213
214	hashRows = []
215	self.matrix_base_counts = {} # for pseudocounts
216	self.counts = [] # for scaled counts
217	self.probs = [] # for probabilities
218
219	# scale counts to integers
220	for i in range(len(rows)):
221	hashRows.append(dict())
222	for x,sym in enumerate(alphabet):
223	(w,s) = rows[i][x]
224	hashRows[i][sym] = w * scale/s
225	assert hashRows[i][sym] >= 0
226	if sym not in self.matrix_base_counts: self.matrix_base_counts[sym] = 0
227	self.matrix_base_counts[sym] += hashRows[i][sym]
228	self.counts.append( hashRows[i].copy() )
229	self.probs.append( hashRows[i].copy() )
230	totalWeight = float(sum(self.probs[i].values()))
231	for sym in self.probs[i]:
232	self.probs[i][sym] /= totalWeight
233	self.sites = sum ( hashRows[0].values() )
234
235	# scan pwm to pre-compute logs of probabilities and min and max log-odds
236	# scores (over the whole PWM) for scaling; note that the same min and max
237	# applies for scaling long-odds scores for quantum comparisions
238	self.information_content = []
239	minSum = 0
240	maxSum = 0
241
242	for i in range( len( hashRows )):
243	self.information_content.append( self.information_content_calculation( i, hashRows ) )
244	newHashRow = {}
245	for base in self.alphabet:
246	newHashRow[base] = self.pwm_score(base, i, hashRows)
247	hashRows[i] = newHashRow
248
249	minSum += min(hashRows[i].values())
250	maxSum += max(hashRows[i].values())
251
252	self.minSum = minSum
253	self.maxSum = maxSum
254	self.rows = hashRows
255
256	# Reference 1: Wasserman and Sandelin: Nat Rev Genet. 2004 Apr;5(4):276-87.
257	# Reference 2: Gertz et al.: Genome Res. 2005 Aug;15(8):1145-52.
258	def information_content_calculation(self, i, counts):
259	# Reference 1)
260	return 2 + sum( [ self.information_base_content(base,i,counts) for base in self.alphabet ] )
261
262	# Reference 2)
263	#return sum( [ self.information_base_content(base,i,counts) for base in self.alphabet ] )
264
265	def information_base_content(self, base, i, counts):
266
267	# Reference 1)
268	#return self.score_correction(counts,base,i) * math.log ( self.score_correction(counts,base,i), 2)
269
270	# Reference 2)
271	return self.score_correction(counts,base,i) * self.pwm_score(base, i, counts)
272
273	def __call__ (self,seq):
274	return self.score_seq(seq)
275
276	def __add__ (self,other):
277
278	assert self.alphabet == other.alphabet
279	r,(p,q) = self.max_correlation(other)
280
281	if p == q == 0: width = max( len(self),len(other) )
282	elif p > 0: width = max( len(other)+p, len(self) )
283	elif q > 0: width = max( len(self)+q, len(other) )
284
285	sumx = zeros( (width,len(self.alphabet)),dtype='int')
286	selfx = self.to_count_matrix()
287	otherx = other.to_count_matrix()
288
289	if p == q == 0:
290	sumx[:len(self)] += selfx
291	sumx[:len(other)] += otherx
292	elif p > 0:
293	sumx[p:p+len(other)] += otherx
294	sumx[:len(self)] += selfx
295	else:
296	sumx[:len(other)] += otherx
297	sumx[q:q+len(self)] += selfx
298
299	newRows = []
300	for i,x in enumerate(sumx):
301	y = list(x)
302	y.append( consensus_symbol(y) )
303	y = [ str(yi) for yi in y]
304	newRows.append( y )
305	return PositionWeightMatrix(self.id+other.id,newRows,self.alphabet,self.background)
306
307	def __old_add__ (self,other,maxp=None):
308
309	assert self.alphabet == other.alphabet
310	bigN = max(len(self),len(other))
311	smallN = min(len(self),len(other))
312	if not maxp:
313	prsq = self.correlation(other)
314	maxp = prsq.index( max(prsq) )
315
316	leftpad = ' ' * maxp
317	rightsize = bigN - smallN
318	rightpad = ' ' * rightsize
319	leftStrings = []
320	rightStrings = []
321
322	if len(self) > len(other):
323	larger = self
324	smaller = other
325	leftStrings = self.consensus
326	rightStrings = list(leftpad) + other.consensus + list(rightpad)
327	else:
328	smaller = self
329	larger = other
330	leftStrings = list(leftpad) + self.consensus + list(rightpad)
331	rightStrings = other.consensus
332
333	sumx = zeros([bigN,len(self.alphabet)])
334	sumx += larger.to_count_matrix()
335	sumx[maxp:maxp+smallN] += smaller.to_count_matrix()
336
337	newRows = []
338	for i,x in enumerate(sumx):
339	y = list(x)
340	y.append( leftStrings[i] + rightStrings[i] )
341	y = [ str(yi) for yi in y]
342	newRows.append( y )
343
344	#return PositionWeightMatrix(self.id+other.id,newRows[maxp:maxp+smallN],self.alphabet,self.background)
345	return PositionWeightMatrix(self.id+other.id,newRows,self.alphabet,self.background)
346
347	def to_matrix(self):
348	m = zeros([len(self),len(self.alphabet)])
349	for i in range(len(self)):
350	for j,a in enumerate(self.alphabet):
351	m[i][j] = self[i][a]
352	return m
353
354	def to_count_matrix(self):
355	m = zeros([len(self),len(self.alphabet)],dtype='int')
356	for i in range(len(self)):
357	for j,a in enumerate(self.alphabet):
358	m[i][j] = self.counts[i][a]
359	return m
360
361	def max_correlation(self, otherwmx):
362	rsq,ixtuple = self.slide_correlation(otherwmx)
363	max_rsq = max(rsq)
364	maxp,maxq = ixtuple[rsq.index(max_rsq)]
365	return max_rsq,(maxp,maxq)
366
367	def slide_correlation(self, other):
368	assert self.alphabet == other.alphabet
369	selfx = self.to_count_matrix()
370	otherx = other.to_count_matrix()
371	rsq = []
372	ixtuple = []
373	# self staggered over other, scan self backwards until flush
374	for q in range(len(other)-1,-1,-1):
375	r = 0
376	n = 0
377	for p in range(len(self)):
378	if q+p < len(other):
379	r += rsquared( list(selfx[p]), list(otherx[q+p]) )
380	n += 1
381	else:
382	n += 1
383	rsq.append( r/n )
384	ixtuple.append( (0,q) )
385	# other staggered below self , scan other forward
386	for p in range(1,len(self)):
387	r = 0
388	n = 0
389	for q in range(len(other)):
390	if p+q < len(self):
391	r += rsquared( list(selfx[p+q]), list(otherx[q]) )
392	n += 1
393	else:
394	n += 1
395	rsq.append( r/n )
396	ixtuple.append( (p,0) )
397	return rsq,ixtuple
398
399	def correlation(self, otherwmx):
400	assert self.alphabet == otherwmx.alphabet
401	width = len(self.alphabet)
402	if len(self) > len(otherwmx):
403	larger = self.to_count_matrix()
404	smaller = otherwmx.to_count_matrix()
405	else:
406	smaller = self.to_count_matrix()
407	larger = otherwmx.to_count_matrix()
408	bigN = len(larger)
409	smallN = len(smaller)
410	position_rsq = []
411
412	# slide small over large, for ave rsq
413	for p in range(bigN):
414	if p+smallN <= bigN:
415	r = 0
416	for q in range(smallN):
417	r += rsquared(list(smaller[q]),list(larger[p+q]))
418	position_rsq.append( r / smallN )
419	return position_rsq
420
421	def score_align (self,align,gapmask=None,byPosition=True):
422
423	# a blank score matrix
424	nrows,ncols = align.dims
425	ascoremax = AlignScoreMatrix( align )
426	scoremax = ascoremax.matrix
427
428	minSeqLen = len( self )
429	for ir in range(nrows):
430
431	# row is missing data
432	if isnan(align.rows[ir][0]): continue
433
434	for start in range(ncols):
435	if align.rows[ir][start] == '-': continue
436	elif align.rows[ir][start] == 'n': continue
437	elif align.rows[ir][start] == 'N': continue
438
439	# get enough sequence for the weight matrix
440	subseq = ""
441	end = 0
442	for ic in range(start,ncols):
443
444	char = align.rows[ir][ic]
445	if char == '-' or char == 'N': continue
446	else: subseq += char
447
448	if len(subseq) == minSeqLen:
449	end = ic+1
450
451	#forward
452	scores = self.score_seq( subseq )
453	raw,forward_score = scores[0]
454	#reverse
455	scores = self.score_reverse_seq( subseq )
456	raw,reverse_score = scores[0]
457
458	score = max(forward_score, reverse_score)
459
460	# replace the alignment positions with the result
461	if byPosition:
462	scoremax[ir][start] = score
463	else:
464	# replace positions matching the width of the pwm
465	for i in range(start,end):
466	if isnan(scoremax[ir][i]): scoremax[ir][i] = score
467	elif score > scoremax[ir][i]: scoremax[ir][i] = score
468	# mask gap characters
469	if gapmask == None:
470	gapmask = score_align_gaps(align)
471	putmask( scoremax, gapmask, float('nan') )
472	return scoremax
473
474	# seq can be a string, a list of characters, or a quantum sequence (a list
475	# of hashes from symbols to probability)
476
477	def score_seq(self,seq):
478	if (type(seq[0]) == dict):
479	return self.score_quantum_seq(seq)
480
481	scores = []
482	for start in range( len(seq)):
483	if start + len(self) > len(seq): break
484	subseq = seq[ start:start+len(self) ]
485	raw = 0
486	try:
487	for i,nt in enumerate(subseq): raw += self.rows[i][nt.upper()]
488	scaled = self.scaled( raw )
489	except KeyError:
490	raw,scaled = float('nan'),float('nan')
491	scores.append( (raw, scaled) )
492	return scores
493
494	def score_quantum_seq(self,seq):
495	scores = []
496	for start in range(len(seq)):
497	if (start + len(self) > len(seq)): break
498	subseq = seq[start:start+len(self)]
499	raw = 0
500	try:
501	for i,nt in enumerate(subseq):
502	numer = sum([subseq[i][nt] * self.probs[i][nt] for nt in subseq[i]])
503	denom = sum([subseq[i][nt] * self.background[nt] for nt in subseq[i]])
504	raw += math.log(numer/denom,2)
505	scaled = self.scaled(raw)
506	except KeyError:
507	raw,scaled = float('nan'),float('nan')
508	except OverflowError,e:
509	raw,scaled = float('nan'),float('nan')
510	except ValueError,e:
511	raw,scaled = float('nan'),float('nan')
512	scores.append((raw,scaled))
513	return scores
514
515	def score_reverse_seq(self,seq):
516	revSeq = reverse_complement( seq )
517	scores = self.score_seq( revSeq )
518	scores.reverse()
519	return scores
520
521	def scaled(self,val):
522	return ( val - self.minSum ) / (self.maxSum - self.minSum)
523
524	def pseudocount(self, base=None):
525	f = lambda count: math.sqrt( count + 1 )
526	if base in self.alphabet:
527	return f( self.matrix_base_counts[base] )
528	elif base == None:
529	return f ( self.sites )
530	else:
531	return float("nan")
532
533	def simple_probability (self,freq, base, i):
534	# p(base,i) = f(base,i)
535	# ----------------------
536	# sum(f(base,{A,C,G,T}))
537
538	return float( freq[i][base] ) / sum([freq[i][nt] for nt in self.alphabet])
539
540	def corrected_probability_score (self,freq, base, i):
541	# p(base,i) = f(base,i) + s(base)
542	# --------------------
543	# N + sum(s(A,C,T,G))
544
545	f = float( freq[i][base] )
546	s = self.pseudocount(base)
547	N = self.sites
548	#print >>sys.stderr, "f:%.3f + s:%.3f = %.3f" % (f,s,f + s)
549	#print >>sys.stderr, "-------------------------"
550	#print >>sys.stderr, "N:%d + %d = %d" % (N,self.pseudocount(), N + self.pseudocount())
551	#print >>sys.stderr, "\t\t %.3f\n" % ((f + s) / (N + self.pseudocount()))
552
553	assert (f + s) > 0
554	return (f + s) / (N + self.pseudocount())
555
556	def pwm_score (self,base,i,freq,background=None):
557	if background == None: background = self.background
558	p = self.score_correction(freq,base,i)
559	#print >>sys.stderr, p
560	#print >>sys.stderr, "k %d %c" % (i,base),freq[i][base]
561	b = background[ base ]
562	try:
563	return math.log( p/b, 2)
564	except OverflowError,e:
565	## print >>sys.stderr,"base=%c, math.log(%.3f / %.3f)" % (base,p,b)
566	## print >>sys.stderr,self.id
567	return float('nan')
568	except ValueError,e:
569	## print >>sys.stderr,"base=%c, math.log(%.3f / %.3f)" % (base,p,b)
570	## print >>sys.stderr,self.id
571	return float('nan')
572
573	def parse_weight (self, weightString):
574
575	fields = weightString.split(".")
576	if (len(fields) > 2): raise ValueError
577
578	w = int(fields[0])
579	s = 1
580
581	if (len(fields) == 2):
582	for cnt in range(0,len(fields[1])): s *= 10
583	w = s*w + int(fields[1])
584
585	return (w,s) # w = the weight
586	# s = the scale used (a power of 10)
587
588	def __str__ (self):
589	lines = [self.id]
590	headers = ["%s" % nt for nt in self.alphabet]
591	lines.append("P0\t" + "\t".join(headers))
592	for ix in range(0,len(self.rows)):
593	weights = ["%d" % self.counts[ix][nt] for nt in self.alphabet]
594	#lines.append(("%02d\t" % ix) + "\t".join(weights) + "\t" + self.consensus[ix])
595	lines.append(("%02d\t" % ix) + "\t".join(weights) + "\t" + str(sum(self.counts[ix].values())) + "\t" + self.consensus[ix])
596
597	return "\n".join(lines)
598
599	def __getitem__ (self,key):
600	return self.rows[key]
601
602	def __setitem__ (self,key,value):
603	self.rows[key] = value
604
605	def __len__ (self):
606	return len( self.rows )
607
608	def score_align_gaps (align):
609	# a blank score matrix
610	nrows,ncols = align.dims
611	scoremax = AlignScoreMatrix( align ).matrix
612	for ir in range(nrows):
613	# row is missing data
614	if isnan(align.rows[ir][0]): continue
615	# scan for gaps
616	for pos in range(ncols):
617	if align.rows[ir][pos] == '-': scoremax[ir][pos] = 1
618	else: scoremax[ir][pos] = 0
619	return scoremax
620
621	#-----------
622	#
623	# WeightMatrix Reader--
624	# Read position weight matrices (PWM) from a file.
625	#
626	#-----------
627
628	class Reader (object):
629	"""Iterate over all interesting weight matrices in a file"""
630
631	def __init__ (self,file,tfIds=None,name=None,format='basic',background=None,score_correction=True):
632	self.tfIds = tfIds
633	self.file = file
634	self.name = name
635	self.lineNumber = 0
636	self.format = format
637	self.background = background
638	self.score_correction = score_correction
639
640
641	def close (self):
642	self.file.close()
643
644
645	def where (self):
646	if (self.name == None):
647	return "line %d" % self.lineNumber
648	else:
649	return "line %d in %s" % (self.lineNumber,self.name)
650
651
652	def __iter__ (self):
653	if self.format == 'basic':
654	return self.read_as_basic()
655	elif self.format == 'transfac':
656	return self.read_as_transfac()
657	else:
658	raise "unknown weight matrix file format: '%s'" % self.format
659
660	def read_as_basic(self):
661	tfId = None
662	pwmRows = None
663
664	alphabet = ['A','C','G','T']
665	while (True):
666	line = self.file.readline()
667	if (not line): break
668	line = line.strip()
669	self.lineNumber += 1
670	if line.startswith(">"):
671	if pwmRows != None:
672	yield PositionWeightMatrix(tfId,pwmRows,alphabet,background=self.background)
673	#try:
674	#yield PositionWeightMatrix(tfId,pwmRows,alphabet)
675	#except:
676	#print >>sys.stderr, "Failed to read", tfId
677	tfId = line.strip()[1:]
678	pwmRows = []
679	elif line[0].isdigit():
680	tokens = line.strip().split()
681	tokens.append( consensus_symbol(line) )
682	vals = [float(v) for v in tokens[:-1]]
683	#print >>sys.stderr,[ "%.2f" % (float(v)/sum(vals)) for v in vals], tokens[-1]
684	pwmRows.append( tokens )
685	if pwmRows != None: # we've finished collecting a desired matrix
686	yield PositionWeightMatrix(tfId,pwmRows,alphabet,background=self.background,score_correction=self.score_correction)
687
688	def read_as_transfac(self):
689	self.tfToPwm = {}
690	tfId = None
691	pwmRows = None
692
693	while (True):
694	line = self.file.readline()
695	if (not line): break
696	line = line.strip()
697	self.lineNumber += 1
698	# handle an ID line
699	if line.startswith("ID"):
700	if pwmRows != None: # we've finished collecting a desired matrix
701	try:
702	yield PositionWeightMatrix(tfId,pwmRows,alphabet,background=self.background,score_correction=self.score_correction)
703	except:
704	print >>sys.stderr, "Failed to read", tfId
705	tfId = None
706	pwmRows = None
707
708	tokens = line.split (None, 2)
709	if len(tokens) != 2:
710	raise ValueError, "bad line, need two fields (%s)" % self.where()
711	tfId = tokens[1]
712	if self.tfIds != None and (not tfId in self.tfIds):
713	continue # ignore it, this isn't a desired matrix
714	if tfId in self.tfToPwm:
715	raise ValueError, "transcription factor %s appears twice (%s)" \
716	% (tfId,self.where())
717	pwmRows = [] # start collecting a desired matrix
718	continue
719
720	# if we're not collecting, skip this line
721	if pwmRows == None: continue
722	if len(line) < 1: continue
723
724	# name, if present, added to ID
725	if line.startswith('NA'):
726	words = line.strip().split()
727	tfId = tfId + "\t" + " ".join(words[1:])
728
729	# handle a P0 line
730	if line.startswith("P0"):
731	alphabet = line.split()[1:]
732	if len(alphabet) < 2:
733	raise ValueError, "bad line, need more dna (%s)" % self.where()
734	continue
735
736	# handle a 01,02,etc. line
737	if line[0].isdigit():
738	tokens = line.split ()
739	try:
740	index = int(tokens[0])
741	if index != len(pwmRows)+1: raise ValueError
742	except:
743	raise ValueError,"bad line, bad index (%s)" % self.where()
744	pwmRows.append(tokens[1:])
745	continue
746	# skip low quality entries
747	if line.startswith("CC TRANSFAC Sites of quality"):
748	print >>sys.stderr, line.strip(), tfId
749	pwmRows = None
750	continue
751	if pwmRows != None: # we've finished collecting a desired matrix
752	yield PositionWeightMatrix(tfId,pwmRows,alphabet,background=self.background,score_correction=self.score_correction)
753	# clean up
754	self.tfToPwm = None
755
756	def isnan(x):
757	#return ieeespecial.isnan(x)
758	if x==x: return False
759	return True
760
761	def reverse_complement (nukes):
762	return nukes[::-1].translate(PositionWeightMatrix.complementMap)
763
764	def rsquared( x, y ):
765	try:
766	return sum_of_squares(x,y)*2 / (sum_of_squares(x) sum_of_squares(y))
767	except ZeroDivisionError:
768	#return float('nan')
769	return 0
770
771	def sum_of_squares( x,y=None ):
772	if not y: y = x
773	xmean = float(sum( x )) / len( x )
774	ymean = float(sum( y )) / len( y )
775	assert len(x) == len(y)
776	return sum([ float(xi)float(yi) for xi,yi in zip(x,y)]) - len(x)xmean*ymean
777
778	def match_consensus(sequence,pattern):
779
780	return c_match_consensus( sequence, pattern, len(sequence))
781
782	#for s,p in zip(sequence,pattern):
783	#if p == 'N': continue
784	#if not s in PositionWeightMatrix.symbols[p]: return False
785
786	#return True
787
788	def consensus_symbol( pattern ):
789
790	if type(pattern)==type(""):
791	try:
792	pattern = [int(x) for x in pattern.split()]
793	except ValueError,e:
794	print >>sys.stderr, pattern
795	raise ValueError,e
796
797	# IUPAC-IUB nomenclature for wobblers
798	wobblers = {
799	'R':Set(['A','G']),
800	'Y':Set(['C','T']),
801	'M':Set(['A','C']),
802	'K':Set(['G','T']),
803	'S':Set(['G','C']),
804	'W':Set(['A','T']),
805	'H':Set(['A','C','T']),
806	'B':Set(['G','T','C']),
807	'V':Set(['G','C','A']),
808	'D':Set(['G','T','A'])}
809
810	symbols = ['A','C','G','T']
811
812	if type(pattern)==type({}):
813	pattern = [pattern[u] for u in symbols]
814
815	total = sum(pattern)
816	f = [ (space/1e5)+(float(x)/total) for space,x in enumerate(pattern) ]
817	copy = []
818	copy[:] = f[:]
819	copy.sort()
820
821	# http://www.genomatix.de/online_help/help_matinspector/matrix_help.html --
822	# url says consensus must be greater than 50%, and at least twice the freq
823	# of the second-most frequent. A double-degenerate symbol can be used
824	# if the top two account for 75% or more of the nt, if each is less than 50%
825	# Otherwise, N is used in the consensus.
826	tops = copy[-2:]
827	if tops[1] > 0.5 and tops[1] >= 2 * tops[0]: return symbols[f.index(tops[1])]
828	elif tops[0] < 0.5 and sum(tops) >= 0.75:
829	degen = Set([ symbols[f.index(v)] for v in tops ])
830	for degenSymbol,wobbles in wobblers.items():
831	#print >>sys.stderr,wobbles
832	if degen == wobbles:
833	return degenSymbol
834	else: return 'N'
835	print >>sys.stderr,pattern
836	raise '?'
837
838	# import C extensions
839	try:
840	from _position_weight_matrix import c_match_consensus
841	## print >>sys.stderr, "C match_consensus used"
842	except:
843	## print >>sys.stderr, "python match_consensus used"
844	def match_consensus(sequence, pattern, size):
845	for s,p in zip(sequence,pattern):
846	if p == 'N': continue
847	if not s in PositionWeightMatrix.symbols[p]: return False
848
849	return True

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/pwm/position_weight_matrix.py

異なるフォーマットでダウンロード: