Context Navigation

qdna.py @ 3

リビジョン 3, 10.2 KB (コミッタ: kohda, 14 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号
1	"""
2	Classes to support "quantum-DNA" files.
3
4	:Author: Bob Harris (rsharris@bx.psu.edu)
5
6	A quantum DNA sequence is a sequence of bytes, each representing a probability
7	distribution (vector) over A, C, G, T. The QdnaFile class encapsulates the
8	sequence of bytes, while the mapping from byte value to probability vector is
9	encapsulated by the QdnaCodebook class.
10
11	qdna file format
12	~~~~~~~~~~~~~~~~
13
14	Fields can be in big- or little-endian format; they must match the endianess
15	of the magic number.
16
17	============ =========== ======================================================
18	offset 0x00: C4 B4 71 97 big endian magic number (97 71 B4 C4 => little endian)
19	offset 0x04: 00 00 02 00 version 2.0 (fourth byte is sub version)
20	offset 0x08: 00 00 00 14 header length (in bytes, including this field)
21	offset 0x0C: xx xx xx xx S, offset (from file start) to data sequence
22	offset 0x10: xx xx xx xx N, offset to name, 0 indicates no name
23	offset 0x14: xx xx xx xx length of data sequence (counted in 'items')
24	offset 0x18: xx xx xx xx (for version >= 2.0) P, offset to named
25	.. properties, 0 indicates no properties
26	offset N: ... name (zero-terminated string)
27	offset S: ... data sequence
28	offset P: ... named properties (see below)
29	============ =========== ======================================================
30
31	The named properties section consists of a list of pairs of zero-terminated
32	strings. The list itself is terminated by an empty string (i.e. another
33	zero). In each pair, the first is the name of the property and the second
34	is its value. Some names are recognized and handled in some specific manner
35	(see list below this paragraph). Any unrecognized name is simply added as
36	an instance variable with that name, as long as it is not already an instance
37	variable (in which case it is an error).
38
39	Recognized properties (at present only one):
40	- codebook: A string in qdna code file format (see QdnaCodebook class for details).
41	"""
42
43	from bx.seq.seq import SeqFile,SeqReader
44	import sys, struct, string
45	from StringIO import StringIO
46
47	qdnaMagic = 0xC4B47197L # big endian magic number for qdna files
48	qdnaMagicSwap = 0x9771B4C4L
49
50	class QdnaFile(SeqFile):
51
52	def __init__(self, file, revcomp=False, name="", gap=None, codebook=None):
53	SeqFile.__init__(self,file,revcomp,name,gap)
54	if (gap == None): self.gap = chr(0)
55	assert (revcomp == False), "reverse complement is not supported for qdna files"
56	self.codebook = codebook
57
58	self.byte_order = ">"
59	magic = struct.unpack(">L", file.read(4))[0]
60	if (magic != qdnaMagic):
61	if (magic == qdnaMagicSwap):
62	self.byte_order = "<"
63	else:
64	raise "not a quantum-dna file (magic=%08X)" % magic
65
66	self.magic = magic
67
68	# process header
69
70	self.version = struct.unpack("%sL" % self.byte_order,
71	self.file.read(4))[0]
72	if (self.version not in [0x100,0x200]):
73	raise "unsupported quantum-dna (version=%08X)" % self.version
74
75	self.headerLength = struct.unpack("%sL" % self.byte_order,
76	self.file.read(4))[0]
77	if (self.headerLength < 0x10):
78	raise "unsupported quantum-dna (header len=%08X)" % self.headerLength
79	if (self.version == 0x100) and (self.headerLength != 0x10):
80	raise "unsupported quantum-dna (version 1.0 header len=%08X)" % self.headerLength
81
82	self.seqOffset = struct.unpack("%sL" % self.byte_order,
83	self.file.read(4))[0]
84	self.nameOffset = struct.unpack("%sL" % self.byte_order,
85	self.file.read(4))[0]
86	self.length = struct.unpack("%sL" % self.byte_order,
87	self.file.read(4))[0]
88
89	self.propOffset = 0
90	if (self.headerLength >= 0x14):
91	self.propOffset = struct.unpack("%sL" % self.byte_order,
92	self.file.read(4))[0]
93
94	self.name = ""
95	if (self.nameOffset != 0):
96	self.file.seek(self.nameOffset)
97	self.name = self.read_string()
98
99	if (self.propOffset != 0):
100	self.file.seek(self.propOffset)
101	while (True):
102	name = self.read_string()
103	if (len(name) == 0): break
104	value = self.read_string()
105	self.set_property(name,value)
106
107
108	def set_property(self,name,value):
109	if (name == "codebook"):
110	self.codebook = QdnaCodebook(StringIO(value))
111	else:
112	raise "named properties as instance variables are not implemented yet"
113	# $$$ do this by adding a properties dict and __getitem__/__setitem__
114	# $$$ also need to write properties in QdnaWriter.write()
115
116
117	def read_string(self):
118	s = ""
119	while (True):
120	ch = self.file.read(1)
121	if (ch == chr(0)): break
122	s += ch
123	return s
124
125
126	def raw_fetch(self, start, length):
127	self.file.seek(self.seqOffset + start)
128	return self.file.read(length)
129
130
131	def get_quantum(self, start, length):
132	assert (self.codebook != None), \
133	"qdna sequence %s has no code book" % self.name
134	return [self.codebook[codeNum] for codeNum in self.raw_fetch(start,length)]
135
136
137	class QdnaReader(SeqReader):
138
139	def __init__(self, file, revcomp=False, name="", gap=None, codebook=None):
140	SeqReader.__init__(self,file,revcomp,name,gap)
141	self.codebook = codebook
142
143	def next(self):
144	if (self.seqs_read != 0): return # qdna files have just one sequence
145	seq = QdnaFile(self.file,self.revcomp,self.name,self.gap,self.codebook)
146	self.seqs_read += 1
147	return seq
148
149	"""
150	A QdnaCodebook maps code numbers to the corresponding probability vector. The
151	latter is a hash from symbols (usually "A", "C", "G", or "T") to the
152	corresponsing probability. Note that code numbers are of type string.
153
154	qdna code file format:
155
156	The file is ascii text and looks something like what's shown below. Lines
157	beginning with # are comments, and columns are assumed to represent A, C, G
158	and T (in that order). Anything other than five columns is an error. Note
159	that code number zero is usually reserved for gaps in quantum sequences, and
160	thus usually won't appear in a code file. Note that code numbers are
161	two-digit hexadecimal (to match the textual displays of quantum sequences).
162
163	01 0.111002 0.072588 0.127196 0.689214
164	02 0.081057 0.023799 0.098657 0.796487
165	03 0.000260 0.003823 0.000336 0.995581
166	... more lines, usually a total of 255 ...
167	FF 0.465900 0.008602 0.482301 0.043197
168	"""
169
170	class QdnaCodebook(object):
171
172	def __init__(self,file):
173	(self.alphabet,self.codeToProbs) = self.read_codebook(file)
174
175
176	def __str__(self):
177	codeSet = [codeNum for codeNum in self.codeToProbs]
178	codeSet.sort()
179	return "\n".join([self.vector_text(codeNum) for codeNum in codeSet])
180
181	def vector_text(self,codeNum):
182	if (codeNum in self.codeToProbs): vec = self.codeToProbs[codeNum]
183	else: vec = {}
184	for sym in self.alphabet:
185	if (sym not in vec):
186	vec[sym] = 0.0
187	return ("%02X\t" % ord(codeNum)) \
188	+ "\t".join(["%.6f" % vec[sym] for sym in self.alphabet])
189
190
191	def __getitem__ (self,codeNum):
192	return self.codeToProbs[codeNum]
193
194
195	def __setitem__ (self,codeNum,value):
196	self.codeToProbs[codeNum] = value # value should be hash from symbol
197	# .. to probability
198
199
200	def read_codebook(self,codeF):
201	alphabet = "ACGT"
202	codeToProbs = {}
203
204	for (lineNum,line) in enumerate (codeF):
205	lineNum += 1
206	line = line.rstrip()
207	stripped = line.strip()
208	if (stripped == "") or (stripped.startswith("#")):
209	continue
210
211	fields = line.split(None)
212	if (len(fields) != 5):
213	raise "wrong vector size (line %d)" % lineNum
214
215	try:
216	codeNum = int(fields[0],16)
217	except:
218	raise "bad character code %s (line %d)" \
219	% (fields[0],lineNum)
220
221	if (not 0 <= codeNum <= 255):
222	raise "character code %s is outside the valid range (line %d)" \
223	% (fields[0],lineNum)
224
225	if (chr(codeNum) in codeToProbs):
226	raise "character code %s appears more than once (line %d)" \
227	% (fields[0],lineNum)
228
229	try:
230	vec = {}
231	for ix in range(1,5):
232	p = float(fields[ix])
233	if (p < 0) or (p > 1): raise ValueError
234	vec[alphabet[ix-1]] = p
235	except:
236	raise "%s is a bad probability value (line %d)" \
237	% (fields[ix],lineNum)
238
239	codeToProbs[chr(codeNum)] = vec
240
241	return (alphabet,codeToProbs)
242
243
244	class QdnaWriter(object):
245
246	def __init__(self,file):
247	self.file = file
248
249	def write(self,seq):
250	text = seq.text
251	if (text == None): text = ""
252
253	version = 0x200
254	headerLen = 0x014
255	offset = headerLen + 8
256
257	nameOffset = 0
258	if (seq.name != None) and (seq.name != ""):
259	nameOffset = 0x01C
260	offset += len(seq.name) + 1
261	name = seq.name + chr(0)
262
263	dataOffset = offset
264	offset += len(text)
265
266	assert (seq.codebook == None), \
267	"QdnaWriter.write() does not support codebooks yet"
268	propOffset = 0
269
270	self.file.write(struct.pack("%sL" % seq.byte_order,qdnaMagic))
271	self.file.write(struct.pack("%sL" % seq.byte_order,version))
272	self.file.write(struct.pack("%sL" % seq.byte_order,headerLen))
273	self.file.write(struct.pack("%sL" % seq.byte_order,dataOffset))
274	self.file.write(struct.pack("%sL" % seq.byte_order,nameOffset))
275	self.file.write(struct.pack("%sL" % seq.byte_order,len(text)))
276	self.file.write(struct.pack("%sL" % seq.byte_order,propOffset))
277	if (nameOffset != 0): self.file.write(name)
278	self.file.write(text)
279
280
281	def close(self):
282	self.file.close()
283

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/seq/qdna.py @ 3

異なるフォーマットでダウンロード: