root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/seq/qdna.py @ 3

リビジョン 3, 10.2 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Classes to support "quantum-DNA" files.
3
4:Author: Bob Harris (rsharris@bx.psu.edu)
5
6A quantum DNA sequence is a sequence of bytes, each representing a probability
7distribution (vector) over A, C, G, T.  The QdnaFile class encapsulates the
8sequence of bytes, while the mapping from byte value to probability vector is
9encapsulated by the QdnaCodebook class.
10
11qdna file format
12~~~~~~~~~~~~~~~~
13
14Fields can be in big- or little-endian format;  they must match the endianess
15of the magic number.
16
17============ ===========   ======================================================
18offset 0x00: C4 B4 71 97   big endian magic number (97 71 B4 C4 => little endian)
19offset 0x04: 00 00 02 00   version 2.0 (fourth byte is sub version)
20offset 0x08: 00 00 00 14   header length (in bytes, including this field)
21offset 0x0C: xx xx xx xx   S, offset (from file start) to data sequence
22offset 0x10: xx xx xx xx   N, offset to name, 0 indicates no name
23offset 0x14: xx xx xx xx   length of data sequence (counted in 'items')
24offset 0x18: xx xx xx xx   (for version >= 2.0) P, offset to named
25                           .. properties, 0 indicates no properties
26offset    N: ...           name (zero-terminated string)
27offset    S: ...           data sequence
28offset    P: ...           named properties (see below)
29============ ===========   ======================================================
30
31The named properties section consists of a list of pairs of zero-terminated
32strings.  The list itself is terminated by an empty string (i.e. another
33zero).  In each pair, the first is the name of the property and the second
34is its value.  Some names are recognized and handled in some specific manner
35(see list below this paragraph).  Any unrecognized name is simply added as
36an instance variable with that name, as long as it is not already an instance
37variable (in which case it is an error).
38
39Recognized properties (at present only one):
40  - codebook: A string in qdna code file format (see QdnaCodebook class for details).
41"""
42
43from bx.seq.seq import SeqFile,SeqReader
44import sys, struct, string
45from StringIO import StringIO
46
47qdnaMagic     = 0xC4B47197L    # big endian magic number for qdna files
48qdnaMagicSwap = 0x9771B4C4L
49
50class QdnaFile(SeqFile):
51
52    def __init__(self, file, revcomp=False, name="", gap=None, codebook=None):
53        SeqFile.__init__(self,file,revcomp,name,gap)
54        if (gap == None): self.gap = chr(0)
55        assert (revcomp == False), "reverse complement is not supported for qdna files"
56        self.codebook = codebook
57
58        self.byte_order = ">"
59        magic = struct.unpack(">L", file.read(4))[0]
60        if (magic != qdnaMagic):
61            if (magic == qdnaMagicSwap):
62                self.byte_order = "<"
63            else:
64                raise "not a quantum-dna file (magic=%08X)" % magic
65
66        self.magic = magic
67
68        # process header
69
70        self.version = struct.unpack("%sL" % self.byte_order,
71                                     self.file.read(4))[0]
72        if (self.version not in [0x100,0x200]):
73            raise "unsupported quantum-dna (version=%08X)" % self.version
74
75        self.headerLength = struct.unpack("%sL" % self.byte_order,
76                                          self.file.read(4))[0]
77        if (self.headerLength < 0x10):
78            raise "unsupported quantum-dna (header len=%08X)" % self.headerLength
79        if (self.version == 0x100) and (self.headerLength != 0x10):
80            raise "unsupported quantum-dna (version 1.0 header len=%08X)" % self.headerLength
81
82        self.seqOffset  = struct.unpack("%sL" % self.byte_order,
83                                        self.file.read(4))[0]
84        self.nameOffset = struct.unpack("%sL" % self.byte_order,
85                                        self.file.read(4))[0]
86        self.length     = struct.unpack("%sL" % self.byte_order,
87                                        self.file.read(4))[0]
88
89        self.propOffset = 0
90        if (self.headerLength >= 0x14):
91            self.propOffset = struct.unpack("%sL" % self.byte_order,
92                                            self.file.read(4))[0]
93
94        self.name = ""
95        if (self.nameOffset != 0):
96            self.file.seek(self.nameOffset)
97            self.name = self.read_string()
98
99        if (self.propOffset != 0):
100            self.file.seek(self.propOffset)
101            while (True):
102                name  = self.read_string()
103                if (len(name) == 0): break
104                value = self.read_string()
105                self.set_property(name,value)
106
107
108    def set_property(self,name,value):
109        if (name == "codebook"):
110            self.codebook = QdnaCodebook(StringIO(value))
111        else:
112            raise "named properties as instance variables are not implemented yet"
113            # $$$ do this by adding a properties dict and __getitem__/__setitem__
114            # $$$ also need to write properties in QdnaWriter.write()
115
116
117    def read_string(self):
118        s = ""
119        while (True):
120            ch = self.file.read(1)
121            if (ch == chr(0)): break
122            s += ch
123        return s
124
125
126    def raw_fetch(self, start, length):
127        self.file.seek(self.seqOffset + start)
128        return self.file.read(length)
129
130
131    def get_quantum(self, start, length):
132        assert (self.codebook != None), \
133                "qdna sequence %s has no code book" % self.name
134        return [self.codebook[codeNum] for codeNum in self.raw_fetch(start,length)]
135
136
137class QdnaReader(SeqReader):
138
139    def __init__(self, file, revcomp=False, name="", gap=None, codebook=None):
140        SeqReader.__init__(self,file,revcomp,name,gap)
141        self.codebook = codebook
142
143    def next(self):
144        if (self.seqs_read != 0): return  # qdna files have just one sequence
145        seq = QdnaFile(self.file,self.revcomp,self.name,self.gap,self.codebook)
146        self.seqs_read += 1
147        return seq
148
149"""
150A QdnaCodebook maps code numbers to the corresponding probability vector.  The
151latter is a hash from symbols (usually "A", "C", "G", or "T") to the
152corresponsing probability.  Note that code numbers are of type string.
153
154qdna code file format:
155
156   The file is ascii text and looks something like what's shown below.  Lines
157   beginning with # are comments, and columns are assumed to represent A, C, G
158   and T (in that order).  Anything other than five columns is an error.  Note
159   that code number zero is usually reserved for gaps in quantum sequences, and
160   thus usually won't appear in a code file.  Note that code numbers are
161   two-digit hexadecimal (to match the textual displays of quantum sequences).
162
163      01  0.111002  0.072588  0.127196  0.689214
164      02  0.081057  0.023799  0.098657  0.796487
165      03  0.000260  0.003823  0.000336  0.995581
166       ... more lines, usually a total of 255 ...
167      FF  0.465900  0.008602  0.482301  0.043197
168"""
169
170class QdnaCodebook(object):
171
172    def __init__(self,file):
173        (self.alphabet,self.codeToProbs) = self.read_codebook(file)
174
175
176    def __str__(self):
177        codeSet = [codeNum for codeNum in self.codeToProbs]
178        codeSet.sort()
179        return "\n".join([self.vector_text(codeNum) for codeNum in codeSet])
180
181    def vector_text(self,codeNum):
182        if (codeNum in self.codeToProbs): vec = self.codeToProbs[codeNum]
183        else:                             vec = {}
184        for sym in self.alphabet:
185            if (sym not in vec):
186                vec[sym] = 0.0
187        return ("%02X\t" % ord(codeNum)) \
188            + "\t".join(["%.6f" % vec[sym] for sym in self.alphabet])
189
190
191    def __getitem__ (self,codeNum):
192        return self.codeToProbs[codeNum]
193
194
195    def __setitem__ (self,codeNum,value):
196        self.codeToProbs[codeNum] = value # value should be hash from symbol
197                                          # .. to probability
198
199
200    def read_codebook(self,codeF):
201        alphabet = "ACGT"
202        codeToProbs = {}
203
204        for (lineNum,line) in enumerate (codeF):
205            lineNum += 1
206            line = line.rstrip()
207            stripped = line.strip()
208            if (stripped == "") or (stripped.startswith("#")):
209                continue
210
211            fields = line.split(None)
212            if (len(fields) != 5):
213                raise "wrong vector size (line %d)" % lineNum
214
215            try:
216                codeNum = int(fields[0],16)
217            except:
218                raise "bad character code %s (line %d)" \
219                    % (fields[0],lineNum)
220
221            if (not 0 <= codeNum <= 255):
222                raise "character code %s is outside the valid range (line %d)" \
223                     % (fields[0],lineNum)
224
225            if (chr(codeNum) in codeToProbs):
226                raise "character code %s appears more than once (line %d)" \
227                     % (fields[0],lineNum)
228
229            try:
230                vec = {}
231                for ix in range(1,5):
232                    p = float(fields[ix])
233                    if (p < 0) or (p > 1): raise ValueError
234                    vec[alphabet[ix-1]] = p
235            except:
236                raise "%s is a bad probability value (line %d)" \
237                     % (fields[ix],lineNum)
238
239            codeToProbs[chr(codeNum)] = vec
240
241        return (alphabet,codeToProbs)
242
243
244class QdnaWriter(object):
245
246    def __init__(self,file):
247        self.file = file
248
249    def write(self,seq):
250        text = seq.text
251        if (text == None): text = ""
252
253        version   = 0x200
254        headerLen = 0x014
255        offset    = headerLen + 8
256
257        nameOffset = 0
258        if (seq.name != None) and (seq.name != ""):
259            nameOffset =  0x01C
260            offset     += len(seq.name) + 1
261            name       =  seq.name + chr(0)
262
263        dataOffset =  offset
264        offset     += len(text)
265
266        assert (seq.codebook == None), \
267               "QdnaWriter.write() does not support codebooks yet"
268        propOffset = 0
269
270        self.file.write(struct.pack("%sL" % seq.byte_order,qdnaMagic))
271        self.file.write(struct.pack("%sL" % seq.byte_order,version))
272        self.file.write(struct.pack("%sL" % seq.byte_order,headerLen))
273        self.file.write(struct.pack("%sL" % seq.byte_order,dataOffset))
274        self.file.write(struct.pack("%sL" % seq.byte_order,nameOffset))
275        self.file.write(struct.pack("%sL" % seq.byte_order,len(text)))
276        self.file.write(struct.pack("%sL" % seq.byte_order,propOffset))
277        if (nameOffset != 0): self.file.write(name)
278        self.file.write(text)
279
280
281    def close(self):
282        self.file.close()
283
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。