| 1 | """ |
|---|
| 2 | Classes to support nib files. |
|---|
| 3 | |
|---|
| 4 | :Author: James Taylor (james@bx.psu.edu), Bob Harris (rsharris@bx.psu.edu) |
|---|
| 5 | |
|---|
| 6 | A nib sequence is a sequence of DNA, using the 10 character alphabet A,C,G,T,N |
|---|
| 7 | (upper and lower case). The file is packed as 4 bits per character. |
|---|
| 8 | |
|---|
| 9 | nib file format |
|---|
| 10 | --------------- |
|---|
| 11 | |
|---|
| 12 | Fields can be in big- or little-endian format; they must match the endianess |
|---|
| 13 | of the magic number. |
|---|
| 14 | |
|---|
| 15 | ============ =========== ====================================================== |
|---|
| 16 | offset 0x00: 6B E9 3D 3A big endian magic number (3A 3D E9 6B => little endian) |
|---|
| 17 | offset 0x04: xx xx xx xx length of data sequence (counted in characters) |
|---|
| 18 | offset 0x08: ... data sequence; most significant nybble in each |
|---|
| 19 | byte is first in sequence |
|---|
| 20 | ============ =========== ====================================================== |
|---|
| 21 | """ |
|---|
| 22 | |
|---|
| 23 | from __future__ import division |
|---|
| 24 | |
|---|
| 25 | from bx.seq.seq import SeqFile,SeqReader |
|---|
| 26 | import sys, struct, string, math |
|---|
| 27 | |
|---|
| 28 | import _nib |
|---|
| 29 | |
|---|
| 30 | NIB_MAGIC_NUMBER = 0x6BE93D3A |
|---|
| 31 | NIB_MAGIC_NUMBER_SWAP = 0x3A3DE96B |
|---|
| 32 | NIB_MAGIC_SIZE = 4 |
|---|
| 33 | NIB_LENGTH_SIZE = 4 |
|---|
| 34 | |
|---|
| 35 | class NibFile(SeqFile): |
|---|
| 36 | |
|---|
| 37 | def __init__(self, file, revcomp=False, name="", gap=None): |
|---|
| 38 | SeqFile.__init__(self,file,revcomp,name,gap) |
|---|
| 39 | |
|---|
| 40 | self.byte_order = ">" |
|---|
| 41 | magic = struct.unpack(">L", file.read(NIB_MAGIC_SIZE))[0] |
|---|
| 42 | if (magic != NIB_MAGIC_NUMBER): |
|---|
| 43 | if magic == NIB_MAGIC_NUMBER_SWAP: self.byte_order = "<" |
|---|
| 44 | else: raise "Not a NIB file" |
|---|
| 45 | self.magic = magic |
|---|
| 46 | self.length = struct.unpack("%sL" % self.byte_order, file.read(NIB_LENGTH_SIZE))[0] |
|---|
| 47 | |
|---|
| 48 | def raw_fetch(self, start, length): |
|---|
| 49 | # Check parameters |
|---|
| 50 | assert start >= 0, "Start must be greater than 0" |
|---|
| 51 | assert length >= 0, "Length must be greater than 0" |
|---|
| 52 | assert start + length <= self.length, "Interval beyond end of sequence" |
|---|
| 53 | # Read block of bytes containing sequence |
|---|
| 54 | block_start = int(math.floor(start / 2)) |
|---|
| 55 | block_end = int(math.floor((start + length - 1) / 2)) |
|---|
| 56 | block_len = block_end + 1 - block_start |
|---|
| 57 | self.file.seek(NIB_MAGIC_SIZE + NIB_LENGTH_SIZE + block_start) |
|---|
| 58 | raw = self.file.read(block_len) |
|---|
| 59 | # Unpack compressed block into a character string and return |
|---|
| 60 | return _nib.translate_raw_data( raw, start, length ) |
|---|
| 61 | |
|---|
| 62 | class NibReader(SeqReader): |
|---|
| 63 | |
|---|
| 64 | def __init__(self, file, revcomp=False, name="", gap=None): |
|---|
| 65 | SeqReader.__init__(self,file,revcomp,name,gap) |
|---|
| 66 | |
|---|
| 67 | def next(self): |
|---|
| 68 | if (self.seqs_read != 0): return # nib files have just one sequence |
|---|
| 69 | seq = NibFile(self.file,self.revcomp,self.name,self.gap) |
|---|
| 70 | self.seqs_read += 1 |
|---|
| 71 | return seq |
|---|
| 72 | |
|---|
| 73 | |
|---|
| 74 | class NibWriter(object): |
|---|
| 75 | |
|---|
| 76 | def __init__(self,file): |
|---|
| 77 | self.file = file |
|---|
| 78 | |
|---|
| 79 | def write(self,seq): |
|---|
| 80 | assert (False), "NibWriter.write() is not implemented yet" |
|---|
| 81 | |
|---|
| 82 | def close(self): |
|---|
| 83 | self.file.close() |
|---|
| 84 | |
|---|