1 | """ |
---|
2 | Support for the `MAF`_ multiple sequence alignment format used by `multiz`_. |
---|
3 | |
---|
4 | .. _MAF: http://genome.ucsc.edu/FAQ/FAQformat.html#format5 |
---|
5 | .. _multiz: http://www.bx.psu.edu/miller_lab/ |
---|
6 | """ |
---|
7 | |
---|
8 | from bx.align import * |
---|
9 | |
---|
10 | from StringIO import StringIO |
---|
11 | import os |
---|
12 | |
---|
13 | import itertools |
---|
14 | from bx import interval_index_file |
---|
15 | |
---|
16 | from bx.misc.seekbzip2 import SeekableBzip2File |
---|
17 | |
---|
18 | MAF_INVERSE_STATUS = 'V' |
---|
19 | MAF_INSERT_STATUS = 'I' |
---|
20 | MAF_CONTIG_STATUS = 'C' |
---|
21 | MAF_CONTIG_NESTED_STATUS = 'c' |
---|
22 | MAF_NEW_STATUS = 'N' |
---|
23 | MAF_NEW_NESTED_STATUS = 'n' |
---|
24 | MAF_MAYBE_NEW_STATUS = 'S' |
---|
25 | MAF_MAYBE_NEW_NESTED_STATUS = 's' |
---|
26 | MAF_MISSING_STATUS = 'M' |
---|
27 | |
---|
28 | class MAFIndexedAccess( interval_index_file.AbstractIndexedAccess ): |
---|
29 | """ |
---|
30 | Indexed access to a MAF file. |
---|
31 | """ |
---|
32 | def read_at_current_offset( self, file, **kwargs ): |
---|
33 | """ |
---|
34 | Read the MAF block at the current position in `file` and return an |
---|
35 | instance of `Alignment`. |
---|
36 | """ |
---|
37 | return read_next_maf( file, **kwargs ) |
---|
38 | |
---|
39 | class MAFMultiIndexedAccess( interval_index_file.AbstractMultiIndexedAccess ): |
---|
40 | """ |
---|
41 | Indexed access to multiple MAF files. |
---|
42 | """ |
---|
43 | indexed_access_class = MAFIndexedAccess |
---|
44 | |
---|
45 | Indexed = MAFIndexedAccess |
---|
46 | """Deprecated: `MAFIndexedAccess` is also available under the name `Indexed`.""" |
---|
47 | |
---|
48 | MultiIndexed = MAFMultiIndexedAccess |
---|
49 | """Deprecated: `MAFMultiIndexedAccess` is also available under the name `MultiIndexed`.""" |
---|
50 | |
---|
51 | class Reader( object ): |
---|
52 | """ |
---|
53 | Iterate over all maf blocks in a file in order |
---|
54 | """ |
---|
55 | def __init__( self, file, **kwargs ): |
---|
56 | self.file = file |
---|
57 | self.maf_kwargs = kwargs |
---|
58 | # Read and verify maf header, store any attributes |
---|
59 | fields = self.file.readline().split() |
---|
60 | if fields[0] != '##maf': raise "File does not have MAF header" |
---|
61 | self.attributes = parse_attributes( fields[1:] ) |
---|
62 | |
---|
63 | def next( self ): |
---|
64 | return read_next_maf( self.file, **self.maf_kwargs ) |
---|
65 | |
---|
66 | def __iter__( self ): |
---|
67 | return ReaderIter( self ) |
---|
68 | |
---|
69 | def close( self ): |
---|
70 | self.file.close() |
---|
71 | |
---|
72 | class ReaderIter( object ): |
---|
73 | """ |
---|
74 | Adapts a `Reader` to the iterator protocol. |
---|
75 | """ |
---|
76 | def __init__( self, reader ): |
---|
77 | self.reader = reader |
---|
78 | def __iter__( self ): |
---|
79 | return self |
---|
80 | def next( self ): |
---|
81 | v = self.reader.next() |
---|
82 | if not v: raise StopIteration |
---|
83 | return v |
---|
84 | |
---|
85 | class Writer( object ): |
---|
86 | |
---|
87 | def __init__( self, file, attributes={} ): |
---|
88 | self.file = file |
---|
89 | # Write header, Webb's maf code wants version first, we accomodate |
---|
90 | if not attributes.has_key('version'): attributes['version'] = 1 |
---|
91 | self.file.write( "##maf version=%s" % attributes['version'] ) |
---|
92 | for key in attributes: |
---|
93 | if key == 'version': continue |
---|
94 | self.file.writelines( " %s=%s" % ( key, attributes[key] ) ) |
---|
95 | self.file.write( "\n" ) |
---|
96 | |
---|
97 | def write( self, alignment ): |
---|
98 | self.file.write( "a score=" + str( alignment.score ) ) |
---|
99 | for key in alignment.attributes: |
---|
100 | self.file.write( " %s=%s" % ( key, alignment.attributes[key] ) ) |
---|
101 | self.file.write( "\n" ) |
---|
102 | # Components |
---|
103 | rows = [] |
---|
104 | for c in alignment.components: |
---|
105 | # "Empty component" generates an 'e' row |
---|
106 | if c.empty: |
---|
107 | rows.append( ( "e", c.src, str( c.start ), str( c.size ), c.strand, str( c.src_size ), c.synteny_empty ) ) |
---|
108 | continue |
---|
109 | # Regular component |
---|
110 | rows.append( ( "s", c.src, str( c.start ), str( c.size ), c.strand, str( c.src_size ), c.text ) ) |
---|
111 | # If component has quality, write a q row |
---|
112 | if c.quality is not None: |
---|
113 | rows.append( ( "q", c.src, "", "", "", "", c.quality ) ) |
---|
114 | # If component has synteny follow up with an 'i' row |
---|
115 | if c.synteny_left and c.synteny_right: |
---|
116 | rows.append( ( "i", c.src, "", "", "", "", " ".join( map( str, c.synteny_left + c.synteny_right ) ) ) ) |
---|
117 | self.file.write( format_tabular( rows, "llrrrrl" ) ) |
---|
118 | self.file.write( "\n" ) |
---|
119 | |
---|
120 | def close( self ): |
---|
121 | self.file.close() |
---|
122 | |
---|
123 | # ---- Helper methods ------------------------------------------------------- |
---|
124 | |
---|
125 | def from_string( string, **kwargs ): |
---|
126 | return read_next_maf( StringIO( string ), **kwargs ) |
---|
127 | |
---|
128 | def read_next_maf( file, species_to_lengths=None, parse_e_rows=False ): |
---|
129 | """ |
---|
130 | Read the next MAF block from `file` and return as an `Alignment` |
---|
131 | instance. If `parse_i_rows` is true, empty components will be created |
---|
132 | when e rows are encountered. |
---|
133 | """ |
---|
134 | alignment = Alignment(species_to_lengths=species_to_lengths) |
---|
135 | # Attributes line |
---|
136 | line = readline( file, skip_blank=True ) |
---|
137 | if not line: return None |
---|
138 | fields = line.split() |
---|
139 | if fields[0] != 'a': raise "Expected 'a ...' line" |
---|
140 | alignment.attributes = parse_attributes( fields[1:] ) |
---|
141 | if 'score' in alignment.attributes: |
---|
142 | alignment.score = alignment.attributes['score'] |
---|
143 | del alignment.attributes['score'] |
---|
144 | else: |
---|
145 | alignment.score = 0 |
---|
146 | # Sequence lines |
---|
147 | last_component = None |
---|
148 | while 1: |
---|
149 | line = readline( file ) |
---|
150 | # EOF or Blank line terminates alignment components |
---|
151 | if not line or line.isspace(): break |
---|
152 | if line.isspace(): break |
---|
153 | # Parse row |
---|
154 | fields = line.split() |
---|
155 | if fields[0] == 's': |
---|
156 | # An 's' row contains sequence for a component |
---|
157 | component = Component() |
---|
158 | component.src = fields[1] |
---|
159 | component.start = int( fields[2] ) |
---|
160 | component.size = int( fields[3] ) |
---|
161 | component.strand = fields[4] |
---|
162 | component.src_size = int( fields[5] ) |
---|
163 | if len(fields) > 6: component.text = fields[6].strip() |
---|
164 | # Add to set |
---|
165 | alignment.add_component( component ) |
---|
166 | last_component = component |
---|
167 | elif fields[0] == 'e': |
---|
168 | # An 'e' row, when no bases align for a given species this tells |
---|
169 | # us something about the synteny |
---|
170 | if parse_e_rows: |
---|
171 | component = Component() |
---|
172 | component.empty = True |
---|
173 | component.src = fields[1] |
---|
174 | component.start = int( fields[2] ) |
---|
175 | component.size = int( fields[3] ) |
---|
176 | component.strand = fields[4] |
---|
177 | component.src_size = int( fields[5] ) |
---|
178 | component.text = None |
---|
179 | synteny = fields[6].strip() |
---|
180 | assert len( synteny ) == 1, \ |
---|
181 | "Synteny status in 'e' rows should be denoted with a single character code" |
---|
182 | component.synteny_empty = synteny |
---|
183 | alignment.add_component( component ) |
---|
184 | last_component = component |
---|
185 | elif fields[0] == 'i': |
---|
186 | # An 'i' row, indicates left and right synteny status for the |
---|
187 | # previous component, we hope ;) |
---|
188 | assert fields[1] == last_component.src, "'i' row does not follow matching 's' row" |
---|
189 | last_component.synteny_left = ( fields[2], int( fields[3] ) ) |
---|
190 | last_component.synteny_right = ( fields[4], int( fields[5] ) ) |
---|
191 | elif fields[0] == 'q': |
---|
192 | assert fields[1] == last_component.src, "'q' row does not follow matching 's' row" |
---|
193 | # TODO: Should convert this to an integer array? |
---|
194 | last_component.quality = fields[2] |
---|
195 | |
---|
196 | return alignment |
---|
197 | |
---|
198 | def readline( file, skip_blank=False ): |
---|
199 | """Read a line from provided file, skipping any blank or comment lines""" |
---|
200 | while 1: |
---|
201 | line = file.readline() |
---|
202 | #print "every line: %r" % line |
---|
203 | if not line: return None |
---|
204 | if line[0] != '#' and not ( skip_blank and line.isspace() ): |
---|
205 | return line |
---|
206 | |
---|
207 | def parse_attributes( fields ): |
---|
208 | """Parse list of key=value strings into a dict""" |
---|
209 | attributes = {} |
---|
210 | for field in fields: |
---|
211 | pair = field.split( '=' ) |
---|
212 | attributes[ pair[0] ] = pair[1] |
---|
213 | return attributes |
---|
214 | |
---|
215 | def format_tabular( rows, align=None ): |
---|
216 | if len( rows ) == 0: return "" |
---|
217 | lengths = [ len( col ) for col in rows[ 0 ] ] |
---|
218 | for row in rows[1:]: |
---|
219 | for i in range( 0, len( row ) ): |
---|
220 | lengths[ i ] = max( lengths[ i ], len( row[ i ] ) ) |
---|
221 | rval = "" |
---|
222 | for row in rows: |
---|
223 | for i in range( 0, len( row ) ): |
---|
224 | if align and align[ i ] == "l": |
---|
225 | rval += row[ i ].ljust( lengths[ i ] ) |
---|
226 | else: |
---|
227 | rval += row[ i ].rjust( lengths[ i ] ) |
---|
228 | rval += " " |
---|
229 | rval += "\n" |
---|
230 | return rval |
---|
231 | |
---|