[3] | 1 | """ |
---|
| 2 | Support for the `MAF`_ multiple sequence alignment format used by `multiz`_. |
---|
| 3 | |
---|
| 4 | .. _MAF: http://genome.ucsc.edu/FAQ/FAQformat.html#format5 |
---|
| 5 | .. _multiz: http://www.bx.psu.edu/miller_lab/ |
---|
| 6 | """ |
---|
| 7 | |
---|
| 8 | from bx.align import * |
---|
| 9 | |
---|
| 10 | from StringIO import StringIO |
---|
| 11 | import os |
---|
| 12 | |
---|
| 13 | import itertools |
---|
| 14 | from bx import interval_index_file |
---|
| 15 | |
---|
| 16 | from bx.misc.seekbzip2 import SeekableBzip2File |
---|
| 17 | |
---|
| 18 | MAF_INVERSE_STATUS = 'V' |
---|
| 19 | MAF_INSERT_STATUS = 'I' |
---|
| 20 | MAF_CONTIG_STATUS = 'C' |
---|
| 21 | MAF_CONTIG_NESTED_STATUS = 'c' |
---|
| 22 | MAF_NEW_STATUS = 'N' |
---|
| 23 | MAF_NEW_NESTED_STATUS = 'n' |
---|
| 24 | MAF_MAYBE_NEW_STATUS = 'S' |
---|
| 25 | MAF_MAYBE_NEW_NESTED_STATUS = 's' |
---|
| 26 | MAF_MISSING_STATUS = 'M' |
---|
| 27 | |
---|
| 28 | class MAFIndexedAccess( interval_index_file.AbstractIndexedAccess ): |
---|
| 29 | """ |
---|
| 30 | Indexed access to a MAF file. |
---|
| 31 | """ |
---|
| 32 | def read_at_current_offset( self, file, **kwargs ): |
---|
| 33 | """ |
---|
| 34 | Read the MAF block at the current position in `file` and return an |
---|
| 35 | instance of `Alignment`. |
---|
| 36 | """ |
---|
| 37 | return read_next_maf( file, **kwargs ) |
---|
| 38 | |
---|
| 39 | class MAFMultiIndexedAccess( interval_index_file.AbstractMultiIndexedAccess ): |
---|
| 40 | """ |
---|
| 41 | Indexed access to multiple MAF files. |
---|
| 42 | """ |
---|
| 43 | indexed_access_class = MAFIndexedAccess |
---|
| 44 | |
---|
| 45 | Indexed = MAFIndexedAccess |
---|
| 46 | """Deprecated: `MAFIndexedAccess` is also available under the name `Indexed`.""" |
---|
| 47 | |
---|
| 48 | MultiIndexed = MAFMultiIndexedAccess |
---|
| 49 | """Deprecated: `MAFMultiIndexedAccess` is also available under the name `MultiIndexed`.""" |
---|
| 50 | |
---|
| 51 | class Reader( object ): |
---|
| 52 | """ |
---|
| 53 | Iterate over all maf blocks in a file in order |
---|
| 54 | """ |
---|
| 55 | def __init__( self, file, **kwargs ): |
---|
| 56 | self.file = file |
---|
| 57 | self.maf_kwargs = kwargs |
---|
| 58 | # Read and verify maf header, store any attributes |
---|
| 59 | fields = self.file.readline().split() |
---|
| 60 | if fields[0] != '##maf': raise "File does not have MAF header" |
---|
| 61 | self.attributes = parse_attributes( fields[1:] ) |
---|
| 62 | |
---|
| 63 | def next( self ): |
---|
| 64 | return read_next_maf( self.file, **self.maf_kwargs ) |
---|
| 65 | |
---|
| 66 | def __iter__( self ): |
---|
| 67 | return ReaderIter( self ) |
---|
| 68 | |
---|
| 69 | def close( self ): |
---|
| 70 | self.file.close() |
---|
| 71 | |
---|
| 72 | class ReaderIter( object ): |
---|
| 73 | """ |
---|
| 74 | Adapts a `Reader` to the iterator protocol. |
---|
| 75 | """ |
---|
| 76 | def __init__( self, reader ): |
---|
| 77 | self.reader = reader |
---|
| 78 | def __iter__( self ): |
---|
| 79 | return self |
---|
| 80 | def next( self ): |
---|
| 81 | v = self.reader.next() |
---|
| 82 | if not v: raise StopIteration |
---|
| 83 | return v |
---|
| 84 | |
---|
| 85 | class Writer( object ): |
---|
| 86 | |
---|
| 87 | def __init__( self, file, attributes={} ): |
---|
| 88 | self.file = file |
---|
| 89 | # Write header, Webb's maf code wants version first, we accomodate |
---|
| 90 | if not attributes.has_key('version'): attributes['version'] = 1 |
---|
| 91 | self.file.write( "##maf version=%s" % attributes['version'] ) |
---|
| 92 | for key in attributes: |
---|
| 93 | if key == 'version': continue |
---|
| 94 | self.file.writelines( " %s=%s" % ( key, attributes[key] ) ) |
---|
| 95 | self.file.write( "\n" ) |
---|
| 96 | |
---|
| 97 | def write( self, alignment ): |
---|
| 98 | self.file.write( "a score=" + str( alignment.score ) ) |
---|
| 99 | for key in alignment.attributes: |
---|
| 100 | self.file.write( " %s=%s" % ( key, alignment.attributes[key] ) ) |
---|
| 101 | self.file.write( "\n" ) |
---|
| 102 | # Components |
---|
| 103 | rows = [] |
---|
| 104 | for c in alignment.components: |
---|
| 105 | # "Empty component" generates an 'e' row |
---|
| 106 | if c.empty: |
---|
| 107 | rows.append( ( "e", c.src, str( c.start ), str( c.size ), c.strand, str( c.src_size ), c.synteny_empty ) ) |
---|
| 108 | continue |
---|
| 109 | # Regular component |
---|
| 110 | rows.append( ( "s", c.src, str( c.start ), str( c.size ), c.strand, str( c.src_size ), c.text ) ) |
---|
| 111 | # If component has quality, write a q row |
---|
| 112 | if c.quality is not None: |
---|
| 113 | rows.append( ( "q", c.src, "", "", "", "", c.quality ) ) |
---|
| 114 | # If component has synteny follow up with an 'i' row |
---|
| 115 | if c.synteny_left and c.synteny_right: |
---|
| 116 | rows.append( ( "i", c.src, "", "", "", "", " ".join( map( str, c.synteny_left + c.synteny_right ) ) ) ) |
---|
| 117 | self.file.write( format_tabular( rows, "llrrrrl" ) ) |
---|
| 118 | self.file.write( "\n" ) |
---|
| 119 | |
---|
| 120 | def close( self ): |
---|
| 121 | self.file.close() |
---|
| 122 | |
---|
| 123 | # ---- Helper methods ------------------------------------------------------- |
---|
| 124 | |
---|
| 125 | def from_string( string, **kwargs ): |
---|
| 126 | return read_next_maf( StringIO( string ), **kwargs ) |
---|
| 127 | |
---|
| 128 | def read_next_maf( file, species_to_lengths=None, parse_e_rows=False ): |
---|
| 129 | """ |
---|
| 130 | Read the next MAF block from `file` and return as an `Alignment` |
---|
| 131 | instance. If `parse_i_rows` is true, empty components will be created |
---|
| 132 | when e rows are encountered. |
---|
| 133 | """ |
---|
| 134 | alignment = Alignment(species_to_lengths=species_to_lengths) |
---|
| 135 | # Attributes line |
---|
| 136 | line = readline( file, skip_blank=True ) |
---|
| 137 | if not line: return None |
---|
| 138 | fields = line.split() |
---|
| 139 | if fields[0] != 'a': raise "Expected 'a ...' line" |
---|
| 140 | alignment.attributes = parse_attributes( fields[1:] ) |
---|
| 141 | if 'score' in alignment.attributes: |
---|
| 142 | alignment.score = alignment.attributes['score'] |
---|
| 143 | del alignment.attributes['score'] |
---|
| 144 | else: |
---|
| 145 | alignment.score = 0 |
---|
| 146 | # Sequence lines |
---|
| 147 | last_component = None |
---|
| 148 | while 1: |
---|
| 149 | line = readline( file ) |
---|
| 150 | # EOF or Blank line terminates alignment components |
---|
| 151 | if not line or line.isspace(): break |
---|
| 152 | if line.isspace(): break |
---|
| 153 | # Parse row |
---|
| 154 | fields = line.split() |
---|
| 155 | if fields[0] == 's': |
---|
| 156 | # An 's' row contains sequence for a component |
---|
| 157 | component = Component() |
---|
| 158 | component.src = fields[1] |
---|
| 159 | component.start = int( fields[2] ) |
---|
| 160 | component.size = int( fields[3] ) |
---|
| 161 | component.strand = fields[4] |
---|
| 162 | component.src_size = int( fields[5] ) |
---|
| 163 | if len(fields) > 6: component.text = fields[6].strip() |
---|
| 164 | # Add to set |
---|
| 165 | alignment.add_component( component ) |
---|
| 166 | last_component = component |
---|
| 167 | elif fields[0] == 'e': |
---|
| 168 | # An 'e' row, when no bases align for a given species this tells |
---|
| 169 | # us something about the synteny |
---|
| 170 | if parse_e_rows: |
---|
| 171 | component = Component() |
---|
| 172 | component.empty = True |
---|
| 173 | component.src = fields[1] |
---|
| 174 | component.start = int( fields[2] ) |
---|
| 175 | component.size = int( fields[3] ) |
---|
| 176 | component.strand = fields[4] |
---|
| 177 | component.src_size = int( fields[5] ) |
---|
| 178 | component.text = None |
---|
| 179 | synteny = fields[6].strip() |
---|
| 180 | assert len( synteny ) == 1, \ |
---|
| 181 | "Synteny status in 'e' rows should be denoted with a single character code" |
---|
| 182 | component.synteny_empty = synteny |
---|
| 183 | alignment.add_component( component ) |
---|
| 184 | last_component = component |
---|
| 185 | elif fields[0] == 'i': |
---|
| 186 | # An 'i' row, indicates left and right synteny status for the |
---|
| 187 | # previous component, we hope ;) |
---|
| 188 | assert fields[1] == last_component.src, "'i' row does not follow matching 's' row" |
---|
| 189 | last_component.synteny_left = ( fields[2], int( fields[3] ) ) |
---|
| 190 | last_component.synteny_right = ( fields[4], int( fields[5] ) ) |
---|
| 191 | elif fields[0] == 'q': |
---|
| 192 | assert fields[1] == last_component.src, "'q' row does not follow matching 's' row" |
---|
| 193 | # TODO: Should convert this to an integer array? |
---|
| 194 | last_component.quality = fields[2] |
---|
| 195 | |
---|
| 196 | return alignment |
---|
| 197 | |
---|
| 198 | def readline( file, skip_blank=False ): |
---|
| 199 | """Read a line from provided file, skipping any blank or comment lines""" |
---|
| 200 | while 1: |
---|
| 201 | line = file.readline() |
---|
| 202 | #print "every line: %r" % line |
---|
| 203 | if not line: return None |
---|
| 204 | if line[0] != '#' and not ( skip_blank and line.isspace() ): |
---|
| 205 | return line |
---|
| 206 | |
---|
| 207 | def parse_attributes( fields ): |
---|
| 208 | """Parse list of key=value strings into a dict""" |
---|
| 209 | attributes = {} |
---|
| 210 | for field in fields: |
---|
| 211 | pair = field.split( '=' ) |
---|
| 212 | attributes[ pair[0] ] = pair[1] |
---|
| 213 | return attributes |
---|
| 214 | |
---|
| 215 | def format_tabular( rows, align=None ): |
---|
| 216 | if len( rows ) == 0: return "" |
---|
| 217 | lengths = [ len( col ) for col in rows[ 0 ] ] |
---|
| 218 | for row in rows[1:]: |
---|
| 219 | for i in range( 0, len( row ) ): |
---|
| 220 | lengths[ i ] = max( lengths[ i ], len( row[ i ] ) ) |
---|
| 221 | rval = "" |
---|
| 222 | for row in rows: |
---|
| 223 | for i in range( 0, len( row ) ): |
---|
| 224 | if align and align[ i ] == "l": |
---|
| 225 | rval += row[ i ].ljust( lengths[ i ] ) |
---|
| 226 | else: |
---|
| 227 | rval += row[ i ].rjust( lengths[ i ] ) |
---|
| 228 | rval += " " |
---|
| 229 | rval += "\n" |
---|
| 230 | return rval |
---|
| 231 | |
---|