[3] | 1 | """ |
---|
| 2 | Classes that represent alignments between multiple sequences. |
---|
| 3 | """ |
---|
| 4 | |
---|
| 5 | import random |
---|
| 6 | import string |
---|
| 7 | import weakref |
---|
| 8 | from bx.misc.readlengths import read_lengths_file |
---|
| 9 | |
---|
| 10 | # DNA reverse complement table |
---|
| 11 | ## DNA_COMP = " - " \ |
---|
| 12 | ## " TVGH CD M KN YSA BWXR tvgh cd m kn ysa bwxr " \ |
---|
| 13 | ## " " \ |
---|
| 14 | ## " " |
---|
| 15 | |
---|
| 16 | DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" ) |
---|
| 17 | |
---|
| 18 | class Alignment( object ): |
---|
| 19 | |
---|
| 20 | def __init__( self, score=0, attributes={}, species_to_lengths=None ): |
---|
| 21 | # species_to_lengths is needed only for file formats that don't provide |
---|
| 22 | # chromosome lengths; it maps each species name to one of these: |
---|
| 23 | # - the name of a file that contains a list of chromosome length pairs |
---|
| 24 | # - a dict mapping chromosome names to their length |
---|
| 25 | # - a single length value (useful when we just have one sequence and no chromosomes) |
---|
| 26 | # internally a file name is replaced by a dict, but only on an "as |
---|
| 27 | # needed" basis |
---|
| 28 | self.score = score |
---|
| 29 | self.text_size = 0 |
---|
| 30 | self.attributes = attributes |
---|
| 31 | if species_to_lengths == None: self.species_to_lengths = {} |
---|
| 32 | else: self.species_to_lengths = species_to_lengths |
---|
| 33 | self.components = [] |
---|
| 34 | |
---|
| 35 | def add_component( self, component ): |
---|
| 36 | component._alignment = weakref.ref( self ) |
---|
| 37 | self.components.append( component ) |
---|
| 38 | if component.text is not None: |
---|
| 39 | if self.text_size == 0: |
---|
| 40 | self.text_size = len( component.text ) |
---|
| 41 | elif self.text_size != len( component.text ): |
---|
| 42 | raise Exception( "Components must have same text length" ) |
---|
| 43 | |
---|
| 44 | def get_score( self ): |
---|
| 45 | return self.__score |
---|
| 46 | def set_score( self,score ): |
---|
| 47 | if type( score ) == str: |
---|
| 48 | try: |
---|
| 49 | score = int(score) |
---|
| 50 | except: |
---|
| 51 | try: |
---|
| 52 | score = float(score) |
---|
| 53 | except: |
---|
| 54 | pass |
---|
| 55 | self.__score = score |
---|
| 56 | score = property( fget=get_score,fset=set_score ) |
---|
| 57 | |
---|
| 58 | def __str__( self ): |
---|
| 59 | s = "a score=" + str( self.score ) |
---|
| 60 | for key in self.attributes: |
---|
| 61 | s += " %s=%s" % ( key, self.attributes[key] ) |
---|
| 62 | s += "\n" |
---|
| 63 | # Components |
---|
| 64 | for c in self.components: |
---|
| 65 | s += str( c ) |
---|
| 66 | s += "\n" |
---|
| 67 | return s |
---|
| 68 | |
---|
| 69 | def src_size( self, src ): |
---|
| 70 | species,chrom = src_split( src ) |
---|
| 71 | if species in self.species_to_lengths: |
---|
| 72 | chrom_to_length = self.species_to_lengths[species] |
---|
| 73 | elif chrom in self.species_to_lengths: |
---|
| 74 | chrom_to_length = self.species_to_lengths |
---|
| 75 | else: |
---|
| 76 | raise "no src_size (no length file for %s)" % species |
---|
| 77 | if type( chrom_to_length ) == int: # (if it's a single length) |
---|
| 78 | return chrom_to_length |
---|
| 79 | if type( chrom_to_length ) == type( "" ): # (if it's a file name) |
---|
| 80 | chrom_to_length = read_lengths_file( chrom_to_length ) |
---|
| 81 | self.species_to_lengths[species] = chrom_to_length |
---|
| 82 | if chrom not in chrom_to_length: raise "no src_size (%s has no length for %s)" % ( species, chrom ) |
---|
| 83 | return chrom_to_length[chrom] |
---|
| 84 | |
---|
| 85 | def get_component_by_src( self, src ): |
---|
| 86 | for c in self.components: |
---|
| 87 | if c.src == src: return c |
---|
| 88 | return None |
---|
| 89 | |
---|
| 90 | def get_component_by_src_start( self, src ): |
---|
| 91 | for c in self.components: |
---|
| 92 | if c.src.startswith( src ): return c |
---|
| 93 | return None |
---|
| 94 | |
---|
| 95 | def slice( self, start, end ): |
---|
| 96 | new = Alignment( score=self.score, attributes=self.attributes ) |
---|
| 97 | for component in self.components: |
---|
| 98 | # FIXME: Is this the right solution? |
---|
| 99 | if component.empty: |
---|
| 100 | continue |
---|
| 101 | new.components.append( component.slice( start, end ) ) |
---|
| 102 | new.text_size = end - start |
---|
| 103 | return new |
---|
| 104 | |
---|
| 105 | def reverse_complement( self ): |
---|
| 106 | new = Alignment( score=self.score, attributes=self.attributes ) |
---|
| 107 | for component in self.components: |
---|
| 108 | new.components.append( component.reverse_complement() ) |
---|
| 109 | new.text_size = self.text_size |
---|
| 110 | return new |
---|
| 111 | |
---|
| 112 | def slice_by_component( self, component_index, start, end ): |
---|
| 113 | """ |
---|
| 114 | Return a slice of the alignment, corresponding to an coordinate interval in a specific component. |
---|
| 115 | |
---|
| 116 | component_index is one of |
---|
| 117 | an integer offset into the components list |
---|
| 118 | a string indicating the src of the desired component |
---|
| 119 | a component |
---|
| 120 | |
---|
| 121 | start and end are relative to the + strand, regardless of the component's strand. |
---|
| 122 | |
---|
| 123 | """ |
---|
| 124 | if type( component_index ) == type( 0 ): |
---|
| 125 | ref = self.components[ component_index ] |
---|
| 126 | elif type( component_index ) == type( "" ): |
---|
| 127 | ref = self.get_component_by_src( component_index ) |
---|
| 128 | elif type( component_index ) == Component: |
---|
| 129 | ref = component_index |
---|
| 130 | else: |
---|
| 131 | raise ValueError( "can't figure out what to do" ) |
---|
| 132 | start_col = ref.coord_to_col( start ) |
---|
| 133 | end_col = ref.coord_to_col( end ) |
---|
| 134 | if (ref.strand == '-'): |
---|
| 135 | (start_col,end_col) = (end_col,start_col) |
---|
| 136 | return self.slice( start_col, end_col ) |
---|
| 137 | |
---|
| 138 | def column_iter( self ): |
---|
| 139 | for i in range( self.text_size ): |
---|
| 140 | yield [ c.text[i] for c in self.components ] |
---|
| 141 | |
---|
| 142 | def limit_to_species( self, species ): |
---|
| 143 | new = Alignment( score=self.score, attributes=self.attributes ) |
---|
| 144 | new.text_size = self.text_size |
---|
| 145 | for component in self.components: |
---|
| 146 | if component.src.split('.')[0] in species: |
---|
| 147 | new.add_component( component ) |
---|
| 148 | return new |
---|
| 149 | |
---|
| 150 | def remove_all_gap_columns( self ): |
---|
| 151 | """ |
---|
| 152 | Remove any columns containing only gaps from alignment components, |
---|
| 153 | text of components is modified IN PLACE. |
---|
| 154 | """ |
---|
| 155 | seqs = [] |
---|
| 156 | for c in self.components: |
---|
| 157 | try: |
---|
| 158 | seqs.append( list( c.text ) ) |
---|
| 159 | except TypeError: |
---|
| 160 | seqs.append( None ) |
---|
| 161 | i = 0 |
---|
| 162 | text_size = self.text_size |
---|
| 163 | while i < text_size: |
---|
| 164 | all_gap = True |
---|
| 165 | for seq in seqs: |
---|
| 166 | if seq is None: continue |
---|
| 167 | if seq[i] != '-': all_gap = False |
---|
| 168 | if all_gap: |
---|
| 169 | for seq in seqs: |
---|
| 170 | if seq is None: continue |
---|
| 171 | del seq[i] |
---|
| 172 | text_size -= 1 |
---|
| 173 | else: |
---|
| 174 | i += 1 |
---|
| 175 | for i in range( len( self.components ) ): |
---|
| 176 | if seqs[i] is None: continue |
---|
| 177 | self.components[i].text = ''.join( seqs[i] ) |
---|
| 178 | self.text_size = text_size |
---|
| 179 | |
---|
| 180 | def __eq__( self, other ): |
---|
| 181 | if other is None or type( other ) != type( self ): |
---|
| 182 | return False |
---|
| 183 | if self.score != other.score: |
---|
| 184 | return False |
---|
| 185 | if self.attributes != other.attributes: |
---|
| 186 | return False |
---|
| 187 | if len( self.components ) != len( other.components ): |
---|
| 188 | return False |
---|
| 189 | for c1, c2 in zip( self.components, other.components ): |
---|
| 190 | if c1 != c2: |
---|
| 191 | return False |
---|
| 192 | return True |
---|
| 193 | |
---|
| 194 | def __ne__( self, other ): |
---|
| 195 | return not( self.__eq__( other ) ) |
---|
| 196 | |
---|
| 197 | def __deepcopy__( self, memo ): |
---|
| 198 | from copy import deepcopy |
---|
| 199 | new = Alignment( score=self.score, attributes=deepcopy( self.attributes ), species_to_lengths=deepcopy( self.species_to_lengths ) ) |
---|
| 200 | for component in self.components: |
---|
| 201 | new.add_component( deepcopy( component ) ) |
---|
| 202 | return new |
---|
| 203 | |
---|
| 204 | class Component( object ): |
---|
| 205 | |
---|
| 206 | def __init__( self, src='', start=0, size=0, strand=None, src_size=None, text='' ): |
---|
| 207 | self._alignment = None |
---|
| 208 | self.src = src |
---|
| 209 | self.start = start # Nota Bene: start,size,strand are as they |
---|
| 210 | self.size = size # .. appear in a MAF file-- origin-zero, end |
---|
| 211 | self.strand = strand # .. excluded, and minus strand counts from |
---|
| 212 | self._src_size = src_size # .. end of sequence |
---|
| 213 | self.text = text |
---|
| 214 | self.quality = None |
---|
| 215 | # Optional fields to keep track of synteny status (only makes sense |
---|
| 216 | # when the alignment is part of an ordered set) |
---|
| 217 | self.synteny_left = None |
---|
| 218 | self.synteny_right = None |
---|
| 219 | self.synteny_empty = None |
---|
| 220 | # If true, this component actually represents a non-aligning region, |
---|
| 221 | # and has no text. |
---|
| 222 | self.empty = False |
---|
| 223 | # Index maps a coordinate (distance along + strand from + start) to alignment column |
---|
| 224 | self.index = None |
---|
| 225 | |
---|
| 226 | def __str__( self ): |
---|
| 227 | if self.empty: |
---|
| 228 | rval = "e %s %d %d %s %d %s" % ( self.src, self.start, |
---|
| 229 | self.size, self.strand, |
---|
| 230 | self.src_size, self.synteny_empty ) |
---|
| 231 | else: |
---|
| 232 | rval = "s %s %d %d %s %d %s" % ( self.src, self.start, |
---|
| 233 | self.size, self.strand, |
---|
| 234 | self.src_size, self.text ) |
---|
| 235 | if self.synteny_left and self.synteny_right: |
---|
| 236 | rval += "\ni %s %s %d %s %d" % ( self.src, |
---|
| 237 | self.synteny_left[0], self.synteny_left[1], |
---|
| 238 | self.synteny_right[0], self.synteny_right[1] ) |
---|
| 239 | return rval |
---|
| 240 | |
---|
| 241 | def get_end( self ): |
---|
| 242 | return self.start + self.size |
---|
| 243 | end = property( fget=get_end ) |
---|
| 244 | |
---|
| 245 | def get_src_size( self ): |
---|
| 246 | if self._src_size == None: |
---|
| 247 | if self._alignment == None: raise "component has no src_size" |
---|
| 248 | self._src_size = self._alignment().src_size( self.src ) |
---|
| 249 | return self._src_size |
---|
| 250 | def set_src_size( self,src_size ): |
---|
| 251 | self._src_size = src_size |
---|
| 252 | src_size = property( fget=get_src_size, fset=set_src_size ) |
---|
| 253 | |
---|
| 254 | def get_forward_strand_start( self ): |
---|
| 255 | if self.strand == '-': return self.src_size - self.end |
---|
| 256 | else: return self.start |
---|
| 257 | forward_strand_start = property( fget=get_forward_strand_start ) |
---|
| 258 | |
---|
| 259 | def get_forward_strand_end( self ): |
---|
| 260 | if self.strand == '-': return self.src_size - self.start |
---|
| 261 | else: return self.end |
---|
| 262 | forward_strand_end = property( fget=get_forward_strand_end) |
---|
| 263 | |
---|
| 264 | def reverse_complement( self ): |
---|
| 265 | start = self.src_size - self.end |
---|
| 266 | if self.strand == "+": strand = "-" |
---|
| 267 | else: strand = "+" |
---|
| 268 | comp = [ch for ch in self.text.translate(DNA_COMP)] |
---|
| 269 | comp.reverse() |
---|
| 270 | text = "".join(comp) |
---|
| 271 | new = Component( self.src, start, self.size, strand, self._src_size, text ) |
---|
| 272 | new._alignment = self._alignment |
---|
| 273 | return new |
---|
| 274 | |
---|
| 275 | def slice( self, start, end ): |
---|
| 276 | new = Component( src=self.src, start=self.start, strand=self.strand, src_size=self._src_size ) |
---|
| 277 | new._alignment = self._alignment |
---|
| 278 | new.text = self.text[start:end] |
---|
| 279 | |
---|
| 280 | #for i in range( 0, start ): |
---|
| 281 | # if self.text[i] != '-': new.start += 1 |
---|
| 282 | #for c in new.text: |
---|
| 283 | # if c != '-': new.size += 1 |
---|
| 284 | new.start += start - self.text.count( '-', 0, start ) |
---|
| 285 | new.size = len( new.text ) - new.text.count( '-' ) |
---|
| 286 | |
---|
| 287 | # FIXME: This annotation probably means nothing after slicing if |
---|
| 288 | # one of the ends changes. In general the 'i' rows of a MAF only |
---|
| 289 | # make sense in context (relative to the previous and next alignments |
---|
| 290 | # in a stream, slicing breaks that). |
---|
| 291 | new.synteny_left = self.synteny_left |
---|
| 292 | new.synteny_right = self.synteny_right |
---|
| 293 | |
---|
| 294 | return new |
---|
| 295 | |
---|
| 296 | def slice_by_coord( self, start, end ): |
---|
| 297 | """ |
---|
| 298 | Return the slice of the component corresponding to a coordinate interval. |
---|
| 299 | |
---|
| 300 | start and end are relative to the + strand, regardless of the component's strand. |
---|
| 301 | |
---|
| 302 | """ |
---|
| 303 | start_col = self.coord_to_col( start ) |
---|
| 304 | end_col = self.coord_to_col( end ) |
---|
| 305 | if (self.strand == '-'): |
---|
| 306 | (start_col,end_col) = (end_col,start_col) |
---|
| 307 | return self.slice( start_col, end_col ) |
---|
| 308 | |
---|
| 309 | def coord_to_col( self, pos ): |
---|
| 310 | """ |
---|
| 311 | Return the alignment column index corresponding to coordinate pos. |
---|
| 312 | |
---|
| 313 | pos is relative to the + strand, regardless of the component's strand. |
---|
| 314 | |
---|
| 315 | """ |
---|
| 316 | start,end = self.get_forward_strand_start(),self.get_forward_strand_end() |
---|
| 317 | if pos < start or pos > end: |
---|
| 318 | raise "Range error: %d not in %d-%d" % ( pos, start, end ) |
---|
| 319 | if not self.index: |
---|
| 320 | self.index = list() |
---|
| 321 | if (self.strand == '-'): |
---|
| 322 | # nota bene: for - strand self.index[x] maps to one column |
---|
| 323 | # higher than is actually associated with the position; thus |
---|
| 324 | # when slice_by_component() and slice_by_coord() flip the ends, |
---|
| 325 | # the resulting slice is correct |
---|
| 326 | for x in range( len(self.text)-1,-1,-1 ): |
---|
| 327 | if not self.text[x] == '-': |
---|
| 328 | self.index.append( x + 1 ) |
---|
| 329 | self.index.append( 0 ) |
---|
| 330 | else: |
---|
| 331 | for x in range( len(self.text) ): |
---|
| 332 | if not self.text[x] == '-': |
---|
| 333 | self.index.append(x) |
---|
| 334 | self.index.append( len(self.text) ) |
---|
| 335 | x = None |
---|
| 336 | try: |
---|
| 337 | x = self.index[ pos - start ] |
---|
| 338 | except: |
---|
| 339 | raise "Error in index." |
---|
| 340 | return x |
---|
| 341 | |
---|
| 342 | |
---|
| 343 | def __eq__( self, other ): |
---|
| 344 | if other is None or type( other ) != type( self ): |
---|
| 345 | return False |
---|
| 346 | return ( self.src == other.src |
---|
| 347 | and self.start == other.start |
---|
| 348 | and self.size == other.size |
---|
| 349 | and self.strand == other.strand |
---|
| 350 | and self._src_size == other._src_size |
---|
| 351 | and self.text == other.text |
---|
| 352 | and self.synteny_left == other.synteny_left |
---|
| 353 | and self.synteny_right == other.synteny_right |
---|
| 354 | and self.synteny_empty == other.synteny_empty |
---|
| 355 | and self.empty == other.empty ) |
---|
| 356 | |
---|
| 357 | def __ne__( self, other ): |
---|
| 358 | return not( self.__eq__( other ) ) |
---|
| 359 | |
---|
| 360 | def __deepcopy__( self, memo ): |
---|
| 361 | new = Component( src=self.src, start=self.start, size=self.size, strand=self.strand, src_size=self._src_size, text=self.text ) |
---|
| 362 | new._alignment = self._alignment |
---|
| 363 | new.quality = self.quality |
---|
| 364 | new.synteny_left = self.synteny_left |
---|
| 365 | new.synteny_right = self.synteny_right |
---|
| 366 | new.synteny_empty = self.synteny_empty |
---|
| 367 | new.empty = self.empty |
---|
| 368 | new.index = self.index |
---|
| 369 | return new |
---|
| 370 | |
---|
| 371 | def get_reader( format, infile, species_to_lengths=None ): |
---|
| 372 | import bx.align.maf, bx.align.axt, bx.align.lav |
---|
| 373 | if format == "maf": return bx.align.maf.Reader( infile, species_to_lengths ) |
---|
| 374 | elif format == "axt": return bx.align.axt.Reader( infile, species_to_lengths ) |
---|
| 375 | elif format == "lav": return bx.align.lav.Reader( infile ) |
---|
| 376 | else: raise "Unknown alignment format %s" % format |
---|
| 377 | |
---|
| 378 | def get_writer( format, outfile, attributes={} ): |
---|
| 379 | import bx.align.maf, bx.align.axt, bx.align.lav |
---|
| 380 | if format == "maf": return bx.align.maf.Writer( outfile, attributes ) |
---|
| 381 | elif format == "axt": return bx.align.axt.Writer( outfile, attributes ) |
---|
| 382 | elif format == "lav": return bx.align.lav.Writer( outfile, attributes ) |
---|
| 383 | else: raise "Unknown alignment format %s" % format |
---|
| 384 | |
---|
| 385 | def get_indexed( format, filename, index_filename=None, keep_open=False, species_to_lengths=None ): |
---|
| 386 | import bx.align.maf, bx.align.axt, bx.align.lav |
---|
| 387 | if format == "maf": return bx.align.maf.Indexed( filename, index_filename, keep_open, species_to_lengths ) |
---|
| 388 | elif format == "axt": return bx.align.axt.Indexed( filename, index_filename, keep_open, species_to_lengths ) |
---|
| 389 | elif format == "lav": raise "LAV support for Indexed has not been implemented" |
---|
| 390 | else: raise "Unknown alignment format %s" % format |
---|
| 391 | |
---|
| 392 | def shuffle_columns( a ): |
---|
| 393 | """Randomize the columns of an alignment""" |
---|
| 394 | mask = range( a.text_size ) |
---|
| 395 | random.shuffle( mask ) |
---|
| 396 | for c in a.components: |
---|
| 397 | c.text = ''.join( [ c.text[i] for i in mask ] ) |
---|
| 398 | |
---|
| 399 | def src_split( src ): # splits src into species,chrom |
---|
| 400 | dot = src.rfind( "." ) |
---|
| 401 | if dot == -1: return None,src |
---|
| 402 | else: return src[:dot],src[dot+1:] |
---|
| 403 | |
---|
| 404 | def src_merge( species,chrom,contig=None ): # creates src (inverse of src_split) |
---|
| 405 | if species == None: src = chrom |
---|
| 406 | else: src = species + "." + chrom |
---|
| 407 | if contig != None: src += "[%s]" % contig |
---|
| 408 | return src |
---|
| 409 | |
---|
| 410 | # ---- Read C extension if available --------------------------------------- |
---|
| 411 | |
---|
| 412 | try: |
---|
| 413 | from _core import coord_to_col |
---|
| 414 | except: |
---|
| 415 | def coord_to_col( start, text, pos ): |
---|
| 416 | col = 0 |
---|
| 417 | while start < pos: |
---|
| 418 | if text[col] != '-': |
---|
| 419 | start += 1 |
---|
| 420 | col += 1 |
---|
| 421 | return col |
---|