| 1 | """ |
|---|
| 2 | Classes that represent alignments between multiple sequences. |
|---|
| 3 | """ |
|---|
| 4 | |
|---|
| 5 | import random |
|---|
| 6 | import string |
|---|
| 7 | import weakref |
|---|
| 8 | from bx.misc.readlengths import read_lengths_file |
|---|
| 9 | |
|---|
| 10 | # DNA reverse complement table |
|---|
| 11 | ## DNA_COMP = " - " \ |
|---|
| 12 | ## " TVGH CD M KN YSA BWXR tvgh cd m kn ysa bwxr " \ |
|---|
| 13 | ## " " \ |
|---|
| 14 | ## " " |
|---|
| 15 | |
|---|
| 16 | DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" ) |
|---|
| 17 | |
|---|
| 18 | class Alignment( object ): |
|---|
| 19 | |
|---|
| 20 | def __init__( self, score=0, attributes={}, species_to_lengths=None ): |
|---|
| 21 | # species_to_lengths is needed only for file formats that don't provide |
|---|
| 22 | # chromosome lengths; it maps each species name to one of these: |
|---|
| 23 | # - the name of a file that contains a list of chromosome length pairs |
|---|
| 24 | # - a dict mapping chromosome names to their length |
|---|
| 25 | # - a single length value (useful when we just have one sequence and no chromosomes) |
|---|
| 26 | # internally a file name is replaced by a dict, but only on an "as |
|---|
| 27 | # needed" basis |
|---|
| 28 | self.score = score |
|---|
| 29 | self.text_size = 0 |
|---|
| 30 | self.attributes = attributes |
|---|
| 31 | if species_to_lengths == None: self.species_to_lengths = {} |
|---|
| 32 | else: self.species_to_lengths = species_to_lengths |
|---|
| 33 | self.components = [] |
|---|
| 34 | |
|---|
| 35 | def add_component( self, component ): |
|---|
| 36 | component._alignment = weakref.ref( self ) |
|---|
| 37 | self.components.append( component ) |
|---|
| 38 | if component.text is not None: |
|---|
| 39 | if self.text_size == 0: |
|---|
| 40 | self.text_size = len( component.text ) |
|---|
| 41 | elif self.text_size != len( component.text ): |
|---|
| 42 | raise Exception( "Components must have same text length" ) |
|---|
| 43 | |
|---|
| 44 | def get_score( self ): |
|---|
| 45 | return self.__score |
|---|
| 46 | def set_score( self,score ): |
|---|
| 47 | if type( score ) == str: |
|---|
| 48 | try: |
|---|
| 49 | score = int(score) |
|---|
| 50 | except: |
|---|
| 51 | try: |
|---|
| 52 | score = float(score) |
|---|
| 53 | except: |
|---|
| 54 | pass |
|---|
| 55 | self.__score = score |
|---|
| 56 | score = property( fget=get_score,fset=set_score ) |
|---|
| 57 | |
|---|
| 58 | def __str__( self ): |
|---|
| 59 | s = "a score=" + str( self.score ) |
|---|
| 60 | for key in self.attributes: |
|---|
| 61 | s += " %s=%s" % ( key, self.attributes[key] ) |
|---|
| 62 | s += "\n" |
|---|
| 63 | # Components |
|---|
| 64 | for c in self.components: |
|---|
| 65 | s += str( c ) |
|---|
| 66 | s += "\n" |
|---|
| 67 | return s |
|---|
| 68 | |
|---|
| 69 | def src_size( self, src ): |
|---|
| 70 | species,chrom = src_split( src ) |
|---|
| 71 | if species in self.species_to_lengths: |
|---|
| 72 | chrom_to_length = self.species_to_lengths[species] |
|---|
| 73 | elif chrom in self.species_to_lengths: |
|---|
| 74 | chrom_to_length = self.species_to_lengths |
|---|
| 75 | else: |
|---|
| 76 | raise "no src_size (no length file for %s)" % species |
|---|
| 77 | if type( chrom_to_length ) == int: # (if it's a single length) |
|---|
| 78 | return chrom_to_length |
|---|
| 79 | if type( chrom_to_length ) == type( "" ): # (if it's a file name) |
|---|
| 80 | chrom_to_length = read_lengths_file( chrom_to_length ) |
|---|
| 81 | self.species_to_lengths[species] = chrom_to_length |
|---|
| 82 | if chrom not in chrom_to_length: raise "no src_size (%s has no length for %s)" % ( species, chrom ) |
|---|
| 83 | return chrom_to_length[chrom] |
|---|
| 84 | |
|---|
| 85 | def get_component_by_src( self, src ): |
|---|
| 86 | for c in self.components: |
|---|
| 87 | if c.src == src: return c |
|---|
| 88 | return None |
|---|
| 89 | |
|---|
| 90 | def get_component_by_src_start( self, src ): |
|---|
| 91 | for c in self.components: |
|---|
| 92 | if c.src.startswith( src ): return c |
|---|
| 93 | return None |
|---|
| 94 | |
|---|
| 95 | def slice( self, start, end ): |
|---|
| 96 | new = Alignment( score=self.score, attributes=self.attributes ) |
|---|
| 97 | for component in self.components: |
|---|
| 98 | # FIXME: Is this the right solution? |
|---|
| 99 | if component.empty: |
|---|
| 100 | continue |
|---|
| 101 | new.components.append( component.slice( start, end ) ) |
|---|
| 102 | new.text_size = end - start |
|---|
| 103 | return new |
|---|
| 104 | |
|---|
| 105 | def reverse_complement( self ): |
|---|
| 106 | new = Alignment( score=self.score, attributes=self.attributes ) |
|---|
| 107 | for component in self.components: |
|---|
| 108 | new.components.append( component.reverse_complement() ) |
|---|
| 109 | new.text_size = self.text_size |
|---|
| 110 | return new |
|---|
| 111 | |
|---|
| 112 | def slice_by_component( self, component_index, start, end ): |
|---|
| 113 | """ |
|---|
| 114 | Return a slice of the alignment, corresponding to an coordinate interval in a specific component. |
|---|
| 115 | |
|---|
| 116 | component_index is one of |
|---|
| 117 | an integer offset into the components list |
|---|
| 118 | a string indicating the src of the desired component |
|---|
| 119 | a component |
|---|
| 120 | |
|---|
| 121 | start and end are relative to the + strand, regardless of the component's strand. |
|---|
| 122 | |
|---|
| 123 | """ |
|---|
| 124 | if type( component_index ) == type( 0 ): |
|---|
| 125 | ref = self.components[ component_index ] |
|---|
| 126 | elif type( component_index ) == type( "" ): |
|---|
| 127 | ref = self.get_component_by_src( component_index ) |
|---|
| 128 | elif type( component_index ) == Component: |
|---|
| 129 | ref = component_index |
|---|
| 130 | else: |
|---|
| 131 | raise ValueError( "can't figure out what to do" ) |
|---|
| 132 | start_col = ref.coord_to_col( start ) |
|---|
| 133 | end_col = ref.coord_to_col( end ) |
|---|
| 134 | if (ref.strand == '-'): |
|---|
| 135 | (start_col,end_col) = (end_col,start_col) |
|---|
| 136 | return self.slice( start_col, end_col ) |
|---|
| 137 | |
|---|
| 138 | def column_iter( self ): |
|---|
| 139 | for i in range( self.text_size ): |
|---|
| 140 | yield [ c.text[i] for c in self.components ] |
|---|
| 141 | |
|---|
| 142 | def limit_to_species( self, species ): |
|---|
| 143 | new = Alignment( score=self.score, attributes=self.attributes ) |
|---|
| 144 | new.text_size = self.text_size |
|---|
| 145 | for component in self.components: |
|---|
| 146 | if component.src.split('.')[0] in species: |
|---|
| 147 | new.add_component( component ) |
|---|
| 148 | return new |
|---|
| 149 | |
|---|
| 150 | def remove_all_gap_columns( self ): |
|---|
| 151 | """ |
|---|
| 152 | Remove any columns containing only gaps from alignment components, |
|---|
| 153 | text of components is modified IN PLACE. |
|---|
| 154 | """ |
|---|
| 155 | seqs = [] |
|---|
| 156 | for c in self.components: |
|---|
| 157 | try: |
|---|
| 158 | seqs.append( list( c.text ) ) |
|---|
| 159 | except TypeError: |
|---|
| 160 | seqs.append( None ) |
|---|
| 161 | i = 0 |
|---|
| 162 | text_size = self.text_size |
|---|
| 163 | while i < text_size: |
|---|
| 164 | all_gap = True |
|---|
| 165 | for seq in seqs: |
|---|
| 166 | if seq is None: continue |
|---|
| 167 | if seq[i] != '-': all_gap = False |
|---|
| 168 | if all_gap: |
|---|
| 169 | for seq in seqs: |
|---|
| 170 | if seq is None: continue |
|---|
| 171 | del seq[i] |
|---|
| 172 | text_size -= 1 |
|---|
| 173 | else: |
|---|
| 174 | i += 1 |
|---|
| 175 | for i in range( len( self.components ) ): |
|---|
| 176 | if seqs[i] is None: continue |
|---|
| 177 | self.components[i].text = ''.join( seqs[i] ) |
|---|
| 178 | self.text_size = text_size |
|---|
| 179 | |
|---|
| 180 | def __eq__( self, other ): |
|---|
| 181 | if other is None or type( other ) != type( self ): |
|---|
| 182 | return False |
|---|
| 183 | if self.score != other.score: |
|---|
| 184 | return False |
|---|
| 185 | if self.attributes != other.attributes: |
|---|
| 186 | return False |
|---|
| 187 | if len( self.components ) != len( other.components ): |
|---|
| 188 | return False |
|---|
| 189 | for c1, c2 in zip( self.components, other.components ): |
|---|
| 190 | if c1 != c2: |
|---|
| 191 | return False |
|---|
| 192 | return True |
|---|
| 193 | |
|---|
| 194 | def __ne__( self, other ): |
|---|
| 195 | return not( self.__eq__( other ) ) |
|---|
| 196 | |
|---|
| 197 | def __deepcopy__( self, memo ): |
|---|
| 198 | from copy import deepcopy |
|---|
| 199 | new = Alignment( score=self.score, attributes=deepcopy( self.attributes ), species_to_lengths=deepcopy( self.species_to_lengths ) ) |
|---|
| 200 | for component in self.components: |
|---|
| 201 | new.add_component( deepcopy( component ) ) |
|---|
| 202 | return new |
|---|
| 203 | |
|---|
| 204 | class Component( object ): |
|---|
| 205 | |
|---|
| 206 | def __init__( self, src='', start=0, size=0, strand=None, src_size=None, text='' ): |
|---|
| 207 | self._alignment = None |
|---|
| 208 | self.src = src |
|---|
| 209 | self.start = start # Nota Bene: start,size,strand are as they |
|---|
| 210 | self.size = size # .. appear in a MAF file-- origin-zero, end |
|---|
| 211 | self.strand = strand # .. excluded, and minus strand counts from |
|---|
| 212 | self._src_size = src_size # .. end of sequence |
|---|
| 213 | self.text = text |
|---|
| 214 | self.quality = None |
|---|
| 215 | # Optional fields to keep track of synteny status (only makes sense |
|---|
| 216 | # when the alignment is part of an ordered set) |
|---|
| 217 | self.synteny_left = None |
|---|
| 218 | self.synteny_right = None |
|---|
| 219 | self.synteny_empty = None |
|---|
| 220 | # If true, this component actually represents a non-aligning region, |
|---|
| 221 | # and has no text. |
|---|
| 222 | self.empty = False |
|---|
| 223 | # Index maps a coordinate (distance along + strand from + start) to alignment column |
|---|
| 224 | self.index = None |
|---|
| 225 | |
|---|
| 226 | def __str__( self ): |
|---|
| 227 | if self.empty: |
|---|
| 228 | rval = "e %s %d %d %s %d %s" % ( self.src, self.start, |
|---|
| 229 | self.size, self.strand, |
|---|
| 230 | self.src_size, self.synteny_empty ) |
|---|
| 231 | else: |
|---|
| 232 | rval = "s %s %d %d %s %d %s" % ( self.src, self.start, |
|---|
| 233 | self.size, self.strand, |
|---|
| 234 | self.src_size, self.text ) |
|---|
| 235 | if self.synteny_left and self.synteny_right: |
|---|
| 236 | rval += "\ni %s %s %d %s %d" % ( self.src, |
|---|
| 237 | self.synteny_left[0], self.synteny_left[1], |
|---|
| 238 | self.synteny_right[0], self.synteny_right[1] ) |
|---|
| 239 | return rval |
|---|
| 240 | |
|---|
| 241 | def get_end( self ): |
|---|
| 242 | return self.start + self.size |
|---|
| 243 | end = property( fget=get_end ) |
|---|
| 244 | |
|---|
| 245 | def get_src_size( self ): |
|---|
| 246 | if self._src_size == None: |
|---|
| 247 | if self._alignment == None: raise "component has no src_size" |
|---|
| 248 | self._src_size = self._alignment().src_size( self.src ) |
|---|
| 249 | return self._src_size |
|---|
| 250 | def set_src_size( self,src_size ): |
|---|
| 251 | self._src_size = src_size |
|---|
| 252 | src_size = property( fget=get_src_size, fset=set_src_size ) |
|---|
| 253 | |
|---|
| 254 | def get_forward_strand_start( self ): |
|---|
| 255 | if self.strand == '-': return self.src_size - self.end |
|---|
| 256 | else: return self.start |
|---|
| 257 | forward_strand_start = property( fget=get_forward_strand_start ) |
|---|
| 258 | |
|---|
| 259 | def get_forward_strand_end( self ): |
|---|
| 260 | if self.strand == '-': return self.src_size - self.start |
|---|
| 261 | else: return self.end |
|---|
| 262 | forward_strand_end = property( fget=get_forward_strand_end) |
|---|
| 263 | |
|---|
| 264 | def reverse_complement( self ): |
|---|
| 265 | start = self.src_size - self.end |
|---|
| 266 | if self.strand == "+": strand = "-" |
|---|
| 267 | else: strand = "+" |
|---|
| 268 | comp = [ch for ch in self.text.translate(DNA_COMP)] |
|---|
| 269 | comp.reverse() |
|---|
| 270 | text = "".join(comp) |
|---|
| 271 | new = Component( self.src, start, self.size, strand, self._src_size, text ) |
|---|
| 272 | new._alignment = self._alignment |
|---|
| 273 | return new |
|---|
| 274 | |
|---|
| 275 | def slice( self, start, end ): |
|---|
| 276 | new = Component( src=self.src, start=self.start, strand=self.strand, src_size=self._src_size ) |
|---|
| 277 | new._alignment = self._alignment |
|---|
| 278 | new.text = self.text[start:end] |
|---|
| 279 | |
|---|
| 280 | #for i in range( 0, start ): |
|---|
| 281 | # if self.text[i] != '-': new.start += 1 |
|---|
| 282 | #for c in new.text: |
|---|
| 283 | # if c != '-': new.size += 1 |
|---|
| 284 | new.start += start - self.text.count( '-', 0, start ) |
|---|
| 285 | new.size = len( new.text ) - new.text.count( '-' ) |
|---|
| 286 | |
|---|
| 287 | # FIXME: This annotation probably means nothing after slicing if |
|---|
| 288 | # one of the ends changes. In general the 'i' rows of a MAF only |
|---|
| 289 | # make sense in context (relative to the previous and next alignments |
|---|
| 290 | # in a stream, slicing breaks that). |
|---|
| 291 | new.synteny_left = self.synteny_left |
|---|
| 292 | new.synteny_right = self.synteny_right |
|---|
| 293 | |
|---|
| 294 | return new |
|---|
| 295 | |
|---|
| 296 | def slice_by_coord( self, start, end ): |
|---|
| 297 | """ |
|---|
| 298 | Return the slice of the component corresponding to a coordinate interval. |
|---|
| 299 | |
|---|
| 300 | start and end are relative to the + strand, regardless of the component's strand. |
|---|
| 301 | |
|---|
| 302 | """ |
|---|
| 303 | start_col = self.coord_to_col( start ) |
|---|
| 304 | end_col = self.coord_to_col( end ) |
|---|
| 305 | if (self.strand == '-'): |
|---|
| 306 | (start_col,end_col) = (end_col,start_col) |
|---|
| 307 | return self.slice( start_col, end_col ) |
|---|
| 308 | |
|---|
| 309 | def coord_to_col( self, pos ): |
|---|
| 310 | """ |
|---|
| 311 | Return the alignment column index corresponding to coordinate pos. |
|---|
| 312 | |
|---|
| 313 | pos is relative to the + strand, regardless of the component's strand. |
|---|
| 314 | |
|---|
| 315 | """ |
|---|
| 316 | start,end = self.get_forward_strand_start(),self.get_forward_strand_end() |
|---|
| 317 | if pos < start or pos > end: |
|---|
| 318 | raise "Range error: %d not in %d-%d" % ( pos, start, end ) |
|---|
| 319 | if not self.index: |
|---|
| 320 | self.index = list() |
|---|
| 321 | if (self.strand == '-'): |
|---|
| 322 | # nota bene: for - strand self.index[x] maps to one column |
|---|
| 323 | # higher than is actually associated with the position; thus |
|---|
| 324 | # when slice_by_component() and slice_by_coord() flip the ends, |
|---|
| 325 | # the resulting slice is correct |
|---|
| 326 | for x in range( len(self.text)-1,-1,-1 ): |
|---|
| 327 | if not self.text[x] == '-': |
|---|
| 328 | self.index.append( x + 1 ) |
|---|
| 329 | self.index.append( 0 ) |
|---|
| 330 | else: |
|---|
| 331 | for x in range( len(self.text) ): |
|---|
| 332 | if not self.text[x] == '-': |
|---|
| 333 | self.index.append(x) |
|---|
| 334 | self.index.append( len(self.text) ) |
|---|
| 335 | x = None |
|---|
| 336 | try: |
|---|
| 337 | x = self.index[ pos - start ] |
|---|
| 338 | except: |
|---|
| 339 | raise "Error in index." |
|---|
| 340 | return x |
|---|
| 341 | |
|---|
| 342 | |
|---|
| 343 | def __eq__( self, other ): |
|---|
| 344 | if other is None or type( other ) != type( self ): |
|---|
| 345 | return False |
|---|
| 346 | return ( self.src == other.src |
|---|
| 347 | and self.start == other.start |
|---|
| 348 | and self.size == other.size |
|---|
| 349 | and self.strand == other.strand |
|---|
| 350 | and self._src_size == other._src_size |
|---|
| 351 | and self.text == other.text |
|---|
| 352 | and self.synteny_left == other.synteny_left |
|---|
| 353 | and self.synteny_right == other.synteny_right |
|---|
| 354 | and self.synteny_empty == other.synteny_empty |
|---|
| 355 | and self.empty == other.empty ) |
|---|
| 356 | |
|---|
| 357 | def __ne__( self, other ): |
|---|
| 358 | return not( self.__eq__( other ) ) |
|---|
| 359 | |
|---|
| 360 | def __deepcopy__( self, memo ): |
|---|
| 361 | new = Component( src=self.src, start=self.start, size=self.size, strand=self.strand, src_size=self._src_size, text=self.text ) |
|---|
| 362 | new._alignment = self._alignment |
|---|
| 363 | new.quality = self.quality |
|---|
| 364 | new.synteny_left = self.synteny_left |
|---|
| 365 | new.synteny_right = self.synteny_right |
|---|
| 366 | new.synteny_empty = self.synteny_empty |
|---|
| 367 | new.empty = self.empty |
|---|
| 368 | new.index = self.index |
|---|
| 369 | return new |
|---|
| 370 | |
|---|
| 371 | def get_reader( format, infile, species_to_lengths=None ): |
|---|
| 372 | import bx.align.maf, bx.align.axt, bx.align.lav |
|---|
| 373 | if format == "maf": return bx.align.maf.Reader( infile, species_to_lengths ) |
|---|
| 374 | elif format == "axt": return bx.align.axt.Reader( infile, species_to_lengths ) |
|---|
| 375 | elif format == "lav": return bx.align.lav.Reader( infile ) |
|---|
| 376 | else: raise "Unknown alignment format %s" % format |
|---|
| 377 | |
|---|
| 378 | def get_writer( format, outfile, attributes={} ): |
|---|
| 379 | import bx.align.maf, bx.align.axt, bx.align.lav |
|---|
| 380 | if format == "maf": return bx.align.maf.Writer( outfile, attributes ) |
|---|
| 381 | elif format == "axt": return bx.align.axt.Writer( outfile, attributes ) |
|---|
| 382 | elif format == "lav": return bx.align.lav.Writer( outfile, attributes ) |
|---|
| 383 | else: raise "Unknown alignment format %s" % format |
|---|
| 384 | |
|---|
| 385 | def get_indexed( format, filename, index_filename=None, keep_open=False, species_to_lengths=None ): |
|---|
| 386 | import bx.align.maf, bx.align.axt, bx.align.lav |
|---|
| 387 | if format == "maf": return bx.align.maf.Indexed( filename, index_filename, keep_open, species_to_lengths ) |
|---|
| 388 | elif format == "axt": return bx.align.axt.Indexed( filename, index_filename, keep_open, species_to_lengths ) |
|---|
| 389 | elif format == "lav": raise "LAV support for Indexed has not been implemented" |
|---|
| 390 | else: raise "Unknown alignment format %s" % format |
|---|
| 391 | |
|---|
| 392 | def shuffle_columns( a ): |
|---|
| 393 | """Randomize the columns of an alignment""" |
|---|
| 394 | mask = range( a.text_size ) |
|---|
| 395 | random.shuffle( mask ) |
|---|
| 396 | for c in a.components: |
|---|
| 397 | c.text = ''.join( [ c.text[i] for i in mask ] ) |
|---|
| 398 | |
|---|
| 399 | def src_split( src ): # splits src into species,chrom |
|---|
| 400 | dot = src.rfind( "." ) |
|---|
| 401 | if dot == -1: return None,src |
|---|
| 402 | else: return src[:dot],src[dot+1:] |
|---|
| 403 | |
|---|
| 404 | def src_merge( species,chrom,contig=None ): # creates src (inverse of src_split) |
|---|
| 405 | if species == None: src = chrom |
|---|
| 406 | else: src = species + "." + chrom |
|---|
| 407 | if contig != None: src += "[%s]" % contig |
|---|
| 408 | return src |
|---|
| 409 | |
|---|
| 410 | # ---- Read C extension if available --------------------------------------- |
|---|
| 411 | |
|---|
| 412 | try: |
|---|
| 413 | from _core import coord_to_col |
|---|
| 414 | except: |
|---|
| 415 | def coord_to_col( start, text, pos ): |
|---|
| 416 | col = 0 |
|---|
| 417 | while start < pos: |
|---|
| 418 | if text[col] != '-': |
|---|
| 419 | start += 1 |
|---|
| 420 | col += 1 |
|---|
| 421 | return col |
|---|