1 | """ |
---|
2 | Classes that represent alignments between multiple sequences. |
---|
3 | """ |
---|
4 | |
---|
5 | import random |
---|
6 | import string |
---|
7 | import weakref |
---|
8 | from bx.misc.readlengths import read_lengths_file |
---|
9 | |
---|
10 | # DNA reverse complement table |
---|
11 | ## DNA_COMP = " - " \ |
---|
12 | ## " TVGH CD M KN YSA BWXR tvgh cd m kn ysa bwxr " \ |
---|
13 | ## " " \ |
---|
14 | ## " " |
---|
15 | |
---|
16 | DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" ) |
---|
17 | |
---|
18 | class Alignment( object ): |
---|
19 | |
---|
20 | def __init__( self, score=0, attributes={}, species_to_lengths=None ): |
---|
21 | # species_to_lengths is needed only for file formats that don't provide |
---|
22 | # chromosome lengths; it maps each species name to one of these: |
---|
23 | # - the name of a file that contains a list of chromosome length pairs |
---|
24 | # - a dict mapping chromosome names to their length |
---|
25 | # - a single length value (useful when we just have one sequence and no chromosomes) |
---|
26 | # internally a file name is replaced by a dict, but only on an "as |
---|
27 | # needed" basis |
---|
28 | self.score = score |
---|
29 | self.text_size = 0 |
---|
30 | self.attributes = attributes |
---|
31 | if species_to_lengths == None: self.species_to_lengths = {} |
---|
32 | else: self.species_to_lengths = species_to_lengths |
---|
33 | self.components = [] |
---|
34 | |
---|
35 | def add_component( self, component ): |
---|
36 | component._alignment = weakref.ref( self ) |
---|
37 | self.components.append( component ) |
---|
38 | if component.text is not None: |
---|
39 | if self.text_size == 0: |
---|
40 | self.text_size = len( component.text ) |
---|
41 | elif self.text_size != len( component.text ): |
---|
42 | raise Exception( "Components must have same text length" ) |
---|
43 | |
---|
44 | def get_score( self ): |
---|
45 | return self.__score |
---|
46 | def set_score( self,score ): |
---|
47 | if type( score ) == str: |
---|
48 | try: |
---|
49 | score = int(score) |
---|
50 | except: |
---|
51 | try: |
---|
52 | score = float(score) |
---|
53 | except: |
---|
54 | pass |
---|
55 | self.__score = score |
---|
56 | score = property( fget=get_score,fset=set_score ) |
---|
57 | |
---|
58 | def __str__( self ): |
---|
59 | s = "a score=" + str( self.score ) |
---|
60 | for key in self.attributes: |
---|
61 | s += " %s=%s" % ( key, self.attributes[key] ) |
---|
62 | s += "\n" |
---|
63 | # Components |
---|
64 | for c in self.components: |
---|
65 | s += str( c ) |
---|
66 | s += "\n" |
---|
67 | return s |
---|
68 | |
---|
69 | def src_size( self, src ): |
---|
70 | species,chrom = src_split( src ) |
---|
71 | if species in self.species_to_lengths: |
---|
72 | chrom_to_length = self.species_to_lengths[species] |
---|
73 | elif chrom in self.species_to_lengths: |
---|
74 | chrom_to_length = self.species_to_lengths |
---|
75 | else: |
---|
76 | raise "no src_size (no length file for %s)" % species |
---|
77 | if type( chrom_to_length ) == int: # (if it's a single length) |
---|
78 | return chrom_to_length |
---|
79 | if type( chrom_to_length ) == type( "" ): # (if it's a file name) |
---|
80 | chrom_to_length = read_lengths_file( chrom_to_length ) |
---|
81 | self.species_to_lengths[species] = chrom_to_length |
---|
82 | if chrom not in chrom_to_length: raise "no src_size (%s has no length for %s)" % ( species, chrom ) |
---|
83 | return chrom_to_length[chrom] |
---|
84 | |
---|
85 | def get_component_by_src( self, src ): |
---|
86 | for c in self.components: |
---|
87 | if c.src == src: return c |
---|
88 | return None |
---|
89 | |
---|
90 | def get_component_by_src_start( self, src ): |
---|
91 | for c in self.components: |
---|
92 | if c.src.startswith( src ): return c |
---|
93 | return None |
---|
94 | |
---|
95 | def slice( self, start, end ): |
---|
96 | new = Alignment( score=self.score, attributes=self.attributes ) |
---|
97 | for component in self.components: |
---|
98 | # FIXME: Is this the right solution? |
---|
99 | if component.empty: |
---|
100 | continue |
---|
101 | new.components.append( component.slice( start, end ) ) |
---|
102 | new.text_size = end - start |
---|
103 | return new |
---|
104 | |
---|
105 | def reverse_complement( self ): |
---|
106 | new = Alignment( score=self.score, attributes=self.attributes ) |
---|
107 | for component in self.components: |
---|
108 | new.components.append( component.reverse_complement() ) |
---|
109 | new.text_size = self.text_size |
---|
110 | return new |
---|
111 | |
---|
112 | def slice_by_component( self, component_index, start, end ): |
---|
113 | """ |
---|
114 | Return a slice of the alignment, corresponding to an coordinate interval in a specific component. |
---|
115 | |
---|
116 | component_index is one of |
---|
117 | an integer offset into the components list |
---|
118 | a string indicating the src of the desired component |
---|
119 | a component |
---|
120 | |
---|
121 | start and end are relative to the + strand, regardless of the component's strand. |
---|
122 | |
---|
123 | """ |
---|
124 | if type( component_index ) == type( 0 ): |
---|
125 | ref = self.components[ component_index ] |
---|
126 | elif type( component_index ) == type( "" ): |
---|
127 | ref = self.get_component_by_src( component_index ) |
---|
128 | elif type( component_index ) == Component: |
---|
129 | ref = component_index |
---|
130 | else: |
---|
131 | raise ValueError( "can't figure out what to do" ) |
---|
132 | start_col = ref.coord_to_col( start ) |
---|
133 | end_col = ref.coord_to_col( end ) |
---|
134 | if (ref.strand == '-'): |
---|
135 | (start_col,end_col) = (end_col,start_col) |
---|
136 | return self.slice( start_col, end_col ) |
---|
137 | |
---|
138 | def column_iter( self ): |
---|
139 | for i in range( self.text_size ): |
---|
140 | yield [ c.text[i] for c in self.components ] |
---|
141 | |
---|
142 | def limit_to_species( self, species ): |
---|
143 | new = Alignment( score=self.score, attributes=self.attributes ) |
---|
144 | new.text_size = self.text_size |
---|
145 | for component in self.components: |
---|
146 | if component.src.split('.')[0] in species: |
---|
147 | new.add_component( component ) |
---|
148 | return new |
---|
149 | |
---|
150 | def remove_all_gap_columns( self ): |
---|
151 | """ |
---|
152 | Remove any columns containing only gaps from alignment components, |
---|
153 | text of components is modified IN PLACE. |
---|
154 | """ |
---|
155 | seqs = [] |
---|
156 | for c in self.components: |
---|
157 | try: |
---|
158 | seqs.append( list( c.text ) ) |
---|
159 | except TypeError: |
---|
160 | seqs.append( None ) |
---|
161 | i = 0 |
---|
162 | text_size = self.text_size |
---|
163 | while i < text_size: |
---|
164 | all_gap = True |
---|
165 | for seq in seqs: |
---|
166 | if seq is None: continue |
---|
167 | if seq[i] != '-': all_gap = False |
---|
168 | if all_gap: |
---|
169 | for seq in seqs: |
---|
170 | if seq is None: continue |
---|
171 | del seq[i] |
---|
172 | text_size -= 1 |
---|
173 | else: |
---|
174 | i += 1 |
---|
175 | for i in range( len( self.components ) ): |
---|
176 | if seqs[i] is None: continue |
---|
177 | self.components[i].text = ''.join( seqs[i] ) |
---|
178 | self.text_size = text_size |
---|
179 | |
---|
180 | def __eq__( self, other ): |
---|
181 | if other is None or type( other ) != type( self ): |
---|
182 | return False |
---|
183 | if self.score != other.score: |
---|
184 | return False |
---|
185 | if self.attributes != other.attributes: |
---|
186 | return False |
---|
187 | if len( self.components ) != len( other.components ): |
---|
188 | return False |
---|
189 | for c1, c2 in zip( self.components, other.components ): |
---|
190 | if c1 != c2: |
---|
191 | return False |
---|
192 | return True |
---|
193 | |
---|
194 | def __ne__( self, other ): |
---|
195 | return not( self.__eq__( other ) ) |
---|
196 | |
---|
197 | def __deepcopy__( self, memo ): |
---|
198 | from copy import deepcopy |
---|
199 | new = Alignment( score=self.score, attributes=deepcopy( self.attributes ), species_to_lengths=deepcopy( self.species_to_lengths ) ) |
---|
200 | for component in self.components: |
---|
201 | new.add_component( deepcopy( component ) ) |
---|
202 | return new |
---|
203 | |
---|
204 | class Component( object ): |
---|
205 | |
---|
206 | def __init__( self, src='', start=0, size=0, strand=None, src_size=None, text='' ): |
---|
207 | self._alignment = None |
---|
208 | self.src = src |
---|
209 | self.start = start # Nota Bene: start,size,strand are as they |
---|
210 | self.size = size # .. appear in a MAF file-- origin-zero, end |
---|
211 | self.strand = strand # .. excluded, and minus strand counts from |
---|
212 | self._src_size = src_size # .. end of sequence |
---|
213 | self.text = text |
---|
214 | self.quality = None |
---|
215 | # Optional fields to keep track of synteny status (only makes sense |
---|
216 | # when the alignment is part of an ordered set) |
---|
217 | self.synteny_left = None |
---|
218 | self.synteny_right = None |
---|
219 | self.synteny_empty = None |
---|
220 | # If true, this component actually represents a non-aligning region, |
---|
221 | # and has no text. |
---|
222 | self.empty = False |
---|
223 | # Index maps a coordinate (distance along + strand from + start) to alignment column |
---|
224 | self.index = None |
---|
225 | |
---|
226 | def __str__( self ): |
---|
227 | if self.empty: |
---|
228 | rval = "e %s %d %d %s %d %s" % ( self.src, self.start, |
---|
229 | self.size, self.strand, |
---|
230 | self.src_size, self.synteny_empty ) |
---|
231 | else: |
---|
232 | rval = "s %s %d %d %s %d %s" % ( self.src, self.start, |
---|
233 | self.size, self.strand, |
---|
234 | self.src_size, self.text ) |
---|
235 | if self.synteny_left and self.synteny_right: |
---|
236 | rval += "\ni %s %s %d %s %d" % ( self.src, |
---|
237 | self.synteny_left[0], self.synteny_left[1], |
---|
238 | self.synteny_right[0], self.synteny_right[1] ) |
---|
239 | return rval |
---|
240 | |
---|
241 | def get_end( self ): |
---|
242 | return self.start + self.size |
---|
243 | end = property( fget=get_end ) |
---|
244 | |
---|
245 | def get_src_size( self ): |
---|
246 | if self._src_size == None: |
---|
247 | if self._alignment == None: raise "component has no src_size" |
---|
248 | self._src_size = self._alignment().src_size( self.src ) |
---|
249 | return self._src_size |
---|
250 | def set_src_size( self,src_size ): |
---|
251 | self._src_size = src_size |
---|
252 | src_size = property( fget=get_src_size, fset=set_src_size ) |
---|
253 | |
---|
254 | def get_forward_strand_start( self ): |
---|
255 | if self.strand == '-': return self.src_size - self.end |
---|
256 | else: return self.start |
---|
257 | forward_strand_start = property( fget=get_forward_strand_start ) |
---|
258 | |
---|
259 | def get_forward_strand_end( self ): |
---|
260 | if self.strand == '-': return self.src_size - self.start |
---|
261 | else: return self.end |
---|
262 | forward_strand_end = property( fget=get_forward_strand_end) |
---|
263 | |
---|
264 | def reverse_complement( self ): |
---|
265 | start = self.src_size - self.end |
---|
266 | if self.strand == "+": strand = "-" |
---|
267 | else: strand = "+" |
---|
268 | comp = [ch for ch in self.text.translate(DNA_COMP)] |
---|
269 | comp.reverse() |
---|
270 | text = "".join(comp) |
---|
271 | new = Component( self.src, start, self.size, strand, self._src_size, text ) |
---|
272 | new._alignment = self._alignment |
---|
273 | return new |
---|
274 | |
---|
275 | def slice( self, start, end ): |
---|
276 | new = Component( src=self.src, start=self.start, strand=self.strand, src_size=self._src_size ) |
---|
277 | new._alignment = self._alignment |
---|
278 | new.text = self.text[start:end] |
---|
279 | |
---|
280 | #for i in range( 0, start ): |
---|
281 | # if self.text[i] != '-': new.start += 1 |
---|
282 | #for c in new.text: |
---|
283 | # if c != '-': new.size += 1 |
---|
284 | new.start += start - self.text.count( '-', 0, start ) |
---|
285 | new.size = len( new.text ) - new.text.count( '-' ) |
---|
286 | |
---|
287 | # FIXME: This annotation probably means nothing after slicing if |
---|
288 | # one of the ends changes. In general the 'i' rows of a MAF only |
---|
289 | # make sense in context (relative to the previous and next alignments |
---|
290 | # in a stream, slicing breaks that). |
---|
291 | new.synteny_left = self.synteny_left |
---|
292 | new.synteny_right = self.synteny_right |
---|
293 | |
---|
294 | return new |
---|
295 | |
---|
296 | def slice_by_coord( self, start, end ): |
---|
297 | """ |
---|
298 | Return the slice of the component corresponding to a coordinate interval. |
---|
299 | |
---|
300 | start and end are relative to the + strand, regardless of the component's strand. |
---|
301 | |
---|
302 | """ |
---|
303 | start_col = self.coord_to_col( start ) |
---|
304 | end_col = self.coord_to_col( end ) |
---|
305 | if (self.strand == '-'): |
---|
306 | (start_col,end_col) = (end_col,start_col) |
---|
307 | return self.slice( start_col, end_col ) |
---|
308 | |
---|
309 | def coord_to_col( self, pos ): |
---|
310 | """ |
---|
311 | Return the alignment column index corresponding to coordinate pos. |
---|
312 | |
---|
313 | pos is relative to the + strand, regardless of the component's strand. |
---|
314 | |
---|
315 | """ |
---|
316 | start,end = self.get_forward_strand_start(),self.get_forward_strand_end() |
---|
317 | if pos < start or pos > end: |
---|
318 | raise "Range error: %d not in %d-%d" % ( pos, start, end ) |
---|
319 | if not self.index: |
---|
320 | self.index = list() |
---|
321 | if (self.strand == '-'): |
---|
322 | # nota bene: for - strand self.index[x] maps to one column |
---|
323 | # higher than is actually associated with the position; thus |
---|
324 | # when slice_by_component() and slice_by_coord() flip the ends, |
---|
325 | # the resulting slice is correct |
---|
326 | for x in range( len(self.text)-1,-1,-1 ): |
---|
327 | if not self.text[x] == '-': |
---|
328 | self.index.append( x + 1 ) |
---|
329 | self.index.append( 0 ) |
---|
330 | else: |
---|
331 | for x in range( len(self.text) ): |
---|
332 | if not self.text[x] == '-': |
---|
333 | self.index.append(x) |
---|
334 | self.index.append( len(self.text) ) |
---|
335 | x = None |
---|
336 | try: |
---|
337 | x = self.index[ pos - start ] |
---|
338 | except: |
---|
339 | raise "Error in index." |
---|
340 | return x |
---|
341 | |
---|
342 | |
---|
343 | def __eq__( self, other ): |
---|
344 | if other is None or type( other ) != type( self ): |
---|
345 | return False |
---|
346 | return ( self.src == other.src |
---|
347 | and self.start == other.start |
---|
348 | and self.size == other.size |
---|
349 | and self.strand == other.strand |
---|
350 | and self._src_size == other._src_size |
---|
351 | and self.text == other.text |
---|
352 | and self.synteny_left == other.synteny_left |
---|
353 | and self.synteny_right == other.synteny_right |
---|
354 | and self.synteny_empty == other.synteny_empty |
---|
355 | and self.empty == other.empty ) |
---|
356 | |
---|
357 | def __ne__( self, other ): |
---|
358 | return not( self.__eq__( other ) ) |
---|
359 | |
---|
360 | def __deepcopy__( self, memo ): |
---|
361 | new = Component( src=self.src, start=self.start, size=self.size, strand=self.strand, src_size=self._src_size, text=self.text ) |
---|
362 | new._alignment = self._alignment |
---|
363 | new.quality = self.quality |
---|
364 | new.synteny_left = self.synteny_left |
---|
365 | new.synteny_right = self.synteny_right |
---|
366 | new.synteny_empty = self.synteny_empty |
---|
367 | new.empty = self.empty |
---|
368 | new.index = self.index |
---|
369 | return new |
---|
370 | |
---|
371 | def get_reader( format, infile, species_to_lengths=None ): |
---|
372 | import bx.align.maf, bx.align.axt, bx.align.lav |
---|
373 | if format == "maf": return bx.align.maf.Reader( infile, species_to_lengths ) |
---|
374 | elif format == "axt": return bx.align.axt.Reader( infile, species_to_lengths ) |
---|
375 | elif format == "lav": return bx.align.lav.Reader( infile ) |
---|
376 | else: raise "Unknown alignment format %s" % format |
---|
377 | |
---|
378 | def get_writer( format, outfile, attributes={} ): |
---|
379 | import bx.align.maf, bx.align.axt, bx.align.lav |
---|
380 | if format == "maf": return bx.align.maf.Writer( outfile, attributes ) |
---|
381 | elif format == "axt": return bx.align.axt.Writer( outfile, attributes ) |
---|
382 | elif format == "lav": return bx.align.lav.Writer( outfile, attributes ) |
---|
383 | else: raise "Unknown alignment format %s" % format |
---|
384 | |
---|
385 | def get_indexed( format, filename, index_filename=None, keep_open=False, species_to_lengths=None ): |
---|
386 | import bx.align.maf, bx.align.axt, bx.align.lav |
---|
387 | if format == "maf": return bx.align.maf.Indexed( filename, index_filename, keep_open, species_to_lengths ) |
---|
388 | elif format == "axt": return bx.align.axt.Indexed( filename, index_filename, keep_open, species_to_lengths ) |
---|
389 | elif format == "lav": raise "LAV support for Indexed has not been implemented" |
---|
390 | else: raise "Unknown alignment format %s" % format |
---|
391 | |
---|
392 | def shuffle_columns( a ): |
---|
393 | """Randomize the columns of an alignment""" |
---|
394 | mask = range( a.text_size ) |
---|
395 | random.shuffle( mask ) |
---|
396 | for c in a.components: |
---|
397 | c.text = ''.join( [ c.text[i] for i in mask ] ) |
---|
398 | |
---|
399 | def src_split( src ): # splits src into species,chrom |
---|
400 | dot = src.rfind( "." ) |
---|
401 | if dot == -1: return None,src |
---|
402 | else: return src[:dot],src[dot+1:] |
---|
403 | |
---|
404 | def src_merge( species,chrom,contig=None ): # creates src (inverse of src_split) |
---|
405 | if species == None: src = chrom |
---|
406 | else: src = species + "." + chrom |
---|
407 | if contig != None: src += "[%s]" % contig |
---|
408 | return src |
---|
409 | |
---|
410 | # ---- Read C extension if available --------------------------------------- |
---|
411 | |
---|
412 | try: |
---|
413 | from _core import coord_to_col |
---|
414 | except: |
---|
415 | def coord_to_col( start, text, pos ): |
---|
416 | col = 0 |
---|
417 | while start < pos: |
---|
418 | if text[col] != '-': |
---|
419 | start += 1 |
---|
420 | col += 1 |
---|
421 | return col |
---|