[2] | 1 | #!/usr/bin/env python
|
---|
| 2 | """
|
---|
| 3 | Provides wrappers and utilities for working with MAF files and alignments.
|
---|
| 4 | """
|
---|
| 5 | #Dan Blankenberg
|
---|
| 6 | import pkg_resources; pkg_resources.require( "bx-python" )
|
---|
| 7 | import bx.align.maf
|
---|
| 8 | import bx.intervals
|
---|
| 9 | import bx.interval_index_file
|
---|
| 10 | import sys, os, string, tempfile |
---|
| 11 | import logging |
---|
| 12 | from copy import deepcopy
|
---|
| 13 |
|
---|
| 14 | assert sys.version_info[:2] >= ( 2, 4 )
|
---|
| 15 | |
---|
| 16 | log = logging.getLogger(__name__) |
---|
| 17 | |
---|
| 18 | |
---|
| 19 | GAP_CHARS = [ '-' ] |
---|
| 20 | SRC_SPLIT_CHAR = '.' |
---|
| 21 | |
---|
| 22 | def src_split( src ): |
---|
| 23 | fields = src.split( SRC_SPLIT_CHAR, 1 ) |
---|
| 24 | spec = fields.pop( 0 ) |
---|
| 25 | if fields: |
---|
| 26 | chrom = fields.pop( 0 ) |
---|
| 27 | else: |
---|
| 28 | chrom = spec |
---|
| 29 | return spec, chrom
|
---|
| 30 | |
---|
| 31 | def src_merge( spec, chrom, contig = None ): |
---|
| 32 | if None in [ spec, chrom ]: |
---|
| 33 | spec = chrom = spec or chrom |
---|
| 34 | return bx.align.maf.src_merge( spec, chrom, contig ) |
---|
| 35 | |
---|
| 36 | def get_species_in_block( block ): |
---|
| 37 | species = [] |
---|
| 38 | for c in block.components: |
---|
| 39 | spec, chrom = src_split( c.src ) |
---|
| 40 | if spec not in species: |
---|
| 41 | species.append( spec ) |
---|
| 42 | return species |
---|
| 43 | |
---|
| 44 | def tool_fail( msg = "Unknown Error" ): |
---|
| 45 | print >> sys.stderr, "Fatal Error: %s" % msg |
---|
| 46 | sys.exit() |
---|
| 47 | |
---|
| 48 | #an object corresponding to a reference layered alignment
|
---|
| 49 | class RegionAlignment( object ):
|
---|
| 50 |
|
---|
| 51 | DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )
|
---|
| 52 | MAX_SEQUENCE_SIZE = sys.maxint #Maximum length of sequence allowed
|
---|
| 53 |
|
---|
| 54 | def __init__( self, size, species = [] ):
|
---|
| 55 | assert size <= self.MAX_SEQUENCE_SIZE, "Maximum length allowed for an individual sequence has been exceeded (%i > %i)." % ( size, self.MAX_SEQUENCE_SIZE )
|
---|
| 56 | self.size = size
|
---|
| 57 | self.sequences = {}
|
---|
| 58 | if not isinstance( species, list ):
|
---|
| 59 | species = [species]
|
---|
| 60 | for spec in species:
|
---|
| 61 | self.add_species( spec )
|
---|
| 62 |
|
---|
| 63 | #add a species to the alignment
|
---|
| 64 | def add_species( self, species ):
|
---|
| 65 | #make temporary sequence files
|
---|
| 66 | self.sequences[species] = tempfile.TemporaryFile()
|
---|
| 67 | self.sequences[species].write( "-" * self.size )
|
---|
| 68 |
|
---|
| 69 | #returns the names for species found in alignment, skipping names as requested
|
---|
| 70 | def get_species_names( self, skip = [] ):
|
---|
| 71 | if not isinstance( skip, list ): skip = [skip]
|
---|
| 72 | names = self.sequences.keys()
|
---|
| 73 | for name in skip:
|
---|
| 74 | try: names.remove( name )
|
---|
| 75 | except: pass
|
---|
| 76 | return names
|
---|
| 77 |
|
---|
| 78 | #returns the sequence for a species
|
---|
| 79 | def get_sequence( self, species ):
|
---|
| 80 | self.sequences[species].seek( 0 )
|
---|
| 81 | return self.sequences[species].read()
|
---|
| 82 |
|
---|
| 83 | #returns the reverse complement of the sequence for a species
|
---|
| 84 | def get_sequence_reverse_complement( self, species ):
|
---|
| 85 | complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]
|
---|
| 86 | complement.reverse()
|
---|
| 87 | return "".join( complement )
|
---|
| 88 |
|
---|
| 89 | #sets a position for a species
|
---|
| 90 | def set_position( self, index, species, base ):
|
---|
| 91 | if len( base ) != 1: raise Exception( "A genomic position can only have a length of 1." )
|
---|
| 92 | return self.set_range( index, species, base )
|
---|
| 93 | #sets a range for a species
|
---|
| 94 | def set_range( self, index, species, bases ):
|
---|
| 95 | if index >= self.size or index < 0: raise Exception( "Your index (%i) is out of range (0 - %i)." % ( index, self.size - 1 ) )
|
---|
| 96 | if len( bases ) == 0: raise Exception( "A set of genomic positions can only have a positive length." )
|
---|
| 97 | if species not in self.sequences.keys(): self.add_species( species )
|
---|
| 98 | self.sequences[species].seek( index )
|
---|
| 99 | self.sequences[species].write( bases )
|
---|
| 100 |
|
---|
| 101 | #Flush temp file of specified species, or all species
|
---|
| 102 | def flush( self, species = None ):
|
---|
| 103 | if species is None:
|
---|
| 104 | species = self.sequences.keys()
|
---|
| 105 | elif not isinstance( species, list ):
|
---|
| 106 | species = [species]
|
---|
| 107 | for spec in species:
|
---|
| 108 | self.sequences[spec].flush()
|
---|
| 109 |
|
---|
| 110 | class GenomicRegionAlignment( RegionAlignment ):
|
---|
| 111 |
|
---|
| 112 | def __init__( self, start, end, species = [] ):
|
---|
| 113 | RegionAlignment.__init__( self, end - start, species )
|
---|
| 114 | self.start = start
|
---|
| 115 | self.end = end
|
---|
| 116 |
|
---|
| 117 | class SplicedAlignment( object ):
|
---|
| 118 |
|
---|
| 119 | DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )
|
---|
| 120 |
|
---|
| 121 | def __init__( self, exon_starts, exon_ends, species = [] ):
|
---|
| 122 | if not isinstance( exon_starts, list ):
|
---|
| 123 | exon_starts = [exon_starts]
|
---|
| 124 | if not isinstance( exon_ends, list ):
|
---|
| 125 | exon_ends = [exon_ends]
|
---|
| 126 | assert len( exon_starts ) == len( exon_ends ), "The number of starts does not match the number of sizes."
|
---|
| 127 | self.exons = []
|
---|
| 128 | for i in range( len( exon_starts ) ):
|
---|
| 129 | self.exons.append( GenomicRegionAlignment( exon_starts[i], exon_ends[i], species ) )
|
---|
| 130 |
|
---|
| 131 | #returns the names for species found in alignment, skipping names as requested
|
---|
| 132 | def get_species_names( self, skip = [] ):
|
---|
| 133 | if not isinstance( skip, list ): skip = [skip]
|
---|
| 134 | names = []
|
---|
| 135 | for exon in self.exons:
|
---|
| 136 | for name in exon.get_species_names( skip = skip ):
|
---|
| 137 | if name not in names:
|
---|
| 138 | names.append( name )
|
---|
| 139 | return names
|
---|
| 140 |
|
---|
| 141 | #returns the sequence for a species
|
---|
| 142 | def get_sequence( self, species ):
|
---|
| 143 | sequence = tempfile.TemporaryFile()
|
---|
| 144 | for exon in self.exons:
|
---|
| 145 | if species in exon.get_species_names():
|
---|
| 146 | sequence.write( exon.get_sequence( species ) )
|
---|
| 147 | else:
|
---|
| 148 | sequence.write( "-" * exon.size )
|
---|
| 149 | sequence.seek( 0 )
|
---|
| 150 | return sequence.read()
|
---|
| 151 |
|
---|
| 152 | #returns the reverse complement of the sequence for a species
|
---|
| 153 | def get_sequence_reverse_complement( self, species ):
|
---|
| 154 | complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]
|
---|
| 155 | complement.reverse()
|
---|
| 156 | return "".join( complement )
|
---|
| 157 |
|
---|
| 158 | #Start and end of coding region
|
---|
| 159 | @property
|
---|
| 160 | def start( self ):
|
---|
| 161 | return self.exons[0].start
|
---|
| 162 | @property
|
---|
| 163 | def end( self ):
|
---|
| 164 | return self.exons[-1].end
|
---|
| 165 |
|
---|
| 166 | #Open a MAF index using a UID
|
---|
| 167 | def maf_index_by_uid( maf_uid, index_location_file ):
|
---|
| 168 | for line in open( index_location_file ):
|
---|
| 169 | try:
|
---|
| 170 | #read each line, if not enough fields, go to next line
|
---|
| 171 | if line[0:1] == "#" : continue
|
---|
| 172 | fields = line.split('\t')
|
---|
| 173 | if maf_uid == fields[1]:
|
---|
| 174 | try:
|
---|
| 175 | maf_files = fields[4].replace( "\n", "" ).replace( "\r", "" ).split( "," )
|
---|
| 176 | return bx.align.maf.MultiIndexed( maf_files, keep_open = True, parse_e_rows = False )
|
---|
| 177 | except Exception, e:
|
---|
| 178 | raise Exception( 'MAF UID (%s) found, but configuration appears to be malformed: %s' % ( maf_uid, e ) )
|
---|
| 179 | except:
|
---|
| 180 | pass
|
---|
| 181 | return None
|
---|
| 182 | |
---|
| 183 | #return ( index, temp_index_filename ) for user maf, if available, or build one and return it, return None when no tempfile is created |
---|
| 184 | def open_or_build_maf_index( maf_file, index_filename, species = None ): |
---|
| 185 | try: |
---|
| 186 | return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), None ) |
---|
| 187 | except: |
---|
| 188 | return build_maf_index( maf_file, species = species ) |
---|
| 189 | |
---|
| 190 | #*** ANYCHANGE TO THIS METHOD HERE OR IN galaxy.datatypes.sequences MUST BE PROPAGATED *** |
---|
| 191 | def build_maf_index_species_chromosomes( filename, index_species = None ): |
---|
| 192 | species = [] |
---|
| 193 | species_chromosomes = {} |
---|
| 194 | indexes = bx.interval_index_file.Indexes()
|
---|
| 195 | blocks = 0 |
---|
| 196 | try: |
---|
| 197 | maf_reader = bx.align.maf.Reader( open( filename ) ) |
---|
| 198 | while True: |
---|
| 199 | pos = maf_reader.file.tell() |
---|
| 200 | block = maf_reader.next() |
---|
| 201 | if block is None:
|
---|
| 202 | break
|
---|
| 203 | blocks += 1 |
---|
| 204 | for c in block.components: |
---|
| 205 | spec = c.src |
---|
| 206 | chrom = None |
---|
| 207 | if "." in spec: |
---|
| 208 | spec, chrom = spec.split( ".", 1 ) |
---|
| 209 | if spec not in species: |
---|
| 210 | species.append( spec ) |
---|
| 211 | species_chromosomes[spec] = [] |
---|
| 212 | if chrom and chrom not in species_chromosomes[spec]: |
---|
| 213 | species_chromosomes[spec].append( chrom ) |
---|
| 214 | if index_species is None or spec in index_species: |
---|
| 215 | forward_strand_start = c.forward_strand_start |
---|
| 216 | forward_strand_end = c.forward_strand_end |
---|
| 217 | try: |
---|
| 218 | forward_strand_start = int( forward_strand_start ) |
---|
| 219 | forward_strand_end = int( forward_strand_end ) |
---|
| 220 | except ValueError: |
---|
| 221 | continue #start and end are not integers, can't add component to index, goto next component |
---|
| 222 | #this likely only occurs when parse_e_rows is True? |
---|
| 223 | #could a species exist as only e rows? should the |
---|
| 224 | if forward_strand_end > forward_strand_start: |
---|
| 225 | #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed |
---|
| 226 | indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size ) |
---|
| 227 | except Exception, e: |
---|
| 228 | #most likely a bad MAF |
---|
| 229 | log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) ) |
---|
| 230 | return ( None, [], {} ) |
---|
| 231 | return ( indexes, species, species_chromosomes, blocks ) |
---|
| 232 |
|
---|
| 233 | #builds and returns ( index, index_filename ) for specified maf_file
|
---|
| 234 | def build_maf_index( maf_file, species = None ):
|
---|
| 235 | indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes( maf_file, species ) |
---|
| 236 | if indexes is not None: |
---|
| 237 | fd, index_filename = tempfile.mkstemp()
|
---|
| 238 | out = os.fdopen( fd, 'w' )
|
---|
| 239 | indexes.write( out )
|
---|
| 240 | out.close()
|
---|
| 241 | return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), index_filename ) |
---|
| 242 | return ( None, None )
|
---|
| 243 | |
---|
| 244 | def component_overlaps_region( c, region ): |
---|
| 245 | if c is None: return False |
---|
| 246 | start, end = c.get_forward_strand_start(), c.get_forward_strand_end() |
---|
| 247 | if region.start >= end or region.end <= start: |
---|
| 248 | return False |
---|
| 249 | return True |
---|
| 250 | |
---|
| 251 | def chop_block_by_region( block, src, region, species = None, mincols = 0 ): |
---|
| 252 | # This chopping method was designed to maintain consistency with how start/end padding gaps have been working in Galaxy thus far: |
---|
| 253 | # behavior as seen when forcing blocks to be '+' relative to src sequence (ref) and using block.slice_by_component( ref, slice_start, slice_end ) |
---|
| 254 | # whether-or-not this is the 'correct' behavior is questionable, but this will at least maintain consistency |
---|
| 255 | # comments welcome |
---|
| 256 | slice_start = block.text_size #max for the min() |
---|
| 257 | slice_end = 0 #min for the max() |
---|
| 258 | old_score = block.score #save old score for later use |
---|
| 259 | # We no longer assume only one occurance of src per block, so we need to check them all |
---|
| 260 | for c in iter_components_by_src( block, src ): |
---|
| 261 | if component_overlaps_region( c, region ): |
---|
| 262 | if c.text is not None: |
---|
| 263 | rev_strand = False |
---|
| 264 | if c.strand == "-": |
---|
| 265 | #We want our coord_to_col coordinates to be returned from positive stranded component |
---|
| 266 | rev_strand = True |
---|
| 267 | c = c.reverse_complement() |
---|
| 268 | start = max( region.start, c.start ) |
---|
| 269 | end = min( region.end, c.end ) |
---|
| 270 | start = c.coord_to_col( start ) |
---|
| 271 | end = c.coord_to_col( end ) |
---|
| 272 | if rev_strand: |
---|
| 273 | #need to orient slice coordinates to the original block direction |
---|
| 274 | slice_len = end - start |
---|
| 275 | end = len( c.text ) - start |
---|
| 276 | start = end - slice_len |
---|
| 277 | slice_start = min( start, slice_start ) |
---|
| 278 | slice_end = max( end, slice_end ) |
---|
| 279 | |
---|
| 280 | if slice_start < slice_end: |
---|
| 281 | block = block.slice( slice_start, slice_end ) |
---|
| 282 | if block.text_size > mincols: |
---|
| 283 | # restore old score, may not be accurate, but it is better than 0 for everything? |
---|
| 284 | block.score = old_score |
---|
| 285 | if species is not None: |
---|
| 286 | block = block.limit_to_species( species ) |
---|
| 287 | block.remove_all_gap_columns() |
---|
| 288 | return block |
---|
| 289 | return None |
---|
| 290 |
|
---|
| 291 | def orient_block_by_region( block, src, region, force_strand = None ): |
---|
| 292 | #loop through components matching src, |
---|
| 293 | #make sure each of these components overlap region |
---|
| 294 | #cache strand for each of overlaping regions |
---|
| 295 | #if force_strand / region.strand not in strand cache, reverse complement |
---|
| 296 | ### we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing |
---|
| 297 | strands = [ c.strand for c in iter_components_by_src( block, src ) if component_overlaps_region( c, region ) ] |
---|
| 298 | if strands and ( force_strand is None and region.strand not in strands ) or ( force_strand is not None and force_strand not in strands ):
|
---|
| 299 | block = block.reverse_complement() |
---|
| 300 | return block |
---|
| 301 | |
---|
| 302 | def get_oriented_chopped_blocks_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): |
---|
| 303 | for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols, force_strand ):
|
---|
| 304 | yield block |
---|
| 305 | def get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): |
---|
| 306 | for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ): |
---|
| 307 | yield orient_block_by_region( block, src, region, force_strand ), idx, offset |
---|
| 308 | |
---|
| 309 | #split a block with multiple occurances of src into one block per src |
---|
| 310 | def iter_blocks_split_by_src( block, src ): |
---|
| 311 | for src_c in iter_components_by_src( block, src ): |
---|
| 312 | new_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) |
---|
| 313 | new_block.text_size = block.text_size |
---|
| 314 | for c in block.components: |
---|
| 315 | if c == src_c or c.src != src: |
---|
| 316 | new_block.add_component( deepcopy( c ) ) #components have reference to alignment, dont want to loose reference to original alignment block in original components |
---|
| 317 | yield new_block |
---|
| 318 | |
---|
| 319 | #split a block into multiple blocks with all combinations of a species appearing only once per block |
---|
| 320 | def iter_blocks_split_by_species( block, species = None ): |
---|
| 321 | def __split_components_by_species( components_by_species, new_block ): |
---|
| 322 | if components_by_species: |
---|
| 323 | #more species with components to add to this block |
---|
| 324 | components_by_species = deepcopy( components_by_species ) |
---|
| 325 | spec_comps = components_by_species.pop( 0 ) |
---|
| 326 | for c in spec_comps: |
---|
| 327 | newer_block = deepcopy( new_block ) |
---|
| 328 | newer_block.add_component( deepcopy( c ) ) |
---|
| 329 | for value in __split_components_by_species( components_by_species, newer_block ): |
---|
| 330 | yield value |
---|
| 331 | else: |
---|
| 332 | #no more components to add, yield this block |
---|
| 333 | yield new_block |
---|
| 334 | |
---|
| 335 | #divide components by species |
---|
| 336 | spec_dict = {} |
---|
| 337 | if not species: |
---|
| 338 | species = [] |
---|
| 339 | for c in block.components: |
---|
| 340 | spec, chrom = src_split( c.src ) |
---|
| 341 | if spec not in spec_dict: |
---|
| 342 | spec_dict[ spec ] = [] |
---|
| 343 | species.append( spec ) |
---|
| 344 | spec_dict[ spec ].append( c ) |
---|
| 345 | else: |
---|
| 346 | for spec in species: |
---|
| 347 | spec_dict[ spec ] = [] |
---|
| 348 | for c in iter_components_by_src_start( block, spec ): |
---|
| 349 | spec_dict[ spec ].append( c ) |
---|
| 350 | |
---|
| 351 | empty_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) #should we copy attributes? |
---|
| 352 | empty_block.text_size = block.text_size |
---|
| 353 | #call recursive function to split into each combo of spec/blocks |
---|
| 354 | for value in __split_components_by_species( spec_dict.values(), empty_block ): |
---|
| 355 | sort_block_components_by_block( value, block ) #restore original component order |
---|
| 356 | yield value |
---|
| 357 | |
---|
| 358 | |
---|
| 359 | #generator yielding only chopped and valid blocks for a specified region
|
---|
| 360 | def get_chopped_blocks_for_region( index, src, region, species = None, mincols = 0 ):
|
---|
| 361 | for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ):
|
---|
| 362 | yield block
|
---|
| 363 | def get_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0 ):
|
---|
| 364 | for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, region.start, region.end ):
|
---|
| 365 | block = chop_block_by_region( block, src, region, species, mincols )
|
---|
| 366 | if block is not None:
|
---|
| 367 | yield block, idx, offset
|
---|
| 368 |
|
---|
| 369 | #returns a filled region alignment for specified regions
|
---|
| 370 | def get_region_alignment( index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
|
---|
| 371 | if species is not None: alignment = RegionAlignment( end - start, species )
|
---|
| 372 | else: alignment = RegionAlignment( end - start, primary_species )
|
---|
| 373 | return fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps )
|
---|
| 374 |
|
---|
| 375 | #reduces a block to only positions exisiting in the src provided
|
---|
| 376 | def reduce_block_by_primary_genome( block, species, chromosome, region_start ):
|
---|
| 377 | #returns ( startIndex, {species:texts}
|
---|
| 378 | #where texts' contents are reduced to only positions existing in the primary genome
|
---|
| 379 | src = "%s.%s" % ( species, chromosome )
|
---|
| 380 | ref = block.get_component_by_src( src )
|
---|
| 381 | start_offset = ref.start - region_start
|
---|
| 382 | species_texts = {}
|
---|
| 383 | for c in block.components:
|
---|
| 384 | species_texts[ c.src.split( '.' )[0] ] = list( c.text )
|
---|
| 385 | #remove locations which are gaps in the primary species, starting from the downstream end
|
---|
| 386 | for i in range( len( species_texts[ species ] ) - 1, -1, -1 ):
|
---|
| 387 | if species_texts[ species ][i] == '-':
|
---|
| 388 | for text in species_texts.values():
|
---|
| 389 | text.pop( i )
|
---|
| 390 | for spec, text in species_texts.items():
|
---|
| 391 | species_texts[spec] = ''.join( text )
|
---|
| 392 | return ( start_offset, species_texts )
|
---|
| 393 |
|
---|
| 394 | #fills a region alignment
|
---|
| 395 | def fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
|
---|
| 396 | region = bx.intervals.Interval( start, end )
|
---|
| 397 | region.chrom = chrom
|
---|
| 398 | region.strand = strand
|
---|
| 399 | primary_src = "%s.%s" % ( primary_species, chrom )
|
---|
| 400 |
|
---|
| 401 | #Order blocks overlaping this position by score, lowest first
|
---|
| 402 | blocks = []
|
---|
| 403 | for block, idx, offset in index.get_as_iterator_with_index_and_offset( primary_src, start, end ):
|
---|
| 404 | score = float( block.score )
|
---|
| 405 | for i in range( 0, len( blocks ) ):
|
---|
| 406 | if score < blocks[i][0]:
|
---|
| 407 | blocks.insert( i, ( score, idx, offset ) )
|
---|
| 408 | break
|
---|
| 409 | else:
|
---|
| 410 | blocks.append( ( score, idx, offset ) )
|
---|
| 411 | |
---|
| 412 | #gap_chars_tuple = tuple( GAP_CHARS ) |
---|
| 413 | gap_chars_str = ''.join( GAP_CHARS )
|
---|
| 414 | #Loop through ordered blocks and layer by increasing score
|
---|
| 415 | for block_dict in blocks:
for block in iter_blocks_split_by_species( block_dict[1].get_at_offset( block_dict[2] ) ): #need to handle each occurance of sequence in block seperately |
---|
| 416 | if component_overlaps_region( block.get_component_by_src( primary_src ), region ): |
---|
| 417 | block = chop_block_by_region( block, primary_src, region, species, mincols ) #chop block |
---|
| 418 | block = orient_block_by_region( block, primary_src, region ) #orient block |
---|
| 419 | start_offset, species_texts = reduce_block_by_primary_genome( block, primary_species, chrom, start )
|
---|
| 420 | for spec, text in species_texts.items():
|
---|
| 421 | #we should trim gaps from both sides, since these are not positions in this species genome (sequence) |
---|
| 422 | text = text.rstrip( gap_chars_str ) |
---|
| 423 | gap_offset = 0 |
---|
| 424 | while True in [ text.startswith( gap_char ) for gap_char in GAP_CHARS ]: #python2.4 doesn't accept a tuple for .startswith() |
---|
| 425 | #while text.startswith( gap_chars_tuple ): |
---|
| 426 | gap_offset += 1 |
---|
| 427 | text = text[1:] |
---|
| 428 | if not text: |
---|
| 429 | break |
---|
| 430 | if text: |
---|
| 431 | if overwrite_with_gaps: |
---|
| 432 | alignment.set_range( start_offset + gap_offset, spec, text ) |
---|
| 433 | else: |
---|
| 434 | for i, char in enumerate( text ): |
---|
| 435 | if char not in GAP_CHARS: |
---|
| 436 | alignment.set_position( start_offset + gap_offset + i, spec, char )
|
---|
| 437 | return alignment
|
---|
| 438 |
|
---|
| 439 | #returns a filled spliced region alignment for specified region with start and end lists
|
---|
| 440 | def get_spliced_region_alignment( index, primary_species, chrom, starts, ends, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
|
---|
| 441 | #create spliced alignment object
|
---|
| 442 | if species is not None: alignment = SplicedAlignment( starts, ends, species )
|
---|
| 443 | else: alignment = SplicedAlignment( starts, ends, [primary_species] )
|
---|
| 444 | for exon in alignment.exons:
|
---|
| 445 | fill_region_alignment( exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps )
|
---|
| 446 | return alignment
|
---|
| 447 |
|
---|
| 448 | #loop through string array, only return non-commented lines
|
---|
| 449 | def line_enumerator( lines, comment_start = '#' ):
|
---|
| 450 | i = 0
|
---|
| 451 | for line in lines:
|
---|
| 452 | if not line.startswith( comment_start ):
|
---|
| 453 | i += 1
|
---|
| 454 | yield ( i, line )
|
---|
| 455 |
|
---|
| 456 | #read a GeneBed file, return list of starts, ends, raw fields
|
---|
| 457 | def get_starts_ends_fields_from_gene_bed( line ):
|
---|
| 458 | #Starts and ends for exons
|
---|
| 459 | starts = []
|
---|
| 460 | ends = []
|
---|
| 461 |
|
---|
| 462 | fields = line.split()
|
---|
| 463 | #Requires atleast 12 BED columns
|
---|
| 464 | if len(fields) < 12:
|
---|
| 465 | raise Exception( "Not a proper 12 column BED line (%s)." % line )
|
---|
| 466 | chrom = fields[0]
|
---|
| 467 | tx_start = int( fields[1] )
|
---|
| 468 | tx_end = int( fields[2] )
|
---|
| 469 | name = fields[3]
|
---|
| 470 | strand = fields[5]
|
---|
| 471 | if strand != '-': strand='+' #Default strand is +
|
---|
| 472 | cds_start = int( fields[6] )
|
---|
| 473 | cds_end = int( fields[7] )
|
---|
| 474 |
|
---|
| 475 | #Calculate and store starts and ends of coding exons
|
---|
| 476 | region_start, region_end = cds_start, cds_end
|
---|
| 477 | exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) )
|
---|
| 478 | exon_starts = map( ( lambda x: x + tx_start ), exon_starts )
|
---|
| 479 | exon_ends = map( int, fields[10].rstrip( ',' ).split( ',' ) )
|
---|
| 480 | exon_ends = map( ( lambda x, y: x + y ), exon_starts, exon_ends );
|
---|
| 481 | for start, end in zip( exon_starts, exon_ends ):
|
---|
| 482 | start = max( start, region_start )
|
---|
| 483 | end = min( end, region_end )
|
---|
| 484 | if start < end:
|
---|
| 485 | starts.append( start )
|
---|
| 486 | ends.append( end )
|
---|
| 487 | return ( starts, ends, fields )
|
---|
| 488 |
|
---|
| 489 | def iter_components_by_src( block, src ):
|
---|
| 490 | for c in block.components:
|
---|
| 491 | if c.src == src:
|
---|
| 492 | yield c
|
---|
| 493 |
|
---|
| 494 | def get_components_by_src( block, src ):
|
---|
| 495 | return [ value for value in iter_components_by_src( block, src ) ]
|
---|
| 496 |
|
---|
| 497 | def iter_components_by_src_start( block, src ):
|
---|
| 498 | for c in block.components:
|
---|
| 499 | if c.src.startswith( src ):
|
---|
| 500 | yield c
|
---|
| 501 |
|
---|
| 502 | def get_components_by_src_start( block, src ):
|
---|
| 503 | return [ value for value in iter_components_by_src_start( block, src ) ]
|
---|
| 504 | |
---|
| 505 | def sort_block_components_by_block( block1, block2 ): |
---|
| 506 | #orders the components in block1 by the index of the component in block2 |
---|
| 507 | #block1 must be a subset of block2 |
---|
| 508 | #occurs in-place |
---|
| 509 | return block1.components.sort( cmp = lambda x, y: block2.components.index( x ) - block2.components.index( y ) ) |
---|
| 510 |
|
---|
| 511 | def get_species_in_maf( maf_filename ): |
---|
| 512 | species = [] |
---|
| 513 | for block in bx.align.maf.Reader( open( maf_filename ) ): |
---|
| 514 | for spec in get_species_in_block( block ): |
---|
| 515 | if spec not in species: |
---|
| 516 | species.append( spec ) |
---|
| 517 | return species
|
---|
| 518 |
|
---|
| 519 | def parse_species_option( species ):
|
---|
| 520 | if species:
|
---|
| 521 | species = species.split( ',' )
|
---|
| 522 | if 'None' not in species:
|
---|
| 523 | return species
|
---|
| 524 | return None #provided species was '', None, or had 'None' in it
|
---|
| 525 |
|
---|
| 526 | def remove_temp_index_file( index_filename ):
|
---|
| 527 | try: os.unlink( index_filename )
|
---|
| 528 | except: pass
|
---|
| 529 |
|
---|
| 530 | #Below are methods to deal with FASTA files
|
---|
| 531 |
|
---|
| 532 | def get_fasta_header( component, attributes = {}, suffix = None ):
|
---|
| 533 | header = ">%s(%s):%i-%i|" % ( component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end() )
|
---|
| 534 | for key, value in attributes.iteritems():
|
---|
| 535 | header = "%s%s=%s|" % ( header, key, value )
|
---|
| 536 | if suffix:
|
---|
| 537 | header = "%s%s" % ( header, suffix )
|
---|
| 538 | else:
|
---|
| 539 | header = "%s%s" % ( header, src_split( component.src )[ 0 ] )
|
---|
| 540 | return header
|
---|
| 541 |
|
---|
| 542 | def get_attributes_from_fasta_header( header ):
|
---|
| 543 | if not header: return {}
|
---|
| 544 | attributes = {}
|
---|
| 545 | header = header.lstrip( '>' )
|
---|
| 546 | header = header.strip()
|
---|
| 547 | fields = header.split( '|' )
|
---|
| 548 | try:
|
---|
| 549 | region = fields[0]
|
---|
| 550 | region = region.split( '(', 1 )
|
---|
| 551 | temp = region[0].split( '.', 1 )
|
---|
| 552 | attributes['species'] = temp[0]
|
---|
| 553 | if len( temp ) == 2:
|
---|
| 554 | attributes['chrom'] = temp[1]
|
---|
| 555 | else:
|
---|
| 556 | attributes['chrom'] = temp[0]
|
---|
| 557 | region = region[1].split( ')', 1 )
|
---|
| 558 | attributes['strand'] = region[0]
|
---|
| 559 | region = region[1].lstrip( ':' ).split( '-' )
|
---|
| 560 | attributes['start'] = int( region[0] )
|
---|
| 561 | attributes['end'] = int( region[1] )
|
---|
| 562 | except:
|
---|
| 563 | #fields 0 is not a region coordinate
|
---|
| 564 | pass
|
---|
| 565 | if len( fields ) > 2:
|
---|
| 566 | for i in xrange( 1, len( fields ) - 1 ):
|
---|
| 567 | prop = fields[i].split( '=', 1 )
|
---|
| 568 | if len( prop ) == 2:
|
---|
| 569 | attributes[ prop[0] ] = prop[1]
|
---|
| 570 | if len( fields ) > 1:
|
---|
| 571 | attributes['__suffix__'] = fields[-1]
|
---|
| 572 | return attributes
|
---|
| 573 |
|
---|
| 574 | def iter_fasta_alignment( filename ):
|
---|
| 575 | class fastaComponent:
|
---|
| 576 | def __init__( self, species, text = "" ):
|
---|
| 577 | self.species = species
|
---|
| 578 | self.text = text
|
---|
| 579 | def extend( self, text ):
|
---|
| 580 | self.text = self.text + text.replace( '\n', '' ).replace( '\r', '' ).strip()
|
---|
| 581 | #yields a list of fastaComponents for a FASTA file
|
---|
| 582 | f = open( filename, 'rb' )
|
---|
| 583 | components = []
|
---|
| 584 | #cur_component = None
|
---|
| 585 | while True:
|
---|
| 586 | line = f.readline()
|
---|
| 587 | if not line:
|
---|
| 588 | if components:
|
---|
| 589 | yield components
|
---|
| 590 | return
|
---|
| 591 | line = line.strip()
|
---|
| 592 | if not line:
|
---|
| 593 | if components:
|
---|
| 594 | yield components
|
---|
| 595 | components = []
|
---|
| 596 | elif line.startswith( '>' ):
|
---|
| 597 | attributes = get_attributes_from_fasta_header( line )
|
---|
| 598 | components.append( fastaComponent( attributes['species'] ) )
|
---|
| 599 | elif components:
|
---|
| 600 | components[-1].extend( line )
|
---|
| 601 |
|
---|