| 1 | #!/usr/bin/python2.6 |
|---|
| 2 | |
|---|
| 3 | """ |
|---|
| 4 | Reads a list of intervals and a maf. Produces a new maf containing the |
|---|
| 5 | blocks or parts of blocks in the original that overlapped the intervals. |
|---|
| 6 | |
|---|
| 7 | It is assumed that each file `maf_fname` has a corresponding `maf_fname`.index |
|---|
| 8 | file. |
|---|
| 9 | |
|---|
| 10 | NOTE: If two intervals overlap the same block it will be written twice. With |
|---|
| 11 | non-overlapping intervals and --chop this is never a problem. |
|---|
| 12 | |
|---|
| 13 | NOTE: Intervals are relative to the + strand, regardless of the strands in |
|---|
| 14 | the alignments. |
|---|
| 15 | |
|---|
| 16 | |
|---|
| 17 | WARNING: bz2/bz2t support and file cache support are new and not as well |
|---|
| 18 | tested. |
|---|
| 19 | |
|---|
| 20 | usage: %prog maf_fname1 maf_fname2 ... [options] < interval_file |
|---|
| 21 | -m, --mincols=0: Minimum length (columns) required for alignment to be output |
|---|
| 22 | -c, --chop: Should blocks be chopped to only portion overlapping (no by default) |
|---|
| 23 | -s, --src=s: Use this src for all intervals |
|---|
| 24 | -p, --prefix=p: Prepend this to each src before lookup |
|---|
| 25 | -d, --dir=d: Write each interval as a separate file in this directory |
|---|
| 26 | -S, --strand: Strand is included as an additional column, and the blocks are reverse complemented (if necessary) so that they are always on that strand w/r/t the src species. |
|---|
| 27 | -C, --usecache: Use a cache that keeps blocks of the MAF files in memory (requires ~20MB per MAF) |
|---|
| 28 | """ |
|---|
| 29 | |
|---|
| 30 | import psyco_full |
|---|
| 31 | |
|---|
| 32 | from bx.cookbook import doc_optparse |
|---|
| 33 | |
|---|
| 34 | import bx.align.maf |
|---|
| 35 | from bx import misc |
|---|
| 36 | import os |
|---|
| 37 | import sys |
|---|
| 38 | |
|---|
| 39 | def main(): |
|---|
| 40 | # Parse Command Line |
|---|
| 41 | options, args = doc_optparse.parse( __doc__ ) |
|---|
| 42 | try: |
|---|
| 43 | maf_files = args |
|---|
| 44 | if options.mincols: mincols = int( options.mincols ) |
|---|
| 45 | else: mincols = 0 |
|---|
| 46 | if options.src: fixed_src = options.src |
|---|
| 47 | else: fixed_src = None |
|---|
| 48 | if options.prefix: prefix = options.prefix |
|---|
| 49 | else: prefix = None |
|---|
| 50 | if options.dir: dir = options.dir |
|---|
| 51 | else: dir = None |
|---|
| 52 | chop = bool( options.chop ) |
|---|
| 53 | do_strand = bool( options.strand ) |
|---|
| 54 | use_cache = bool( options.usecache ) |
|---|
| 55 | except: |
|---|
| 56 | doc_optparse.exit() |
|---|
| 57 | # Open indexed access to mafs |
|---|
| 58 | index = bx.align.maf.MultiIndexed( maf_files, keep_open=True, |
|---|
| 59 | parse_e_rows=True, |
|---|
| 60 | use_cache=use_cache ) |
|---|
| 61 | # Start MAF on stdout |
|---|
| 62 | if dir is None: |
|---|
| 63 | out = bx.align.maf.Writer( sys.stdout ) |
|---|
| 64 | # Iterate over input ranges |
|---|
| 65 | for line in sys.stdin: |
|---|
| 66 | strand = None |
|---|
| 67 | fields = line.split() |
|---|
| 68 | if fixed_src: |
|---|
| 69 | src, start, end = fixed_src, int( fields[0] ), int( fields[1] ) |
|---|
| 70 | if do_strand: strand = fields[2] |
|---|
| 71 | else: |
|---|
| 72 | src, start, end = fields[0], int( fields[1] ), int( fields[2] ) |
|---|
| 73 | if do_strand: strand = fields[3] |
|---|
| 74 | if prefix: src = prefix + src |
|---|
| 75 | # Find overlap with reference component |
|---|
| 76 | blocks = index.get( src, start, end ) |
|---|
| 77 | # Open file if needed |
|---|
| 78 | if dir: |
|---|
| 79 | out = bx.align.maf.Writer( open( os.path.join( dir, "%s:%09d-%09d.maf" % ( src, start, end ) ), 'w' ) ) |
|---|
| 80 | # Write each intersecting block |
|---|
| 81 | if chop: |
|---|
| 82 | for block in blocks: |
|---|
| 83 | ref = block.get_component_by_src( src ) |
|---|
| 84 | slice_start = max( start, ref.get_forward_strand_start() ) |
|---|
| 85 | slice_end = min( end, ref.get_forward_strand_end() ) |
|---|
| 86 | sliced = block.slice_by_component( ref, slice_start, slice_end ) |
|---|
| 87 | # If the block is shorter than the minimum allowed size, stop |
|---|
| 88 | if mincols and ( sliced.text_size < mincols ): |
|---|
| 89 | continue |
|---|
| 90 | # If the reference component is empty, don't write the block |
|---|
| 91 | if sliced.get_component_by_src( src ).size < 1: |
|---|
| 92 | continue |
|---|
| 93 | # Keep only components that are not empty |
|---|
| 94 | sliced.components = [ c for c in sliced.components if c.size > 0 ] |
|---|
| 95 | # Reverse complement if needed |
|---|
| 96 | if ( strand != None ) and ( ref.strand != strand ): |
|---|
| 97 | sliced = sliced.reverse_complement() |
|---|
| 98 | # Write the block |
|---|
| 99 | out.write( sliced ) |
|---|
| 100 | else: |
|---|
| 101 | for block in blocks: |
|---|
| 102 | out.write( block ) |
|---|
| 103 | if dir: |
|---|
| 104 | out.close() |
|---|
| 105 | # Close output MAF |
|---|
| 106 | out.close() |
|---|
| 107 | index.close() |
|---|
| 108 | |
|---|
| 109 | if __name__ == "__main__": |
|---|
| 110 | main() |
|---|