1 | #!/usr/bin/python2.6 |
---|
2 | |
---|
3 | """ |
---|
4 | Reads a list of intervals and a maf. Produces a new maf containing the |
---|
5 | blocks or parts of blocks in the original that overlapped the intervals. |
---|
6 | |
---|
7 | It is assumed that each file `maf_fname` has a corresponding `maf_fname`.index |
---|
8 | file. |
---|
9 | |
---|
10 | NOTE: If two intervals overlap the same block it will be written twice. With |
---|
11 | non-overlapping intervals and --chop this is never a problem. |
---|
12 | |
---|
13 | NOTE: Intervals are relative to the + strand, regardless of the strands in |
---|
14 | the alignments. |
---|
15 | |
---|
16 | |
---|
17 | WARNING: bz2/bz2t support and file cache support are new and not as well |
---|
18 | tested. |
---|
19 | |
---|
20 | usage: %prog maf_fname1 maf_fname2 ... [options] < interval_file |
---|
21 | -m, --mincols=0: Minimum length (columns) required for alignment to be output |
---|
22 | -c, --chop: Should blocks be chopped to only portion overlapping (no by default) |
---|
23 | -s, --src=s: Use this src for all intervals |
---|
24 | -p, --prefix=p: Prepend this to each src before lookup |
---|
25 | -d, --dir=d: Write each interval as a separate file in this directory |
---|
26 | -S, --strand: Strand is included as an additional column, and the blocks are reverse complemented (if necessary) so that they are always on that strand w/r/t the src species. |
---|
27 | -C, --usecache: Use a cache that keeps blocks of the MAF files in memory (requires ~20MB per MAF) |
---|
28 | """ |
---|
29 | |
---|
30 | import psyco_full |
---|
31 | |
---|
32 | from bx.cookbook import doc_optparse |
---|
33 | |
---|
34 | import bx.align.maf |
---|
35 | from bx import misc |
---|
36 | import os |
---|
37 | import sys |
---|
38 | |
---|
39 | def main(): |
---|
40 | # Parse Command Line |
---|
41 | options, args = doc_optparse.parse( __doc__ ) |
---|
42 | try: |
---|
43 | maf_files = args |
---|
44 | if options.mincols: mincols = int( options.mincols ) |
---|
45 | else: mincols = 0 |
---|
46 | if options.src: fixed_src = options.src |
---|
47 | else: fixed_src = None |
---|
48 | if options.prefix: prefix = options.prefix |
---|
49 | else: prefix = None |
---|
50 | if options.dir: dir = options.dir |
---|
51 | else: dir = None |
---|
52 | chop = bool( options.chop ) |
---|
53 | do_strand = bool( options.strand ) |
---|
54 | use_cache = bool( options.usecache ) |
---|
55 | except: |
---|
56 | doc_optparse.exit() |
---|
57 | # Open indexed access to mafs |
---|
58 | index = bx.align.maf.MultiIndexed( maf_files, keep_open=True, |
---|
59 | parse_e_rows=True, |
---|
60 | use_cache=use_cache ) |
---|
61 | # Start MAF on stdout |
---|
62 | if dir is None: |
---|
63 | out = bx.align.maf.Writer( sys.stdout ) |
---|
64 | # Iterate over input ranges |
---|
65 | for line in sys.stdin: |
---|
66 | strand = None |
---|
67 | fields = line.split() |
---|
68 | if fixed_src: |
---|
69 | src, start, end = fixed_src, int( fields[0] ), int( fields[1] ) |
---|
70 | if do_strand: strand = fields[2] |
---|
71 | else: |
---|
72 | src, start, end = fields[0], int( fields[1] ), int( fields[2] ) |
---|
73 | if do_strand: strand = fields[3] |
---|
74 | if prefix: src = prefix + src |
---|
75 | # Find overlap with reference component |
---|
76 | blocks = index.get( src, start, end ) |
---|
77 | # Open file if needed |
---|
78 | if dir: |
---|
79 | out = bx.align.maf.Writer( open( os.path.join( dir, "%s:%09d-%09d.maf" % ( src, start, end ) ), 'w' ) ) |
---|
80 | # Write each intersecting block |
---|
81 | if chop: |
---|
82 | for block in blocks: |
---|
83 | ref = block.get_component_by_src( src ) |
---|
84 | slice_start = max( start, ref.get_forward_strand_start() ) |
---|
85 | slice_end = min( end, ref.get_forward_strand_end() ) |
---|
86 | sliced = block.slice_by_component( ref, slice_start, slice_end ) |
---|
87 | # If the block is shorter than the minimum allowed size, stop |
---|
88 | if mincols and ( sliced.text_size < mincols ): |
---|
89 | continue |
---|
90 | # If the reference component is empty, don't write the block |
---|
91 | if sliced.get_component_by_src( src ).size < 1: |
---|
92 | continue |
---|
93 | # Keep only components that are not empty |
---|
94 | sliced.components = [ c for c in sliced.components if c.size > 0 ] |
---|
95 | # Reverse complement if needed |
---|
96 | if ( strand != None ) and ( ref.strand != strand ): |
---|
97 | sliced = sliced.reverse_complement() |
---|
98 | # Write the block |
---|
99 | out.write( sliced ) |
---|
100 | else: |
---|
101 | for block in blocks: |
---|
102 | out.write( block ) |
---|
103 | if dir: |
---|
104 | out.close() |
---|
105 | # Close output MAF |
---|
106 | out.close() |
---|
107 | index.close() |
---|
108 | |
---|
109 | if __name__ == "__main__": |
---|
110 | main() |
---|