1 | #!/usr/bin/python2.6 |
---|
2 | |
---|
3 | """ |
---|
4 | WARNING: bz2/bz2t support and file cache support are new and not as well |
---|
5 | tested. |
---|
6 | |
---|
7 | usage: %prog maf_files [options] < interval_file |
---|
8 | -s, --species=SPECIES: Comma separated list of species to include |
---|
9 | -p, --prefix=PREFIX: Prefix to add to each interval chrom (usually reference species) |
---|
10 | -C, --usecache: Use a cache that keeps blocks of the MAF files in memory (requires ~20MB per MAF) |
---|
11 | """ |
---|
12 | |
---|
13 | from __future__ import division |
---|
14 | |
---|
15 | import psyco_full |
---|
16 | |
---|
17 | from bx.cookbook import doc_optparse |
---|
18 | |
---|
19 | import bx.align.maf |
---|
20 | from bx import misc |
---|
21 | import os |
---|
22 | import sys |
---|
23 | |
---|
24 | from numpy import * |
---|
25 | |
---|
26 | def main(): |
---|
27 | # Parse Command Line |
---|
28 | options, args = doc_optparse.parse( __doc__ ) |
---|
29 | try: |
---|
30 | maf_files = args |
---|
31 | species = options.species.split( "," ) |
---|
32 | prefix = options.prefix |
---|
33 | use_cache = bool( options.usecache ) |
---|
34 | if not prefix: |
---|
35 | prefix = "" |
---|
36 | except: |
---|
37 | doc_optparse.exit() |
---|
38 | # Open indexed access to mafs |
---|
39 | index = bx.align.maf.MultiIndexed( maf_files, |
---|
40 | parse_e_rows=True, |
---|
41 | use_cache=use_cache ) |
---|
42 | # Print header |
---|
43 | print "#chr", "start", "end", |
---|
44 | for s in species: |
---|
45 | print s, |
---|
46 | print |
---|
47 | # Iterate over input ranges |
---|
48 | for line in sys.stdin: |
---|
49 | fields = line.split() |
---|
50 | # Input is BED3+ |
---|
51 | chr, start, end = fields[0], int( fields[1] ), int( fields[2] ) |
---|
52 | length = end - start |
---|
53 | assert length > 0, "Interval has length less than one" |
---|
54 | # Prepend prefix if specified |
---|
55 | src = prefix + chr |
---|
56 | # Keep a bitset for each species noting covered pieces |
---|
57 | aligned_bits = [] |
---|
58 | missing_bits = [] |
---|
59 | for s in species: |
---|
60 | aligned_bits.append( zeros( length, dtype=bool ) ) |
---|
61 | missing_bits.append( zeros( length, dtype=bool ) ) |
---|
62 | # Find overlap with reference component |
---|
63 | blocks = index.get( src, start, end ) |
---|
64 | # Determine alignability for each position |
---|
65 | for block in blocks: |
---|
66 | # Determine the piece of the human interval this block covers, |
---|
67 | # relative to the start of the interval of interest |
---|
68 | ref = block.get_component_by_src( src ) |
---|
69 | assert ref.strand == "+", \ |
---|
70 | "Reference species blocks must be on '+' strand" |
---|
71 | rel_start = max( start, ref.start ) - start |
---|
72 | rel_end = min( end, ref.end ) - start |
---|
73 | # Check alignability for each species |
---|
74 | for i, s in enumerate( species ): |
---|
75 | other = block.get_component_by_src_start( s ) |
---|
76 | # Species does not appear at all indicates unaligned (best we |
---|
77 | # can do here?) |
---|
78 | if other is None: |
---|
79 | continue |
---|
80 | # An empty component might indicate missing data, all other |
---|
81 | # cases (even contiguous) we count as not aligned |
---|
82 | if other.empty: |
---|
83 | if other.synteny_empty == bx.align.maf.MAF_MISSING_STATUS: |
---|
84 | missing_bits[i][rel_start:rel_end] = True |
---|
85 | # Otherwise we have a local alignment with some text, call |
---|
86 | # it aligned |
---|
87 | else: |
---|
88 | aligned_bits[i][rel_start:rel_end] = True |
---|
89 | # Now determine the total alignment coverage of each interval |
---|
90 | print chr, start, end, |
---|
91 | for i, s in enumerate( species ): |
---|
92 | aligned = sum( aligned_bits[i] ) |
---|
93 | missing = sum( missing_bits[i] ) |
---|
94 | # An interval will be called missing if it is < 100bp and <50% |
---|
95 | # present, or more than 100bp and less that 50bp present (yes, |
---|
96 | # arbitrary) |
---|
97 | is_missing = False |
---|
98 | if length < 100 and missing > ( length / 2 ): |
---|
99 | print "NA", |
---|
100 | elif length >= 100 and missing > 50: |
---|
101 | print "NA", |
---|
102 | else: |
---|
103 | print aligned / ( length - missing ), |
---|
104 | |
---|
105 | print |
---|
106 | |
---|
107 | # Close MAF files |
---|
108 | index.close() |
---|
109 | |
---|
110 | if __name__ == "__main__": |
---|
111 | main() |
---|