| 1 | #!/usr/bin/python2.6 |
|---|
| 2 | |
|---|
| 3 | """ |
|---|
| 4 | WARNING: bz2/bz2t support and file cache support are new and not as well |
|---|
| 5 | tested. |
|---|
| 6 | |
|---|
| 7 | usage: %prog maf_files [options] < interval_file |
|---|
| 8 | -s, --species=SPECIES: Comma separated list of species to include |
|---|
| 9 | -p, --prefix=PREFIX: Prefix to add to each interval chrom (usually reference species) |
|---|
| 10 | -C, --usecache: Use a cache that keeps blocks of the MAF files in memory (requires ~20MB per MAF) |
|---|
| 11 | """ |
|---|
| 12 | |
|---|
| 13 | from __future__ import division |
|---|
| 14 | |
|---|
| 15 | import psyco_full |
|---|
| 16 | |
|---|
| 17 | from bx.cookbook import doc_optparse |
|---|
| 18 | |
|---|
| 19 | import bx.align.maf |
|---|
| 20 | from bx import misc |
|---|
| 21 | import os |
|---|
| 22 | import sys |
|---|
| 23 | |
|---|
| 24 | from numpy import * |
|---|
| 25 | |
|---|
| 26 | def main(): |
|---|
| 27 | # Parse Command Line |
|---|
| 28 | options, args = doc_optparse.parse( __doc__ ) |
|---|
| 29 | try: |
|---|
| 30 | maf_files = args |
|---|
| 31 | species = options.species.split( "," ) |
|---|
| 32 | prefix = options.prefix |
|---|
| 33 | use_cache = bool( options.usecache ) |
|---|
| 34 | if not prefix: |
|---|
| 35 | prefix = "" |
|---|
| 36 | except: |
|---|
| 37 | doc_optparse.exit() |
|---|
| 38 | # Open indexed access to mafs |
|---|
| 39 | index = bx.align.maf.MultiIndexed( maf_files, |
|---|
| 40 | parse_e_rows=True, |
|---|
| 41 | use_cache=use_cache ) |
|---|
| 42 | # Print header |
|---|
| 43 | print "#chr", "start", "end", |
|---|
| 44 | for s in species: |
|---|
| 45 | print s, |
|---|
| 46 | print |
|---|
| 47 | # Iterate over input ranges |
|---|
| 48 | for line in sys.stdin: |
|---|
| 49 | fields = line.split() |
|---|
| 50 | # Input is BED3+ |
|---|
| 51 | chr, start, end = fields[0], int( fields[1] ), int( fields[2] ) |
|---|
| 52 | length = end - start |
|---|
| 53 | assert length > 0, "Interval has length less than one" |
|---|
| 54 | # Prepend prefix if specified |
|---|
| 55 | src = prefix + chr |
|---|
| 56 | # Keep a bitset for each species noting covered pieces |
|---|
| 57 | aligned_bits = [] |
|---|
| 58 | missing_bits = [] |
|---|
| 59 | for s in species: |
|---|
| 60 | aligned_bits.append( zeros( length, dtype=bool ) ) |
|---|
| 61 | missing_bits.append( zeros( length, dtype=bool ) ) |
|---|
| 62 | # Find overlap with reference component |
|---|
| 63 | blocks = index.get( src, start, end ) |
|---|
| 64 | # Determine alignability for each position |
|---|
| 65 | for block in blocks: |
|---|
| 66 | # Determine the piece of the human interval this block covers, |
|---|
| 67 | # relative to the start of the interval of interest |
|---|
| 68 | ref = block.get_component_by_src( src ) |
|---|
| 69 | assert ref.strand == "+", \ |
|---|
| 70 | "Reference species blocks must be on '+' strand" |
|---|
| 71 | rel_start = max( start, ref.start ) - start |
|---|
| 72 | rel_end = min( end, ref.end ) - start |
|---|
| 73 | # Check alignability for each species |
|---|
| 74 | for i, s in enumerate( species ): |
|---|
| 75 | other = block.get_component_by_src_start( s ) |
|---|
| 76 | # Species does not appear at all indicates unaligned (best we |
|---|
| 77 | # can do here?) |
|---|
| 78 | if other is None: |
|---|
| 79 | continue |
|---|
| 80 | # An empty component might indicate missing data, all other |
|---|
| 81 | # cases (even contiguous) we count as not aligned |
|---|
| 82 | if other.empty: |
|---|
| 83 | if other.synteny_empty == bx.align.maf.MAF_MISSING_STATUS: |
|---|
| 84 | missing_bits[i][rel_start:rel_end] = True |
|---|
| 85 | # Otherwise we have a local alignment with some text, call |
|---|
| 86 | # it aligned |
|---|
| 87 | else: |
|---|
| 88 | aligned_bits[i][rel_start:rel_end] = True |
|---|
| 89 | # Now determine the total alignment coverage of each interval |
|---|
| 90 | print chr, start, end, |
|---|
| 91 | for i, s in enumerate( species ): |
|---|
| 92 | aligned = sum( aligned_bits[i] ) |
|---|
| 93 | missing = sum( missing_bits[i] ) |
|---|
| 94 | # An interval will be called missing if it is < 100bp and <50% |
|---|
| 95 | # present, or more than 100bp and less that 50bp present (yes, |
|---|
| 96 | # arbitrary) |
|---|
| 97 | is_missing = False |
|---|
| 98 | if length < 100 and missing > ( length / 2 ): |
|---|
| 99 | print "NA", |
|---|
| 100 | elif length >= 100 and missing > 50: |
|---|
| 101 | print "NA", |
|---|
| 102 | else: |
|---|
| 103 | print aligned / ( length - missing ), |
|---|
| 104 | |
|---|
| 105 | print |
|---|
| 106 | |
|---|
| 107 | # Close MAF files |
|---|
| 108 | index.close() |
|---|
| 109 | |
|---|
| 110 | if __name__ == "__main__": |
|---|
| 111 | main() |
|---|