| 1 | #!/usr/bin/env python |
|---|
| 2 | |
|---|
| 3 | """ |
|---|
| 4 | usage: %prog data_file.h5 region_mapping.bed in_file out_file chrom_col start_col end_col [options] |
|---|
| 5 | -p, --perCol: standardize to lod per column |
|---|
| 6 | """ |
|---|
| 7 | |
|---|
| 8 | from __future__ import division |
|---|
| 9 | |
|---|
| 10 | import sys |
|---|
| 11 | from galaxy import eggs |
|---|
| 12 | from numpy import * |
|---|
| 13 | from tables import * |
|---|
| 14 | |
|---|
| 15 | import pkg_resources; pkg_resources.require( "bx-python" ) |
|---|
| 16 | from bx.cookbook import doc_optparse |
|---|
| 17 | |
|---|
| 18 | from bx import intervals |
|---|
| 19 | |
|---|
| 20 | # ignore wanrnings about NumArray flavor |
|---|
| 21 | from warnings import filterwarnings |
|---|
| 22 | from tables.exceptions import FlavorWarning |
|---|
| 23 | filterwarnings("ignore", category=FlavorWarning) |
|---|
| 24 | |
|---|
| 25 | assert sys.version_info[:2] >= ( 2, 4 ) |
|---|
| 26 | |
|---|
| 27 | def stop_err( msg ): |
|---|
| 28 | sys.stderr.write(msg) |
|---|
| 29 | sys.exit() |
|---|
| 30 | |
|---|
| 31 | def main(): |
|---|
| 32 | # Parse command line |
|---|
| 33 | options, args = doc_optparse.parse( __doc__ ) |
|---|
| 34 | try: |
|---|
| 35 | h5_fname = args[0] |
|---|
| 36 | mapping_fname = args[1] |
|---|
| 37 | in_fname = args[2] |
|---|
| 38 | out_fname = args[3] |
|---|
| 39 | chrom_col, start_col, end_col = map( lambda x: int( x ) - 1, args[4:7] ) |
|---|
| 40 | per_col = bool( options.perCol ) |
|---|
| 41 | except Exception, e: |
|---|
| 42 | doc_optparse.exception() |
|---|
| 43 | |
|---|
| 44 | if h5_fname == 'None.h5': |
|---|
| 45 | stop_err( 'Invalid genome build, this tool currently only works with data from build hg17. Click the pencil icon in your history item to correct the build if appropriate.' ) |
|---|
| 46 | |
|---|
| 47 | # Open the h5 file |
|---|
| 48 | h5 = openFile( h5_fname, mode = "r" ) |
|---|
| 49 | # Load intervals and names for the subregions |
|---|
| 50 | intersecters = {} |
|---|
| 51 | for i, line in enumerate( file( mapping_fname ) ): |
|---|
| 52 | line = line.rstrip( '\r\n' ) |
|---|
| 53 | if line and not line.startswith( '#' ): |
|---|
| 54 | chr, start, end, name = line.split()[0:4] |
|---|
| 55 | if not intersecters.has_key( chr ): |
|---|
| 56 | intersecters[ chr ] = intervals.Intersecter() |
|---|
| 57 | intersecters[ chr ].add_interval( intervals.Interval( int( start ), int( end ), name ) ) |
|---|
| 58 | |
|---|
| 59 | # Find the subregion containing each input interval |
|---|
| 60 | skipped_lines = 0 |
|---|
| 61 | first_invalid_line = 0 |
|---|
| 62 | invalid_line = '' |
|---|
| 63 | out_file = open( out_fname, "w" ) |
|---|
| 64 | warnings = [] |
|---|
| 65 | warning = '' |
|---|
| 66 | for i, line in enumerate( file( in_fname ) ): |
|---|
| 67 | line = line.rstrip( '\r\n' ) |
|---|
| 68 | if line.startswith( '#' ): |
|---|
| 69 | if i == 0: |
|---|
| 70 | out_file.write( "%s\tscore\n" % line ) |
|---|
| 71 | else: |
|---|
| 72 | out_file.write( "%s\n" % line ) |
|---|
| 73 | fields = line.split( "\t" ) |
|---|
| 74 | try: |
|---|
| 75 | chr = fields[ chrom_col ] |
|---|
| 76 | start = int( fields[ start_col ] ) |
|---|
| 77 | end = int( fields[ end_col ] ) |
|---|
| 78 | except: |
|---|
| 79 | warning = "Invalid value for chrom, start or end column." |
|---|
| 80 | warnings.append( warning ) |
|---|
| 81 | skipped_lines += 1 |
|---|
| 82 | if not invalid_line: |
|---|
| 83 | first_invalid_line = i + 1 |
|---|
| 84 | invalid_line = line |
|---|
| 85 | continue |
|---|
| 86 | # Find matching interval |
|---|
| 87 | try: |
|---|
| 88 | matches = intersecters[ chr ].find( start, end ) |
|---|
| 89 | except: |
|---|
| 90 | warning = "'%s' is not a valid chrom value for the region. " %chr |
|---|
| 91 | warnings.append( warning ) |
|---|
| 92 | skipped_lines += 1 |
|---|
| 93 | if not invalid_line: |
|---|
| 94 | first_invalid_line = i + 1 |
|---|
| 95 | invalid_line = line |
|---|
| 96 | continue |
|---|
| 97 | if not len( matches ) == 1: |
|---|
| 98 | warning = "Interval must match exactly one target region. " |
|---|
| 99 | warnings.append( warning ) |
|---|
| 100 | skipped_lines += 1 |
|---|
| 101 | if not invalid_line: |
|---|
| 102 | first_invalid_line = i + 1 |
|---|
| 103 | invalid_line = line |
|---|
| 104 | continue |
|---|
| 105 | region = matches[0] |
|---|
| 106 | if not ( start >= region.start and end <= region.end ): |
|---|
| 107 | warning = "Interval must fall entirely within region. " |
|---|
| 108 | warnings.append( warning ) |
|---|
| 109 | skipped_lines += 1 |
|---|
| 110 | if not invalid_line: |
|---|
| 111 | first_invalid_line = i + 1 |
|---|
| 112 | invalid_line = line |
|---|
| 113 | continue |
|---|
| 114 | region_name = region.value |
|---|
| 115 | rel_start = start - region.start |
|---|
| 116 | rel_end = end - region.start |
|---|
| 117 | if not rel_start < rel_end: |
|---|
| 118 | warning = "Region %s is empty, relative start:%d, relative end:%d. " % ( region_name, rel_start, rel_end ) |
|---|
| 119 | warnings.append( warning ) |
|---|
| 120 | skipped_lines += 1 |
|---|
| 121 | if not invalid_line: |
|---|
| 122 | first_invalid_line = i + 1 |
|---|
| 123 | invalid_line = line |
|---|
| 124 | continue |
|---|
| 125 | s = h5.getNode( h5.root, "scores_" + region_name ) |
|---|
| 126 | c = h5.getNode( h5.root, "counts_" + region_name ) |
|---|
| 127 | score = s[rel_end-1] |
|---|
| 128 | count = c[rel_end-1] |
|---|
| 129 | if rel_start > 0: |
|---|
| 130 | score -= s[rel_start-1] |
|---|
| 131 | count -= c[rel_start-1] |
|---|
| 132 | if per_col: |
|---|
| 133 | score /= count |
|---|
| 134 | fields.append( str( score ) ) |
|---|
| 135 | out_file.write( "%s\n" % "\t".join( fields ) ) |
|---|
| 136 | # Close the file handle |
|---|
| 137 | h5.close() |
|---|
| 138 | out_file.close() |
|---|
| 139 | |
|---|
| 140 | if warnings: |
|---|
| 141 | warn_msg = "PhastOdds scores are only available for ENCODE regions. %d warnings, 1st is: " % len( warnings ) |
|---|
| 142 | warn_msg += warnings[0] |
|---|
| 143 | print warn_msg |
|---|
| 144 | if skipped_lines: |
|---|
| 145 | print 'Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) |
|---|
| 146 | |
|---|
| 147 | if __name__ == "__main__": main() |
|---|