[2] | 1 | #!/usr/bin/env python |
---|
| 2 | # Greg Von Kuster |
---|
| 3 | """ |
---|
| 4 | usage: %prog score_file interval_file chrom start stop [out_file] [options] |
---|
| 5 | -b, --binned: 'score_file' is actually a directory of binned array files |
---|
| 6 | -m, --mask=FILE: bed file containing regions not to consider valid |
---|
| 7 | -c, --chrom_buffer=INT: number of chromosomes (default is 3) to keep in memory when using a user supplied score file |
---|
| 8 | """ |
---|
| 9 | |
---|
| 10 | from __future__ import division |
---|
| 11 | from galaxy import eggs |
---|
| 12 | import pkg_resources |
---|
| 13 | pkg_resources.require( "bx-python" ) |
---|
| 14 | pkg_resources.require( "lrucache" ) |
---|
| 15 | try: |
---|
| 16 | pkg_resources.require( "python-lzo" ) |
---|
| 17 | except: |
---|
| 18 | pass |
---|
| 19 | |
---|
| 20 | import psyco_full |
---|
| 21 | import sys |
---|
| 22 | import os, os.path |
---|
| 23 | from UserDict import DictMixin |
---|
| 24 | import bx.wiggle |
---|
| 25 | from bx.binned_array import BinnedArray, FileBinnedArray |
---|
| 26 | from bx.bitset import * |
---|
| 27 | from bx.bitset_builders import * |
---|
| 28 | from fpconst import isNaN |
---|
| 29 | from bx.cookbook import doc_optparse |
---|
| 30 | from galaxy.tools.exception_handling import * |
---|
| 31 | |
---|
| 32 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
| 33 | |
---|
| 34 | import tempfile, struct |
---|
| 35 | class PositionalScoresOnDisk: |
---|
| 36 | fmt = 'f' |
---|
| 37 | fmt_size = struct.calcsize( fmt ) |
---|
| 38 | default_value = float( 'nan' ) |
---|
| 39 | |
---|
| 40 | def __init__( self ): |
---|
| 41 | self.file = tempfile.TemporaryFile( 'w+b' ) |
---|
| 42 | self.length = 0 |
---|
| 43 | def __getitem__( self, i ): |
---|
| 44 | if i < 0: i = self.length + i |
---|
| 45 | if i < 0 or i >= self.length: return self.default_value |
---|
| 46 | try: |
---|
| 47 | self.file.seek( i * self.fmt_size ) |
---|
| 48 | return struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0] |
---|
| 49 | except Exception, e: |
---|
| 50 | raise IndexError, e |
---|
| 51 | def __setitem__( self, i, value ): |
---|
| 52 | if i < 0: i = self.length + i |
---|
| 53 | if i < 0: raise IndexError, 'Negative assignment index out of range' |
---|
| 54 | if i >= self.length: |
---|
| 55 | self.file.seek( self.length * self.fmt_size ) |
---|
| 56 | self.file.write( struct.pack( self.fmt, self.default_value ) * ( i - self.length ) ) |
---|
| 57 | self.length = i + 1 |
---|
| 58 | self.file.seek( i * self.fmt_size ) |
---|
| 59 | self.file.write( struct.pack( self.fmt, value ) ) |
---|
| 60 | def __len__( self ): |
---|
| 61 | return self.length |
---|
| 62 | def __repr__( self ): |
---|
| 63 | i = 0 |
---|
| 64 | repr = "[ " |
---|
| 65 | for i in xrange( self.length ): |
---|
| 66 | repr = "%s %s," % ( repr, self[i] ) |
---|
| 67 | return "%s ]" % ( repr ) |
---|
| 68 | |
---|
| 69 | class FileBinnedArrayDir( DictMixin ): |
---|
| 70 | """ |
---|
| 71 | Adapter that makes a directory of FileBinnedArray files look like |
---|
| 72 | a regular dict of BinnedArray objects. |
---|
| 73 | """ |
---|
| 74 | def __init__( self, dir ): |
---|
| 75 | self.dir = dir |
---|
| 76 | self.cache = dict() |
---|
| 77 | def __getitem__( self, key ): |
---|
| 78 | value = None |
---|
| 79 | if key in self.cache: |
---|
| 80 | value = self.cache[key] |
---|
| 81 | else: |
---|
| 82 | fname = os.path.join( self.dir, "%s.ba" % key ) |
---|
| 83 | if os.path.exists( fname ): |
---|
| 84 | value = FileBinnedArray( open( fname ) ) |
---|
| 85 | self.cache[key] = value |
---|
| 86 | if value is None: |
---|
| 87 | raise KeyError( "File does not exist: " + fname ) |
---|
| 88 | return value |
---|
| 89 | |
---|
| 90 | def stop_err(msg): |
---|
| 91 | sys.stderr.write(msg) |
---|
| 92 | sys.exit() |
---|
| 93 | |
---|
| 94 | def load_scores_wiggle( fname, chrom_buffer_size = 3 ): |
---|
| 95 | """ |
---|
| 96 | Read a wiggle file and return a dict of BinnedArray objects keyed |
---|
| 97 | by chromosome. |
---|
| 98 | """ |
---|
| 99 | scores_by_chrom = dict() |
---|
| 100 | try: |
---|
| 101 | for chrom, pos, val in bx.wiggle.Reader( UCSCOutWrapper( open( fname ) ) ): |
---|
| 102 | if chrom not in scores_by_chrom: |
---|
| 103 | if chrom_buffer_size: |
---|
| 104 | scores_by_chrom[chrom] = BinnedArray() |
---|
| 105 | chrom_buffer_size -= 1 |
---|
| 106 | else: |
---|
| 107 | scores_by_chrom[chrom] = PositionalScoresOnDisk() |
---|
| 108 | scores_by_chrom[chrom][pos] = val |
---|
| 109 | except UCSCLimitException: |
---|
| 110 | # Wiggle data was truncated, at the very least need to warn the user. |
---|
| 111 | print 'Encountered message from UCSC: "Reached output limit of 100000 data values", so be aware your data was truncated.' |
---|
| 112 | except IndexError: |
---|
| 113 | stop_err('Data error: one or more column data values is missing in "%s"' %fname) |
---|
| 114 | except ValueError: |
---|
| 115 | stop_err('Data error: invalid data type for one or more values in "%s".' %fname) |
---|
| 116 | return scores_by_chrom |
---|
| 117 | |
---|
| 118 | def load_scores_ba_dir( dir ): |
---|
| 119 | """ |
---|
| 120 | Return a dict-like object (keyed by chromosome) that returns |
---|
| 121 | FileBinnedArray objects created from "key.ba" files in `dir` |
---|
| 122 | """ |
---|
| 123 | return FileBinnedArrayDir( dir ) |
---|
| 124 | |
---|
| 125 | def main(): |
---|
| 126 | |
---|
| 127 | # Parse command line |
---|
| 128 | options, args = doc_optparse.parse( __doc__ ) |
---|
| 129 | |
---|
| 130 | try: |
---|
| 131 | score_fname = args[0] |
---|
| 132 | interval_fname = args[1] |
---|
| 133 | chrom_col = args[2] |
---|
| 134 | start_col = args[3] |
---|
| 135 | stop_col = args[4] |
---|
| 136 | if len( args ) > 5: |
---|
| 137 | out_file = open( args[5], 'w' ) |
---|
| 138 | else: |
---|
| 139 | out_file = sys.stdout |
---|
| 140 | binned = bool( options.binned ) |
---|
| 141 | mask_fname = options.mask |
---|
| 142 | except: |
---|
| 143 | doc_optparse.exit() |
---|
| 144 | |
---|
| 145 | if score_fname == 'None': |
---|
| 146 | stop_err( 'This tool works with data from genome builds hg16, hg17 or hg18. Click the pencil icon in your history item to set the genome build if appropriate.' ) |
---|
| 147 | |
---|
| 148 | try: |
---|
| 149 | chrom_col = int(chrom_col) - 1 |
---|
| 150 | start_col = int(start_col) - 1 |
---|
| 151 | stop_col = int(stop_col) - 1 |
---|
| 152 | except: |
---|
| 153 | stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' ) |
---|
| 154 | |
---|
| 155 | if chrom_col < 0 or start_col < 0 or stop_col < 0: |
---|
| 156 | stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' ) |
---|
| 157 | |
---|
| 158 | if binned: |
---|
| 159 | scores_by_chrom = load_scores_ba_dir( score_fname ) |
---|
| 160 | else: |
---|
| 161 | try: |
---|
| 162 | chrom_buffer = int( options.chrom_buffer ) |
---|
| 163 | except: |
---|
| 164 | chrom_buffer = 3 |
---|
| 165 | scores_by_chrom = load_scores_wiggle( score_fname, chrom_buffer ) |
---|
| 166 | |
---|
| 167 | if mask_fname: |
---|
| 168 | masks = binned_bitsets_from_file( open( mask_fname ) ) |
---|
| 169 | else: |
---|
| 170 | masks = None |
---|
| 171 | |
---|
| 172 | skipped_lines = 0 |
---|
| 173 | first_invalid_line = 0 |
---|
| 174 | invalid_line = '' |
---|
| 175 | |
---|
| 176 | for i, line in enumerate( open( interval_fname )): |
---|
| 177 | valid = True |
---|
| 178 | line = line.rstrip('\r\n') |
---|
| 179 | if line and not line.startswith( '#' ): |
---|
| 180 | fields = line.split() |
---|
| 181 | |
---|
| 182 | try: |
---|
| 183 | chrom, start, stop = fields[chrom_col], int( fields[start_col] ), int( fields[stop_col] ) |
---|
| 184 | except: |
---|
| 185 | valid = False |
---|
| 186 | skipped_lines += 1 |
---|
| 187 | if not invalid_line: |
---|
| 188 | first_invalid_line = i + 1 |
---|
| 189 | invalid_line = line |
---|
| 190 | if valid: |
---|
| 191 | total = 0 |
---|
| 192 | count = 0 |
---|
| 193 | min_score = 100000000 |
---|
| 194 | max_score = -100000000 |
---|
| 195 | for j in range( start, stop ): |
---|
| 196 | if chrom in scores_by_chrom: |
---|
| 197 | try: |
---|
| 198 | # Skip if base is masked |
---|
| 199 | if masks and chrom in masks: |
---|
| 200 | if masks[chrom][j]: |
---|
| 201 | continue |
---|
| 202 | # Get the score, only count if not 'nan' |
---|
| 203 | score = scores_by_chrom[chrom][j] |
---|
| 204 | if not isNaN( score ): |
---|
| 205 | total += score |
---|
| 206 | count += 1 |
---|
| 207 | max_score = max( score, max_score ) |
---|
| 208 | min_score = min( score, min_score ) |
---|
| 209 | except: |
---|
| 210 | continue |
---|
| 211 | if count > 0: |
---|
| 212 | avg = total/count |
---|
| 213 | else: |
---|
| 214 | avg = "nan" |
---|
| 215 | min_score = "nan" |
---|
| 216 | max_score = "nan" |
---|
| 217 | |
---|
| 218 | # Build the resulting line of data |
---|
| 219 | out_line = [] |
---|
| 220 | for k in range(0, len(fields)): |
---|
| 221 | out_line.append(fields[k]) |
---|
| 222 | out_line.append(avg) |
---|
| 223 | out_line.append(min_score) |
---|
| 224 | out_line.append(max_score) |
---|
| 225 | |
---|
| 226 | print >> out_file, "\t".join( map( str, out_line ) ) |
---|
| 227 | else: |
---|
| 228 | skipped_lines += 1 |
---|
| 229 | if not invalid_line: |
---|
| 230 | first_invalid_line = i + 1 |
---|
| 231 | invalid_line = line |
---|
| 232 | elif line.startswith( '#' ): |
---|
| 233 | # We'll save the original comments |
---|
| 234 | print >> out_file, line |
---|
| 235 | |
---|
| 236 | out_file.close() |
---|
| 237 | |
---|
| 238 | if skipped_lines > 0: |
---|
| 239 | print 'Data issue: skipped %d invalid lines starting at line #%d which is "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) |
---|
| 240 | if skipped_lines == i: |
---|
| 241 | print 'Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.' |
---|
| 242 | |
---|
| 243 | if __name__ == "__main__": main() |
---|