1 | #!/usr/bin/env python |
---|
2 | # Greg Von Kuster |
---|
3 | """ |
---|
4 | usage: %prog score_file interval_file chrom start stop [out_file] [options] |
---|
5 | -b, --binned: 'score_file' is actually a directory of binned array files |
---|
6 | -m, --mask=FILE: bed file containing regions not to consider valid |
---|
7 | -c, --chrom_buffer=INT: number of chromosomes (default is 3) to keep in memory when using a user supplied score file |
---|
8 | """ |
---|
9 | |
---|
10 | from __future__ import division |
---|
11 | from galaxy import eggs |
---|
12 | import pkg_resources |
---|
13 | pkg_resources.require( "bx-python" ) |
---|
14 | pkg_resources.require( "lrucache" ) |
---|
15 | try: |
---|
16 | pkg_resources.require( "python-lzo" ) |
---|
17 | except: |
---|
18 | pass |
---|
19 | |
---|
20 | import psyco_full |
---|
21 | import sys |
---|
22 | import os, os.path |
---|
23 | from UserDict import DictMixin |
---|
24 | import bx.wiggle |
---|
25 | from bx.binned_array import BinnedArray, FileBinnedArray |
---|
26 | from bx.bitset import * |
---|
27 | from bx.bitset_builders import * |
---|
28 | from fpconst import isNaN |
---|
29 | from bx.cookbook import doc_optparse |
---|
30 | from galaxy.tools.exception_handling import * |
---|
31 | |
---|
32 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
33 | |
---|
34 | import tempfile, struct |
---|
35 | class PositionalScoresOnDisk: |
---|
36 | fmt = 'f' |
---|
37 | fmt_size = struct.calcsize( fmt ) |
---|
38 | default_value = float( 'nan' ) |
---|
39 | |
---|
40 | def __init__( self ): |
---|
41 | self.file = tempfile.TemporaryFile( 'w+b' ) |
---|
42 | self.length = 0 |
---|
43 | def __getitem__( self, i ): |
---|
44 | if i < 0: i = self.length + i |
---|
45 | if i < 0 or i >= self.length: return self.default_value |
---|
46 | try: |
---|
47 | self.file.seek( i * self.fmt_size ) |
---|
48 | return struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0] |
---|
49 | except Exception, e: |
---|
50 | raise IndexError, e |
---|
51 | def __setitem__( self, i, value ): |
---|
52 | if i < 0: i = self.length + i |
---|
53 | if i < 0: raise IndexError, 'Negative assignment index out of range' |
---|
54 | if i >= self.length: |
---|
55 | self.file.seek( self.length * self.fmt_size ) |
---|
56 | self.file.write( struct.pack( self.fmt, self.default_value ) * ( i - self.length ) ) |
---|
57 | self.length = i + 1 |
---|
58 | self.file.seek( i * self.fmt_size ) |
---|
59 | self.file.write( struct.pack( self.fmt, value ) ) |
---|
60 | def __len__( self ): |
---|
61 | return self.length |
---|
62 | def __repr__( self ): |
---|
63 | i = 0 |
---|
64 | repr = "[ " |
---|
65 | for i in xrange( self.length ): |
---|
66 | repr = "%s %s," % ( repr, self[i] ) |
---|
67 | return "%s ]" % ( repr ) |
---|
68 | |
---|
69 | class FileBinnedArrayDir( DictMixin ): |
---|
70 | """ |
---|
71 | Adapter that makes a directory of FileBinnedArray files look like |
---|
72 | a regular dict of BinnedArray objects. |
---|
73 | """ |
---|
74 | def __init__( self, dir ): |
---|
75 | self.dir = dir |
---|
76 | self.cache = dict() |
---|
77 | def __getitem__( self, key ): |
---|
78 | value = None |
---|
79 | if key in self.cache: |
---|
80 | value = self.cache[key] |
---|
81 | else: |
---|
82 | fname = os.path.join( self.dir, "%s.ba" % key ) |
---|
83 | if os.path.exists( fname ): |
---|
84 | value = FileBinnedArray( open( fname ) ) |
---|
85 | self.cache[key] = value |
---|
86 | if value is None: |
---|
87 | raise KeyError( "File does not exist: " + fname ) |
---|
88 | return value |
---|
89 | |
---|
90 | def stop_err(msg): |
---|
91 | sys.stderr.write(msg) |
---|
92 | sys.exit() |
---|
93 | |
---|
94 | def load_scores_wiggle( fname, chrom_buffer_size = 3 ): |
---|
95 | """ |
---|
96 | Read a wiggle file and return a dict of BinnedArray objects keyed |
---|
97 | by chromosome. |
---|
98 | """ |
---|
99 | scores_by_chrom = dict() |
---|
100 | try: |
---|
101 | for chrom, pos, val in bx.wiggle.Reader( UCSCOutWrapper( open( fname ) ) ): |
---|
102 | if chrom not in scores_by_chrom: |
---|
103 | if chrom_buffer_size: |
---|
104 | scores_by_chrom[chrom] = BinnedArray() |
---|
105 | chrom_buffer_size -= 1 |
---|
106 | else: |
---|
107 | scores_by_chrom[chrom] = PositionalScoresOnDisk() |
---|
108 | scores_by_chrom[chrom][pos] = val |
---|
109 | except UCSCLimitException: |
---|
110 | # Wiggle data was truncated, at the very least need to warn the user. |
---|
111 | print 'Encountered message from UCSC: "Reached output limit of 100000 data values", so be aware your data was truncated.' |
---|
112 | except IndexError: |
---|
113 | stop_err('Data error: one or more column data values is missing in "%s"' %fname) |
---|
114 | except ValueError: |
---|
115 | stop_err('Data error: invalid data type for one or more values in "%s".' %fname) |
---|
116 | return scores_by_chrom |
---|
117 | |
---|
118 | def load_scores_ba_dir( dir ): |
---|
119 | """ |
---|
120 | Return a dict-like object (keyed by chromosome) that returns |
---|
121 | FileBinnedArray objects created from "key.ba" files in `dir` |
---|
122 | """ |
---|
123 | return FileBinnedArrayDir( dir ) |
---|
124 | |
---|
125 | def main(): |
---|
126 | |
---|
127 | # Parse command line |
---|
128 | options, args = doc_optparse.parse( __doc__ ) |
---|
129 | |
---|
130 | try: |
---|
131 | score_fname = args[0] |
---|
132 | interval_fname = args[1] |
---|
133 | chrom_col = args[2] |
---|
134 | start_col = args[3] |
---|
135 | stop_col = args[4] |
---|
136 | if len( args ) > 5: |
---|
137 | out_file = open( args[5], 'w' ) |
---|
138 | else: |
---|
139 | out_file = sys.stdout |
---|
140 | binned = bool( options.binned ) |
---|
141 | mask_fname = options.mask |
---|
142 | except: |
---|
143 | doc_optparse.exit() |
---|
144 | |
---|
145 | if score_fname == 'None': |
---|
146 | stop_err( 'This tool works with data from genome builds hg16, hg17 or hg18. Click the pencil icon in your history item to set the genome build if appropriate.' ) |
---|
147 | |
---|
148 | try: |
---|
149 | chrom_col = int(chrom_col) - 1 |
---|
150 | start_col = int(start_col) - 1 |
---|
151 | stop_col = int(stop_col) - 1 |
---|
152 | except: |
---|
153 | stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' ) |
---|
154 | |
---|
155 | if chrom_col < 0 or start_col < 0 or stop_col < 0: |
---|
156 | stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' ) |
---|
157 | |
---|
158 | if binned: |
---|
159 | scores_by_chrom = load_scores_ba_dir( score_fname ) |
---|
160 | else: |
---|
161 | try: |
---|
162 | chrom_buffer = int( options.chrom_buffer ) |
---|
163 | except: |
---|
164 | chrom_buffer = 3 |
---|
165 | scores_by_chrom = load_scores_wiggle( score_fname, chrom_buffer ) |
---|
166 | |
---|
167 | if mask_fname: |
---|
168 | masks = binned_bitsets_from_file( open( mask_fname ) ) |
---|
169 | else: |
---|
170 | masks = None |
---|
171 | |
---|
172 | skipped_lines = 0 |
---|
173 | first_invalid_line = 0 |
---|
174 | invalid_line = '' |
---|
175 | |
---|
176 | for i, line in enumerate( open( interval_fname )): |
---|
177 | valid = True |
---|
178 | line = line.rstrip('\r\n') |
---|
179 | if line and not line.startswith( '#' ): |
---|
180 | fields = line.split() |
---|
181 | |
---|
182 | try: |
---|
183 | chrom, start, stop = fields[chrom_col], int( fields[start_col] ), int( fields[stop_col] ) |
---|
184 | except: |
---|
185 | valid = False |
---|
186 | skipped_lines += 1 |
---|
187 | if not invalid_line: |
---|
188 | first_invalid_line = i + 1 |
---|
189 | invalid_line = line |
---|
190 | if valid: |
---|
191 | total = 0 |
---|
192 | count = 0 |
---|
193 | min_score = 100000000 |
---|
194 | max_score = -100000000 |
---|
195 | for j in range( start, stop ): |
---|
196 | if chrom in scores_by_chrom: |
---|
197 | try: |
---|
198 | # Skip if base is masked |
---|
199 | if masks and chrom in masks: |
---|
200 | if masks[chrom][j]: |
---|
201 | continue |
---|
202 | # Get the score, only count if not 'nan' |
---|
203 | score = scores_by_chrom[chrom][j] |
---|
204 | if not isNaN( score ): |
---|
205 | total += score |
---|
206 | count += 1 |
---|
207 | max_score = max( score, max_score ) |
---|
208 | min_score = min( score, min_score ) |
---|
209 | except: |
---|
210 | continue |
---|
211 | if count > 0: |
---|
212 | avg = total/count |
---|
213 | else: |
---|
214 | avg = "nan" |
---|
215 | min_score = "nan" |
---|
216 | max_score = "nan" |
---|
217 | |
---|
218 | # Build the resulting line of data |
---|
219 | out_line = [] |
---|
220 | for k in range(0, len(fields)): |
---|
221 | out_line.append(fields[k]) |
---|
222 | out_line.append(avg) |
---|
223 | out_line.append(min_score) |
---|
224 | out_line.append(max_score) |
---|
225 | |
---|
226 | print >> out_file, "\t".join( map( str, out_line ) ) |
---|
227 | else: |
---|
228 | skipped_lines += 1 |
---|
229 | if not invalid_line: |
---|
230 | first_invalid_line = i + 1 |
---|
231 | invalid_line = line |
---|
232 | elif line.startswith( '#' ): |
---|
233 | # We'll save the original comments |
---|
234 | print >> out_file, line |
---|
235 | |
---|
236 | out_file.close() |
---|
237 | |
---|
238 | if skipped_lines > 0: |
---|
239 | print 'Data issue: skipped %d invalid lines starting at line #%d which is "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) |
---|
240 | if skipped_lines == i: |
---|
241 | print 'Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.' |
---|
242 | |
---|
243 | if __name__ == "__main__": main() |
---|