1 | #!/usr/bin/env python
|
---|
2 | #Dan Blankenberg
|
---|
3 | #For a set of intervals, this tool returns the same set of intervals
|
---|
4 | #with 2 additional fields: the name of a Table/Feature and the number of
|
---|
5 | #bases covered. The original intervals are repeated for each Table/Feature.
|
---|
6 |
|
---|
7 | import sys, struct, optparse, os, random
|
---|
8 | from galaxy import eggs
|
---|
9 | import pkg_resources; pkg_resources.require( "bx-python" )
|
---|
10 | import bx.intervals.io
|
---|
11 | import bx.bitset
|
---|
12 | try:
|
---|
13 | import psyco
|
---|
14 | psyco.full()
|
---|
15 | except:
|
---|
16 | pass
|
---|
17 |
|
---|
18 | assert sys.version_info[:2] >= ( 2, 4 )
|
---|
19 |
|
---|
20 | class CachedRangesInFile:
|
---|
21 | DEFAULT_STRUCT_FORMAT = '<I'
|
---|
22 | def __init__( self, filename, profiler_info ):
|
---|
23 | self.file_size = os.stat( filename ).st_size
|
---|
24 | self.file = open( filename, 'rb' ) |
---|
25 | self.filename = filename
|
---|
26 | self.fmt = profiler_info.get( 'profiler_struct_format', self.DEFAULT_STRUCT_FORMAT )
|
---|
27 | self.fmt_size = int( profiler_info.get( 'profiler_struct_size', struct.calcsize( self.fmt ) ) )
|
---|
28 | self.length = int( self.file_size / self.fmt_size / 2 )
|
---|
29 | self._cached_ranges = [ None for i in xrange( self.length ) ]
|
---|
30 | def __getitem__( self, i ): |
---|
31 | if self._cached_ranges[i] is not None:
|
---|
32 | return self._cached_ranges[i]
|
---|
33 | if i < 0: i = self.length + i
|
---|
34 | offset = i * self.fmt_size * 2
|
---|
35 | self.file.seek( offset )
|
---|
36 | try:
|
---|
37 | start = struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]
|
---|
38 | end = struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]
|
---|
39 | except Exception, e: |
---|
40 | raise IndexError, e
|
---|
41 | self._cached_ranges[i] = ( start, end )
|
---|
42 | return start, end
|
---|
43 | def __len__( self ):
|
---|
44 | return self.length
|
---|
45 |
|
---|
46 | class RegionCoverage:
|
---|
47 | def __init__( self, filename_base, profiler_info ):
|
---|
48 | try:
|
---|
49 | self._coverage = CachedRangesInFile( "%s.covered" % filename_base, profiler_info )
|
---|
50 | except Exception, e:
|
---|
51 | #print "Error loading coverage file %s: %s" % ( "%s.covered" % filename_base, e )
|
---|
52 | self._coverage = []
|
---|
53 | try:
|
---|
54 | self._total_coverage = int( open( "%s.total_coverage" % filename_base ).read() )
|
---|
55 | except Exception, e:
|
---|
56 | #print "Error loading total coverage file %s: %s" % ( "%s.total_coverage" % filename_base, e )
|
---|
57 | self._total_coverage = 0
|
---|
58 | def get_start_index( self, start ):
|
---|
59 | #binary search: returns index of range closest to start
|
---|
60 | if start > self._coverage[-1][1]:
|
---|
61 | return len( self._coverage ) - 1
|
---|
62 | i = 0
|
---|
63 | j = len( self._coverage) - 1
|
---|
64 | while i < j:
|
---|
65 | k = ( i + j ) / 2
|
---|
66 | if start <= self._coverage[k][1]:
|
---|
67 | j = k
|
---|
68 | else:
|
---|
69 | i = k + 1
|
---|
70 | return i
|
---|
71 | def get_coverage( self, start, end ):
|
---|
72 | return self.get_coverage_regions_overlap( start, end )[0]
|
---|
73 | def get_coverage_regions_overlap( self, start, end ):
|
---|
74 | return self.get_coverage_regions_index_overlap( start, end )[0:2]
|
---|
75 | def get_coverage_regions_index_overlap( self, start, end ):
|
---|
76 | if len( self._coverage ) < 1 or start > self._coverage[-1][1] or end < self._coverage[0][0]:
|
---|
77 | return 0, 0, 0
|
---|
78 | if self._total_coverage and start <= self._coverage[0][0] and end >= self._coverage[-1][1]:
|
---|
79 | return self._total_coverage, len( self._coverage ), 0
|
---|
80 | coverage = 0
|
---|
81 | region_count = 0
|
---|
82 | start_index = self.get_start_index( start )
|
---|
83 | for i in xrange( start_index, len( self._coverage ) ):
|
---|
84 | c_start, c_end = self._coverage[i]
|
---|
85 | if c_start > end:
|
---|
86 | break
|
---|
87 | if c_start <= end and c_end >= start:
|
---|
88 | coverage += min( end, c_end ) - max( start, c_start )
|
---|
89 | region_count += 1
|
---|
90 | return coverage, region_count, start_index
|
---|
91 |
|
---|
92 | class CachedCoverageReader:
|
---|
93 | def __init__( self, base_file_path, buffer = 10, table_names = None, profiler_info = None ):
|
---|
94 | self._base_file_path = base_file_path
|
---|
95 | self._buffer = buffer #number of chromosomes to keep in memory at a time
|
---|
96 | self._coverage = {}
|
---|
97 | if table_names is None: table_names = [ table_dir for table_dir in os.listdir( self._base_file_path ) if os.path.isdir( os.path.join( self._base_file_path, table_dir ) ) ]
|
---|
98 | for tablename in table_names: self._coverage[tablename] = {}
|
---|
99 | if profiler_info is None: profiler_info = {}
|
---|
100 | self._profiler_info = profiler_info
|
---|
101 | def iter_table_coverage_by_region( self, chrom, start, end ):
|
---|
102 | for tablename, coverage, regions in self.iter_table_coverage_regions_by_region( chrom, start, end ):
|
---|
103 | yield tablename, coverage
|
---|
104 | def iter_table_coverage_regions_by_region( self, chrom, start, end ):
|
---|
105 | for tablename, coverage, regions, index in self.iter_table_coverage_regions_index_by_region( chrom, start, end ):
|
---|
106 | yield tablename, coverage, regions
|
---|
107 | def iter_table_coverage_regions_index_by_region( self, chrom, start, end ):
|
---|
108 | for tablename, chromosomes in self._coverage.iteritems():
|
---|
109 | if chrom not in chromosomes:
|
---|
110 | if len( chromosomes ) >= self._buffer:
|
---|
111 | #randomly remove one chromosome from this table
|
---|
112 | del chromosomes[ chromosomes.keys().pop( random.randint( 0, self._buffer - 1 ) ) ]
|
---|
113 | chromosomes[chrom] = RegionCoverage( os.path.join ( self._base_file_path, tablename, chrom ), self._profiler_info )
|
---|
114 | coverage, regions, index = chromosomes[chrom].get_coverage_regions_index_overlap( start, end )
|
---|
115 | yield tablename, coverage, regions, index
|
---|
116 |
|
---|
117 | class TableCoverageSummary:
|
---|
118 | def __init__( self, coverage_reader, chrom_lengths ):
|
---|
119 | self.coverage_reader = coverage_reader
|
---|
120 | self.chrom_lengths = chrom_lengths
|
---|
121 | self.chromosome_coverage = {} #dict of bitset by chromosome holding user's collapsed input intervals
|
---|
122 | self.total_interval_size = 0 #total size of user's input intervals
|
---|
123 | self.total_interval_count = 0 #total number of user's input intervals
|
---|
124 | self.table_coverage = {} #dict of total coverage by user's input intervals by table
|
---|
125 | self.table_chromosome_size = {} #dict of dict of table:chrom containing total coverage of table for a chrom
|
---|
126 | self.table_chromosome_count = {} #dict of dict of table:chrom containing total number of coverage ranges of table for a chrom
|
---|
127 | self.table_regions_overlaped_count = {} #total number of table regions overlaping user's input intervals (non unique)
|
---|
128 | self.interval_table_overlap_count = {} #total number of user input intervals which overlap table |
---|
129 | self.region_size_errors = {} #dictionary of lists of invalid ranges by chromosome
|
---|
130 | def add_region( self, chrom, start, end ):
|
---|
131 | chrom_length = self.chrom_lengths.get( chrom ) |
---|
132 | region_start = min( start, chrom_length ) |
---|
133 | region_end = min( end, chrom_length ) |
---|
134 | region_length = region_end - region_start |
---|
135 | |
---|
136 | if region_length < 1 or region_start != start or region_end != end: |
---|
137 | if chrom not in self.region_size_errors: |
---|
138 | self.region_size_errors[chrom] = [] |
---|
139 | self.region_size_errors[chrom].append( ( start, end ) ) |
---|
140 | if region_length < 1: return |
---|
141 | |
---|
142 | self.total_interval_size += region_length
|
---|
143 | self.total_interval_count += 1
|
---|
144 | if chrom not in self.chromosome_coverage:
|
---|
145 | self.chromosome_coverage[chrom] = bx.bitset.BitSet( chrom_length ) |
---|
146 | |
---|
147 | self.chromosome_coverage[chrom].set_range( region_start, region_length ) |
---|
148 | for table_name, coverage, regions in self.coverage_reader.iter_table_coverage_regions_by_region( chrom, region_start, region_end ):
|
---|
149 | if table_name not in self.table_coverage:
|
---|
150 | self.table_coverage[table_name] = 0
|
---|
151 | self.table_chromosome_size[table_name] = {}
|
---|
152 | self.table_regions_overlaped_count[table_name] = 0
|
---|
153 | self.interval_table_overlap_count[table_name] = 0
|
---|
154 | self.table_chromosome_count[table_name] = {}
|
---|
155 | if chrom not in self.table_chromosome_size[table_name]:
|
---|
156 | self.table_chromosome_size[table_name][chrom] = self.coverage_reader._coverage[table_name][chrom]._total_coverage
|
---|
157 | self.table_chromosome_count[table_name][chrom] = len( self.coverage_reader._coverage[table_name][chrom]._coverage )
|
---|
158 | self.table_coverage[table_name] += coverage
|
---|
159 | if coverage:
|
---|
160 | self.interval_table_overlap_count[table_name] += 1
|
---|
161 | self.table_regions_overlaped_count[table_name] += regions |
---|
162 | def iter_table_coverage( self ):
|
---|
163 | def get_nr_coverage():
|
---|
164 | #returns non-redundant coverage, where user's input intervals have been collapse to resolve overlaps
|
---|
165 | table_coverage = {} #dictionary of tables containing number of table bases overlaped by nr intervals
|
---|
166 | interval_table_overlap_count = {} #dictionary of tables containing number of nr intervals overlaping table
|
---|
167 | table_regions_overlap_count = {} #dictionary of tables containing number of regions overlaped (unique)
|
---|
168 | interval_count = 0 #total number of nr intervals
|
---|
169 | interval_size = 0 #holds total size of nr intervals
|
---|
170 | region_start_end = {} #holds absolute start,end for each user input chromosome
|
---|
171 | for chrom, chromosome_bitset in self.chromosome_coverage.iteritems():
|
---|
172 | #loop through user's collapsed input intervals
|
---|
173 | end = 0
|
---|
174 | last_end_index = {}
|
---|
175 | interval_size += chromosome_bitset.count_range()
|
---|
176 | while True:
|
---|
177 | if end >= chromosome_bitset.size: break
|
---|
178 | start = chromosome_bitset.next_set( end )
|
---|
179 | if start >= chromosome_bitset.size: break
|
---|
180 | end = chromosome_bitset.next_clear( start )
|
---|
181 | interval_count += 1
|
---|
182 | if chrom not in region_start_end:
|
---|
183 | region_start_end[chrom] = [start, end]
|
---|
184 | else:
|
---|
185 | region_start_end[chrom][1] = end
|
---|
186 | for table_name, coverage, region_count, start_index in self.coverage_reader.iter_table_coverage_regions_index_by_region( chrom, start, end ):
|
---|
187 | if table_name not in table_coverage:
|
---|
188 | table_coverage[table_name] = 0
|
---|
189 | interval_table_overlap_count[table_name] = 0
|
---|
190 | table_regions_overlap_count[table_name] = 0
|
---|
191 | table_coverage[table_name] += coverage
|
---|
192 | if coverage:
|
---|
193 | interval_table_overlap_count[table_name] += 1
|
---|
194 | table_regions_overlap_count[table_name] += region_count
|
---|
195 | if table_name in last_end_index and last_end_index[table_name] == start_index:
|
---|
196 | table_regions_overlap_count[table_name] -= 1
|
---|
197 | last_end_index[table_name] = start_index + region_count - 1
|
---|
198 | table_region_coverage = {} #total coverage for tables by bounding nr interval region
|
---|
199 | table_region_count = {} #total number for tables by bounding nr interval region
|
---|
200 | for chrom, start_end in region_start_end.items():
|
---|
201 | for table_name, coverage, region_count in self.coverage_reader.iter_table_coverage_regions_by_region( chrom, start_end[0], start_end[1] ):
|
---|
202 | if table_name not in table_region_coverage:
|
---|
203 | table_region_coverage[table_name] = 0
|
---|
204 | table_region_count[table_name] = 0
|
---|
205 | table_region_coverage[table_name] += coverage
|
---|
206 | table_region_count[table_name] += region_count
|
---|
207 | return table_region_coverage, table_region_count, interval_count, interval_size, table_coverage, table_regions_overlap_count, interval_table_overlap_count
|
---|
208 | table_region_coverage, table_region_count, nr_interval_count, nr_interval_size, nr_table_coverage, nr_table_regions_overlap_count, nr_interval_table_overlap_count = get_nr_coverage()
|
---|
209 | for table_name in self.table_coverage:
|
---|
210 | #TODO: determine a type of statistic, then calculate and report here
|
---|
211 | yield table_name, sum( self.table_chromosome_size.get( table_name, {} ).values() ), sum( self.table_chromosome_count.get( table_name, {} ).values() ), table_region_coverage.get( table_name, 0 ), table_region_count.get( table_name, 0 ), self.total_interval_count, self.total_interval_size, self.table_coverage[table_name], self.table_regions_overlaped_count.get( table_name, 0), self.interval_table_overlap_count.get( table_name, 0 ), nr_interval_count, nr_interval_size, nr_table_coverage[table_name], nr_table_regions_overlap_count.get( table_name, 0 ), nr_interval_table_overlap_count.get( table_name, 0 )
|
---|
212 |
|
---|
213 | def profile_per_interval( interval_filename, chrom_col, start_col, end_col, out_filename, keep_empty, coverage_reader ):
|
---|
214 | out = open( out_filename, 'wb' )
|
---|
215 | for region in bx.intervals.io.NiceReaderWrapper( open( interval_filename, 'rb' ), chrom_col = chrom_col, start_col = start_col, end_col = end_col, fix_strand = True, return_header = False, return_comments = False ):
|
---|
216 | for table_name, coverage, region_count in coverage_reader.iter_table_coverage_regions_by_region( region.chrom, region.start, region.end ):
|
---|
217 | if keep_empty or coverage:
|
---|
218 | #only output regions that have atleast 1 base covered unless empty are requested
|
---|
219 | out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), table_name, coverage, region_count ) )
|
---|
220 | out.close()
|
---|
221 |
|
---|
222 | def profile_summary( interval_filename, chrom_col, start_col, end_col, out_filename, keep_empty, coverage_reader, chrom_lengths ):
|
---|
223 | out = open( out_filename, 'wb' )
|
---|
224 | table_coverage_summary = TableCoverageSummary( coverage_reader, chrom_lengths )
|
---|
225 | for region in bx.intervals.io.NiceReaderWrapper( open( interval_filename, 'rb' ), chrom_col = chrom_col, start_col = start_col, end_col = end_col, fix_strand = True, return_header = False, return_comments = False ):
|
---|
226 | table_coverage_summary.add_region( region.chrom, region.start, region.end )
|
---|
227 |
|
---|
228 | out.write( "#tableName\ttableChromosomeCoverage\ttableChromosomeCount\ttableRegionCoverage\ttableRegionCount\tallIntervalCount\tallIntervalSize\tallCoverage\tallTableRegionsOverlaped\tallIntervalsOverlapingTable\tnrIntervalCount\tnrIntervalSize\tnrCoverage\tnrTableRegionsOverlaped\tnrIntervalsOverlapingTable\n" )
|
---|
229 | for table_name, table_chromosome_size, table_chromosome_count, table_region_coverage, table_region_count, total_interval_count, total_interval_size, total_coverage, table_regions_overlaped_count, interval_region_overlap_count, nr_interval_count, nr_interval_size, nr_coverage, nr_table_regions_overlaped_count, nr_interval_table_overlap_count in table_coverage_summary.iter_table_coverage():
|
---|
230 | if keep_empty or total_coverage:
|
---|
231 | #only output tables that have atleast 1 base covered unless empty are requested
|
---|
232 | out.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( table_name, table_chromosome_size, table_chromosome_count, table_region_coverage, table_region_count, total_interval_count, total_interval_size, total_coverage, table_regions_overlaped_count, interval_region_overlap_count, nr_interval_count, nr_interval_size, nr_coverage, nr_table_regions_overlaped_count, nr_interval_table_overlap_count ) )
|
---|
233 | out.close() |
---|
234 | |
---|
235 | #report chrom size errors as needed: |
---|
236 | if table_coverage_summary.region_size_errors: |
---|
237 | print "Regions provided extended beyond known chromosome lengths, and have been truncated as necessary, for the following intervals:" |
---|
238 | for chrom, regions in table_coverage_summary.region_size_errors.items(): |
---|
239 | if len( regions ) > 3: |
---|
240 | extra_region_info = ", ... " |
---|
241 | else: |
---|
242 | extra_region_info = "" |
---|
243 | print "%s has max length of %s, exceeded by %s%s." % ( chrom, chrom_lengths.get( chrom ), ", ".join( map( str, regions[:3] ) ), extra_region_info )
|
---|
244 |
|
---|
245 | class ChromosomeLengths:
|
---|
246 | def __init__( self, profiler_info ):
|
---|
247 | self.chroms = {}
|
---|
248 | self.default_bitset_size = int( profiler_info.get( 'bitset_size', bx.bitset.MAX ) )
|
---|
249 | chroms = profiler_info.get( 'chromosomes', None )
|
---|
250 | if chroms:
|
---|
251 | for chrom in chroms.split( ',' ):
|
---|
252 | for fields in chrom.rsplit( '=', 1 ):
|
---|
253 | if len( fields ) == 2:
|
---|
254 | self.chroms[ fields[0] ] = int( fields[1] )
|
---|
255 | else:
|
---|
256 | self.chroms[ fields[0] ] = self.default_bitset_size
|
---|
257 | def get( self, name ):
|
---|
258 | return self.chroms.get( name, self.default_bitset_size )
|
---|
259 |
|
---|
260 | def parse_profiler_info( filename ):
|
---|
261 | profiler_info = {}
|
---|
262 | try:
|
---|
263 | for line in open( filename ):
|
---|
264 | fields = line.rstrip( '\n\r' ).split( '\t', 1 )
|
---|
265 | if len( fields ) == 2:
|
---|
266 | if fields[0] in profiler_info:
|
---|
267 | if not isinstance( profiler_info[ fields[0] ], list ):
|
---|
268 | profiler_info[ fields[0] ] = [ profiler_info[ fields[0] ] ]
|
---|
269 | profiler_info[ fields[0] ].append( fields[1] )
|
---|
270 | else:
|
---|
271 | profiler_info[ fields[0] ] = fields[1]
|
---|
272 | except:
|
---|
273 | pass #likely missing file
|
---|
274 | return profiler_info
|
---|
275 |
|
---|
276 | def __main__():
|
---|
277 | parser = optparse.OptionParser()
|
---|
278 | parser.add_option(
|
---|
279 | '-k','--keep_empty',
|
---|
280 | action="store_true",
|
---|
281 | dest='keep_empty',
|
---|
282 | default=False,
|
---|
283 | help='Keep tables with 0 coverage'
|
---|
284 | )
|
---|
285 | parser.add_option(
|
---|
286 | '-b','--buffer',
|
---|
287 | dest='buffer',
|
---|
288 | type='int',default=10,
|
---|
289 | help='Number of Chromosomes to keep buffered'
|
---|
290 | )
|
---|
291 | parser.add_option(
|
---|
292 | '-c','--chrom_col',
|
---|
293 | dest='chrom_col',
|
---|
294 | type='int',default=1,
|
---|
295 | help='Chromosome column'
|
---|
296 | )
|
---|
297 | parser.add_option(
|
---|
298 | '-s','--start_col',
|
---|
299 | dest='start_col',
|
---|
300 | type='int',default=2,
|
---|
301 | help='Start Column'
|
---|
302 | )
|
---|
303 | parser.add_option(
|
---|
304 | '-e','--end_col',
|
---|
305 | dest='end_col',
|
---|
306 | type='int',default=3,
|
---|
307 | help='End Column'
|
---|
308 | )
|
---|
309 | parser.add_option(
|
---|
310 | '-p','--path',
|
---|
311 | dest='path',
|
---|
312 | type='str',default='/galaxy/data/annotation_profiler/hg18',
|
---|
313 | help='Path to profiled data for this organism'
|
---|
314 | )
|
---|
315 | parser.add_option(
|
---|
316 | '-t','--table_names',
|
---|
317 | dest='table_names',
|
---|
318 | type='str',default='None',
|
---|
319 | help='Table names requested'
|
---|
320 | )
|
---|
321 | parser.add_option(
|
---|
322 | '-i','--input',
|
---|
323 | dest='interval_filename',
|
---|
324 | type='str',
|
---|
325 | help='Input Interval File'
|
---|
326 | )
|
---|
327 | parser.add_option(
|
---|
328 | '-o','--output',
|
---|
329 | dest='out_filename',
|
---|
330 | type='str',
|
---|
331 | help='Input Interval File'
|
---|
332 | )
|
---|
333 | parser.add_option(
|
---|
334 | '-S','--summary',
|
---|
335 | action="store_true",
|
---|
336 | dest='summary',
|
---|
337 | default=False,
|
---|
338 | help='Display Summary Results'
|
---|
339 | )
|
---|
340 |
|
---|
341 | options, args = parser.parse_args()
|
---|
342 |
|
---|
343 | #get profiler_info
|
---|
344 | profiler_info = parse_profiler_info( os.path.join( options.path, 'profiler_info.txt' ) )
|
---|
345 |
|
---|
346 | table_names = options.table_names.split( "," )
|
---|
347 | if table_names == ['None']: table_names = None
|
---|
348 | coverage_reader = CachedCoverageReader( options.path, buffer = options.buffer, table_names = table_names, profiler_info = profiler_info )
|
---|
349 |
|
---|
350 | if options.summary:
|
---|
351 | profile_summary( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader, ChromosomeLengths( profiler_info ) )
|
---|
352 | else:
|
---|
353 | profile_per_interval( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader )
|
---|
354 |
|
---|
355 | #print out data version info
|
---|
356 | print 'Data version (%s:%s:%s)' % ( profiler_info.get( 'dbkey', 'unknown' ), profiler_info.get( 'profiler_hash', 'unknown' ), profiler_info.get( 'dump_time', 'unknown' ) )
|
---|
357 |
|
---|
358 | if __name__ == "__main__": __main__()
|
---|