root/galaxy-central/tools/annotation_profiler/annotation_profiler_for_interval.py

リビジョン 2, 20.1 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2#Dan Blankenberg
3#For a set of intervals, this tool returns the same set of intervals
4#with 2 additional fields: the name of a Table/Feature and the number of
5#bases covered. The original intervals are repeated for each Table/Feature.
6
7import sys, struct, optparse, os, random
8from galaxy import eggs
9import pkg_resources; pkg_resources.require( "bx-python" )
10import bx.intervals.io
11import bx.bitset
12try:
13    import psyco
14    psyco.full()
15except:
16    pass
17
18assert sys.version_info[:2] >= ( 2, 4 )
19
20class CachedRangesInFile:
21    DEFAULT_STRUCT_FORMAT = '<I'
22    def __init__( self, filename, profiler_info ):
23        self.file_size = os.stat( filename ).st_size
24        self.file = open( filename, 'rb' )
25        self.filename = filename
26        self.fmt = profiler_info.get( 'profiler_struct_format', self.DEFAULT_STRUCT_FORMAT )
27        self.fmt_size = int( profiler_info.get( 'profiler_struct_size', struct.calcsize( self.fmt ) ) )
28        self.length = int( self.file_size / self.fmt_size / 2 )
29        self._cached_ranges = [ None for i in xrange( self.length ) ]
30    def __getitem__( self, i ):
31        if self._cached_ranges[i] is not None:
32            return self._cached_ranges[i]
33        if i < 0: i = self.length + i
34        offset = i * self.fmt_size * 2
35        self.file.seek( offset )
36        try:
37            start = struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]
38            end = struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]
39        except Exception, e:
40            raise IndexError, e
41        self._cached_ranges[i] = ( start, end )
42        return start, end
43    def __len__( self ):
44        return self.length
45
46class RegionCoverage:
47    def __init__( self, filename_base, profiler_info ):
48        try:
49            self._coverage = CachedRangesInFile( "%s.covered" % filename_base, profiler_info )
50        except Exception, e:
51            #print "Error loading coverage file %s: %s" % ( "%s.covered" % filename_base, e )
52            self._coverage = []
53        try:
54            self._total_coverage = int( open( "%s.total_coverage" % filename_base ).read() )
55        except Exception, e:
56            #print "Error loading total coverage file %s: %s" % ( "%s.total_coverage" % filename_base, e )
57            self._total_coverage = 0
58    def get_start_index( self, start ):
59        #binary search: returns index of range closest to start
60        if start > self._coverage[-1][1]:
61            return len( self._coverage ) - 1
62        i = 0
63        j = len( self._coverage) - 1
64        while i < j:
65            k = ( i + j ) / 2
66            if start <= self._coverage[k][1]:
67                j = k
68            else:
69                i = k + 1
70        return i
71    def get_coverage( self, start, end ):
72        return self.get_coverage_regions_overlap( start, end )[0]
73    def get_coverage_regions_overlap( self, start, end ):
74        return self.get_coverage_regions_index_overlap( start, end )[0:2]
75    def get_coverage_regions_index_overlap( self, start, end ):
76        if len( self._coverage ) < 1 or start > self._coverage[-1][1] or end < self._coverage[0][0]:
77            return 0, 0, 0
78        if self._total_coverage and start <= self._coverage[0][0] and end >= self._coverage[-1][1]:
79            return self._total_coverage, len( self._coverage ), 0
80        coverage = 0
81        region_count = 0
82        start_index = self.get_start_index( start )
83        for i in xrange( start_index, len( self._coverage ) ):
84            c_start, c_end = self._coverage[i]
85            if c_start > end:
86                break
87            if c_start <= end and c_end >= start:
88                coverage += min( end, c_end ) - max( start, c_start )
89                region_count += 1
90        return coverage, region_count, start_index
91
92class CachedCoverageReader:
93    def __init__( self, base_file_path, buffer = 10, table_names = None, profiler_info = None ):
94        self._base_file_path = base_file_path
95        self._buffer = buffer #number of chromosomes to keep in memory at a time
96        self._coverage = {}
97        if table_names is None: table_names = [ table_dir for table_dir in os.listdir( self._base_file_path ) if os.path.isdir( os.path.join( self._base_file_path, table_dir ) ) ]
98        for tablename in table_names: self._coverage[tablename] = {}
99        if profiler_info is None: profiler_info = {}
100        self._profiler_info = profiler_info
101    def iter_table_coverage_by_region( self, chrom, start, end ):
102        for tablename, coverage, regions in self.iter_table_coverage_regions_by_region( chrom, start, end ):
103            yield tablename, coverage
104    def iter_table_coverage_regions_by_region( self, chrom, start, end ):
105        for tablename, coverage, regions, index in self.iter_table_coverage_regions_index_by_region( chrom, start, end ):
106            yield tablename, coverage, regions
107    def iter_table_coverage_regions_index_by_region( self, chrom, start, end ):
108        for tablename, chromosomes in self._coverage.iteritems():
109            if chrom not in chromosomes:
110                if len( chromosomes ) >= self._buffer:
111                    #randomly remove one chromosome from this table
112                    del chromosomes[ chromosomes.keys().pop( random.randint( 0, self._buffer - 1 ) ) ]
113                chromosomes[chrom] = RegionCoverage( os.path.join ( self._base_file_path, tablename, chrom ), self._profiler_info )
114            coverage, regions, index = chromosomes[chrom].get_coverage_regions_index_overlap( start, end )
115            yield tablename, coverage, regions, index
116
117class TableCoverageSummary:
118    def __init__( self, coverage_reader, chrom_lengths ):
119        self.coverage_reader = coverage_reader
120        self.chrom_lengths = chrom_lengths
121        self.chromosome_coverage = {} #dict of bitset by chromosome holding user's collapsed input intervals
122        self.total_interval_size = 0 #total size of user's input intervals
123        self.total_interval_count = 0 #total number of user's input intervals
124        self.table_coverage = {} #dict of total coverage by user's input intervals by table
125        self.table_chromosome_size = {} #dict of dict of table:chrom containing total coverage of table for a chrom
126        self.table_chromosome_count = {} #dict of dict of table:chrom containing total number of coverage ranges of table for a chrom
127        self.table_regions_overlaped_count = {} #total number of table regions overlaping user's input intervals (non unique)
128        self.interval_table_overlap_count = {} #total number of user input intervals which overlap table
129        self.region_size_errors = {} #dictionary of lists of invalid ranges by chromosome
130    def add_region( self, chrom, start, end ):
131        chrom_length = self.chrom_lengths.get( chrom )
132        region_start = min( start, chrom_length )
133        region_end = min( end, chrom_length )
134        region_length = region_end - region_start
135       
136        if region_length < 1 or region_start != start or region_end != end:
137            if chrom not in self.region_size_errors:
138                self.region_size_errors[chrom] = []
139            self.region_size_errors[chrom].append( ( start, end ) )
140            if region_length < 1: return
141       
142        self.total_interval_size += region_length
143        self.total_interval_count += 1
144        if chrom not in self.chromosome_coverage:
145            self.chromosome_coverage[chrom] = bx.bitset.BitSet( chrom_length )
146       
147        self.chromosome_coverage[chrom].set_range( region_start, region_length )
148        for table_name, coverage, regions in self.coverage_reader.iter_table_coverage_regions_by_region( chrom, region_start, region_end ):
149            if table_name not in self.table_coverage:
150                self.table_coverage[table_name] = 0
151                self.table_chromosome_size[table_name] = {}
152                self.table_regions_overlaped_count[table_name] = 0
153                self.interval_table_overlap_count[table_name] = 0
154                self.table_chromosome_count[table_name] = {}
155            if chrom not in self.table_chromosome_size[table_name]:
156                self.table_chromosome_size[table_name][chrom] = self.coverage_reader._coverage[table_name][chrom]._total_coverage
157                self.table_chromosome_count[table_name][chrom] = len( self.coverage_reader._coverage[table_name][chrom]._coverage )
158            self.table_coverage[table_name] += coverage
159            if coverage:
160                self.interval_table_overlap_count[table_name] += 1
161            self.table_regions_overlaped_count[table_name] += regions
162    def iter_table_coverage( self ):
163        def get_nr_coverage():
164            #returns non-redundant coverage, where user's input intervals have been collapse to resolve overlaps
165            table_coverage = {} #dictionary of tables containing number of table bases overlaped by nr intervals
166            interval_table_overlap_count = {} #dictionary of tables containing number of nr intervals overlaping table
167            table_regions_overlap_count = {} #dictionary of tables containing number of regions overlaped (unique)
168            interval_count = 0 #total number of nr intervals
169            interval_size = 0 #holds total size of nr intervals
170            region_start_end = {} #holds absolute start,end for each user input chromosome
171            for chrom, chromosome_bitset in self.chromosome_coverage.iteritems():
172                #loop through user's collapsed input intervals
173                end = 0
174                last_end_index = {}
175                interval_size += chromosome_bitset.count_range()
176                while True:
177                    if end >= chromosome_bitset.size: break
178                    start = chromosome_bitset.next_set( end )
179                    if start >= chromosome_bitset.size: break
180                    end = chromosome_bitset.next_clear( start )
181                    interval_count += 1
182                    if chrom not in region_start_end:
183                        region_start_end[chrom] = [start, end]
184                    else:
185                        region_start_end[chrom][1] = end
186                    for table_name, coverage, region_count, start_index in self.coverage_reader.iter_table_coverage_regions_index_by_region( chrom, start, end ):
187                        if table_name not in table_coverage:
188                            table_coverage[table_name] = 0
189                            interval_table_overlap_count[table_name] = 0
190                            table_regions_overlap_count[table_name] = 0
191                        table_coverage[table_name] += coverage
192                        if coverage:
193                            interval_table_overlap_count[table_name] += 1
194                            table_regions_overlap_count[table_name] += region_count
195                            if table_name in last_end_index and last_end_index[table_name] == start_index:
196                                table_regions_overlap_count[table_name] -= 1
197                            last_end_index[table_name] = start_index + region_count - 1
198            table_region_coverage = {} #total coverage for tables by bounding nr interval region
199            table_region_count = {} #total number for tables by bounding nr interval region
200            for chrom, start_end in region_start_end.items():
201                for table_name, coverage, region_count in self.coverage_reader.iter_table_coverage_regions_by_region( chrom, start_end[0], start_end[1] ):
202                    if table_name not in table_region_coverage:
203                        table_region_coverage[table_name] = 0
204                        table_region_count[table_name] = 0
205                    table_region_coverage[table_name] += coverage
206                    table_region_count[table_name] += region_count
207            return table_region_coverage, table_region_count, interval_count, interval_size, table_coverage, table_regions_overlap_count, interval_table_overlap_count
208        table_region_coverage, table_region_count, nr_interval_count, nr_interval_size, nr_table_coverage, nr_table_regions_overlap_count, nr_interval_table_overlap_count = get_nr_coverage()
209        for table_name in self.table_coverage:
210            #TODO: determine a type of statistic, then calculate and report here
211            yield table_name, sum( self.table_chromosome_size.get( table_name, {} ).values() ), sum( self.table_chromosome_count.get( table_name, {} ).values() ), table_region_coverage.get( table_name, 0 ), table_region_count.get( table_name, 0 ), self.total_interval_count, self.total_interval_size,  self.table_coverage[table_name], self.table_regions_overlaped_count.get( table_name, 0), self.interval_table_overlap_count.get( table_name, 0 ), nr_interval_count, nr_interval_size, nr_table_coverage[table_name], nr_table_regions_overlap_count.get( table_name, 0 ), nr_interval_table_overlap_count.get( table_name, 0 )
212
213def profile_per_interval( interval_filename, chrom_col, start_col, end_col, out_filename, keep_empty, coverage_reader ):
214    out = open( out_filename, 'wb' )
215    for region in bx.intervals.io.NiceReaderWrapper( open( interval_filename, 'rb' ), chrom_col = chrom_col, start_col = start_col, end_col = end_col, fix_strand = True, return_header = False, return_comments = False ):
216        for table_name, coverage, region_count in coverage_reader.iter_table_coverage_regions_by_region( region.chrom, region.start, region.end ):
217            if keep_empty or coverage:
218                #only output regions that have atleast 1 base covered unless empty are requested
219                out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), table_name, coverage, region_count ) )
220    out.close()
221
222def profile_summary( interval_filename, chrom_col, start_col, end_col, out_filename, keep_empty, coverage_reader, chrom_lengths ):
223    out = open( out_filename, 'wb' )
224    table_coverage_summary = TableCoverageSummary( coverage_reader, chrom_lengths )
225    for region in bx.intervals.io.NiceReaderWrapper( open( interval_filename, 'rb' ), chrom_col = chrom_col, start_col = start_col, end_col = end_col, fix_strand = True, return_header = False, return_comments = False ):
226        table_coverage_summary.add_region( region.chrom, region.start, region.end )
227   
228    out.write( "#tableName\ttableChromosomeCoverage\ttableChromosomeCount\ttableRegionCoverage\ttableRegionCount\tallIntervalCount\tallIntervalSize\tallCoverage\tallTableRegionsOverlaped\tallIntervalsOverlapingTable\tnrIntervalCount\tnrIntervalSize\tnrCoverage\tnrTableRegionsOverlaped\tnrIntervalsOverlapingTable\n" )
229    for table_name, table_chromosome_size, table_chromosome_count, table_region_coverage, table_region_count, total_interval_count, total_interval_size, total_coverage, table_regions_overlaped_count, interval_region_overlap_count, nr_interval_count, nr_interval_size, nr_coverage, nr_table_regions_overlaped_count, nr_interval_table_overlap_count in table_coverage_summary.iter_table_coverage():
230        if keep_empty or total_coverage:
231            #only output tables that have atleast 1 base covered unless empty are requested
232            out.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( table_name, table_chromosome_size, table_chromosome_count, table_region_coverage, table_region_count, total_interval_count, total_interval_size, total_coverage, table_regions_overlaped_count, interval_region_overlap_count, nr_interval_count, nr_interval_size, nr_coverage, nr_table_regions_overlaped_count, nr_interval_table_overlap_count ) )
233    out.close()
234   
235    #report chrom size errors as needed:
236    if table_coverage_summary.region_size_errors:
237        print "Regions provided extended beyond known chromosome lengths, and have been truncated as necessary, for the following intervals:"
238        for chrom, regions in table_coverage_summary.region_size_errors.items():
239            if len( regions ) > 3:
240                extra_region_info = ", ... "
241            else:
242                extra_region_info = ""
243            print "%s has max length of %s, exceeded by %s%s." % ( chrom, chrom_lengths.get( chrom ), ", ".join( map( str, regions[:3] ) ), extra_region_info )
244
245class ChromosomeLengths:
246    def __init__( self, profiler_info ):
247        self.chroms = {}
248        self.default_bitset_size = int( profiler_info.get( 'bitset_size', bx.bitset.MAX ) )
249        chroms = profiler_info.get( 'chromosomes', None )
250        if chroms:
251            for chrom in chroms.split( ',' ):
252                for fields in chrom.rsplit( '=', 1 ):
253                    if len( fields ) == 2:
254                        self.chroms[ fields[0] ] = int( fields[1] )
255                    else:
256                        self.chroms[ fields[0] ] = self.default_bitset_size
257    def get( self, name ):
258        return self.chroms.get( name, self.default_bitset_size )
259
260def parse_profiler_info( filename ):
261    profiler_info = {}
262    try:
263        for line in open( filename ):
264            fields = line.rstrip( '\n\r' ).split( '\t', 1 )
265            if len( fields ) == 2:
266                if fields[0] in profiler_info:
267                    if not isinstance( profiler_info[ fields[0] ], list ):
268                        profiler_info[ fields[0] ] = [ profiler_info[ fields[0] ] ]
269                    profiler_info[ fields[0] ].append( fields[1] )
270                else:
271                    profiler_info[ fields[0] ] = fields[1]
272    except:
273        pass #likely missing file
274    return profiler_info
275
276def __main__():
277    parser = optparse.OptionParser()
278    parser.add_option(
279        '-k','--keep_empty',
280        action="store_true",
281        dest='keep_empty',
282        default=False,
283        help='Keep tables with 0 coverage'
284    )
285    parser.add_option(
286        '-b','--buffer',
287        dest='buffer',
288        type='int',default=10,
289        help='Number of Chromosomes to keep buffered'
290    )
291    parser.add_option(
292        '-c','--chrom_col',
293        dest='chrom_col',
294        type='int',default=1,
295        help='Chromosome column'
296    )
297    parser.add_option(
298        '-s','--start_col',
299        dest='start_col',
300        type='int',default=2,
301        help='Start Column'
302    )
303    parser.add_option(
304        '-e','--end_col',
305        dest='end_col',
306        type='int',default=3,
307        help='End Column'
308    )
309    parser.add_option(
310        '-p','--path',
311        dest='path',
312        type='str',default='/galaxy/data/annotation_profiler/hg18',
313        help='Path to profiled data for this organism'
314    )
315    parser.add_option(
316        '-t','--table_names',
317        dest='table_names',
318        type='str',default='None',
319        help='Table names requested'
320    )
321    parser.add_option(
322        '-i','--input',
323        dest='interval_filename',
324        type='str',
325        help='Input Interval File'
326    )
327    parser.add_option(
328        '-o','--output',
329        dest='out_filename',
330        type='str',
331        help='Input Interval File'
332    )
333    parser.add_option(
334        '-S','--summary',
335        action="store_true",
336        dest='summary',
337        default=False,
338        help='Display Summary Results'
339    )
340   
341    options, args = parser.parse_args()
342   
343    #get profiler_info
344    profiler_info = parse_profiler_info( os.path.join( options.path, 'profiler_info.txt' ) )
345   
346    table_names = options.table_names.split( "," )
347    if table_names == ['None']: table_names = None
348    coverage_reader = CachedCoverageReader( options.path, buffer = options.buffer, table_names = table_names, profiler_info = profiler_info )
349   
350    if options.summary:
351        profile_summary( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader, ChromosomeLengths( profiler_info ) )
352    else:
353        profile_per_interval( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader )
354   
355    #print out data version info
356    print 'Data version (%s:%s:%s)' % ( profiler_info.get( 'dbkey', 'unknown' ), profiler_info.get( 'profiler_hash', 'unknown' ), profiler_info.get( 'dump_time', 'unknown' ) )
357
358if __name__ == "__main__": __main__()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。