root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/EGG-INFO/scripts/aggregate_scores_in_intervals.py @ 3

リビジョン 3, 4.0 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1#!/usr/bin/python2.6
2
3"""
4Given a list of intervals in BED format (`interval_file`) and a set of scores
5(`score_file`) print each interval plus the average, minimum, and maximum of
6the scores that fall in that interval. Scores can either be wiggle format
7data or a directory containing binned array files (named according to the
8sequence source / chromosome of the intervals).
9
10usage: %prog score_file interval_file [out_file] [options]
11    -b, --binned: 'score_file' is actually a directory of binned array files
12    -m, --mask=FILE: bed file containing regions not to consider valid
13"""
14
15from __future__ import division
16
17import psyco_full
18import sys
19import os, os.path
20from UserDict import DictMixin
21import bx.wiggle
22from bx.binned_array import BinnedArray, FileBinnedArray
23from bx.bitset import *
24from bx.bitset_builders import *
25from bx_extras.fpconst import isNaN
26from bx.cookbook import doc_optparse
27from bx import misc
28
29class FileBinnedArrayDir( DictMixin ):
30    """
31    Adapter that makes a directory of FileBinnedArray files look like
32    a regular dict of BinnedArray objects.
33    """
34    def __init__( self, dir ):
35        self.dir = dir
36        self.cache = dict()
37    def __getitem__( self, key ):
38        value = None
39        if key in self.cache:
40            value = self.cache[key]
41        else:
42            fname = os.path.join( self.dir, "%s.ba" % key )
43            if os.path.exists( fname ):
44                value = FileBinnedArray( open( fname ) )
45                self.cache[key] = value
46        if value is None:
47            raise KeyError( "File does not exist: " + fname )
48        return value
49
50def load_scores_wiggle( fname ):
51    """
52    Read a wiggle file and return a dict of BinnedArray objects keyed
53    by chromosome.
54    """
55    scores_by_chrom = dict()
56    for chrom, pos, val in bx.wiggle.Reader( misc.open_compressed( fname ) ):
57        if chrom not in scores_by_chrom:
58            scores_by_chrom[chrom] = BinnedArray()
59        scores_by_chrom[chrom][pos] = val
60    return scores_by_chrom
61
62def load_scores_ba_dir( dir ):
63    """
64    Return a dict-like object (keyed by chromosome) that returns
65    FileBinnedArray objects created from "key.ba" files in `dir`
66    """
67    return FileBinnedArrayDir( dir )
68   
69def main():
70
71    # Parse command line
72    options, args = doc_optparse.parse( __doc__ )
73    try:
74        score_fname = args[0]
75        interval_fname = args[1]
76        if len( args ) > 2:
77            out_file = open( args[2], 'w' )
78        else:
79            out_file = sys.stdout
80        binned = bool( options.binned )
81        mask_fname = options.mask
82    except:
83        doc_optparse.exit()
84
85    if binned:
86        scores_by_chrom = load_scores_ba_dir( score_fname )
87    else:
88        scores_by_chrom = load_scores_wiggle( score_fname )
89
90    if mask_fname:
91        masks = binned_bitsets_from_file( open( mask_fname ) )
92    else:
93        masks = None
94
95    for line in open( interval_fname ):
96        fields = line.split()
97        chrom, start, stop = fields[0], int( fields[1] ), int( fields[2] )
98        total = 0
99        count = 0
100        min_score = 100000000
101        max_score = -100000000
102        for i in range( start, stop ):
103            if chrom in scores_by_chrom and scores_by_chrom[chrom][i]:
104                # Skip if base is masked
105                if masks and chrom in masks:
106                    if masks[chrom][i]:
107                        continue
108                # Get the score, only count if not 'nan'
109                score = scores_by_chrom[chrom][i]
110                if not isNaN( score ):
111                    total += score
112                    count += 1
113                    max_score = max( score, max_score )
114                    min_score = min( score, min_score )
115        if count > 0:
116            avg = total/count
117        else:
118            avg = "nan"
119            min_score = "nan"
120            max_score = "nan"
121           
122        print >> out_file, "\t".join( map( str, [ chrom, start, stop, avg, min_score, max_score ] ) )
123
124    out_file.close()
125
126if __name__ == "__main__": main()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。