root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/bitset_builders.py

リビジョン 3, 6.0 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Support for creating dictionaries of `Bitset`s / `BinnedBitset`s from text
3files containg sets of "covered" intervals in sequences (e.g. `BED`_ files).
4
5.. BED: http://genome.ucsc.edu/FAQ/FAQformat.html#format1
6"""
7
8from warnings import warn
9from bx.bitset import *
10import re
11
12def binned_bitsets_from_file( f, chrom_col=0, start_col=1, end_col=2, strand_col=5, upstream_pad=0, downstream_pad=0, lens={} ):
13    """
14    Read a file into a dictionary of bitsets. The defaults arguments
15   
16    - 'f' should be a file like object (or any iterable containing strings)
17    - 'chrom_col', 'start_col', and 'end_col' must exist in each line.
18    - 'strand_col' is optional, any line without it will be assumed to be '+'
19    - if 'lens' is provided bitset sizes will be looked up from it, otherwise
20      chromosomes will be assumed to be the maximum size
21    """
22    last_chrom = None
23    last_bitset = None
24    bitsets = dict()
25    for line in f:
26        if line.startswith("#") or line.isspace():
27            continue
28        fields = line.split()
29        strand = "+"
30        if len(fields) > strand_col:
31            if fields[strand_col] == "-": strand = "-"
32        chrom = fields[chrom_col]
33        if chrom != last_chrom:
34            if chrom not in bitsets:
35                if chrom in lens:
36                    size = lens[chrom]
37                else:
38                    size = MAX
39                bitsets[chrom] = BinnedBitSet( size )
40            last_chrom = chrom
41            last_bitset = bitsets[chrom]
42        start, end = int( fields[start_col] ), int( fields[end_col] )
43        if upstream_pad: start = max( 0, start - upstream_pad )
44        if downstream_pad: end = min( size, end + downstream_pad )
45        if start > end: warn( "Interval start after end!" )
46        last_bitset.set_range( start, end-start )
47    return bitsets
48
49def binned_bitsets_from_bed_file( f, chrom_col=0, start_col=1, end_col=2, strand_col=5, upstream_pad=0, downstream_pad=0, lens={} ):
50    """
51    Read a file into a dictionary of bitsets. The defaults arguments
52   
53    - 'f' should be a file like object (or any iterable containing strings)
54    - 'chrom_col', 'start_col', and 'end_col' must exist in each line.
55    - 'strand_col' is optional, any line without it will be assumed to be '+'
56    - if 'lens' is provided bitset sizes will be looked up from it, otherwise
57      chromosomes will be assumed to be the maximum size
58    """
59    last_chrom = None
60    last_bitset = None
61    bitsets = dict()
62    offset = 0
63    for line in f:
64        if line.startswith("#") or line.isspace():
65            continue
66        # Ignore browser lines completely
67        if line.startswith( "browser" ):
68            continue
69        # Need to check track lines due to the offset
70        if line.startswith( "track" ):
71            m = re.search( "offset=(\d+)", line )
72            if m and m.group( 1 ):
73                offset = int( m.group(1) )
74            continue
75        fields = line.split()
76        strand = "+"
77        if len(fields) > strand_col:
78            if fields[strand_col] == "-": strand = "-"
79        chrom = fields[chrom_col]
80        if chrom != last_chrom:
81            if chrom not in bitsets:
82                if chrom in lens:
83                    size = lens[chrom]
84                else:
85                    size = MAX
86                bitsets[chrom] = BinnedBitSet( size )
87            last_chrom = chrom
88            last_bitset = bitsets[chrom]
89        start, end = int( fields[start_col] ) + offset, int( fields[end_col] ) + offset
90        ## # Switch to '+' strand coordinates if not already
91        ## if strand == '-':
92        ##     start = size - end
93        ##     end = size - start
94        if upstream_pad: start = max( 0, start - upstream_pad )
95        if downstream_pad: end = min( size, end + downstream_pad )
96        if start > end: warn( "Interval start after end!" )
97        last_bitset.set_range( start, end-start )
98    return bitsets
99
100def binned_bitsets_proximity( f, chrom_col=0, start_col=1, end_col=2, strand_col=5, upstream=0, downstream=0 ):
101    """Read a file into a dictionary of bitsets"""
102    last_chrom = None
103    last_bitset = None
104    bitsets = dict()
105    for line in f:
106        if line.startswith("#"): continue
107#        print "input=%s" % ( line ),
108        fields = line.split()
109        strand = "+"
110        if len(fields) >= strand_col + 1:
111            if fields[strand_col] == "-": strand = "-"
112        chrom = fields[chrom_col]
113        if chrom != last_chrom:
114            if chrom not in bitsets:
115                bitsets[chrom] = BinnedBitSet( MAX )
116            last_chrom = chrom
117            last_bitset = bitsets[chrom]
118        start, end = int( fields[start_col] ), int( fields[end_col] )
119        if strand == "+":
120            if upstream: start = max( 0, start - upstream )
121            if downstream: end = min( MAX, end + downstream )
122        if strand == "-":
123            if upstream: end = min( MAX, end + upstream )
124            if downstream: start = max( 0, start - downstream )
125#        print "set: start=%d\tend=%d" % ( start, end )
126        if end-start > 0:
127            last_bitset.set_range( start, end-start )
128    return bitsets
129
130def binned_bitsets_from_list( list=[] ):
131    """Read a list into a dictionary of bitsets"""
132    last_chrom = None
133    last_bitset = None
134    bitsets = dict()
135    for l in list:
136        chrom = l[0]
137        if chrom != last_chrom:
138            if chrom not in bitsets:
139                bitsets[chrom] = BinnedBitSet(MAX)
140            last_chrom = chrom
141            last_bitset = bitsets[chrom]
142        start, end = int( l[1] ), int( l[2] )
143        last_bitset.set_range( start, end - start )
144    return bitsets
145
146def binned_bitsets_by_chrom( f, chrom, chrom_col=0, start_col=1, end_col=2):
147    """Read a file by chrom name into a bitset"""
148    bitset = BinnedBitSet( MAX )
149    for line in f:
150        if line.startswith("#"): continue
151        fields = line.split()
152        if fields[chrom_col] == chrom:
153            start, end = int( fields[start_col] ), int( fields[end_col] )
154            bitset.set_range( start, end-start )
155    return bitset
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。