root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/intervals/io.py @ 3

リビジョン 3, 9.1 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Support for reading and writing genomic intervals from delimited text files.
3"""
4
5import sys
6from itertools import *
7from bx.tabular.io import *
8from bx.bitset import *
9
10class MissingFieldError( ParseError ):
11    pass
12
13class FieldFormatError( ParseError ):
14    def __init__( self, *args, **kwargs):
15        ParseError.__init__( self, *args, **kwargs )
16        self.expected = kwargs.get("expected",None)
17    def __str__( self ):
18        if self.expected:
19            return ParseError.__str__( self ) + ", " + self.expected + " expected"
20        else:
21            return ParseError.__str__( self )
22
23class StrandFormatError( ParseError ):
24    pass
25
26class GenomicInterval( TableRow ):
27    """
28    A genomic interval stored in a set of fields (a row of a table)
29    """
30    def __init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, default_strand, fix_strand=False ):
31        TableRow.__init__( self, reader, fields )
32        self.chrom_col = chrom_col
33        self.start_col = start_col
34        self.end_col = end_col
35        self.strand_col = strand_col
36        self.nfields = nfields = len( fields )
37        # Parse chrom/source column
38        if chrom_col >= nfields:
39            raise MissingFieldError( "No field for chrom_col (%d)" % chrom_col )
40        self.chrom = fields[chrom_col]
41        # Parse start column and ensure it is an integer
42        if start_col >= nfields:
43            raise MissingFieldError( "No field for start_col (%d)" % start_col )
44        try:
45            self.start = int( fields[start_col] )
46        except ValueError, e:
47            raise FieldFormatError( "Could not parse start_col: " + str( e ), expected="integer" )
48        # Parse end column and ensure it is an integer
49        if end_col >= nfields:
50            raise MissingFieldError( "No field for end_col (%d)" % end_col )
51        try:
52            self.end = int( fields[end_col] )
53        except ValueError, e:
54            raise FieldFormatError( "Could not parse end_col: " + str( e ), expected="integer" )
55        # Ensure start <= end
56        if self.end < self.start:
57            raise ParseError( "Start is greater than End. Interval length is < 1." )
58        # Parse strand and ensure it is valid
59        if strand_col >= nfields or strand_col < 0:
60            # This should probable be immutable since the fields are
61            # not updated when it is set
62            self.strand = default_strand
63        else:
64            strand = fields[strand_col]
65            if strand not in ( "+", "-"):
66                if fix_strand:
67                    strand = "+"
68                else: raise StrandFormatError( "Strand must be either '+' or '-'" )
69            self.strand = strand
70    def __setattr__( self, name, value ):
71        if name == "chrom":
72            self.fields[self.chrom_col] = str( value )
73        elif name == "start":
74            self.fields[self.start_col] = str( value )
75        elif name == "end":
76            self.fields[self.end_col] = str( value )
77        elif name == "strand":
78            if self.strand_col < self.nfields and self.strand_col >= 0:
79                self.fields[self.strand_col] = str( value )
80        object.__setattr__( self, name, value )
81    def __str__( self ):
82        return "\t".join( self.fields )
83    def copy( self ):
84        return GenomicInterval(self.reader, list( self.fields ), self.chrom_col, self.start_col, self.end_col, self.strand_col, self.strand)
85
86class GenomicIntervalReader( TableReader ):
87    """
88    Reader for iterating a set of intervals in a tab separated file. Can
89    also parse header and comment lines if requested.
90   
91    >>> r = GenomicIntervalReader( [ "#chrom\\tname\\tstart\\tend\\textra",
92    ...               "chr1\\tfoo\\t1\\t100\\txxx",
93    ...               "chr2\\tbar\\t20\\t300\\txxx",
94    ...               "#I am a comment",
95    ...               "chr2\\tbar\\t20\\t300\\txxx" ], start_col=2, end_col=3 )
96    >>> elements = list( r )
97    >>> assert type( elements[0] ) is Header
98    >>> str( elements[0] )
99    '#chrom\\tname\\tstart\\tend\\textra'
100    >>> assert type( elements[1] ) is GenomicInterval
101    >>> print elements[1].start, elements[1].end
102    1 100
103    >>> str( elements[1] )
104    'chr1\\tfoo\\t1\\t100\\txxx'
105    >>> elements[1].start = 30
106    >>> print elements[1].start, elements[1].end
107    30 100
108    >>> str( elements[1] )
109    'chr1\\tfoo\\t30\\t100\\txxx'
110    >>> assert type( elements[2] ) is GenomicInterval
111    >>> assert type( elements[3] ) is Comment
112    >>> assert type( elements[4] ) is GenomicInterval
113    """
114    def __init__( self, input, chrom_col=0, start_col=1, end_col=2, strand_col=5,
115                  default_strand="+", return_header=True, return_comments=True, force_header=None, fix_strand=False, comment_lines_startswith = ["#", "track "] ):
116        TableReader.__init__( self, input, return_header, return_comments, force_header, comment_lines_startswith )
117        self.chrom_col = chrom_col
118        self.start_col = start_col
119        self.end_col = end_col
120        self.strand_col = strand_col
121        self.default_strand = default_strand
122        self.fix_strand = fix_strand
123    def parse_row( self, line ):
124        return GenomicInterval( self, line.split( "\t" ), self.chrom_col,
125                                self.start_col, self.end_col,
126                                self.strand_col, self.default_strand, fix_strand=self.fix_strand )
127
128    def binned_bitsets( self , upstream_pad=0, downstream_pad=0, lens={} ):
129        # The incoming lens dictionary is a dictionary of chromosome lengths
130        # which are used to initialize the bitsets.
131        last_chrom = None
132        last_bitset = None
133        bitsets = dict()
134        for interval in self:
135            if type( interval ) == GenomicInterval:
136                chrom = interval[self.chrom_col]
137                if chrom != last_chrom:
138                    if chrom not in bitsets:
139                        size = lens.get( chrom, MAX )
140                        try:
141                            bbs = BinnedBitSet( size )
142                        except ValueError, e:
143                            # We will only reach here when constructing this bitset from the lens dict
144                            # since the value of MAX is always safe.
145                            raise Exception( "Invalid chrom length %s in 'lens' dictionary. %s" % ( str( size ), str( e ) ) )
146                        bitsets[chrom] = bbs
147                    last_chrom = chrom
148                    last_bitset = bitsets[chrom]
149                start = max( int( interval[self.start_col] ), 0 )
150                end = min( int( interval[self.end_col] ), last_bitset.size)
151                last_bitset.set_range( start, end-start )
152        return bitsets
153
154class NiceReaderWrapper( GenomicIntervalReader ):
155    def __init__( self, reader, **kwargs ):
156        GenomicIntervalReader.__init__( self, reader, **kwargs )
157        self.outstream = kwargs.get("outstream", None)
158        self.print_delegate = kwargs.get("print_delegate", None)
159        self.input_wrapper = iter( self.input )
160        self.input_iter = self.iterwrapper()
161        self.skipped = 0
162        self.skipped_lines = []
163    def __iter__( self ):
164        return self
165    def next( self ):
166        while 1:
167            try:
168                nextitem = GenomicIntervalReader.next( self )
169                return nextitem
170            except ParseError, e:
171                if self.outstream:
172                    if self.print_delegate and hasattr(self.print_delegate,"__call__"):
173                        self.print_delegate( self.outstream, e, self )
174                self.skipped += 1
175                # no reason to stuff an entire bad file into memmory
176                if self.skipped < 10:
177                    self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
178    def iterwrapper( self ):
179        while 1:
180            self.current_line = self.input_wrapper.next()
181            yield self.current_line
182
183class BitsetSafeReaderWrapper( NiceReaderWrapper ):
184    def __init__( self, reader, lens={} ):
185        # This class handles any ValueError, IndexError and OverflowError exceptions that may be thrown when
186        # the bitsets are being created by skipping the problem lines.
187        # The incoming lens dictionary is a dictionary of chromosome lengths
188        # which are used to initialize the bitsets.
189        # It is assumed that the reader is an interval reader, i.e. it has chr_col, start_col, end_col and strand_col attributes.
190        NiceReaderWrapper.__init__( self, reader.input, chrom_col=reader.chrom_col, start_col=reader.start_col, end_col=reader.end_col, strand_col=reader.strand_col)
191        self.lens = lens
192    def next( self ):
193        while True:
194            rval = NiceReaderWrapper.next( self )
195            if type( rval ) == GenomicInterval and rval.end > self.lens.get( rval.chrom, MAX ): # MAX_INT is defined in bx.bitset
196                try:
197                    # This will only work if reader is a NiceReaderWrapper
198                    self.skipped += 1
199                    # no reason to stuff an entire bad file into memmory
200                    if self.skipped < 10:
201                        self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
202                except:
203                    pass
204            else:
205                return rval
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。