root/galaxy-central/eggs/GeneTrack-2.0.0_beta_1_dev_48da9e998f0caf01c5be731e926f4b0481f658f0-py2.6.egg/genetrack/hdflib.py @ 3

リビジョン 3, 10.4 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Utilities for the hierarchical data format (HDF).
3
4"""
5from tables import openFile
6from tables import IsDescription, IntCol, FloatCol
7from genetrack import logger, util, conf
8from itertools import *
9import os, bisect, gc, csv
10
11# missing file
12missing = lambda f: not os.path.isfile(f)
13
14# prints messages after processing chunk  number of lines
15CHUNK = 10**5
16
17class PositionalSchema( IsDescription ):
18    """
19    Stores a triplet of float values for each index.
20    """
21    # the position arguments must be present
22    idx = IntCol  ( pos=1 )  # index
23    fwd = FloatCol( pos=2 )  # values on the forward strand
24    rev = FloatCol( pos=3 )  # value on the reverse strand
25    val = FloatCol( pos=4 )  # weighted value on the combined strands
26
27class PositionalData(object):
28    """
29    An HFD representation of coordinates with one or more values associated with
30    each of these coordinates. The class can store such data for various labels (chromosomes).
31    The default parser built into the class can process files in the following
32    format::
33
34        chrom   index   forward reverse value
35        chr1    146     0.0     1.0     1.0
36        chr1    254     0.0     3.0     3.0
37        chr1    319     0.0     1.0     1.0
38        chr1    328     0.0     1.0     1.0
39        chr1    330     0.0     1.0     1.0
40        chr1    339     0.0     1.0     1.0
41        chr1    341     1.0     0.0     1.0
42        ...
43
44    The default representation is to store a value for the forward and reverse strands,
45    and to produce a composite value (stored as `value` column). In the most common
46    case the composite value is simply the sum of the values on the forward
47    and reverse strands. The input file must be sorted by both coordinates
48    and chromosome (increasing order). Processing is performed in the
49    following manner:
50
51    >>> from genetrack import conf
52    >>>
53    >>> fname = conf.testdata('test-hdflib-input.txt')
54    >>> index = PositionalData(fname=fname, workdir=conf.TEMP_DATA_DIR, update=True)   
55   
56    Upon the first instantiation the index will be created if it did
57    not exist or if the `update=True` parameter was set.
58
59    The `workdir` parameter is optional and if present must point
60    to the directory into which the resulting index file will be placed.
61    The contents of the Positional data object may be accessed as a list
62    but note that only the accessed slice is loaded into memory (lazy access).
63
64    >>> index.labels
65    ['chr1', 'chr2', 'chr3']
66    >>>
67    >>> # this will return the HDF table as implmenented in pytables
68    >>> table = index.table('chr1')
69    >>>
70    >>> list (table.cols.idx[:10])
71    [146, 254, 319, 328, 330, 339, 341, 342, 345, 362]
72    >>>
73    >>> list( table.cols.fwd[:10])
74    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0]
75    >>>
76    >>> list( table.cols.rev[:10])
77    [1.0, 3.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]
78   
79    We may also find the indices for real coordinates. For example the genomic
80    coordinates 400 and 600 map to internal data indices of 20 to 31
81    (it works as a binary search that returns the left index)
82
83    >>>
84    >>> start, end = index.indices('chr1', 400, 600)
85    >>> (start, end)
86    (20, 31)
87
88    We may also query for slices of data that span over an interval
89
90    >>> results = index.query( 'chr1', 400, 600)
91    >>>
92    >>> # the attributes are numeric arrays, here are cast to list
93    >>>
94    >>> list(results.idx)
95    [402, 403, 411, 419, 427, 432, 434, 443, 587, 593, 596]
96    >>>
97    >>> list(results.fwd)
98    [0.0, 1.0, 0.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0]
99    >>>
100    >>> list(results.rev)
101    [3.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0]
102    >>>
103    >>> list(results.val)
104    [3.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0]
105    >>> index.close()
106    >>>
107
108    In order to provide the fastes parsing the internal parser
109    is not overridable. There are transformers that can
110    change bed and gff files to this input format. See the
111    `genetrack.scripts' module.
112    """
113
114    def __init__(self, fname, workdir=None, update=False, nobuild=False, index=None ):
115        """
116        Create the PositionalData
117        """
118        self.fname = fname
119        self.db = None
120       
121        # split the incoming name to find the real name, and base directory
122        basedir, basename = os.path.split(self.fname)
123
124        # the index may be stored in the workdir if it was specified
125        basedir = workdir or basedir
126
127        # this is the HDF index name that the file operates on
128        self.index = index or conf.path_join(basedir, '%s.hdf' % basename)
129
130        # debug messages
131        logger.debug('file path %s' % self.fname)
132        logger.debug('index path %s' % self.index)
133
134        # no building permitted
135        if nobuild and missing(self.index):
136            raise Exception('No autobuild allowed and no index found at %s' % self.index)
137       
138        # creating indices if these are missing or an update is forced
139        if update or missing(self.index):
140            self.build()
141
142        # operates on the HDF file
143        self.db=openFile(self.index, mode='r')
144        self.root = self.db.root
145       
146        # shows the internal labels
147        logger.debug('index labels -> %s' % self.labels)
148
149    def build(self):
150        "May be overriden to use different parsers and schemas"
151
152        logger.info( "file='%s'" % self.fname )
153        logger.info( "index='%s'" % self.index)
154
155        # check file for existance
156        if missing(self.fname):
157            raise IOError('missing data %s' % self.fname)
158
159        # provides timing information
160        timer = util.Timer()
161
162        # iterate over the file
163        reader = csv.reader( file(self.fname, 'rt'), delimiter='\t' )
164   
165        # unwind the reader until it hits the header
166        for row in reader:
167            if row[0] == 'chrom':
168                break
169
170        # helper function that flushes a table
171        def flush( table, collect, name ):
172            # commit the changes
173            if collect:
174                table.append(collect)
175                table.flush()
176                # nicer information
177                size = util.commify( len(table) )
178                logger.info('table=%s, contains %s rows' % (name, size) )
179           
180        # print messages at every CHUNK line
181        last_chrom = table = None
182        db = openFile( self.index, mode='w', title='HDF index database')
183
184        # continue on with reading, optimized for throughput
185        # with minimal function calls
186        collect = []
187        for linec, row in izip(count(1), reader):
188
189            # prints progress on processing, also flushes to periodically
190            if (linec % CHUNK) == 0:
191               logger.info("... processed %s lines" % util.commify(linec))   
192               flush( table=table, collect=collect, name=last_chrom )
193               collect = []
194           
195            # get the values from each row
196            chrom, index, fwd, rev, value = row
197            fwd, rev, value = float(fwd), float(rev), float(value)
198       
199            # flush when switching chromosomes
200            if chrom != last_chrom:
201                # table==None at the beginning
202                if table is not None:
203                    #logger.debug("... flushing at line %s" % row)   
204                    flush( table=table, collect=collect, name=last_chrom )
205                    collect = []
206
207                # creates the new HDF table here
208                table = db.createTable(  "/", chrom, PositionalSchema, 'label %s' % chrom )
209                logger.info("creating table:%s" % chrom)
210                last_chrom = chrom
211
212            collect.append( (index, fwd, rev, value) )
213                           
214        # flush for last chromosome, report some timing information
215        flush(table, collect, chrom)
216        lineno = util.commify(linec)
217        elapsed = timer.report()
218        logger.info("finished inserting %s lines in %s" % (lineno, elapsed) )
219
220        # close database
221        db.close()
222
223    @property
224    def labels(self):
225        "Labels in the file"
226        labs = [ x.name for x in self.root._f_listNodes() ]
227        util.nice_sort( labs )
228        return labs
229   
230    def indices( self, label, start, end, colattr='idx'):
231        """
232        Returns the array indices that correspond the start, end values of index column
233       
234        Note that for this to work the values for the column attribute 'colattr'
235        in the table must be sorted in increasing order
236        """
237        table  = self.table( label )
238        column = getattr(table.cols, colattr)
239        istart = bisect.bisect_left( column, start )
240        iend   = bisect.bisect_left( column, end  )
241        return istart, iend
242
243    def query(self, label, start, end, pad=0, aslist=False ):
244        """
245        Returns data that spans star to end as a class
246        with attributes for idx, fwd, rev and val
247        """
248
249        step  = 1
250        table = self.table( label )
251        istart, iend = self.indices(label=label, start=start-pad, end=end+pad)
252
253        idx = table.cols.idx[istart:iend:step]
254        fwd = table.cols.fwd[istart:iend:step]
255        rev = table.cols.rev[istart:iend:step]
256        val = table.cols.val[istart:iend:step]
257       
258        # sometimes we need all return values to belists
259        if aslist:
260            idx, fwd, rev, val = map(list, (idx, fwd, rev, val))
261
262        params = util.Params( idx=idx, fwd=fwd, rev=rev, val=val )
263        return params
264   
265    def chunks(self, label, size=10**6, step=1 ):
266        """
267        Returns the data as chunks of size. All columns are
268        simultaneously iterated over.
269        """
270        table = self.table( label )
271        for start in xrange(0, 10**9, size):
272            end = start + size
273            idx = table.cols.idx[start:end:step].tolist()
274            if not idx:
275                break
276            fwd = table.cols.fwd[start:end:step].tolist()
277            rev = table.cols.rev[start:end:step].tolist()
278            val = table.cols.val[start:end:step].tolist()       
279            yield idx, fwd, rev, val
280       
281    def table(self, label):
282        return getattr( self.root, label )
283
284    def chromosome(self, label):
285        """
286        Attempts to get a chromosome when specified by either
287        the label or chr1, chr01, chrom01, chrI
288        """
289        return getattr( self.root, label )
290
291
292    def close(self):
293        if self.db is not None:
294            self.db.close()
295   
296    def __del__(self):
297        self.close()
298
299def test( verbose=0 ):
300    """
301    Test runner
302    """
303    import doctest
304    doctest.testmod(optionflags=doctest.ELLIPSIS)
305   
306if __name__ == "__main__":
307    test()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。