root/galaxy-central/lib/galaxy/datatypes/sniff.py @ 2

リビジョン 2, 10.2 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1"""
2File format detector
3"""
4import logging, sys, os, csv, tempfile, shutil, re, zipfile
5import registry
6from galaxy import util
7
8log = logging.getLogger(__name__)
9       
10def get_test_fname(fname):
11    """Returns test data filename"""
12    path, name = os.path.split(__file__)
13    full_path = os.path.join(path, 'test', fname)
14    return full_path
15
16def stream_to_file( stream, suffix='', prefix='', dir=None, text=False ):
17    """Writes a stream to a temporary file, returns the temporary file's name"""
18    fd, temp_name = tempfile.mkstemp( suffix=suffix, prefix=prefix, dir=dir, text=text )
19    CHUNK_SIZE = 1048576
20    data_checked = False
21    is_compressed = False
22    is_binary = False
23    is_multi_byte = False
24    while 1:
25        chunk = stream.read( CHUNK_SIZE )
26        if not chunk:
27            break
28        if not data_checked:
29            # See if we're uploading a compressed file
30            if zipfile.is_zipfile( temp_name ):
31                is_compressed = True
32            else:
33                try:
34                    if unicode( chunk[:2] ) == unicode( util.gzip_magic ):
35                        is_compressed = True
36                except:
37                    pass
38            if not is_compressed:
39                # See if we have a multi-byte character file
40                chars = chunk[:100]
41                is_multi_byte = util.is_multi_byte( chars )
42                if not is_multi_byte:
43                    for char in chars:
44                        if ord( char ) > 128:
45                            is_binary = True
46                            break
47            data_checked = True
48        if not is_compressed and not is_binary:
49            os.write( fd, chunk.encode( "utf-8" ) )
50        else:
51            # Compressed files must be encoded after they are uncompressed in the upload utility,
52            # while binary files should not be encoded at all.
53            os.write( fd, chunk )
54    os.close( fd )
55    return temp_name, is_multi_byte
56
57def check_newlines( fname, bytes_to_read=52428800 ):
58    """
59    Determines if there are any non-POSIX newlines in the first
60    number_of_bytes (by default, 50MB) of the file.
61    """
62    CHUNK_SIZE = 2 ** 20
63    f = open( fname, 'r' )
64    for chunk in f.read( CHUNK_SIZE ):
65        if f.tell() > bytes_to_read:
66            break
67        if chunk.count( '\r' ):
68            f.close()
69            return True
70    f.close()
71    return False
72
73def convert_newlines( fname, in_place=True ):
74    """
75    Converts in place a file from universal line endings
76    to Posix line endings.
77
78    >>> fname = get_test_fname('temp.txt')
79    >>> file(fname, 'wt').write("1 2\\r3 4")
80    >>> convert_newlines(fname)
81    (2, None)
82    >>> file(fname).read()
83    '1 2\\n3 4\\n'
84    """
85    fd, temp_name = tempfile.mkstemp()
86    fp = os.fdopen( fd, "wt" )
87    for i, line in enumerate( file( fname, "U" ) ):
88        fp.write( "%s\n" % line.rstrip( "\r\n" ) )
89    fp.close()
90    if in_place:
91        shutil.move( temp_name, fname )
92        # Return number of lines in file.
93        return ( i + 1, None )
94    else:
95        return ( i + 1, temp_name )
96
97def sep2tabs( fname, in_place=True, patt="\\s+" ):
98    """
99    Transforms in place a 'sep' separated file to a tab separated one
100
101    >>> fname = get_test_fname('temp.txt')
102    >>> file(fname, 'wt').write("1 2\\n3 4\\n")
103    >>> sep2tabs(fname)
104    (2, None)
105    >>> file(fname).read()
106    '1\\t2\\n3\\t4\\n'
107    """
108    regexp = re.compile( patt )
109    fd, temp_name = tempfile.mkstemp()
110    fp = os.fdopen( fd, "wt" )
111    for i, line in enumerate( file( fname ) ):
112        line  = line.rstrip( '\r\n' )
113        elems = regexp.split( line )
114        fp.write( "%s\n" % '\t'.join( elems ) )
115    fp.close()
116    if in_place:
117        shutil.move( temp_name, fname )
118        # Return number of lines in file.
119        return ( i + 1, None )
120    else:
121        return ( i + 1, temp_name )
122
123def convert_newlines_sep2tabs( fname, in_place=True, patt="\\s+" ):
124    """
125    Combines above methods: convert_newlines() and sep2tabs()
126    so that files do not need to be read twice
127
128    >>> fname = get_test_fname('temp.txt')
129    >>> file(fname, 'wt').write("1 2\\r3 4")
130    >>> convert_newlines_sep2tabs(fname)
131    (2, None)
132    >>> file(fname).read()
133    '1\\t2\\n3\\t4\\n'
134    """
135    regexp = re.compile( patt )
136    fd, temp_name = tempfile.mkstemp()
137    fp = os.fdopen( fd, "wt" )
138    for i, line in enumerate( file( fname, "U" ) ):
139        line  = line.rstrip( '\r\n' )
140        elems = regexp.split( line )
141        fp.write( "%s\n" % '\t'.join( elems ) )
142    fp.close()
143    if in_place:
144        shutil.move( temp_name, fname )
145        # Return number of lines in file.
146        return ( i + 1, None )
147    else:
148        return ( i + 1, temp_name )
149
150def get_headers( fname, sep, count=60, is_multi_byte=False ):
151    """
152    Returns a list with the first 'count' lines split by 'sep'
153   
154    >>> fname = get_test_fname('complete.bed')
155    >>> get_headers(fname,'\\t')
156    [['chr7', '127475281', '127491632', 'NM_000230', '0', '+', '127486022', '127488767', '0', '3', '29,172,3225,', '0,10713,13126,'], ['chr7', '127486011', '127488900', 'D49487', '0', '+', '127486022', '127488767', '0', '2', '155,490,', '0,2399']]
157    """
158    headers = []
159    for idx, line in enumerate(file(fname)):
160        line = line.rstrip('\n\r')
161        if is_multi_byte:
162            # TODO: fix this - sep is never found in line
163            line = unicode( line, 'utf-8' )
164            sep = sep.encode( 'utf-8' )
165        headers.append( line.split(sep) )
166        if idx == count:
167            break
168    return headers
169   
170def is_column_based( fname, sep='\t', skip=0, is_multi_byte=False ):
171    """
172    Checks whether the file is column based with respect to a separator
173    (defaults to tab separator).
174   
175    >>> fname = get_test_fname('test.gff')
176    >>> is_column_based(fname)
177    True
178    >>> fname = get_test_fname('test_tab.bed')
179    >>> is_column_based(fname)
180    True
181    >>> is_column_based(fname, sep=' ')
182    False
183    >>> fname = get_test_fname('test_space.txt')
184    >>> is_column_based(fname)
185    False
186    >>> is_column_based(fname, sep=' ')
187    True
188    >>> fname = get_test_fname('test_ensembl.tab')
189    >>> is_column_based(fname)
190    True
191    >>> fname = get_test_fname('test_tab1.tabular')
192    >>> is_column_based(fname, sep=' ', skip=0)
193    False
194    >>> fname = get_test_fname('test_tab1.tabular')
195    >>> is_column_based(fname)
196    True
197    """
198    headers = get_headers( fname, sep, is_multi_byte=is_multi_byte )
199    count = 0
200    if not headers:
201        return False
202    for hdr in headers[skip:]:
203        if hdr and hdr[0] and not hdr[0].startswith('#'):
204            if len(hdr) > 1:
205                count = len(hdr)
206            break
207    if count < 2:
208        return False
209    for hdr in headers[skip:]:
210        if hdr and hdr[0] and not hdr[0].startswith('#'):
211            if len(hdr) != count:
212                return False
213    return True
214
215def guess_ext( fname, sniff_order=None, is_multi_byte=False ):
216    """
217    Returns an extension that can be used in the datatype factory to
218    generate a data for the 'fname' file
219
220    >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml')
221    >>> guess_ext(fname)
222    'blastxml'
223    >>> fname = get_test_fname('interval.interval')
224    >>> guess_ext(fname)
225    'interval'
226    >>> fname = get_test_fname('interval1.bed')
227    >>> guess_ext(fname)
228    'bed'
229    >>> fname = get_test_fname('test_tab.bed')
230    >>> guess_ext(fname)
231    'bed'
232    >>> fname = get_test_fname('sequence.maf')
233    >>> guess_ext(fname)
234    'maf'
235    >>> fname = get_test_fname('sequence.fasta')
236    >>> guess_ext(fname)
237    'fasta'
238    >>> fname = get_test_fname('file.html')
239    >>> guess_ext(fname)
240    'html'
241    >>> fname = get_test_fname('test.gtf')
242    >>> guess_ext(fname)
243    'gtf'
244    >>> fname = get_test_fname('test.gff')
245    >>> guess_ext(fname)
246    'gff'
247    >>> fname = get_test_fname('gff_version_3.gff')
248    >>> guess_ext(fname)
249    'gff3'
250    >>> fname = get_test_fname('temp.txt')
251    >>> file(fname, 'wt').write("a\\t2\\nc\\t1\\nd\\t0")
252    >>> guess_ext(fname)
253    'tabular'
254    >>> fname = get_test_fname('temp.txt')
255    >>> file(fname, 'wt').write("a 1 2 x\\nb 3 4 y\\nc 5 6 z")
256    >>> guess_ext(fname)
257    'txt'
258    >>> fname = get_test_fname('test_tab1.tabular')
259    >>> guess_ext(fname)
260    'tabular'
261    >>> fname = get_test_fname('alignment.lav')
262    >>> guess_ext(fname)
263    'lav'
264    >>> fname = get_test_fname('1.sff')
265    >>> guess_ext(fname)
266    'sff'
267    >>> fname = get_test_fname('1.bam')
268    >>> guess_ext(fname)
269    'bam'
270    >>> fname = get_test_fname('3.bam')
271    >>> guess_ext(fname)
272    'bam'
273    """
274    if sniff_order is None:
275        datatypes_registry = registry.Registry()
276        sniff_order = datatypes_registry.sniff_order
277    for datatype in sniff_order:
278        """
279        Some classes may not have a sniff function, which is ok.  In fact, the
280        Tabular and Text classes are 2 examples of classes that should never have
281        a sniff function.  Since these classes are default classes, they contain
282        few rules to filter out data of other formats, so they should be called
283        from this function after all other datatypes in sniff_order have not been
284        successfully discovered.
285        """
286        try:
287            if datatype.sniff( fname ):
288                return datatype.file_ext
289        except:
290            pass
291    headers = get_headers( fname, None )
292    is_binary = False
293    if is_multi_byte:
294        is_binary = False
295    else:
296        for hdr in headers:
297            for char in hdr:
298                if len( char ) > 1:
299                    for c in char:
300                        if ord( c ) > 128:
301                            is_binary = True
302                            break
303                elif ord( char ) > 128:
304                    is_binary = True
305                    break
306                if is_binary:
307                    break
308            if is_binary:
309                break
310    if is_binary:
311        return 'data'        #default binary data type file extension
312    if is_column_based( fname, '\t', 1, is_multi_byte=is_multi_byte ):
313        return 'tabular'    #default tabular data type file extension
314    return 'txt'            #default text data type file extension
315
316if __name__ == '__main__':
317    import doctest, sys
318    doctest.testmod(sys.modules[__name__])
319   
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。