1 | """ |
---|
2 | File format detector |
---|
3 | """ |
---|
4 | import logging, sys, os, csv, tempfile, shutil, re, zipfile |
---|
5 | import registry |
---|
6 | from galaxy import util |
---|
7 | |
---|
8 | log = logging.getLogger(__name__) |
---|
9 | |
---|
10 | def get_test_fname(fname): |
---|
11 | """Returns test data filename""" |
---|
12 | path, name = os.path.split(__file__) |
---|
13 | full_path = os.path.join(path, 'test', fname) |
---|
14 | return full_path |
---|
15 | |
---|
16 | def stream_to_file( stream, suffix='', prefix='', dir=None, text=False ): |
---|
17 | """Writes a stream to a temporary file, returns the temporary file's name""" |
---|
18 | fd, temp_name = tempfile.mkstemp( suffix=suffix, prefix=prefix, dir=dir, text=text ) |
---|
19 | CHUNK_SIZE = 1048576 |
---|
20 | data_checked = False |
---|
21 | is_compressed = False |
---|
22 | is_binary = False |
---|
23 | is_multi_byte = False |
---|
24 | while 1: |
---|
25 | chunk = stream.read( CHUNK_SIZE ) |
---|
26 | if not chunk: |
---|
27 | break |
---|
28 | if not data_checked: |
---|
29 | # See if we're uploading a compressed file |
---|
30 | if zipfile.is_zipfile( temp_name ): |
---|
31 | is_compressed = True |
---|
32 | else: |
---|
33 | try: |
---|
34 | if unicode( chunk[:2] ) == unicode( util.gzip_magic ): |
---|
35 | is_compressed = True |
---|
36 | except: |
---|
37 | pass |
---|
38 | if not is_compressed: |
---|
39 | # See if we have a multi-byte character file |
---|
40 | chars = chunk[:100] |
---|
41 | is_multi_byte = util.is_multi_byte( chars ) |
---|
42 | if not is_multi_byte: |
---|
43 | for char in chars: |
---|
44 | if ord( char ) > 128: |
---|
45 | is_binary = True |
---|
46 | break |
---|
47 | data_checked = True |
---|
48 | if not is_compressed and not is_binary: |
---|
49 | os.write( fd, chunk.encode( "utf-8" ) ) |
---|
50 | else: |
---|
51 | # Compressed files must be encoded after they are uncompressed in the upload utility, |
---|
52 | # while binary files should not be encoded at all. |
---|
53 | os.write( fd, chunk ) |
---|
54 | os.close( fd ) |
---|
55 | return temp_name, is_multi_byte |
---|
56 | |
---|
57 | def check_newlines( fname, bytes_to_read=52428800 ): |
---|
58 | """ |
---|
59 | Determines if there are any non-POSIX newlines in the first |
---|
60 | number_of_bytes (by default, 50MB) of the file. |
---|
61 | """ |
---|
62 | CHUNK_SIZE = 2 ** 20 |
---|
63 | f = open( fname, 'r' ) |
---|
64 | for chunk in f.read( CHUNK_SIZE ): |
---|
65 | if f.tell() > bytes_to_read: |
---|
66 | break |
---|
67 | if chunk.count( '\r' ): |
---|
68 | f.close() |
---|
69 | return True |
---|
70 | f.close() |
---|
71 | return False |
---|
72 | |
---|
73 | def convert_newlines( fname, in_place=True ): |
---|
74 | """ |
---|
75 | Converts in place a file from universal line endings |
---|
76 | to Posix line endings. |
---|
77 | |
---|
78 | >>> fname = get_test_fname('temp.txt') |
---|
79 | >>> file(fname, 'wt').write("1 2\\r3 4") |
---|
80 | >>> convert_newlines(fname) |
---|
81 | (2, None) |
---|
82 | >>> file(fname).read() |
---|
83 | '1 2\\n3 4\\n' |
---|
84 | """ |
---|
85 | fd, temp_name = tempfile.mkstemp() |
---|
86 | fp = os.fdopen( fd, "wt" ) |
---|
87 | for i, line in enumerate( file( fname, "U" ) ): |
---|
88 | fp.write( "%s\n" % line.rstrip( "\r\n" ) ) |
---|
89 | fp.close() |
---|
90 | if in_place: |
---|
91 | shutil.move( temp_name, fname ) |
---|
92 | # Return number of lines in file. |
---|
93 | return ( i + 1, None ) |
---|
94 | else: |
---|
95 | return ( i + 1, temp_name ) |
---|
96 | |
---|
97 | def sep2tabs( fname, in_place=True, patt="\\s+" ): |
---|
98 | """ |
---|
99 | Transforms in place a 'sep' separated file to a tab separated one |
---|
100 | |
---|
101 | >>> fname = get_test_fname('temp.txt') |
---|
102 | >>> file(fname, 'wt').write("1 2\\n3 4\\n") |
---|
103 | >>> sep2tabs(fname) |
---|
104 | (2, None) |
---|
105 | >>> file(fname).read() |
---|
106 | '1\\t2\\n3\\t4\\n' |
---|
107 | """ |
---|
108 | regexp = re.compile( patt ) |
---|
109 | fd, temp_name = tempfile.mkstemp() |
---|
110 | fp = os.fdopen( fd, "wt" ) |
---|
111 | for i, line in enumerate( file( fname ) ): |
---|
112 | line = line.rstrip( '\r\n' ) |
---|
113 | elems = regexp.split( line ) |
---|
114 | fp.write( "%s\n" % '\t'.join( elems ) ) |
---|
115 | fp.close() |
---|
116 | if in_place: |
---|
117 | shutil.move( temp_name, fname ) |
---|
118 | # Return number of lines in file. |
---|
119 | return ( i + 1, None ) |
---|
120 | else: |
---|
121 | return ( i + 1, temp_name ) |
---|
122 | |
---|
123 | def convert_newlines_sep2tabs( fname, in_place=True, patt="\\s+" ): |
---|
124 | """ |
---|
125 | Combines above methods: convert_newlines() and sep2tabs() |
---|
126 | so that files do not need to be read twice |
---|
127 | |
---|
128 | >>> fname = get_test_fname('temp.txt') |
---|
129 | >>> file(fname, 'wt').write("1 2\\r3 4") |
---|
130 | >>> convert_newlines_sep2tabs(fname) |
---|
131 | (2, None) |
---|
132 | >>> file(fname).read() |
---|
133 | '1\\t2\\n3\\t4\\n' |
---|
134 | """ |
---|
135 | regexp = re.compile( patt ) |
---|
136 | fd, temp_name = tempfile.mkstemp() |
---|
137 | fp = os.fdopen( fd, "wt" ) |
---|
138 | for i, line in enumerate( file( fname, "U" ) ): |
---|
139 | line = line.rstrip( '\r\n' ) |
---|
140 | elems = regexp.split( line ) |
---|
141 | fp.write( "%s\n" % '\t'.join( elems ) ) |
---|
142 | fp.close() |
---|
143 | if in_place: |
---|
144 | shutil.move( temp_name, fname ) |
---|
145 | # Return number of lines in file. |
---|
146 | return ( i + 1, None ) |
---|
147 | else: |
---|
148 | return ( i + 1, temp_name ) |
---|
149 | |
---|
150 | def get_headers( fname, sep, count=60, is_multi_byte=False ): |
---|
151 | """ |
---|
152 | Returns a list with the first 'count' lines split by 'sep' |
---|
153 | |
---|
154 | >>> fname = get_test_fname('complete.bed') |
---|
155 | >>> get_headers(fname,'\\t') |
---|
156 | [['chr7', '127475281', '127491632', 'NM_000230', '0', '+', '127486022', '127488767', '0', '3', '29,172,3225,', '0,10713,13126,'], ['chr7', '127486011', '127488900', 'D49487', '0', '+', '127486022', '127488767', '0', '2', '155,490,', '0,2399']] |
---|
157 | """ |
---|
158 | headers = [] |
---|
159 | for idx, line in enumerate(file(fname)): |
---|
160 | line = line.rstrip('\n\r') |
---|
161 | if is_multi_byte: |
---|
162 | # TODO: fix this - sep is never found in line |
---|
163 | line = unicode( line, 'utf-8' ) |
---|
164 | sep = sep.encode( 'utf-8' ) |
---|
165 | headers.append( line.split(sep) ) |
---|
166 | if idx == count: |
---|
167 | break |
---|
168 | return headers |
---|
169 | |
---|
170 | def is_column_based( fname, sep='\t', skip=0, is_multi_byte=False ): |
---|
171 | """ |
---|
172 | Checks whether the file is column based with respect to a separator |
---|
173 | (defaults to tab separator). |
---|
174 | |
---|
175 | >>> fname = get_test_fname('test.gff') |
---|
176 | >>> is_column_based(fname) |
---|
177 | True |
---|
178 | >>> fname = get_test_fname('test_tab.bed') |
---|
179 | >>> is_column_based(fname) |
---|
180 | True |
---|
181 | >>> is_column_based(fname, sep=' ') |
---|
182 | False |
---|
183 | >>> fname = get_test_fname('test_space.txt') |
---|
184 | >>> is_column_based(fname) |
---|
185 | False |
---|
186 | >>> is_column_based(fname, sep=' ') |
---|
187 | True |
---|
188 | >>> fname = get_test_fname('test_ensembl.tab') |
---|
189 | >>> is_column_based(fname) |
---|
190 | True |
---|
191 | >>> fname = get_test_fname('test_tab1.tabular') |
---|
192 | >>> is_column_based(fname, sep=' ', skip=0) |
---|
193 | False |
---|
194 | >>> fname = get_test_fname('test_tab1.tabular') |
---|
195 | >>> is_column_based(fname) |
---|
196 | True |
---|
197 | """ |
---|
198 | headers = get_headers( fname, sep, is_multi_byte=is_multi_byte ) |
---|
199 | count = 0 |
---|
200 | if not headers: |
---|
201 | return False |
---|
202 | for hdr in headers[skip:]: |
---|
203 | if hdr and hdr[0] and not hdr[0].startswith('#'): |
---|
204 | if len(hdr) > 1: |
---|
205 | count = len(hdr) |
---|
206 | break |
---|
207 | if count < 2: |
---|
208 | return False |
---|
209 | for hdr in headers[skip:]: |
---|
210 | if hdr and hdr[0] and not hdr[0].startswith('#'): |
---|
211 | if len(hdr) != count: |
---|
212 | return False |
---|
213 | return True |
---|
214 | |
---|
215 | def guess_ext( fname, sniff_order=None, is_multi_byte=False ): |
---|
216 | """ |
---|
217 | Returns an extension that can be used in the datatype factory to |
---|
218 | generate a data for the 'fname' file |
---|
219 | |
---|
220 | >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml') |
---|
221 | >>> guess_ext(fname) |
---|
222 | 'blastxml' |
---|
223 | >>> fname = get_test_fname('interval.interval') |
---|
224 | >>> guess_ext(fname) |
---|
225 | 'interval' |
---|
226 | >>> fname = get_test_fname('interval1.bed') |
---|
227 | >>> guess_ext(fname) |
---|
228 | 'bed' |
---|
229 | >>> fname = get_test_fname('test_tab.bed') |
---|
230 | >>> guess_ext(fname) |
---|
231 | 'bed' |
---|
232 | >>> fname = get_test_fname('sequence.maf') |
---|
233 | >>> guess_ext(fname) |
---|
234 | 'maf' |
---|
235 | >>> fname = get_test_fname('sequence.fasta') |
---|
236 | >>> guess_ext(fname) |
---|
237 | 'fasta' |
---|
238 | >>> fname = get_test_fname('file.html') |
---|
239 | >>> guess_ext(fname) |
---|
240 | 'html' |
---|
241 | >>> fname = get_test_fname('test.gtf') |
---|
242 | >>> guess_ext(fname) |
---|
243 | 'gtf' |
---|
244 | >>> fname = get_test_fname('test.gff') |
---|
245 | >>> guess_ext(fname) |
---|
246 | 'gff' |
---|
247 | >>> fname = get_test_fname('gff_version_3.gff') |
---|
248 | >>> guess_ext(fname) |
---|
249 | 'gff3' |
---|
250 | >>> fname = get_test_fname('temp.txt') |
---|
251 | >>> file(fname, 'wt').write("a\\t2\\nc\\t1\\nd\\t0") |
---|
252 | >>> guess_ext(fname) |
---|
253 | 'tabular' |
---|
254 | >>> fname = get_test_fname('temp.txt') |
---|
255 | >>> file(fname, 'wt').write("a 1 2 x\\nb 3 4 y\\nc 5 6 z") |
---|
256 | >>> guess_ext(fname) |
---|
257 | 'txt' |
---|
258 | >>> fname = get_test_fname('test_tab1.tabular') |
---|
259 | >>> guess_ext(fname) |
---|
260 | 'tabular' |
---|
261 | >>> fname = get_test_fname('alignment.lav') |
---|
262 | >>> guess_ext(fname) |
---|
263 | 'lav' |
---|
264 | >>> fname = get_test_fname('1.sff') |
---|
265 | >>> guess_ext(fname) |
---|
266 | 'sff' |
---|
267 | >>> fname = get_test_fname('1.bam') |
---|
268 | >>> guess_ext(fname) |
---|
269 | 'bam' |
---|
270 | >>> fname = get_test_fname('3.bam') |
---|
271 | >>> guess_ext(fname) |
---|
272 | 'bam' |
---|
273 | """ |
---|
274 | if sniff_order is None: |
---|
275 | datatypes_registry = registry.Registry() |
---|
276 | sniff_order = datatypes_registry.sniff_order |
---|
277 | for datatype in sniff_order: |
---|
278 | """ |
---|
279 | Some classes may not have a sniff function, which is ok. In fact, the |
---|
280 | Tabular and Text classes are 2 examples of classes that should never have |
---|
281 | a sniff function. Since these classes are default classes, they contain |
---|
282 | few rules to filter out data of other formats, so they should be called |
---|
283 | from this function after all other datatypes in sniff_order have not been |
---|
284 | successfully discovered. |
---|
285 | """ |
---|
286 | try: |
---|
287 | if datatype.sniff( fname ): |
---|
288 | return datatype.file_ext |
---|
289 | except: |
---|
290 | pass |
---|
291 | headers = get_headers( fname, None ) |
---|
292 | is_binary = False |
---|
293 | if is_multi_byte: |
---|
294 | is_binary = False |
---|
295 | else: |
---|
296 | for hdr in headers: |
---|
297 | for char in hdr: |
---|
298 | if len( char ) > 1: |
---|
299 | for c in char: |
---|
300 | if ord( c ) > 128: |
---|
301 | is_binary = True |
---|
302 | break |
---|
303 | elif ord( char ) > 128: |
---|
304 | is_binary = True |
---|
305 | break |
---|
306 | if is_binary: |
---|
307 | break |
---|
308 | if is_binary: |
---|
309 | break |
---|
310 | if is_binary: |
---|
311 | return 'data' #default binary data type file extension |
---|
312 | if is_column_based( fname, '\t', 1, is_multi_byte=is_multi_byte ): |
---|
313 | return 'tabular' #default tabular data type file extension |
---|
314 | return 'txt' #default text data type file extension |
---|
315 | |
---|
316 | if __name__ == '__main__': |
---|
317 | import doctest, sys |
---|
318 | doctest.testmod(sys.modules[__name__]) |
---|
319 | |
---|