[2] | 1 | """ |
---|
| 2 | File format detector |
---|
| 3 | """ |
---|
| 4 | import logging, sys, os, csv, tempfile, shutil, re, zipfile |
---|
| 5 | import registry |
---|
| 6 | from galaxy import util |
---|
| 7 | |
---|
| 8 | log = logging.getLogger(__name__) |
---|
| 9 | |
---|
| 10 | def get_test_fname(fname): |
---|
| 11 | """Returns test data filename""" |
---|
| 12 | path, name = os.path.split(__file__) |
---|
| 13 | full_path = os.path.join(path, 'test', fname) |
---|
| 14 | return full_path |
---|
| 15 | |
---|
| 16 | def stream_to_file( stream, suffix='', prefix='', dir=None, text=False ): |
---|
| 17 | """Writes a stream to a temporary file, returns the temporary file's name""" |
---|
| 18 | fd, temp_name = tempfile.mkstemp( suffix=suffix, prefix=prefix, dir=dir, text=text ) |
---|
| 19 | CHUNK_SIZE = 1048576 |
---|
| 20 | data_checked = False |
---|
| 21 | is_compressed = False |
---|
| 22 | is_binary = False |
---|
| 23 | is_multi_byte = False |
---|
| 24 | while 1: |
---|
| 25 | chunk = stream.read( CHUNK_SIZE ) |
---|
| 26 | if not chunk: |
---|
| 27 | break |
---|
| 28 | if not data_checked: |
---|
| 29 | # See if we're uploading a compressed file |
---|
| 30 | if zipfile.is_zipfile( temp_name ): |
---|
| 31 | is_compressed = True |
---|
| 32 | else: |
---|
| 33 | try: |
---|
| 34 | if unicode( chunk[:2] ) == unicode( util.gzip_magic ): |
---|
| 35 | is_compressed = True |
---|
| 36 | except: |
---|
| 37 | pass |
---|
| 38 | if not is_compressed: |
---|
| 39 | # See if we have a multi-byte character file |
---|
| 40 | chars = chunk[:100] |
---|
| 41 | is_multi_byte = util.is_multi_byte( chars ) |
---|
| 42 | if not is_multi_byte: |
---|
| 43 | for char in chars: |
---|
| 44 | if ord( char ) > 128: |
---|
| 45 | is_binary = True |
---|
| 46 | break |
---|
| 47 | data_checked = True |
---|
| 48 | if not is_compressed and not is_binary: |
---|
| 49 | os.write( fd, chunk.encode( "utf-8" ) ) |
---|
| 50 | else: |
---|
| 51 | # Compressed files must be encoded after they are uncompressed in the upload utility, |
---|
| 52 | # while binary files should not be encoded at all. |
---|
| 53 | os.write( fd, chunk ) |
---|
| 54 | os.close( fd ) |
---|
| 55 | return temp_name, is_multi_byte |
---|
| 56 | |
---|
| 57 | def check_newlines( fname, bytes_to_read=52428800 ): |
---|
| 58 | """ |
---|
| 59 | Determines if there are any non-POSIX newlines in the first |
---|
| 60 | number_of_bytes (by default, 50MB) of the file. |
---|
| 61 | """ |
---|
| 62 | CHUNK_SIZE = 2 ** 20 |
---|
| 63 | f = open( fname, 'r' ) |
---|
| 64 | for chunk in f.read( CHUNK_SIZE ): |
---|
| 65 | if f.tell() > bytes_to_read: |
---|
| 66 | break |
---|
| 67 | if chunk.count( '\r' ): |
---|
| 68 | f.close() |
---|
| 69 | return True |
---|
| 70 | f.close() |
---|
| 71 | return False |
---|
| 72 | |
---|
| 73 | def convert_newlines( fname, in_place=True ): |
---|
| 74 | """ |
---|
| 75 | Converts in place a file from universal line endings |
---|
| 76 | to Posix line endings. |
---|
| 77 | |
---|
| 78 | >>> fname = get_test_fname('temp.txt') |
---|
| 79 | >>> file(fname, 'wt').write("1 2\\r3 4") |
---|
| 80 | >>> convert_newlines(fname) |
---|
| 81 | (2, None) |
---|
| 82 | >>> file(fname).read() |
---|
| 83 | '1 2\\n3 4\\n' |
---|
| 84 | """ |
---|
| 85 | fd, temp_name = tempfile.mkstemp() |
---|
| 86 | fp = os.fdopen( fd, "wt" ) |
---|
| 87 | for i, line in enumerate( file( fname, "U" ) ): |
---|
| 88 | fp.write( "%s\n" % line.rstrip( "\r\n" ) ) |
---|
| 89 | fp.close() |
---|
| 90 | if in_place: |
---|
| 91 | shutil.move( temp_name, fname ) |
---|
| 92 | # Return number of lines in file. |
---|
| 93 | return ( i + 1, None ) |
---|
| 94 | else: |
---|
| 95 | return ( i + 1, temp_name ) |
---|
| 96 | |
---|
| 97 | def sep2tabs( fname, in_place=True, patt="\\s+" ): |
---|
| 98 | """ |
---|
| 99 | Transforms in place a 'sep' separated file to a tab separated one |
---|
| 100 | |
---|
| 101 | >>> fname = get_test_fname('temp.txt') |
---|
| 102 | >>> file(fname, 'wt').write("1 2\\n3 4\\n") |
---|
| 103 | >>> sep2tabs(fname) |
---|
| 104 | (2, None) |
---|
| 105 | >>> file(fname).read() |
---|
| 106 | '1\\t2\\n3\\t4\\n' |
---|
| 107 | """ |
---|
| 108 | regexp = re.compile( patt ) |
---|
| 109 | fd, temp_name = tempfile.mkstemp() |
---|
| 110 | fp = os.fdopen( fd, "wt" ) |
---|
| 111 | for i, line in enumerate( file( fname ) ): |
---|
| 112 | line = line.rstrip( '\r\n' ) |
---|
| 113 | elems = regexp.split( line ) |
---|
| 114 | fp.write( "%s\n" % '\t'.join( elems ) ) |
---|
| 115 | fp.close() |
---|
| 116 | if in_place: |
---|
| 117 | shutil.move( temp_name, fname ) |
---|
| 118 | # Return number of lines in file. |
---|
| 119 | return ( i + 1, None ) |
---|
| 120 | else: |
---|
| 121 | return ( i + 1, temp_name ) |
---|
| 122 | |
---|
| 123 | def convert_newlines_sep2tabs( fname, in_place=True, patt="\\s+" ): |
---|
| 124 | """ |
---|
| 125 | Combines above methods: convert_newlines() and sep2tabs() |
---|
| 126 | so that files do not need to be read twice |
---|
| 127 | |
---|
| 128 | >>> fname = get_test_fname('temp.txt') |
---|
| 129 | >>> file(fname, 'wt').write("1 2\\r3 4") |
---|
| 130 | >>> convert_newlines_sep2tabs(fname) |
---|
| 131 | (2, None) |
---|
| 132 | >>> file(fname).read() |
---|
| 133 | '1\\t2\\n3\\t4\\n' |
---|
| 134 | """ |
---|
| 135 | regexp = re.compile( patt ) |
---|
| 136 | fd, temp_name = tempfile.mkstemp() |
---|
| 137 | fp = os.fdopen( fd, "wt" ) |
---|
| 138 | for i, line in enumerate( file( fname, "U" ) ): |
---|
| 139 | line = line.rstrip( '\r\n' ) |
---|
| 140 | elems = regexp.split( line ) |
---|
| 141 | fp.write( "%s\n" % '\t'.join( elems ) ) |
---|
| 142 | fp.close() |
---|
| 143 | if in_place: |
---|
| 144 | shutil.move( temp_name, fname ) |
---|
| 145 | # Return number of lines in file. |
---|
| 146 | return ( i + 1, None ) |
---|
| 147 | else: |
---|
| 148 | return ( i + 1, temp_name ) |
---|
| 149 | |
---|
| 150 | def get_headers( fname, sep, count=60, is_multi_byte=False ): |
---|
| 151 | """ |
---|
| 152 | Returns a list with the first 'count' lines split by 'sep' |
---|
| 153 | |
---|
| 154 | >>> fname = get_test_fname('complete.bed') |
---|
| 155 | >>> get_headers(fname,'\\t') |
---|
| 156 | [['chr7', '127475281', '127491632', 'NM_000230', '0', '+', '127486022', '127488767', '0', '3', '29,172,3225,', '0,10713,13126,'], ['chr7', '127486011', '127488900', 'D49487', '0', '+', '127486022', '127488767', '0', '2', '155,490,', '0,2399']] |
---|
| 157 | """ |
---|
| 158 | headers = [] |
---|
| 159 | for idx, line in enumerate(file(fname)): |
---|
| 160 | line = line.rstrip('\n\r') |
---|
| 161 | if is_multi_byte: |
---|
| 162 | # TODO: fix this - sep is never found in line |
---|
| 163 | line = unicode( line, 'utf-8' ) |
---|
| 164 | sep = sep.encode( 'utf-8' ) |
---|
| 165 | headers.append( line.split(sep) ) |
---|
| 166 | if idx == count: |
---|
| 167 | break |
---|
| 168 | return headers |
---|
| 169 | |
---|
| 170 | def is_column_based( fname, sep='\t', skip=0, is_multi_byte=False ): |
---|
| 171 | """ |
---|
| 172 | Checks whether the file is column based with respect to a separator |
---|
| 173 | (defaults to tab separator). |
---|
| 174 | |
---|
| 175 | >>> fname = get_test_fname('test.gff') |
---|
| 176 | >>> is_column_based(fname) |
---|
| 177 | True |
---|
| 178 | >>> fname = get_test_fname('test_tab.bed') |
---|
| 179 | >>> is_column_based(fname) |
---|
| 180 | True |
---|
| 181 | >>> is_column_based(fname, sep=' ') |
---|
| 182 | False |
---|
| 183 | >>> fname = get_test_fname('test_space.txt') |
---|
| 184 | >>> is_column_based(fname) |
---|
| 185 | False |
---|
| 186 | >>> is_column_based(fname, sep=' ') |
---|
| 187 | True |
---|
| 188 | >>> fname = get_test_fname('test_ensembl.tab') |
---|
| 189 | >>> is_column_based(fname) |
---|
| 190 | True |
---|
| 191 | >>> fname = get_test_fname('test_tab1.tabular') |
---|
| 192 | >>> is_column_based(fname, sep=' ', skip=0) |
---|
| 193 | False |
---|
| 194 | >>> fname = get_test_fname('test_tab1.tabular') |
---|
| 195 | >>> is_column_based(fname) |
---|
| 196 | True |
---|
| 197 | """ |
---|
| 198 | headers = get_headers( fname, sep, is_multi_byte=is_multi_byte ) |
---|
| 199 | count = 0 |
---|
| 200 | if not headers: |
---|
| 201 | return False |
---|
| 202 | for hdr in headers[skip:]: |
---|
| 203 | if hdr and hdr[0] and not hdr[0].startswith('#'): |
---|
| 204 | if len(hdr) > 1: |
---|
| 205 | count = len(hdr) |
---|
| 206 | break |
---|
| 207 | if count < 2: |
---|
| 208 | return False |
---|
| 209 | for hdr in headers[skip:]: |
---|
| 210 | if hdr and hdr[0] and not hdr[0].startswith('#'): |
---|
| 211 | if len(hdr) != count: |
---|
| 212 | return False |
---|
| 213 | return True |
---|
| 214 | |
---|
| 215 | def guess_ext( fname, sniff_order=None, is_multi_byte=False ): |
---|
| 216 | """ |
---|
| 217 | Returns an extension that can be used in the datatype factory to |
---|
| 218 | generate a data for the 'fname' file |
---|
| 219 | |
---|
| 220 | >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml') |
---|
| 221 | >>> guess_ext(fname) |
---|
| 222 | 'blastxml' |
---|
| 223 | >>> fname = get_test_fname('interval.interval') |
---|
| 224 | >>> guess_ext(fname) |
---|
| 225 | 'interval' |
---|
| 226 | >>> fname = get_test_fname('interval1.bed') |
---|
| 227 | >>> guess_ext(fname) |
---|
| 228 | 'bed' |
---|
| 229 | >>> fname = get_test_fname('test_tab.bed') |
---|
| 230 | >>> guess_ext(fname) |
---|
| 231 | 'bed' |
---|
| 232 | >>> fname = get_test_fname('sequence.maf') |
---|
| 233 | >>> guess_ext(fname) |
---|
| 234 | 'maf' |
---|
| 235 | >>> fname = get_test_fname('sequence.fasta') |
---|
| 236 | >>> guess_ext(fname) |
---|
| 237 | 'fasta' |
---|
| 238 | >>> fname = get_test_fname('file.html') |
---|
| 239 | >>> guess_ext(fname) |
---|
| 240 | 'html' |
---|
| 241 | >>> fname = get_test_fname('test.gtf') |
---|
| 242 | >>> guess_ext(fname) |
---|
| 243 | 'gtf' |
---|
| 244 | >>> fname = get_test_fname('test.gff') |
---|
| 245 | >>> guess_ext(fname) |
---|
| 246 | 'gff' |
---|
| 247 | >>> fname = get_test_fname('gff_version_3.gff') |
---|
| 248 | >>> guess_ext(fname) |
---|
| 249 | 'gff3' |
---|
| 250 | >>> fname = get_test_fname('temp.txt') |
---|
| 251 | >>> file(fname, 'wt').write("a\\t2\\nc\\t1\\nd\\t0") |
---|
| 252 | >>> guess_ext(fname) |
---|
| 253 | 'tabular' |
---|
| 254 | >>> fname = get_test_fname('temp.txt') |
---|
| 255 | >>> file(fname, 'wt').write("a 1 2 x\\nb 3 4 y\\nc 5 6 z") |
---|
| 256 | >>> guess_ext(fname) |
---|
| 257 | 'txt' |
---|
| 258 | >>> fname = get_test_fname('test_tab1.tabular') |
---|
| 259 | >>> guess_ext(fname) |
---|
| 260 | 'tabular' |
---|
| 261 | >>> fname = get_test_fname('alignment.lav') |
---|
| 262 | >>> guess_ext(fname) |
---|
| 263 | 'lav' |
---|
| 264 | >>> fname = get_test_fname('1.sff') |
---|
| 265 | >>> guess_ext(fname) |
---|
| 266 | 'sff' |
---|
| 267 | >>> fname = get_test_fname('1.bam') |
---|
| 268 | >>> guess_ext(fname) |
---|
| 269 | 'bam' |
---|
| 270 | >>> fname = get_test_fname('3.bam') |
---|
| 271 | >>> guess_ext(fname) |
---|
| 272 | 'bam' |
---|
| 273 | """ |
---|
| 274 | if sniff_order is None: |
---|
| 275 | datatypes_registry = registry.Registry() |
---|
| 276 | sniff_order = datatypes_registry.sniff_order |
---|
| 277 | for datatype in sniff_order: |
---|
| 278 | """ |
---|
| 279 | Some classes may not have a sniff function, which is ok. In fact, the |
---|
| 280 | Tabular and Text classes are 2 examples of classes that should never have |
---|
| 281 | a sniff function. Since these classes are default classes, they contain |
---|
| 282 | few rules to filter out data of other formats, so they should be called |
---|
| 283 | from this function after all other datatypes in sniff_order have not been |
---|
| 284 | successfully discovered. |
---|
| 285 | """ |
---|
| 286 | try: |
---|
| 287 | if datatype.sniff( fname ): |
---|
| 288 | return datatype.file_ext |
---|
| 289 | except: |
---|
| 290 | pass |
---|
| 291 | headers = get_headers( fname, None ) |
---|
| 292 | is_binary = False |
---|
| 293 | if is_multi_byte: |
---|
| 294 | is_binary = False |
---|
| 295 | else: |
---|
| 296 | for hdr in headers: |
---|
| 297 | for char in hdr: |
---|
| 298 | if len( char ) > 1: |
---|
| 299 | for c in char: |
---|
| 300 | if ord( c ) > 128: |
---|
| 301 | is_binary = True |
---|
| 302 | break |
---|
| 303 | elif ord( char ) > 128: |
---|
| 304 | is_binary = True |
---|
| 305 | break |
---|
| 306 | if is_binary: |
---|
| 307 | break |
---|
| 308 | if is_binary: |
---|
| 309 | break |
---|
| 310 | if is_binary: |
---|
| 311 | return 'data' #default binary data type file extension |
---|
| 312 | if is_column_based( fname, '\t', 1, is_multi_byte=is_multi_byte ): |
---|
| 313 | return 'tabular' #default tabular data type file extension |
---|
| 314 | return 'txt' #default text data type file extension |
---|
| 315 | |
---|
| 316 | if __name__ == '__main__': |
---|
| 317 | import doctest, sys |
---|
| 318 | doctest.testmod(sys.modules[__name__]) |
---|
| 319 | |
---|