| 1 | """ |
|---|
| 2 | File format detector |
|---|
| 3 | """ |
|---|
| 4 | import logging, sys, os, csv, tempfile, shutil, re, zipfile |
|---|
| 5 | import registry |
|---|
| 6 | from galaxy import util |
|---|
| 7 | |
|---|
| 8 | log = logging.getLogger(__name__) |
|---|
| 9 | |
|---|
| 10 | def get_test_fname(fname): |
|---|
| 11 | """Returns test data filename""" |
|---|
| 12 | path, name = os.path.split(__file__) |
|---|
| 13 | full_path = os.path.join(path, 'test', fname) |
|---|
| 14 | return full_path |
|---|
| 15 | |
|---|
| 16 | def stream_to_file( stream, suffix='', prefix='', dir=None, text=False ): |
|---|
| 17 | """Writes a stream to a temporary file, returns the temporary file's name""" |
|---|
| 18 | fd, temp_name = tempfile.mkstemp( suffix=suffix, prefix=prefix, dir=dir, text=text ) |
|---|
| 19 | CHUNK_SIZE = 1048576 |
|---|
| 20 | data_checked = False |
|---|
| 21 | is_compressed = False |
|---|
| 22 | is_binary = False |
|---|
| 23 | is_multi_byte = False |
|---|
| 24 | while 1: |
|---|
| 25 | chunk = stream.read( CHUNK_SIZE ) |
|---|
| 26 | if not chunk: |
|---|
| 27 | break |
|---|
| 28 | if not data_checked: |
|---|
| 29 | # See if we're uploading a compressed file |
|---|
| 30 | if zipfile.is_zipfile( temp_name ): |
|---|
| 31 | is_compressed = True |
|---|
| 32 | else: |
|---|
| 33 | try: |
|---|
| 34 | if unicode( chunk[:2] ) == unicode( util.gzip_magic ): |
|---|
| 35 | is_compressed = True |
|---|
| 36 | except: |
|---|
| 37 | pass |
|---|
| 38 | if not is_compressed: |
|---|
| 39 | # See if we have a multi-byte character file |
|---|
| 40 | chars = chunk[:100] |
|---|
| 41 | is_multi_byte = util.is_multi_byte( chars ) |
|---|
| 42 | if not is_multi_byte: |
|---|
| 43 | for char in chars: |
|---|
| 44 | if ord( char ) > 128: |
|---|
| 45 | is_binary = True |
|---|
| 46 | break |
|---|
| 47 | data_checked = True |
|---|
| 48 | if not is_compressed and not is_binary: |
|---|
| 49 | os.write( fd, chunk.encode( "utf-8" ) ) |
|---|
| 50 | else: |
|---|
| 51 | # Compressed files must be encoded after they are uncompressed in the upload utility, |
|---|
| 52 | # while binary files should not be encoded at all. |
|---|
| 53 | os.write( fd, chunk ) |
|---|
| 54 | os.close( fd ) |
|---|
| 55 | return temp_name, is_multi_byte |
|---|
| 56 | |
|---|
| 57 | def check_newlines( fname, bytes_to_read=52428800 ): |
|---|
| 58 | """ |
|---|
| 59 | Determines if there are any non-POSIX newlines in the first |
|---|
| 60 | number_of_bytes (by default, 50MB) of the file. |
|---|
| 61 | """ |
|---|
| 62 | CHUNK_SIZE = 2 ** 20 |
|---|
| 63 | f = open( fname, 'r' ) |
|---|
| 64 | for chunk in f.read( CHUNK_SIZE ): |
|---|
| 65 | if f.tell() > bytes_to_read: |
|---|
| 66 | break |
|---|
| 67 | if chunk.count( '\r' ): |
|---|
| 68 | f.close() |
|---|
| 69 | return True |
|---|
| 70 | f.close() |
|---|
| 71 | return False |
|---|
| 72 | |
|---|
| 73 | def convert_newlines( fname, in_place=True ): |
|---|
| 74 | """ |
|---|
| 75 | Converts in place a file from universal line endings |
|---|
| 76 | to Posix line endings. |
|---|
| 77 | |
|---|
| 78 | >>> fname = get_test_fname('temp.txt') |
|---|
| 79 | >>> file(fname, 'wt').write("1 2\\r3 4") |
|---|
| 80 | >>> convert_newlines(fname) |
|---|
| 81 | (2, None) |
|---|
| 82 | >>> file(fname).read() |
|---|
| 83 | '1 2\\n3 4\\n' |
|---|
| 84 | """ |
|---|
| 85 | fd, temp_name = tempfile.mkstemp() |
|---|
| 86 | fp = os.fdopen( fd, "wt" ) |
|---|
| 87 | for i, line in enumerate( file( fname, "U" ) ): |
|---|
| 88 | fp.write( "%s\n" % line.rstrip( "\r\n" ) ) |
|---|
| 89 | fp.close() |
|---|
| 90 | if in_place: |
|---|
| 91 | shutil.move( temp_name, fname ) |
|---|
| 92 | # Return number of lines in file. |
|---|
| 93 | return ( i + 1, None ) |
|---|
| 94 | else: |
|---|
| 95 | return ( i + 1, temp_name ) |
|---|
| 96 | |
|---|
| 97 | def sep2tabs( fname, in_place=True, patt="\\s+" ): |
|---|
| 98 | """ |
|---|
| 99 | Transforms in place a 'sep' separated file to a tab separated one |
|---|
| 100 | |
|---|
| 101 | >>> fname = get_test_fname('temp.txt') |
|---|
| 102 | >>> file(fname, 'wt').write("1 2\\n3 4\\n") |
|---|
| 103 | >>> sep2tabs(fname) |
|---|
| 104 | (2, None) |
|---|
| 105 | >>> file(fname).read() |
|---|
| 106 | '1\\t2\\n3\\t4\\n' |
|---|
| 107 | """ |
|---|
| 108 | regexp = re.compile( patt ) |
|---|
| 109 | fd, temp_name = tempfile.mkstemp() |
|---|
| 110 | fp = os.fdopen( fd, "wt" ) |
|---|
| 111 | for i, line in enumerate( file( fname ) ): |
|---|
| 112 | line = line.rstrip( '\r\n' ) |
|---|
| 113 | elems = regexp.split( line ) |
|---|
| 114 | fp.write( "%s\n" % '\t'.join( elems ) ) |
|---|
| 115 | fp.close() |
|---|
| 116 | if in_place: |
|---|
| 117 | shutil.move( temp_name, fname ) |
|---|
| 118 | # Return number of lines in file. |
|---|
| 119 | return ( i + 1, None ) |
|---|
| 120 | else: |
|---|
| 121 | return ( i + 1, temp_name ) |
|---|
| 122 | |
|---|
| 123 | def convert_newlines_sep2tabs( fname, in_place=True, patt="\\s+" ): |
|---|
| 124 | """ |
|---|
| 125 | Combines above methods: convert_newlines() and sep2tabs() |
|---|
| 126 | so that files do not need to be read twice |
|---|
| 127 | |
|---|
| 128 | >>> fname = get_test_fname('temp.txt') |
|---|
| 129 | >>> file(fname, 'wt').write("1 2\\r3 4") |
|---|
| 130 | >>> convert_newlines_sep2tabs(fname) |
|---|
| 131 | (2, None) |
|---|
| 132 | >>> file(fname).read() |
|---|
| 133 | '1\\t2\\n3\\t4\\n' |
|---|
| 134 | """ |
|---|
| 135 | regexp = re.compile( patt ) |
|---|
| 136 | fd, temp_name = tempfile.mkstemp() |
|---|
| 137 | fp = os.fdopen( fd, "wt" ) |
|---|
| 138 | for i, line in enumerate( file( fname, "U" ) ): |
|---|
| 139 | line = line.rstrip( '\r\n' ) |
|---|
| 140 | elems = regexp.split( line ) |
|---|
| 141 | fp.write( "%s\n" % '\t'.join( elems ) ) |
|---|
| 142 | fp.close() |
|---|
| 143 | if in_place: |
|---|
| 144 | shutil.move( temp_name, fname ) |
|---|
| 145 | # Return number of lines in file. |
|---|
| 146 | return ( i + 1, None ) |
|---|
| 147 | else: |
|---|
| 148 | return ( i + 1, temp_name ) |
|---|
| 149 | |
|---|
| 150 | def get_headers( fname, sep, count=60, is_multi_byte=False ): |
|---|
| 151 | """ |
|---|
| 152 | Returns a list with the first 'count' lines split by 'sep' |
|---|
| 153 | |
|---|
| 154 | >>> fname = get_test_fname('complete.bed') |
|---|
| 155 | >>> get_headers(fname,'\\t') |
|---|
| 156 | [['chr7', '127475281', '127491632', 'NM_000230', '0', '+', '127486022', '127488767', '0', '3', '29,172,3225,', '0,10713,13126,'], ['chr7', '127486011', '127488900', 'D49487', '0', '+', '127486022', '127488767', '0', '2', '155,490,', '0,2399']] |
|---|
| 157 | """ |
|---|
| 158 | headers = [] |
|---|
| 159 | for idx, line in enumerate(file(fname)): |
|---|
| 160 | line = line.rstrip('\n\r') |
|---|
| 161 | if is_multi_byte: |
|---|
| 162 | # TODO: fix this - sep is never found in line |
|---|
| 163 | line = unicode( line, 'utf-8' ) |
|---|
| 164 | sep = sep.encode( 'utf-8' ) |
|---|
| 165 | headers.append( line.split(sep) ) |
|---|
| 166 | if idx == count: |
|---|
| 167 | break |
|---|
| 168 | return headers |
|---|
| 169 | |
|---|
| 170 | def is_column_based( fname, sep='\t', skip=0, is_multi_byte=False ): |
|---|
| 171 | """ |
|---|
| 172 | Checks whether the file is column based with respect to a separator |
|---|
| 173 | (defaults to tab separator). |
|---|
| 174 | |
|---|
| 175 | >>> fname = get_test_fname('test.gff') |
|---|
| 176 | >>> is_column_based(fname) |
|---|
| 177 | True |
|---|
| 178 | >>> fname = get_test_fname('test_tab.bed') |
|---|
| 179 | >>> is_column_based(fname) |
|---|
| 180 | True |
|---|
| 181 | >>> is_column_based(fname, sep=' ') |
|---|
| 182 | False |
|---|
| 183 | >>> fname = get_test_fname('test_space.txt') |
|---|
| 184 | >>> is_column_based(fname) |
|---|
| 185 | False |
|---|
| 186 | >>> is_column_based(fname, sep=' ') |
|---|
| 187 | True |
|---|
| 188 | >>> fname = get_test_fname('test_ensembl.tab') |
|---|
| 189 | >>> is_column_based(fname) |
|---|
| 190 | True |
|---|
| 191 | >>> fname = get_test_fname('test_tab1.tabular') |
|---|
| 192 | >>> is_column_based(fname, sep=' ', skip=0) |
|---|
| 193 | False |
|---|
| 194 | >>> fname = get_test_fname('test_tab1.tabular') |
|---|
| 195 | >>> is_column_based(fname) |
|---|
| 196 | True |
|---|
| 197 | """ |
|---|
| 198 | headers = get_headers( fname, sep, is_multi_byte=is_multi_byte ) |
|---|
| 199 | count = 0 |
|---|
| 200 | if not headers: |
|---|
| 201 | return False |
|---|
| 202 | for hdr in headers[skip:]: |
|---|
| 203 | if hdr and hdr[0] and not hdr[0].startswith('#'): |
|---|
| 204 | if len(hdr) > 1: |
|---|
| 205 | count = len(hdr) |
|---|
| 206 | break |
|---|
| 207 | if count < 2: |
|---|
| 208 | return False |
|---|
| 209 | for hdr in headers[skip:]: |
|---|
| 210 | if hdr and hdr[0] and not hdr[0].startswith('#'): |
|---|
| 211 | if len(hdr) != count: |
|---|
| 212 | return False |
|---|
| 213 | return True |
|---|
| 214 | |
|---|
| 215 | def guess_ext( fname, sniff_order=None, is_multi_byte=False ): |
|---|
| 216 | """ |
|---|
| 217 | Returns an extension that can be used in the datatype factory to |
|---|
| 218 | generate a data for the 'fname' file |
|---|
| 219 | |
|---|
| 220 | >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml') |
|---|
| 221 | >>> guess_ext(fname) |
|---|
| 222 | 'blastxml' |
|---|
| 223 | >>> fname = get_test_fname('interval.interval') |
|---|
| 224 | >>> guess_ext(fname) |
|---|
| 225 | 'interval' |
|---|
| 226 | >>> fname = get_test_fname('interval1.bed') |
|---|
| 227 | >>> guess_ext(fname) |
|---|
| 228 | 'bed' |
|---|
| 229 | >>> fname = get_test_fname('test_tab.bed') |
|---|
| 230 | >>> guess_ext(fname) |
|---|
| 231 | 'bed' |
|---|
| 232 | >>> fname = get_test_fname('sequence.maf') |
|---|
| 233 | >>> guess_ext(fname) |
|---|
| 234 | 'maf' |
|---|
| 235 | >>> fname = get_test_fname('sequence.fasta') |
|---|
| 236 | >>> guess_ext(fname) |
|---|
| 237 | 'fasta' |
|---|
| 238 | >>> fname = get_test_fname('file.html') |
|---|
| 239 | >>> guess_ext(fname) |
|---|
| 240 | 'html' |
|---|
| 241 | >>> fname = get_test_fname('test.gtf') |
|---|
| 242 | >>> guess_ext(fname) |
|---|
| 243 | 'gtf' |
|---|
| 244 | >>> fname = get_test_fname('test.gff') |
|---|
| 245 | >>> guess_ext(fname) |
|---|
| 246 | 'gff' |
|---|
| 247 | >>> fname = get_test_fname('gff_version_3.gff') |
|---|
| 248 | >>> guess_ext(fname) |
|---|
| 249 | 'gff3' |
|---|
| 250 | >>> fname = get_test_fname('temp.txt') |
|---|
| 251 | >>> file(fname, 'wt').write("a\\t2\\nc\\t1\\nd\\t0") |
|---|
| 252 | >>> guess_ext(fname) |
|---|
| 253 | 'tabular' |
|---|
| 254 | >>> fname = get_test_fname('temp.txt') |
|---|
| 255 | >>> file(fname, 'wt').write("a 1 2 x\\nb 3 4 y\\nc 5 6 z") |
|---|
| 256 | >>> guess_ext(fname) |
|---|
| 257 | 'txt' |
|---|
| 258 | >>> fname = get_test_fname('test_tab1.tabular') |
|---|
| 259 | >>> guess_ext(fname) |
|---|
| 260 | 'tabular' |
|---|
| 261 | >>> fname = get_test_fname('alignment.lav') |
|---|
| 262 | >>> guess_ext(fname) |
|---|
| 263 | 'lav' |
|---|
| 264 | >>> fname = get_test_fname('1.sff') |
|---|
| 265 | >>> guess_ext(fname) |
|---|
| 266 | 'sff' |
|---|
| 267 | >>> fname = get_test_fname('1.bam') |
|---|
| 268 | >>> guess_ext(fname) |
|---|
| 269 | 'bam' |
|---|
| 270 | >>> fname = get_test_fname('3.bam') |
|---|
| 271 | >>> guess_ext(fname) |
|---|
| 272 | 'bam' |
|---|
| 273 | """ |
|---|
| 274 | if sniff_order is None: |
|---|
| 275 | datatypes_registry = registry.Registry() |
|---|
| 276 | sniff_order = datatypes_registry.sniff_order |
|---|
| 277 | for datatype in sniff_order: |
|---|
| 278 | """ |
|---|
| 279 | Some classes may not have a sniff function, which is ok. In fact, the |
|---|
| 280 | Tabular and Text classes are 2 examples of classes that should never have |
|---|
| 281 | a sniff function. Since these classes are default classes, they contain |
|---|
| 282 | few rules to filter out data of other formats, so they should be called |
|---|
| 283 | from this function after all other datatypes in sniff_order have not been |
|---|
| 284 | successfully discovered. |
|---|
| 285 | """ |
|---|
| 286 | try: |
|---|
| 287 | if datatype.sniff( fname ): |
|---|
| 288 | return datatype.file_ext |
|---|
| 289 | except: |
|---|
| 290 | pass |
|---|
| 291 | headers = get_headers( fname, None ) |
|---|
| 292 | is_binary = False |
|---|
| 293 | if is_multi_byte: |
|---|
| 294 | is_binary = False |
|---|
| 295 | else: |
|---|
| 296 | for hdr in headers: |
|---|
| 297 | for char in hdr: |
|---|
| 298 | if len( char ) > 1: |
|---|
| 299 | for c in char: |
|---|
| 300 | if ord( c ) > 128: |
|---|
| 301 | is_binary = True |
|---|
| 302 | break |
|---|
| 303 | elif ord( char ) > 128: |
|---|
| 304 | is_binary = True |
|---|
| 305 | break |
|---|
| 306 | if is_binary: |
|---|
| 307 | break |
|---|
| 308 | if is_binary: |
|---|
| 309 | break |
|---|
| 310 | if is_binary: |
|---|
| 311 | return 'data' #default binary data type file extension |
|---|
| 312 | if is_column_based( fname, '\t', 1, is_multi_byte=is_multi_byte ): |
|---|
| 313 | return 'tabular' #default tabular data type file extension |
|---|
| 314 | return 'txt' #default text data type file extension |
|---|
| 315 | |
|---|
| 316 | if __name__ == '__main__': |
|---|
| 317 | import doctest, sys |
|---|
| 318 | doctest.testmod(sys.modules[__name__]) |
|---|
| 319 | |
|---|