[2] | 1 | #!/usr/bin/env python |
---|
| 2 | #Processes uploads from the user. |
---|
| 3 | |
---|
| 4 | # WARNING: Changes in this tool (particularly as related to parsing) may need |
---|
| 5 | # to be reflected in galaxy.web.controllers.tool_runner and galaxy.tools |
---|
| 6 | |
---|
| 7 | import urllib, sys, os, gzip, tempfile, shutil, re, gzip, zipfile, codecs, binascii |
---|
| 8 | from galaxy import eggs |
---|
| 9 | # need to import model before sniff to resolve a circular import dependency |
---|
| 10 | import galaxy.model |
---|
| 11 | from galaxy.datatypes import sniff |
---|
| 12 | from galaxy.datatypes.binary import * |
---|
| 13 | from galaxy.datatypes.images import Pdf |
---|
| 14 | from galaxy.datatypes.registry import Registry |
---|
| 15 | from galaxy import util |
---|
| 16 | from galaxy.util.json import * |
---|
| 17 | |
---|
| 18 | try: |
---|
| 19 | import bz2 |
---|
| 20 | except: |
---|
| 21 | bz2 = None |
---|
| 22 | |
---|
| 23 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
| 24 | |
---|
| 25 | def stop_err( msg, ret=1 ): |
---|
| 26 | sys.stderr.write( msg ) |
---|
| 27 | sys.exit( ret ) |
---|
| 28 | def file_err( msg, dataset, json_file ): |
---|
| 29 | json_file.write( to_json_string( dict( type = 'dataset', |
---|
| 30 | ext = 'data', |
---|
| 31 | dataset_id = dataset.dataset_id, |
---|
| 32 | stderr = msg ) ) + "\n" ) |
---|
| 33 | # never remove a server-side upload |
---|
| 34 | if dataset.type in ( 'server_dir', 'path_paste' ): |
---|
| 35 | return |
---|
| 36 | try: |
---|
| 37 | os.remove( dataset.path ) |
---|
| 38 | except: |
---|
| 39 | pass |
---|
| 40 | def safe_dict(d): |
---|
| 41 | """ |
---|
| 42 | Recursively clone json structure with UTF-8 dictionary keys |
---|
| 43 | http://mellowmachines.com/blog/2009/06/exploding-dictionary-with-unicode-keys-as-python-arguments/ |
---|
| 44 | """ |
---|
| 45 | if isinstance(d, dict): |
---|
| 46 | return dict([(k.encode('utf-8'), safe_dict(v)) for k,v in d.iteritems()]) |
---|
| 47 | elif isinstance(d, list): |
---|
| 48 | return [safe_dict(x) for x in d] |
---|
| 49 | else: |
---|
| 50 | return d |
---|
| 51 | def check_html( temp_name, chunk=None ): |
---|
| 52 | if chunk is None: |
---|
| 53 | temp = open(temp_name, "U") |
---|
| 54 | else: |
---|
| 55 | temp = chunk |
---|
| 56 | regexp1 = re.compile( "<A\s+[^>]*HREF[^>]+>", re.I ) |
---|
| 57 | regexp2 = re.compile( "<IFRAME[^>]*>", re.I ) |
---|
| 58 | regexp3 = re.compile( "<FRAMESET[^>]*>", re.I ) |
---|
| 59 | regexp4 = re.compile( "<META[^>]*>", re.I ) |
---|
| 60 | regexp5 = re.compile( "<SCRIPT[^>]*>", re.I ) |
---|
| 61 | lineno = 0 |
---|
| 62 | for line in temp: |
---|
| 63 | lineno += 1 |
---|
| 64 | matches = regexp1.search( line ) or regexp2.search( line ) or regexp3.search( line ) or regexp4.search( line ) or regexp5.search( line ) |
---|
| 65 | if matches: |
---|
| 66 | if chunk is None: |
---|
| 67 | temp.close() |
---|
| 68 | return True |
---|
| 69 | if lineno > 100: |
---|
| 70 | break |
---|
| 71 | if chunk is None: |
---|
| 72 | temp.close() |
---|
| 73 | return False |
---|
| 74 | def check_binary( temp_name ): |
---|
| 75 | is_binary = False |
---|
| 76 | temp = open( temp_name, "U" ) |
---|
| 77 | chars_read = 0 |
---|
| 78 | for chars in temp: |
---|
| 79 | for char in chars: |
---|
| 80 | chars_read += 1 |
---|
| 81 | if ord( char ) > 128: |
---|
| 82 | is_binary = True |
---|
| 83 | break |
---|
| 84 | if chars_read > 100: |
---|
| 85 | break |
---|
| 86 | if chars_read > 100: |
---|
| 87 | break |
---|
| 88 | temp.close() |
---|
| 89 | return is_binary |
---|
| 90 | def check_bam( temp_name ): |
---|
| 91 | return Bam().sniff( temp_name ) |
---|
| 92 | def check_sff( temp_name ): |
---|
| 93 | return Sff().sniff( temp_name ) |
---|
| 94 | def check_pdf( temp_name ): |
---|
| 95 | return Pdf().sniff( temp_name ) |
---|
| 96 | def check_bigwig( temp_name ): |
---|
| 97 | return BigWig().sniff( temp_name ) |
---|
| 98 | def check_bigbed( temp_name ): |
---|
| 99 | return BigBed().sniff( temp_name ) |
---|
| 100 | def check_gzip( temp_name ): |
---|
| 101 | # This method returns a tuple of booleans representing ( is_gzipped, is_valid ) |
---|
| 102 | # Make sure we have a gzipped file |
---|
| 103 | try: |
---|
| 104 | temp = open( temp_name, "U" ) |
---|
| 105 | magic_check = temp.read( 2 ) |
---|
| 106 | temp.close() |
---|
| 107 | if magic_check != util.gzip_magic: |
---|
| 108 | return ( False, False ) |
---|
| 109 | except: |
---|
| 110 | return ( False, False ) |
---|
| 111 | # We support some binary data types, so check if the compressed binary file is valid |
---|
| 112 | # If the file is Bam, it should already have been detected as such, so we'll just check |
---|
| 113 | # for sff format. |
---|
| 114 | try: |
---|
| 115 | header = gzip.open( temp_name ).read(4) |
---|
| 116 | if binascii.b2a_hex( header ) == binascii.hexlify( '.sff' ): |
---|
| 117 | return ( True, True ) |
---|
| 118 | except: |
---|
| 119 | return( False, False ) |
---|
| 120 | CHUNK_SIZE = 2**15 # 32Kb |
---|
| 121 | gzipped_file = gzip.GzipFile( temp_name, mode='rb' ) |
---|
| 122 | chunk = gzipped_file.read( CHUNK_SIZE ) |
---|
| 123 | gzipped_file.close() |
---|
| 124 | # See if we have a compressed HTML file |
---|
| 125 | if check_html( temp_name, chunk=chunk ): |
---|
| 126 | return ( True, False ) |
---|
| 127 | return ( True, True ) |
---|
| 128 | def check_bz2( temp_name ): |
---|
| 129 | try: |
---|
| 130 | temp = open( temp_name, "U" ) |
---|
| 131 | magic_check = temp.read( 3 ) |
---|
| 132 | temp.close() |
---|
| 133 | if magic_check != util.bz2_magic: |
---|
| 134 | return ( False, False ) |
---|
| 135 | except: |
---|
| 136 | return( False, False ) |
---|
| 137 | CHUNK_SIZE = 2**15 # reKb |
---|
| 138 | bzipped_file = bz2.BZ2File( temp_name, mode='rb' ) |
---|
| 139 | chunk = bzipped_file.read( CHUNK_SIZE ) |
---|
| 140 | bzipped_file.close() |
---|
| 141 | # See if we have a compressed HTML file |
---|
| 142 | if check_html( temp_name, chunk=chunk ): |
---|
| 143 | return ( True, False ) |
---|
| 144 | return ( True, True ) |
---|
| 145 | def check_zip( temp_name ): |
---|
| 146 | if zipfile.is_zipfile( temp_name ): |
---|
| 147 | return True |
---|
| 148 | return False |
---|
| 149 | def parse_outputs( args ): |
---|
| 150 | rval = {} |
---|
| 151 | for arg in args: |
---|
| 152 | id, files_path, path = arg.split( ':', 2 ) |
---|
| 153 | rval[int( id )] = ( path, files_path ) |
---|
| 154 | return rval |
---|
| 155 | def add_file( dataset, registry, json_file, output_path ): |
---|
| 156 | data_type = None |
---|
| 157 | line_count = None |
---|
| 158 | converted_path = None |
---|
| 159 | stdout = None |
---|
| 160 | |
---|
| 161 | if dataset.type == 'url': |
---|
| 162 | try: |
---|
| 163 | temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen( dataset.path ), prefix='url_paste' ) |
---|
| 164 | except Exception, e: |
---|
| 165 | file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file ) |
---|
| 166 | return |
---|
| 167 | dataset.path = temp_name |
---|
| 168 | # See if we have an empty file |
---|
| 169 | if not os.path.exists( dataset.path ): |
---|
| 170 | file_err( 'Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file ) |
---|
| 171 | return |
---|
| 172 | if not os.path.getsize( dataset.path ) > 0: |
---|
| 173 | file_err( 'The uploaded file is empty', dataset, json_file ) |
---|
| 174 | return |
---|
| 175 | if not dataset.type == 'url': |
---|
| 176 | # Already set is_multi_byte above if type == 'url' |
---|
| 177 | try: |
---|
| 178 | dataset.is_multi_byte = util.is_multi_byte( codecs.open( dataset.path, 'r', 'utf-8' ).read( 100 ) ) |
---|
| 179 | except UnicodeDecodeError, e: |
---|
| 180 | dataset.is_multi_byte = False |
---|
| 181 | # Is dataset content multi-byte? |
---|
| 182 | if dataset.is_multi_byte: |
---|
| 183 | data_type = 'multi-byte char' |
---|
| 184 | ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) |
---|
| 185 | # Is dataset content supported sniffable binary? |
---|
| 186 | elif check_bam( dataset.path ): |
---|
| 187 | ext = 'bam' |
---|
| 188 | data_type = 'bam' |
---|
| 189 | elif check_sff( dataset.path ): |
---|
| 190 | ext = 'sff' |
---|
| 191 | data_type = 'sff' |
---|
| 192 | elif check_pdf( dataset.path ): |
---|
| 193 | ext = 'pdf' |
---|
| 194 | data_type = 'pdf' |
---|
| 195 | elif check_bigwig( dataset.path ): |
---|
| 196 | ext = 'bigwig' |
---|
| 197 | data_type = 'bigwig' |
---|
| 198 | elif check_bigbed( dataset.path ): |
---|
| 199 | ext = 'bigbed' |
---|
| 200 | data_type = 'bigbed' |
---|
| 201 | else: |
---|
| 202 | # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress |
---|
| 203 | is_gzipped, is_valid = check_gzip( dataset.path ) |
---|
| 204 | if is_gzipped and not is_valid: |
---|
| 205 | file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) |
---|
| 206 | return |
---|
| 207 | elif is_gzipped and is_valid: |
---|
| 208 | # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format |
---|
| 209 | CHUNK_SIZE = 2**20 # 1Mb |
---|
| 210 | fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( dataset.path ), text=False ) |
---|
| 211 | gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) |
---|
| 212 | while 1: |
---|
| 213 | try: |
---|
| 214 | chunk = gzipped_file.read( CHUNK_SIZE ) |
---|
| 215 | except IOError: |
---|
| 216 | os.close( fd ) |
---|
| 217 | os.remove( uncompressed ) |
---|
| 218 | file_err( 'Problem decompressing gzipped data', dataset, json_file ) |
---|
| 219 | return |
---|
| 220 | if not chunk: |
---|
| 221 | break |
---|
| 222 | os.write( fd, chunk ) |
---|
| 223 | os.close( fd ) |
---|
| 224 | gzipped_file.close() |
---|
| 225 | # Replace the gzipped file with the decompressed file |
---|
| 226 | shutil.move( uncompressed, dataset.path ) |
---|
| 227 | dataset.name = dataset.name.rstrip( '.gz' ) |
---|
| 228 | data_type = 'gzip' |
---|
| 229 | if not data_type and bz2 is not None: |
---|
| 230 | # See if we have a bz2 file, much like gzip |
---|
| 231 | is_bzipped, is_valid = check_bz2( dataset.path ) |
---|
| 232 | if is_bzipped and not is_valid: |
---|
| 233 | file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) |
---|
| 234 | return |
---|
| 235 | elif is_bzipped and is_valid: |
---|
| 236 | # We need to uncompress the temp_name file |
---|
| 237 | CHUNK_SIZE = 2**20 # 1Mb |
---|
| 238 | fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( dataset.path ), text=False ) |
---|
| 239 | bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) |
---|
| 240 | while 1: |
---|
| 241 | try: |
---|
| 242 | chunk = bzipped_file.read( CHUNK_SIZE ) |
---|
| 243 | except IOError: |
---|
| 244 | os.close( fd ) |
---|
| 245 | os.remove( uncompressed ) |
---|
| 246 | file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) |
---|
| 247 | return |
---|
| 248 | if not chunk: |
---|
| 249 | break |
---|
| 250 | os.write( fd, chunk ) |
---|
| 251 | os.close( fd ) |
---|
| 252 | bzipped_file.close() |
---|
| 253 | # Replace the gzipped file with the decompressed file |
---|
| 254 | shutil.move( uncompressed, dataset.path ) |
---|
| 255 | dataset.name = dataset.name.rstrip( '.bz2' ) |
---|
| 256 | data_type = 'bz2' |
---|
| 257 | if not data_type: |
---|
| 258 | # See if we have a zip archive |
---|
| 259 | is_zipped = check_zip( dataset.path ) |
---|
| 260 | if is_zipped: |
---|
| 261 | CHUNK_SIZE = 2**20 # 1Mb |
---|
| 262 | uncompressed = None |
---|
| 263 | uncompressed_name = None |
---|
| 264 | unzipped = False |
---|
| 265 | z = zipfile.ZipFile( dataset.path ) |
---|
| 266 | for name in z.namelist(): |
---|
| 267 | if name.endswith('/'): |
---|
| 268 | continue |
---|
| 269 | if unzipped: |
---|
| 270 | stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' |
---|
| 271 | break |
---|
| 272 | fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( dataset.path ), text=False ) |
---|
| 273 | zipped_file = z.open( name ) |
---|
| 274 | while 1: |
---|
| 275 | try: |
---|
| 276 | chunk = zipped_file.read( CHUNK_SIZE ) |
---|
| 277 | except IOError: |
---|
| 278 | os.close( fd ) |
---|
| 279 | os.remove( uncompressed ) |
---|
| 280 | file_err( 'Problem decompressing zipped data', dataset, json_file ) |
---|
| 281 | return |
---|
| 282 | if not chunk: |
---|
| 283 | break |
---|
| 284 | os.write( fd, chunk ) |
---|
| 285 | os.close( fd ) |
---|
| 286 | zipped_file.close() |
---|
| 287 | uncompressed_name = name |
---|
| 288 | unzipped = True |
---|
| 289 | z.close() |
---|
| 290 | # Replace the zipped file with the decompressed file |
---|
| 291 | if uncompressed is not None: |
---|
| 292 | shutil.move( uncompressed, dataset.path ) |
---|
| 293 | dataset.name = uncompressed_name |
---|
| 294 | data_type = 'zip' |
---|
| 295 | if not data_type: |
---|
| 296 | if check_binary( dataset.path ): |
---|
| 297 | # We have a binary dataset, but it is not Bam, Sff or Pdf |
---|
| 298 | data_type = 'binary' |
---|
| 299 | #binary_ok = False |
---|
| 300 | parts = dataset.name.split( "." ) |
---|
| 301 | if len( parts ) > 1: |
---|
| 302 | ext = parts[1].strip().lower() |
---|
| 303 | if ext not in unsniffable_binary_formats: |
---|
| 304 | file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) |
---|
| 305 | return |
---|
| 306 | elif ext in unsniffable_binary_formats and dataset.file_type != ext: |
---|
| 307 | err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) |
---|
| 308 | file_err( err_msg, dataset, json_file ) |
---|
| 309 | return |
---|
| 310 | if not data_type: |
---|
| 311 | # We must have a text file |
---|
| 312 | if check_html( dataset.path ): |
---|
| 313 | file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) |
---|
| 314 | return |
---|
| 315 | if data_type != 'binary': |
---|
| 316 | # don't convert newlines on data we're only going to symlink |
---|
| 317 | if not dataset.get( 'link_data_only', False ): |
---|
| 318 | in_place = True |
---|
| 319 | if dataset.type in ( 'server_dir', 'path_paste' ): |
---|
| 320 | in_place = False |
---|
| 321 | if dataset.space_to_tab: |
---|
| 322 | line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place ) |
---|
| 323 | else: |
---|
| 324 | line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place ) |
---|
| 325 | if dataset.file_type == 'auto': |
---|
| 326 | ext = sniff.guess_ext( dataset.path, registry.sniff_order ) |
---|
| 327 | else: |
---|
| 328 | ext = dataset.file_type |
---|
| 329 | data_type = ext |
---|
| 330 | # Save job info for the framework |
---|
| 331 | if ext == 'auto' and dataset.ext: |
---|
| 332 | ext = dataset.ext |
---|
| 333 | if ext == 'auto': |
---|
| 334 | ext = 'data' |
---|
| 335 | # Move the dataset to its "real" path |
---|
| 336 | if dataset.get( 'link_data_only', False ): |
---|
| 337 | pass # data will remain in place |
---|
| 338 | elif dataset.type in ( 'server_dir', 'path_paste' ): |
---|
| 339 | if converted_path is not None: |
---|
| 340 | shutil.copy( converted_path, output_path ) |
---|
| 341 | try: |
---|
| 342 | os.remove( converted_path ) |
---|
| 343 | except: |
---|
| 344 | pass |
---|
| 345 | else: |
---|
| 346 | # this should not happen, but it's here just in case |
---|
| 347 | shutil.copy( dataset.path, output_path ) |
---|
| 348 | else: |
---|
| 349 | shutil.move( dataset.path, output_path ) |
---|
| 350 | # Write the job info |
---|
| 351 | stdout = stdout or 'uploaded %s file' % data_type |
---|
| 352 | info = dict( type = 'dataset', |
---|
| 353 | dataset_id = dataset.dataset_id, |
---|
| 354 | ext = ext, |
---|
| 355 | stdout = stdout, |
---|
| 356 | name = dataset.name, |
---|
| 357 | line_count = line_count ) |
---|
| 358 | json_file.write( to_json_string( info ) + "\n" ) |
---|
| 359 | # Groom the dataset content if necessary |
---|
| 360 | datatype = registry.get_datatype_by_extension( ext ) |
---|
| 361 | datatype.groom_dataset_content( output_path ) |
---|
| 362 | |
---|
| 363 | def add_composite_file( dataset, registry, json_file, output_path, files_path ): |
---|
| 364 | if dataset.composite_files: |
---|
| 365 | os.mkdir( files_path ) |
---|
| 366 | for name, value in dataset.composite_files.iteritems(): |
---|
| 367 | value = util.bunch.Bunch( **value ) |
---|
| 368 | if dataset.composite_file_paths[ value.name ] is None and not value.optional: |
---|
| 369 | file_err( 'A required composite data file was not provided (%s)' % name, dataset, json_file ) |
---|
| 370 | break |
---|
| 371 | elif dataset.composite_file_paths[value.name] is not None: |
---|
| 372 | dp = dataset.composite_file_paths[value.name][ 'path' ] |
---|
| 373 | isurl = dp.find('://') <> -1 # todo fixme |
---|
| 374 | if isurl: |
---|
| 375 | try: |
---|
| 376 | temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen( dp ), prefix='url_paste' ) |
---|
| 377 | except Exception, e: |
---|
| 378 | file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file ) |
---|
| 379 | return |
---|
| 380 | dataset.path = temp_name |
---|
| 381 | dp = temp_name |
---|
| 382 | if not value.is_binary: |
---|
| 383 | if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ): |
---|
| 384 | sniff.convert_newlines_sep2tabs( dp ) |
---|
| 385 | else: |
---|
| 386 | sniff.convert_newlines( dp ) |
---|
| 387 | shutil.move( dp, os.path.join( files_path, name ) ) |
---|
| 388 | # Move the dataset to its "real" path |
---|
| 389 | shutil.move( dataset.primary_file, output_path ) |
---|
| 390 | # Write the job info |
---|
| 391 | info = dict( type = 'dataset', |
---|
| 392 | dataset_id = dataset.dataset_id, |
---|
| 393 | stdout = 'uploaded %s file' % dataset.file_type ) |
---|
| 394 | json_file.write( to_json_string( info ) + "\n" ) |
---|
| 395 | |
---|
| 396 | def __main__(): |
---|
| 397 | |
---|
| 398 | if len( sys.argv ) < 4: |
---|
| 399 | print >>sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...' |
---|
| 400 | sys.exit( 1 ) |
---|
| 401 | |
---|
| 402 | output_paths = parse_outputs( sys.argv[4:] ) |
---|
| 403 | json_file = open( 'galaxy.json', 'w' ) |
---|
| 404 | |
---|
| 405 | registry = Registry( sys.argv[1], sys.argv[2] ) |
---|
| 406 | |
---|
| 407 | for line in open( sys.argv[3], 'r' ): |
---|
| 408 | dataset = from_json_string( line ) |
---|
| 409 | dataset = util.bunch.Bunch( **safe_dict( dataset ) ) |
---|
| 410 | try: |
---|
| 411 | output_path = output_paths[int( dataset.dataset_id )][0] |
---|
| 412 | except: |
---|
| 413 | print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id |
---|
| 414 | sys.exit( 1 ) |
---|
| 415 | if dataset.type == 'composite': |
---|
| 416 | files_path = output_paths[int( dataset.dataset_id )][1] |
---|
| 417 | add_composite_file( dataset, registry, json_file, output_path, files_path ) |
---|
| 418 | else: |
---|
| 419 | add_file( dataset, registry, json_file, output_path ) |
---|
| 420 | # clean up paramfile |
---|
| 421 | try: |
---|
| 422 | os.remove( sys.argv[1] ) |
---|
| 423 | except: |
---|
| 424 | pass |
---|
| 425 | |
---|
| 426 | if __name__ == '__main__': |
---|
| 427 | __main__() |
---|