1 | #!/usr/bin/env python |
---|
2 | #Processes uploads from the user. |
---|
3 | |
---|
4 | # WARNING: Changes in this tool (particularly as related to parsing) may need |
---|
5 | # to be reflected in galaxy.web.controllers.tool_runner and galaxy.tools |
---|
6 | |
---|
7 | import urllib, sys, os, gzip, tempfile, shutil, re, gzip, zipfile, codecs, binascii |
---|
8 | from galaxy import eggs |
---|
9 | # need to import model before sniff to resolve a circular import dependency |
---|
10 | import galaxy.model |
---|
11 | from galaxy.datatypes import sniff |
---|
12 | from galaxy.datatypes.binary import * |
---|
13 | from galaxy.datatypes.images import Pdf |
---|
14 | from galaxy.datatypes.registry import Registry |
---|
15 | from galaxy import util |
---|
16 | from galaxy.util.json import * |
---|
17 | |
---|
18 | try: |
---|
19 | import bz2 |
---|
20 | except: |
---|
21 | bz2 = None |
---|
22 | |
---|
23 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
24 | |
---|
25 | def stop_err( msg, ret=1 ): |
---|
26 | sys.stderr.write( msg ) |
---|
27 | sys.exit( ret ) |
---|
28 | def file_err( msg, dataset, json_file ): |
---|
29 | json_file.write( to_json_string( dict( type = 'dataset', |
---|
30 | ext = 'data', |
---|
31 | dataset_id = dataset.dataset_id, |
---|
32 | stderr = msg ) ) + "\n" ) |
---|
33 | # never remove a server-side upload |
---|
34 | if dataset.type in ( 'server_dir', 'path_paste' ): |
---|
35 | return |
---|
36 | try: |
---|
37 | os.remove( dataset.path ) |
---|
38 | except: |
---|
39 | pass |
---|
40 | def safe_dict(d): |
---|
41 | """ |
---|
42 | Recursively clone json structure with UTF-8 dictionary keys |
---|
43 | http://mellowmachines.com/blog/2009/06/exploding-dictionary-with-unicode-keys-as-python-arguments/ |
---|
44 | """ |
---|
45 | if isinstance(d, dict): |
---|
46 | return dict([(k.encode('utf-8'), safe_dict(v)) for k,v in d.iteritems()]) |
---|
47 | elif isinstance(d, list): |
---|
48 | return [safe_dict(x) for x in d] |
---|
49 | else: |
---|
50 | return d |
---|
51 | def check_html( temp_name, chunk=None ): |
---|
52 | if chunk is None: |
---|
53 | temp = open(temp_name, "U") |
---|
54 | else: |
---|
55 | temp = chunk |
---|
56 | regexp1 = re.compile( "<A\s+[^>]*HREF[^>]+>", re.I ) |
---|
57 | regexp2 = re.compile( "<IFRAME[^>]*>", re.I ) |
---|
58 | regexp3 = re.compile( "<FRAMESET[^>]*>", re.I ) |
---|
59 | regexp4 = re.compile( "<META[^>]*>", re.I ) |
---|
60 | regexp5 = re.compile( "<SCRIPT[^>]*>", re.I ) |
---|
61 | lineno = 0 |
---|
62 | for line in temp: |
---|
63 | lineno += 1 |
---|
64 | matches = regexp1.search( line ) or regexp2.search( line ) or regexp3.search( line ) or regexp4.search( line ) or regexp5.search( line ) |
---|
65 | if matches: |
---|
66 | if chunk is None: |
---|
67 | temp.close() |
---|
68 | return True |
---|
69 | if lineno > 100: |
---|
70 | break |
---|
71 | if chunk is None: |
---|
72 | temp.close() |
---|
73 | return False |
---|
74 | def check_binary( temp_name ): |
---|
75 | is_binary = False |
---|
76 | temp = open( temp_name, "U" ) |
---|
77 | chars_read = 0 |
---|
78 | for chars in temp: |
---|
79 | for char in chars: |
---|
80 | chars_read += 1 |
---|
81 | if ord( char ) > 128: |
---|
82 | is_binary = True |
---|
83 | break |
---|
84 | if chars_read > 100: |
---|
85 | break |
---|
86 | if chars_read > 100: |
---|
87 | break |
---|
88 | temp.close() |
---|
89 | return is_binary |
---|
90 | def check_bam( temp_name ): |
---|
91 | return Bam().sniff( temp_name ) |
---|
92 | def check_sff( temp_name ): |
---|
93 | return Sff().sniff( temp_name ) |
---|
94 | def check_pdf( temp_name ): |
---|
95 | return Pdf().sniff( temp_name ) |
---|
96 | def check_bigwig( temp_name ): |
---|
97 | return BigWig().sniff( temp_name ) |
---|
98 | def check_bigbed( temp_name ): |
---|
99 | return BigBed().sniff( temp_name ) |
---|
100 | def check_gzip( temp_name ): |
---|
101 | # This method returns a tuple of booleans representing ( is_gzipped, is_valid ) |
---|
102 | # Make sure we have a gzipped file |
---|
103 | try: |
---|
104 | temp = open( temp_name, "U" ) |
---|
105 | magic_check = temp.read( 2 ) |
---|
106 | temp.close() |
---|
107 | if magic_check != util.gzip_magic: |
---|
108 | return ( False, False ) |
---|
109 | except: |
---|
110 | return ( False, False ) |
---|
111 | # We support some binary data types, so check if the compressed binary file is valid |
---|
112 | # If the file is Bam, it should already have been detected as such, so we'll just check |
---|
113 | # for sff format. |
---|
114 | try: |
---|
115 | header = gzip.open( temp_name ).read(4) |
---|
116 | if binascii.b2a_hex( header ) == binascii.hexlify( '.sff' ): |
---|
117 | return ( True, True ) |
---|
118 | except: |
---|
119 | return( False, False ) |
---|
120 | CHUNK_SIZE = 2**15 # 32Kb |
---|
121 | gzipped_file = gzip.GzipFile( temp_name, mode='rb' ) |
---|
122 | chunk = gzipped_file.read( CHUNK_SIZE ) |
---|
123 | gzipped_file.close() |
---|
124 | # See if we have a compressed HTML file |
---|
125 | if check_html( temp_name, chunk=chunk ): |
---|
126 | return ( True, False ) |
---|
127 | return ( True, True ) |
---|
128 | def check_bz2( temp_name ): |
---|
129 | try: |
---|
130 | temp = open( temp_name, "U" ) |
---|
131 | magic_check = temp.read( 3 ) |
---|
132 | temp.close() |
---|
133 | if magic_check != util.bz2_magic: |
---|
134 | return ( False, False ) |
---|
135 | except: |
---|
136 | return( False, False ) |
---|
137 | CHUNK_SIZE = 2**15 # reKb |
---|
138 | bzipped_file = bz2.BZ2File( temp_name, mode='rb' ) |
---|
139 | chunk = bzipped_file.read( CHUNK_SIZE ) |
---|
140 | bzipped_file.close() |
---|
141 | # See if we have a compressed HTML file |
---|
142 | if check_html( temp_name, chunk=chunk ): |
---|
143 | return ( True, False ) |
---|
144 | return ( True, True ) |
---|
145 | def check_zip( temp_name ): |
---|
146 | if zipfile.is_zipfile( temp_name ): |
---|
147 | return True |
---|
148 | return False |
---|
149 | def parse_outputs( args ): |
---|
150 | rval = {} |
---|
151 | for arg in args: |
---|
152 | id, files_path, path = arg.split( ':', 2 ) |
---|
153 | rval[int( id )] = ( path, files_path ) |
---|
154 | return rval |
---|
155 | def add_file( dataset, registry, json_file, output_path ): |
---|
156 | data_type = None |
---|
157 | line_count = None |
---|
158 | converted_path = None |
---|
159 | stdout = None |
---|
160 | |
---|
161 | if dataset.type == 'url': |
---|
162 | try: |
---|
163 | temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen( dataset.path ), prefix='url_paste' ) |
---|
164 | except Exception, e: |
---|
165 | file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file ) |
---|
166 | return |
---|
167 | dataset.path = temp_name |
---|
168 | # See if we have an empty file |
---|
169 | if not os.path.exists( dataset.path ): |
---|
170 | file_err( 'Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file ) |
---|
171 | return |
---|
172 | if not os.path.getsize( dataset.path ) > 0: |
---|
173 | file_err( 'The uploaded file is empty', dataset, json_file ) |
---|
174 | return |
---|
175 | if not dataset.type == 'url': |
---|
176 | # Already set is_multi_byte above if type == 'url' |
---|
177 | try: |
---|
178 | dataset.is_multi_byte = util.is_multi_byte( codecs.open( dataset.path, 'r', 'utf-8' ).read( 100 ) ) |
---|
179 | except UnicodeDecodeError, e: |
---|
180 | dataset.is_multi_byte = False |
---|
181 | # Is dataset content multi-byte? |
---|
182 | if dataset.is_multi_byte: |
---|
183 | data_type = 'multi-byte char' |
---|
184 | ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) |
---|
185 | # Is dataset content supported sniffable binary? |
---|
186 | elif check_bam( dataset.path ): |
---|
187 | ext = 'bam' |
---|
188 | data_type = 'bam' |
---|
189 | elif check_sff( dataset.path ): |
---|
190 | ext = 'sff' |
---|
191 | data_type = 'sff' |
---|
192 | elif check_pdf( dataset.path ): |
---|
193 | ext = 'pdf' |
---|
194 | data_type = 'pdf' |
---|
195 | elif check_bigwig( dataset.path ): |
---|
196 | ext = 'bigwig' |
---|
197 | data_type = 'bigwig' |
---|
198 | elif check_bigbed( dataset.path ): |
---|
199 | ext = 'bigbed' |
---|
200 | data_type = 'bigbed' |
---|
201 | else: |
---|
202 | # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress |
---|
203 | is_gzipped, is_valid = check_gzip( dataset.path ) |
---|
204 | if is_gzipped and not is_valid: |
---|
205 | file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) |
---|
206 | return |
---|
207 | elif is_gzipped and is_valid: |
---|
208 | # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format |
---|
209 | CHUNK_SIZE = 2**20 # 1Mb |
---|
210 | fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( dataset.path ), text=False ) |
---|
211 | gzipped_file = gzip.GzipFile( dataset.path, 'rb' ) |
---|
212 | while 1: |
---|
213 | try: |
---|
214 | chunk = gzipped_file.read( CHUNK_SIZE ) |
---|
215 | except IOError: |
---|
216 | os.close( fd ) |
---|
217 | os.remove( uncompressed ) |
---|
218 | file_err( 'Problem decompressing gzipped data', dataset, json_file ) |
---|
219 | return |
---|
220 | if not chunk: |
---|
221 | break |
---|
222 | os.write( fd, chunk ) |
---|
223 | os.close( fd ) |
---|
224 | gzipped_file.close() |
---|
225 | # Replace the gzipped file with the decompressed file |
---|
226 | shutil.move( uncompressed, dataset.path ) |
---|
227 | dataset.name = dataset.name.rstrip( '.gz' ) |
---|
228 | data_type = 'gzip' |
---|
229 | if not data_type and bz2 is not None: |
---|
230 | # See if we have a bz2 file, much like gzip |
---|
231 | is_bzipped, is_valid = check_bz2( dataset.path ) |
---|
232 | if is_bzipped and not is_valid: |
---|
233 | file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) |
---|
234 | return |
---|
235 | elif is_bzipped and is_valid: |
---|
236 | # We need to uncompress the temp_name file |
---|
237 | CHUNK_SIZE = 2**20 # 1Mb |
---|
238 | fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( dataset.path ), text=False ) |
---|
239 | bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) |
---|
240 | while 1: |
---|
241 | try: |
---|
242 | chunk = bzipped_file.read( CHUNK_SIZE ) |
---|
243 | except IOError: |
---|
244 | os.close( fd ) |
---|
245 | os.remove( uncompressed ) |
---|
246 | file_err( 'Problem decompressing bz2 compressed data', dataset, json_file ) |
---|
247 | return |
---|
248 | if not chunk: |
---|
249 | break |
---|
250 | os.write( fd, chunk ) |
---|
251 | os.close( fd ) |
---|
252 | bzipped_file.close() |
---|
253 | # Replace the gzipped file with the decompressed file |
---|
254 | shutil.move( uncompressed, dataset.path ) |
---|
255 | dataset.name = dataset.name.rstrip( '.bz2' ) |
---|
256 | data_type = 'bz2' |
---|
257 | if not data_type: |
---|
258 | # See if we have a zip archive |
---|
259 | is_zipped = check_zip( dataset.path ) |
---|
260 | if is_zipped: |
---|
261 | CHUNK_SIZE = 2**20 # 1Mb |
---|
262 | uncompressed = None |
---|
263 | uncompressed_name = None |
---|
264 | unzipped = False |
---|
265 | z = zipfile.ZipFile( dataset.path ) |
---|
266 | for name in z.namelist(): |
---|
267 | if name.endswith('/'): |
---|
268 | continue |
---|
269 | if unzipped: |
---|
270 | stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' |
---|
271 | break |
---|
272 | fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname( dataset.path ), text=False ) |
---|
273 | zipped_file = z.open( name ) |
---|
274 | while 1: |
---|
275 | try: |
---|
276 | chunk = zipped_file.read( CHUNK_SIZE ) |
---|
277 | except IOError: |
---|
278 | os.close( fd ) |
---|
279 | os.remove( uncompressed ) |
---|
280 | file_err( 'Problem decompressing zipped data', dataset, json_file ) |
---|
281 | return |
---|
282 | if not chunk: |
---|
283 | break |
---|
284 | os.write( fd, chunk ) |
---|
285 | os.close( fd ) |
---|
286 | zipped_file.close() |
---|
287 | uncompressed_name = name |
---|
288 | unzipped = True |
---|
289 | z.close() |
---|
290 | # Replace the zipped file with the decompressed file |
---|
291 | if uncompressed is not None: |
---|
292 | shutil.move( uncompressed, dataset.path ) |
---|
293 | dataset.name = uncompressed_name |
---|
294 | data_type = 'zip' |
---|
295 | if not data_type: |
---|
296 | if check_binary( dataset.path ): |
---|
297 | # We have a binary dataset, but it is not Bam, Sff or Pdf |
---|
298 | data_type = 'binary' |
---|
299 | #binary_ok = False |
---|
300 | parts = dataset.name.split( "." ) |
---|
301 | if len( parts ) > 1: |
---|
302 | ext = parts[1].strip().lower() |
---|
303 | if ext not in unsniffable_binary_formats: |
---|
304 | file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) |
---|
305 | return |
---|
306 | elif ext in unsniffable_binary_formats and dataset.file_type != ext: |
---|
307 | err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) |
---|
308 | file_err( err_msg, dataset, json_file ) |
---|
309 | return |
---|
310 | if not data_type: |
---|
311 | # We must have a text file |
---|
312 | if check_html( dataset.path ): |
---|
313 | file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) |
---|
314 | return |
---|
315 | if data_type != 'binary': |
---|
316 | # don't convert newlines on data we're only going to symlink |
---|
317 | if not dataset.get( 'link_data_only', False ): |
---|
318 | in_place = True |
---|
319 | if dataset.type in ( 'server_dir', 'path_paste' ): |
---|
320 | in_place = False |
---|
321 | if dataset.space_to_tab: |
---|
322 | line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place ) |
---|
323 | else: |
---|
324 | line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place ) |
---|
325 | if dataset.file_type == 'auto': |
---|
326 | ext = sniff.guess_ext( dataset.path, registry.sniff_order ) |
---|
327 | else: |
---|
328 | ext = dataset.file_type |
---|
329 | data_type = ext |
---|
330 | # Save job info for the framework |
---|
331 | if ext == 'auto' and dataset.ext: |
---|
332 | ext = dataset.ext |
---|
333 | if ext == 'auto': |
---|
334 | ext = 'data' |
---|
335 | # Move the dataset to its "real" path |
---|
336 | if dataset.get( 'link_data_only', False ): |
---|
337 | pass # data will remain in place |
---|
338 | elif dataset.type in ( 'server_dir', 'path_paste' ): |
---|
339 | if converted_path is not None: |
---|
340 | shutil.copy( converted_path, output_path ) |
---|
341 | try: |
---|
342 | os.remove( converted_path ) |
---|
343 | except: |
---|
344 | pass |
---|
345 | else: |
---|
346 | # this should not happen, but it's here just in case |
---|
347 | shutil.copy( dataset.path, output_path ) |
---|
348 | else: |
---|
349 | shutil.move( dataset.path, output_path ) |
---|
350 | # Write the job info |
---|
351 | stdout = stdout or 'uploaded %s file' % data_type |
---|
352 | info = dict( type = 'dataset', |
---|
353 | dataset_id = dataset.dataset_id, |
---|
354 | ext = ext, |
---|
355 | stdout = stdout, |
---|
356 | name = dataset.name, |
---|
357 | line_count = line_count ) |
---|
358 | json_file.write( to_json_string( info ) + "\n" ) |
---|
359 | # Groom the dataset content if necessary |
---|
360 | datatype = registry.get_datatype_by_extension( ext ) |
---|
361 | datatype.groom_dataset_content( output_path ) |
---|
362 | |
---|
363 | def add_composite_file( dataset, registry, json_file, output_path, files_path ): |
---|
364 | if dataset.composite_files: |
---|
365 | os.mkdir( files_path ) |
---|
366 | for name, value in dataset.composite_files.iteritems(): |
---|
367 | value = util.bunch.Bunch( **value ) |
---|
368 | if dataset.composite_file_paths[ value.name ] is None and not value.optional: |
---|
369 | file_err( 'A required composite data file was not provided (%s)' % name, dataset, json_file ) |
---|
370 | break |
---|
371 | elif dataset.composite_file_paths[value.name] is not None: |
---|
372 | dp = dataset.composite_file_paths[value.name][ 'path' ] |
---|
373 | isurl = dp.find('://') <> -1 # todo fixme |
---|
374 | if isurl: |
---|
375 | try: |
---|
376 | temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen( dp ), prefix='url_paste' ) |
---|
377 | except Exception, e: |
---|
378 | file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file ) |
---|
379 | return |
---|
380 | dataset.path = temp_name |
---|
381 | dp = temp_name |
---|
382 | if not value.is_binary: |
---|
383 | if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ): |
---|
384 | sniff.convert_newlines_sep2tabs( dp ) |
---|
385 | else: |
---|
386 | sniff.convert_newlines( dp ) |
---|
387 | shutil.move( dp, os.path.join( files_path, name ) ) |
---|
388 | # Move the dataset to its "real" path |
---|
389 | shutil.move( dataset.primary_file, output_path ) |
---|
390 | # Write the job info |
---|
391 | info = dict( type = 'dataset', |
---|
392 | dataset_id = dataset.dataset_id, |
---|
393 | stdout = 'uploaded %s file' % dataset.file_type ) |
---|
394 | json_file.write( to_json_string( info ) + "\n" ) |
---|
395 | |
---|
396 | def __main__(): |
---|
397 | |
---|
398 | if len( sys.argv ) < 4: |
---|
399 | print >>sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...' |
---|
400 | sys.exit( 1 ) |
---|
401 | |
---|
402 | output_paths = parse_outputs( sys.argv[4:] ) |
---|
403 | json_file = open( 'galaxy.json', 'w' ) |
---|
404 | |
---|
405 | registry = Registry( sys.argv[1], sys.argv[2] ) |
---|
406 | |
---|
407 | for line in open( sys.argv[3], 'r' ): |
---|
408 | dataset = from_json_string( line ) |
---|
409 | dataset = util.bunch.Bunch( **safe_dict( dataset ) ) |
---|
410 | try: |
---|
411 | output_path = output_paths[int( dataset.dataset_id )][0] |
---|
412 | except: |
---|
413 | print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id |
---|
414 | sys.exit( 1 ) |
---|
415 | if dataset.type == 'composite': |
---|
416 | files_path = output_paths[int( dataset.dataset_id )][1] |
---|
417 | add_composite_file( dataset, registry, json_file, output_path, files_path ) |
---|
418 | else: |
---|
419 | add_file( dataset, registry, json_file, output_path ) |
---|
420 | # clean up paramfile |
---|
421 | try: |
---|
422 | os.remove( sys.argv[1] ) |
---|
423 | except: |
---|
424 | pass |
---|
425 | |
---|
426 | if __name__ == '__main__': |
---|
427 | __main__() |
---|