1 | """ |
---|
2 | Utility functions used systemwide. |
---|
3 | |
---|
4 | """ |
---|
5 | import logging |
---|
6 | import threading, random, string, re, binascii, pickle, time, datetime, math, re, os, sys, tempfile, stat, grp |
---|
7 | |
---|
8 | # Older py compatibility |
---|
9 | try: |
---|
10 | set() |
---|
11 | except: |
---|
12 | from sets import Set as set |
---|
13 | |
---|
14 | try: |
---|
15 | from hashlib import md5 |
---|
16 | except ImportError: |
---|
17 | from md5 import new as md5 |
---|
18 | |
---|
19 | import pkg_resources |
---|
20 | |
---|
21 | pkg_resources.require( 'docutils' ) |
---|
22 | import docutils.core |
---|
23 | from galaxy.util.docutils_ext.htmlfrag import Writer as HTMLFragWriter |
---|
24 | |
---|
25 | pkg_resources.require( 'elementtree' ) |
---|
26 | from elementtree import ElementTree, ElementInclude |
---|
27 | |
---|
28 | pkg_resources.require( "wchartype" ) |
---|
29 | import wchartype |
---|
30 | |
---|
31 | log = logging.getLogger(__name__) |
---|
32 | _lock = threading.RLock() |
---|
33 | |
---|
34 | gzip_magic = '\037\213' |
---|
35 | bz2_magic = 'BZh' |
---|
36 | |
---|
37 | def is_multi_byte( chars ): |
---|
38 | for char in chars: |
---|
39 | try: |
---|
40 | char = unicode( char ) |
---|
41 | except UnicodeDecodeError, e: |
---|
42 | # Probably binary |
---|
43 | return False |
---|
44 | if wchartype.is_asian( char ) or \ |
---|
45 | wchartype.is_full_width( char ) or \ |
---|
46 | wchartype.is_kanji( char ) or \ |
---|
47 | wchartype.is_hiragana( char ) or \ |
---|
48 | wchartype.is_katakana( char ) or \ |
---|
49 | wchartype.is_half_katakana( char ) or \ |
---|
50 | wchartype.is_hangul( char ) or \ |
---|
51 | wchartype.is_full_digit( char ) or \ |
---|
52 | wchartype.is_full_letter( char ): |
---|
53 | return True |
---|
54 | return False |
---|
55 | |
---|
56 | def synchronized(func): |
---|
57 | """This wrapper will serialize access to 'func' to a single thread. Use it as a decorator.""" |
---|
58 | def caller(*params, **kparams): |
---|
59 | _lock.acquire(True) # Wait |
---|
60 | try: |
---|
61 | return func(*params, **kparams) |
---|
62 | finally: |
---|
63 | _lock.release() |
---|
64 | return caller |
---|
65 | |
---|
66 | def file_iter(fname, sep=None): |
---|
67 | """ |
---|
68 | This generator iterates over a file and yields its lines |
---|
69 | splitted via the C{sep} parameter. Skips empty lines and lines starting with |
---|
70 | the C{#} character. |
---|
71 | |
---|
72 | >>> lines = [ line for line in file_iter(__file__) ] |
---|
73 | >>> len(lines) != 0 |
---|
74 | True |
---|
75 | """ |
---|
76 | for line in file(fname): |
---|
77 | if line and line[0] != '#': |
---|
78 | yield line.split(sep) |
---|
79 | |
---|
80 | def file_reader(fp, chunk_size=65536): |
---|
81 | """This generator yields the open fileobject in chunks (default 64k). Closes the file at the end""" |
---|
82 | while 1: |
---|
83 | data = fp.read(chunk_size) |
---|
84 | if not data: |
---|
85 | break |
---|
86 | yield data |
---|
87 | fp.close() |
---|
88 | |
---|
89 | def unique_id(KEY_SIZE=128): |
---|
90 | """ |
---|
91 | Generates an unique id |
---|
92 | |
---|
93 | >>> ids = [ unique_id() for i in range(1000) ] |
---|
94 | >>> len(set(ids)) |
---|
95 | 1000 |
---|
96 | """ |
---|
97 | id = str( random.getrandbits( KEY_SIZE ) ) |
---|
98 | return md5(id).hexdigest() |
---|
99 | |
---|
100 | def parse_xml(fname): |
---|
101 | """Returns a parsed xml tree""" |
---|
102 | tree = ElementTree.parse(fname) |
---|
103 | root = tree.getroot() |
---|
104 | ElementInclude.include(root) |
---|
105 | return tree |
---|
106 | |
---|
107 | def xml_to_string(elem): |
---|
108 | """Returns an string from and xml tree""" |
---|
109 | text = ElementTree.tostring(elem) |
---|
110 | return text |
---|
111 | |
---|
112 | # characters that are valid |
---|
113 | valid_chars = set(string.letters + string.digits + " -=_.()/+*^,:?!") |
---|
114 | |
---|
115 | # characters that are allowed but need to be escaped |
---|
116 | mapped_chars = { '>' :'__gt__', |
---|
117 | '<' :'__lt__', |
---|
118 | "'" :'__sq__', |
---|
119 | '"' :'__dq__', |
---|
120 | '[' :'__ob__', |
---|
121 | ']' :'__cb__', |
---|
122 | '{' :'__oc__', |
---|
123 | '}' :'__cc__', |
---|
124 | '@' : '__at__', |
---|
125 | '\n' : '__cn__', |
---|
126 | '\r' : '__cr__', |
---|
127 | '\t' : '__tc__' |
---|
128 | } |
---|
129 | |
---|
130 | def restore_text(text): |
---|
131 | """Restores sanitized text""" |
---|
132 | for key, value in mapped_chars.items(): |
---|
133 | text = text.replace(value, key) |
---|
134 | return text |
---|
135 | |
---|
136 | def sanitize_text(text): |
---|
137 | """Restricts the characters that are allowed in a text""" |
---|
138 | out = [] |
---|
139 | for c in text: |
---|
140 | if c in valid_chars: |
---|
141 | out.append(c) |
---|
142 | elif c in mapped_chars: |
---|
143 | out.append(mapped_chars[c]) |
---|
144 | else: |
---|
145 | out.append('X') # makes debugging easier |
---|
146 | return ''.join(out) |
---|
147 | |
---|
148 | def sanitize_param(value): |
---|
149 | """Clean incoming parameters (strings or lists)""" |
---|
150 | if isinstance( value, basestring ): |
---|
151 | return sanitize_text(value) |
---|
152 | elif isinstance( value, list ): |
---|
153 | return map(sanitize_text, value) |
---|
154 | else: |
---|
155 | print value |
---|
156 | raise Exception, 'Unknown parameter type (%s)' % ( type( value ) ) |
---|
157 | |
---|
158 | class Params: |
---|
159 | """ |
---|
160 | Stores and 'sanitizes' parameters. Alphanumeric characters and the |
---|
161 | non-alphanumeric ones that are deemed safe are let to pass through (see L{valid_chars}). |
---|
162 | Some non-safe characters are escaped to safe forms for example C{>} becomes C{__lt__} |
---|
163 | (see L{mapped_chars}). All other characters are replaced with C{X}. |
---|
164 | |
---|
165 | Operates on string or list values only (HTTP parameters). |
---|
166 | |
---|
167 | >>> values = { 'status':'on', 'symbols':[ 'alpha', '<>', '$rm&#!' ] } |
---|
168 | >>> par = Params(values) |
---|
169 | >>> par.status |
---|
170 | 'on' |
---|
171 | >>> par.value == None # missing attributes return None |
---|
172 | True |
---|
173 | >>> par.get('price', 0) |
---|
174 | 0 |
---|
175 | >>> par.symbols # replaces unknown symbols with X |
---|
176 | ['alpha', '__lt____gt__', 'XrmXX!'] |
---|
177 | >>> par.flatten() # flattening to a list |
---|
178 | [('status', 'on'), ('symbols', 'alpha'), ('symbols', '__lt____gt__'), ('symbols', 'XrmXX!')] |
---|
179 | """ |
---|
180 | |
---|
181 | # is NEVER_SANITIZE required now that sanitizing for tool parameters can be controlled on a per parameter basis and occurs via InputValueWrappers? |
---|
182 | NEVER_SANITIZE = ['file_data', 'url_paste', 'URL', 'filesystem_paths'] |
---|
183 | |
---|
184 | def __init__( self, params, sanitize=True ): |
---|
185 | if sanitize: |
---|
186 | for key, value in params.items(): |
---|
187 | if key not in self.NEVER_SANITIZE and True not in [ key.endswith( "|%s" % nonsanitize_parameter ) for nonsanitize_parameter in self.NEVER_SANITIZE ]: #sanitize check both ungrouped and grouped parameters by name. Anything relying on NEVER_SANITIZE should be changed to not require this and NEVER_SANITIZE should be removed. |
---|
188 | self.__dict__[ key ] = sanitize_param( value ) |
---|
189 | else: |
---|
190 | self.__dict__[ key ] = value |
---|
191 | else: |
---|
192 | self.__dict__.update(params) |
---|
193 | |
---|
194 | def flatten(self): |
---|
195 | """ |
---|
196 | Creates a tuple list from a dict with a tuple/value pair for every value that is a list |
---|
197 | """ |
---|
198 | flat = [] |
---|
199 | for key, value in self.__dict__.items(): |
---|
200 | if type(value) == type([]): |
---|
201 | for v in value: |
---|
202 | flat.append( (key, v) ) |
---|
203 | else: |
---|
204 | flat.append( (key, value) ) |
---|
205 | return flat |
---|
206 | |
---|
207 | def __getattr__(self, name): |
---|
208 | """This is here to ensure that we get None for non existing parameters""" |
---|
209 | return None |
---|
210 | |
---|
211 | def get(self, key, default): |
---|
212 | return self.__dict__.get(key, default) |
---|
213 | |
---|
214 | def __str__(self): |
---|
215 | return '%s' % self.__dict__ |
---|
216 | |
---|
217 | def __len__(self): |
---|
218 | return len(self.__dict__) |
---|
219 | |
---|
220 | def __iter__(self): |
---|
221 | return iter(self.__dict__) |
---|
222 | |
---|
223 | def update(self, values): |
---|
224 | self.__dict__.update(values) |
---|
225 | |
---|
226 | def rst_to_html( s ): |
---|
227 | """Convert a blob of reStructuredText to HTML""" |
---|
228 | log = logging.getLogger( "docutils" ) |
---|
229 | class FakeStream( object ): |
---|
230 | def write( self, str ): |
---|
231 | if len( str ) > 0 and not str.isspace(): |
---|
232 | log.warn( str ) |
---|
233 | return docutils.core.publish_string( s, writer=HTMLFragWriter(), settings_overrides=dict( warning_stream=FakeStream() ) ) |
---|
234 | |
---|
235 | def xml_text(root, name=None): |
---|
236 | """Returns the text inside an element""" |
---|
237 | if name is not None: |
---|
238 | # Try attribute first |
---|
239 | val = root.get(name) |
---|
240 | if val: |
---|
241 | return val |
---|
242 | # Then try as element |
---|
243 | elem = root.find(name) |
---|
244 | else: |
---|
245 | elem = root |
---|
246 | if elem is not None and elem.text: |
---|
247 | text = ''.join(elem.text.splitlines()) |
---|
248 | return text.strip() |
---|
249 | # No luck, return empty string |
---|
250 | return '' |
---|
251 | |
---|
252 | def string_as_bool( string ): |
---|
253 | if str( string ).lower() in ( 'true', 'yes', 'on' ): |
---|
254 | return True |
---|
255 | else: |
---|
256 | return False |
---|
257 | |
---|
258 | def listify( item ): |
---|
259 | """ |
---|
260 | Make a single item a single item list, or return a list if passed a |
---|
261 | list. Passing a None returns an empty list. |
---|
262 | """ |
---|
263 | if not item: |
---|
264 | return [] |
---|
265 | elif isinstance( item, list ): |
---|
266 | return item |
---|
267 | elif isinstance( item, basestring ) and item.count( ',' ): |
---|
268 | return item.split( ',' ) |
---|
269 | else: |
---|
270 | return [ item ] |
---|
271 | |
---|
272 | def commaify(amount): |
---|
273 | orig = amount |
---|
274 | new = re.sub("^(-?\d+)(\d{3})", '\g<1>,\g<2>', amount) |
---|
275 | if orig == new: |
---|
276 | return new |
---|
277 | else: |
---|
278 | return commaify(new) |
---|
279 | |
---|
280 | def object_to_string( obj ): |
---|
281 | return binascii.hexlify( pickle.dumps( obj, 2 ) ) |
---|
282 | |
---|
283 | def string_to_object( s ): |
---|
284 | return pickle.loads( binascii.unhexlify( s ) ) |
---|
285 | |
---|
286 | def get_ucsc_by_build(build): |
---|
287 | sites = [] |
---|
288 | for site in ucsc_build_sites: |
---|
289 | if build in site['builds']: |
---|
290 | sites.append((site['name'],site['url'])) |
---|
291 | return sites |
---|
292 | def get_gbrowse_sites_by_build(build): |
---|
293 | sites = [] |
---|
294 | for site in gbrowse_build_sites: |
---|
295 | if build in site['builds']: |
---|
296 | sites.append((site['name'],site['url'])) |
---|
297 | return sites |
---|
298 | def get_genetrack_sites(): |
---|
299 | sites = [] |
---|
300 | for site in genetrack_sites: |
---|
301 | sites.append( ( site['name'], site['url'] ) ) |
---|
302 | return sites |
---|
303 | |
---|
304 | def read_dbnames(filename): |
---|
305 | """ Read build names from file """ |
---|
306 | class DBNames( list ): |
---|
307 | default_value = "?" |
---|
308 | default_name = "unspecified (?)" |
---|
309 | db_names = DBNames() |
---|
310 | try: |
---|
311 | ucsc_builds = {} |
---|
312 | man_builds = [] #assume these are integers |
---|
313 | name_to_db_base = {} |
---|
314 | for line in open(filename): |
---|
315 | try: |
---|
316 | if line[0:1] == "#": continue |
---|
317 | fields = line.replace("\r","").replace("\n","").split("\t") |
---|
318 | #Special case of unspecified build is at top of list |
---|
319 | if fields[0] == "?": |
---|
320 | db_names.insert(0,(fields[0],fields[1])) |
---|
321 | continue |
---|
322 | try: #manual build (i.e. microbes) |
---|
323 | int(fields[0]) |
---|
324 | man_builds.append((fields[1], fields[0])) |
---|
325 | except: #UCSC build |
---|
326 | db_base = fields[0].rstrip('0123456789') |
---|
327 | if db_base not in ucsc_builds: |
---|
328 | ucsc_builds[db_base] = [] |
---|
329 | name_to_db_base[fields[1]] = db_base |
---|
330 | #we want to sort within a species numerically by revision number |
---|
331 | build_rev = re.compile(r'\d+$') |
---|
332 | try: build_rev = int(build_rev.findall(fields[0])[0]) |
---|
333 | except: build_rev = 0 |
---|
334 | ucsc_builds[db_base].append((build_rev, fields[0],fields[1])) |
---|
335 | except: continue |
---|
336 | sort_names = name_to_db_base.keys() |
---|
337 | sort_names.sort() |
---|
338 | for name in sort_names: |
---|
339 | db_base = name_to_db_base[name] |
---|
340 | ucsc_builds[db_base].sort() |
---|
341 | ucsc_builds[db_base].reverse() |
---|
342 | ucsc_builds[db_base] = [(build, name) for build_rev, build, name in ucsc_builds[db_base]] |
---|
343 | db_names = DBNames( db_names + ucsc_builds[db_base] ) |
---|
344 | if len( db_names ) > 1 and len( man_builds ) > 0: db_names.append( ( db_names.default_value, '----- Additional Species Are Below -----' ) ) |
---|
345 | man_builds.sort() |
---|
346 | man_builds = [(build, name) for name, build in man_builds] |
---|
347 | db_names = DBNames( db_names + man_builds ) |
---|
348 | except Exception, e: |
---|
349 | print "ERROR: Unable to read builds file:", e |
---|
350 | if len(db_names)<1: |
---|
351 | db_names = DBNames( [( db_names.default_value, db_names.default_name )] ) |
---|
352 | return db_names |
---|
353 | |
---|
354 | def read_build_sites( filename, check_builds=True ): |
---|
355 | """ read db names to ucsc mappings from file, this file should probably be merged with the one above """ |
---|
356 | build_sites = [] |
---|
357 | try: |
---|
358 | for line in open(filename): |
---|
359 | try: |
---|
360 | if line[0:1] == "#": continue |
---|
361 | fields = line.replace("\r","").replace("\n","").split("\t") |
---|
362 | site_name = fields[0] |
---|
363 | site = fields[1] |
---|
364 | if check_builds: |
---|
365 | site_builds = fields[2].split(",") |
---|
366 | site_dict = {'name':site_name, 'url':site, 'builds':site_builds} |
---|
367 | else: |
---|
368 | site_dict = {'name':site_name, 'url':site} |
---|
369 | build_sites.append( site_dict ) |
---|
370 | except: continue |
---|
371 | except: |
---|
372 | print "ERROR: Unable to read builds for site file %s" %filename |
---|
373 | return build_sites |
---|
374 | |
---|
375 | def relpath( path, start = None ): |
---|
376 | """Return a relative version of a path""" |
---|
377 | #modified from python 2.6.1 source code |
---|
378 | |
---|
379 | #version 2.6+ has it built in, we'll use the 'official' copy |
---|
380 | if sys.version_info[:2] >= ( 2, 6 ): |
---|
381 | if start is not None: |
---|
382 | return os.path.relpath( path, start ) |
---|
383 | return os.path.relpath( path ) |
---|
384 | |
---|
385 | #we need to initialize some local parameters |
---|
386 | curdir = os.curdir |
---|
387 | pardir = os.pardir |
---|
388 | sep = os.sep |
---|
389 | commonprefix = os.path.commonprefix |
---|
390 | join = os.path.join |
---|
391 | if start is None: |
---|
392 | start = curdir |
---|
393 | |
---|
394 | #below is the unedited (but formated) relpath() from posixpath.py of 2.6.1 |
---|
395 | #this will likely not function properly on non-posix systems, i.e. windows |
---|
396 | if not path: |
---|
397 | raise ValueError( "no path specified" ) |
---|
398 | |
---|
399 | start_list = os.path.abspath( start ).split( sep ) |
---|
400 | path_list = os.path.abspath( path ).split( sep ) |
---|
401 | |
---|
402 | # Work out how much of the filepath is shared by start and path. |
---|
403 | i = len( commonprefix( [ start_list, path_list ] ) ) |
---|
404 | |
---|
405 | rel_list = [ pardir ] * ( len( start_list )- i ) + path_list[ i: ] |
---|
406 | if not rel_list: |
---|
407 | return curdir |
---|
408 | return join( *rel_list ) |
---|
409 | |
---|
410 | def stringify_dictionary_keys( in_dict ): |
---|
411 | #returns a new dictionary |
---|
412 | #changes unicode keys into strings, only works on top level (does not recurse) |
---|
413 | #unicode keys are not valid for expansion into keyword arguments on method calls |
---|
414 | out_dict = {} |
---|
415 | for key, value in in_dict.iteritems(): |
---|
416 | out_dict[ str( key ) ] = value |
---|
417 | return out_dict |
---|
418 | |
---|
419 | def recursively_stringify_dictionary_keys( d ): |
---|
420 | if isinstance(d, dict): |
---|
421 | return dict([(k.encode('utf-8'), recursively_stringify_dictionary_keys(v)) for k,v in d.iteritems()]) |
---|
422 | elif isinstance(d, list): |
---|
423 | return [recursively_stringify_dictionary_keys(x) for x in d] |
---|
424 | else: |
---|
425 | return d |
---|
426 | |
---|
427 | def mkstemp_ln( src, prefix='mkstemp_ln_' ): |
---|
428 | """ |
---|
429 | From tempfile._mkstemp_inner, generate a hard link in the same dir with a |
---|
430 | random name. Created so we can persist the underlying file of a |
---|
431 | NamedTemporaryFile upon its closure. |
---|
432 | """ |
---|
433 | dir = os.path.dirname(src) |
---|
434 | names = tempfile._get_candidate_names() |
---|
435 | for seq in xrange(tempfile.TMP_MAX): |
---|
436 | name = names.next() |
---|
437 | file = os.path.join(dir, prefix + name) |
---|
438 | try: |
---|
439 | linked_path = os.link( src, file ) |
---|
440 | return (os.path.abspath(file)) |
---|
441 | except OSError, e: |
---|
442 | if e.errno == errno.EEXIST: |
---|
443 | continue # try again |
---|
444 | raise |
---|
445 | raise IOError, (errno.EEXIST, "No usable temporary file name found") |
---|
446 | |
---|
447 | def umask_fix_perms( path, umask, unmasked_perms, gid=None ): |
---|
448 | """ |
---|
449 | umask-friendly permissions fixing |
---|
450 | """ |
---|
451 | perms = unmasked_perms & ~umask |
---|
452 | try: |
---|
453 | st = os.stat( path ) |
---|
454 | except OSError, e: |
---|
455 | log.exception( 'Unable to set permissions or group on %s' % path ) |
---|
456 | return |
---|
457 | # fix modes |
---|
458 | if stat.S_IMODE( st.st_mode ) != perms: |
---|
459 | try: |
---|
460 | os.chmod( path, perms ) |
---|
461 | except Exception, e: |
---|
462 | log.warning( 'Unable to honor umask (%s) for %s, tried to set: %s but mode remains %s, error was: %s' % ( oct( umask ), \ |
---|
463 | path, |
---|
464 | oct( perms ), |
---|
465 | oct( stat.S_IMODE( st.st_mode ) ), |
---|
466 | e ) ) |
---|
467 | # fix group |
---|
468 | if gid is not None and st.st_gid != gid: |
---|
469 | try: |
---|
470 | os.chown( path, -1, gid ) |
---|
471 | except Exception, e: |
---|
472 | try: |
---|
473 | desired_group = grp.getgrgid( gid ) |
---|
474 | current_group = grp.getgrgid( st.st_gid ) |
---|
475 | except: |
---|
476 | desired_group = gid |
---|
477 | current_group = st.st_gid |
---|
478 | log.warning( 'Unable to honor primary group (%s) for %s, group remains %s, error was: %s' % ( desired_group, \ |
---|
479 | path, |
---|
480 | current_group, |
---|
481 | e ) ) |
---|
482 | |
---|
483 | galaxy_root_path = os.path.join(__path__[0], "..","..","..") |
---|
484 | # The dbnames list is used in edit attributes and the upload tool |
---|
485 | dbnames = read_dbnames( os.path.join( galaxy_root_path, "tool-data", "shared", "ucsc", "builds.txt" ) ) |
---|
486 | ucsc_build_sites = read_build_sites( os.path.join( galaxy_root_path, "tool-data", "shared", "ucsc", "ucsc_build_sites.txt" ) ) |
---|
487 | gbrowse_build_sites = read_build_sites( os.path.join( galaxy_root_path, "tool-data", "shared", "gbrowse", "gbrowse_build_sites.txt" ) ) |
---|
488 | genetrack_sites = read_build_sites( os.path.join( galaxy_root_path, "tool-data", "shared", "genetrack", "genetrack_sites.txt" ), check_builds=False ) |
---|
489 | |
---|
490 | if __name__ == '__main__': |
---|
491 | import doctest, sys |
---|
492 | doctest.testmod(sys.modules[__name__], verbose=False) |
---|