1 | import logging, os, sys, time, tempfile |
---|
2 | from galaxy import util |
---|
3 | from galaxy.util.odict import odict |
---|
4 | from galaxy.util.bunch import Bunch |
---|
5 | from cgi import escape |
---|
6 | import metadata |
---|
7 | import zipfile |
---|
8 | from metadata import MetadataElement #import directly to maintain ease of use in Datatype class definitions |
---|
9 | |
---|
10 | log = logging.getLogger(__name__) |
---|
11 | |
---|
12 | # Valid first column and strand column values vor bed, other formats |
---|
13 | col1_startswith = ['chr', 'chl', 'groupun', 'reftig_', 'scaffold', 'super_', 'vcho'] |
---|
14 | valid_strand = ['+', '-', '.'] |
---|
15 | |
---|
16 | class DataMeta( type ): |
---|
17 | """ |
---|
18 | Metaclass for Data class. Sets up metadata spec. |
---|
19 | """ |
---|
20 | def __init__( cls, name, bases, dict_ ): |
---|
21 | cls.metadata_spec = metadata.MetadataSpecCollection() |
---|
22 | for base in bases: #loop through bases (class/types) of cls |
---|
23 | if hasattr( base, "metadata_spec" ): #base of class Data (object) has no metadata |
---|
24 | cls.metadata_spec.update( base.metadata_spec ) #add contents of metadata spec of base class to cls |
---|
25 | metadata.Statement.process( cls ) |
---|
26 | |
---|
27 | class Data( object ): |
---|
28 | """ |
---|
29 | Base class for all datatypes. Implements basic interfaces as well |
---|
30 | as class methods for metadata. |
---|
31 | |
---|
32 | >>> class DataTest( Data ): |
---|
33 | ... MetadataElement( name="test" ) |
---|
34 | ... |
---|
35 | >>> DataTest.metadata_spec.test.name |
---|
36 | 'test' |
---|
37 | >>> DataTest.metadata_spec.test.desc |
---|
38 | 'test' |
---|
39 | >>> type( DataTest.metadata_spec.test.param ) |
---|
40 | <class 'galaxy.datatypes.metadata.MetadataParameter'> |
---|
41 | |
---|
42 | """ |
---|
43 | __metaclass__ = DataMeta |
---|
44 | # Add metadata elements |
---|
45 | MetadataElement( name="dbkey", desc="Database/Build", default="?", param=metadata.DBKeyParameter, multiple=False, no_value="?" ) |
---|
46 | # Stores the set of display applications, and viewing methods, supported by this datatype |
---|
47 | supported_display_apps = {} |
---|
48 | # If False, the peek is regenerated whenever a dataset of this type is copied |
---|
49 | copy_safe_peek = True |
---|
50 | # The dataset contains binary data --> do not space_to_tab or convert newlines, etc. |
---|
51 | # Allow binary file uploads of this type when True. |
---|
52 | is_binary = True |
---|
53 | # Allow user to change between this datatype and others. If False, this datatype |
---|
54 | # cannot be changed from or into. |
---|
55 | allow_datatype_change = True |
---|
56 | #Composite datatypes |
---|
57 | composite_type = None |
---|
58 | composite_files = odict() |
---|
59 | primary_file_name = 'index' |
---|
60 | #A per datatype setting (inherited): max file size (in bytes) for setting optional metadata |
---|
61 | _max_optional_metadata_filesize = None |
---|
62 | |
---|
63 | def __init__(self, **kwd): |
---|
64 | """Initialize the datatype""" |
---|
65 | object.__init__(self, **kwd) |
---|
66 | self.supported_display_apps = self.supported_display_apps.copy() |
---|
67 | self.composite_files = self.composite_files.copy() |
---|
68 | self.display_applications = odict() |
---|
69 | def write_from_stream(self, dataset, stream): |
---|
70 | """Writes data from a stream""" |
---|
71 | fd = open(dataset.file_name, 'wb') |
---|
72 | while 1: |
---|
73 | chunk = stream.read(1048576) |
---|
74 | if not chunk: |
---|
75 | break |
---|
76 | os.write(fd, chunk) |
---|
77 | os.close(fd) |
---|
78 | def set_raw_data(self, dataset, data): |
---|
79 | """Saves the data on the disc""" |
---|
80 | fd = open(dataset.file_name, 'wb') |
---|
81 | os.write(fd, data) |
---|
82 | os.close(fd) |
---|
83 | def get_raw_data( self, dataset ): |
---|
84 | """Returns the full data. To stream it open the file_name and read/write as needed""" |
---|
85 | try: |
---|
86 | return file(datset.file_name, 'rb').read(-1) |
---|
87 | except OSError, e: |
---|
88 | log.exception('%s reading a file that does not exist %s' % (self.__class__.__name__, dataset.file_name)) |
---|
89 | return '' |
---|
90 | def groom_dataset_content( self, file_name ): |
---|
91 | """This function is called on an output dataset file after the content is initially generated.""" |
---|
92 | pass |
---|
93 | def init_meta( self, dataset, copy_from=None ): |
---|
94 | # Metadata should be left mostly uninitialized. Dataset will |
---|
95 | # handle returning default values when metadata is not set. |
---|
96 | # copy_from allows metadata to be passed in that will be |
---|
97 | # copied. (although this seems ambiguous, see |
---|
98 | # Dataset.set_metadata. It always copies the rhs in order to |
---|
99 | # flag the object as modified for SQLAlchemy. |
---|
100 | if copy_from: |
---|
101 | dataset.metadata = copy_from.metadata |
---|
102 | def set_meta( self, dataset, overwrite = True, **kwd ): |
---|
103 | """Unimplemented method, allows guessing of metadata from contents of file""" |
---|
104 | return True |
---|
105 | def missing_meta( self, dataset, check = [], skip = [] ): |
---|
106 | """ |
---|
107 | Checks for empty metadata values, Returns True if non-optional metadata is missing |
---|
108 | Specifying a list of 'check' values will only check those names provided; when used, optionality is ignored |
---|
109 | Specifying a list of 'skip' items will return True even when a named metadata value is missing |
---|
110 | """ |
---|
111 | if check: |
---|
112 | to_check = [ ( to_check, dataset.metadata.get( to_check ) ) for to_check in check ] |
---|
113 | else: |
---|
114 | to_check = dataset.metadata.items() |
---|
115 | for key, value in to_check: |
---|
116 | if key in skip or ( not check and dataset.metadata.spec[key].get( "optional" ) ): |
---|
117 | continue #we skip check for optional and nonrequested values here |
---|
118 | if not value: |
---|
119 | return True |
---|
120 | return False |
---|
121 | def set_max_optional_metadata_filesize( self, max_value ): |
---|
122 | try: |
---|
123 | max_value = int( max_value ) |
---|
124 | except: |
---|
125 | return |
---|
126 | self.__class__._max_optional_metadata_filesize = max_value |
---|
127 | def get_max_optional_metadata_filesize( self ): |
---|
128 | rval = self.__class__._max_optional_metadata_filesize |
---|
129 | if rval is None: |
---|
130 | return -1 |
---|
131 | return rval |
---|
132 | max_optional_metadata_filesize = property( get_max_optional_metadata_filesize, set_max_optional_metadata_filesize ) |
---|
133 | def set_peek( self, dataset, is_multi_byte=False ): |
---|
134 | """Set the peek and blurb text""" |
---|
135 | if not dataset.dataset.purged: |
---|
136 | dataset.peek = '' |
---|
137 | dataset.blurb = 'data' |
---|
138 | else: |
---|
139 | dataset.peek = 'file does not exist' |
---|
140 | dataset.blurb = 'file purged from disk' |
---|
141 | def display_peek(self, dataset ): |
---|
142 | """Create HTML table, used for displaying peek""" |
---|
143 | out = ['<table cellspacing="0" cellpadding="3">'] |
---|
144 | try: |
---|
145 | if not dataset.peek: |
---|
146 | dataset.set_peek() |
---|
147 | data = dataset.peek |
---|
148 | lines = data.splitlines() |
---|
149 | for line in lines: |
---|
150 | line = line.strip() |
---|
151 | if not line: |
---|
152 | continue |
---|
153 | if type( line ) is unicode: |
---|
154 | out.append( '<tr><td>%s</td></tr>' % escape( line ) ) |
---|
155 | else: |
---|
156 | out.append( '<tr><td>%s</td></tr>' % escape( unicode( line, 'utf-8' ) ) ) |
---|
157 | out.append( '</table>' ) |
---|
158 | out = "".join( out ) |
---|
159 | except Exception, exc: |
---|
160 | out = "Can't create peek %s" % str( exc ) |
---|
161 | return out |
---|
162 | def display_name(self, dataset): |
---|
163 | """Returns formatted html of dataset name""" |
---|
164 | try: |
---|
165 | if type ( dataset.name ) is unicode: |
---|
166 | return escape( dataset.name ) |
---|
167 | else: |
---|
168 | return escape( unicode( dataset.name, 'utf-8 ') ) |
---|
169 | except: |
---|
170 | return "name unavailable" |
---|
171 | def display_info(self, dataset): |
---|
172 | """Returns formatted html of dataset info""" |
---|
173 | try: |
---|
174 | # Change new line chars to html |
---|
175 | info = escape( dataset.info ) |
---|
176 | if info.find( '\r\n' ) >= 0: |
---|
177 | info = info.replace( '\r\n', '<br/>' ) |
---|
178 | if info.find( '\r' ) >= 0: |
---|
179 | info = info.replace( '\r', '<br/>' ) |
---|
180 | if info.find( '\n' ) >= 0: |
---|
181 | info = info.replace( '\n', '<br/>' ) |
---|
182 | |
---|
183 | # Convert to unicode to display non-ascii characters. |
---|
184 | if type( info ) is not unicode: |
---|
185 | info = unicode( info, 'utf-8') |
---|
186 | |
---|
187 | return info |
---|
188 | except: |
---|
189 | return "info unavailable" |
---|
190 | def validate(self, dataset): |
---|
191 | """Unimplemented validate, return no exceptions""" |
---|
192 | return list() |
---|
193 | def repair_methods(self, dataset): |
---|
194 | """Unimplemented method, returns dict with method/option for repairing errors""" |
---|
195 | return None |
---|
196 | def get_mime(self): |
---|
197 | """Returns the mime type of the datatype""" |
---|
198 | return 'application/octet-stream' |
---|
199 | def add_display_app ( self, app_id, label, file_function, links_function ): |
---|
200 | """ |
---|
201 | Adds a display app to the datatype. |
---|
202 | app_id is a unique id |
---|
203 | label is the primary display label, e.g., display at 'UCSC' |
---|
204 | file_function is a string containing the name of the function that returns a properly formatted display |
---|
205 | links_function is a string containing the name of the function that returns a list of (link_name,link) |
---|
206 | """ |
---|
207 | self.supported_display_apps = self.supported_display_apps.copy() |
---|
208 | self.supported_display_apps[app_id] = {'label':label,'file_function':file_function,'links_function':links_function} |
---|
209 | def remove_display_app (self, app_id): |
---|
210 | """Removes a display app from the datatype""" |
---|
211 | self.supported_display_apps = self.supported_display_apps.copy() |
---|
212 | try: |
---|
213 | del self.supported_display_apps[app_id] |
---|
214 | except: |
---|
215 | log.exception('Tried to remove display app %s from datatype %s, but this display app is not declared.' % ( type, self.__class__.__name__ ) ) |
---|
216 | def clear_display_apps( self ): |
---|
217 | self.supported_display_apps = {} |
---|
218 | def add_display_application( self, display_application ): |
---|
219 | """New style display applications""" |
---|
220 | assert display_application.id not in self.display_applications, 'Attempted to add a display application twice' |
---|
221 | self.display_applications[ display_application.id ] = display_application |
---|
222 | def get_display_application( self, key, default = None ): |
---|
223 | return self.display_applications.get( key, default ) |
---|
224 | def get_display_applications_by_dataset( self, dataset, trans ): |
---|
225 | rval = odict() |
---|
226 | for key, value in self.display_applications.iteritems(): |
---|
227 | value = value.filter_by_dataset( dataset, trans ) |
---|
228 | if value.links: |
---|
229 | rval[key] = value |
---|
230 | return rval |
---|
231 | def get_display_types(self): |
---|
232 | """Returns display types available""" |
---|
233 | return self.supported_display_apps.keys() |
---|
234 | def get_display_label(self, type): |
---|
235 | """Returns primary label for display app""" |
---|
236 | try: |
---|
237 | return self.supported_display_apps[type]['label'] |
---|
238 | except: |
---|
239 | return 'unknown' |
---|
240 | def as_display_type(self, dataset, type, **kwd): |
---|
241 | """Returns modified file contents for a particular display type """ |
---|
242 | try: |
---|
243 | if type in self.get_display_types(): |
---|
244 | return getattr (self, self.supported_display_apps[type]['file_function']) (dataset, **kwd) |
---|
245 | except: |
---|
246 | log.exception('Function %s is referred to in datatype %s for displaying as type %s, but is not accessible' % (self.supported_display_apps[type]['file_function'], self.__class__.__name__, type) ) |
---|
247 | return "This display type (%s) is not implemented for this datatype (%s)." % ( type, dataset.ext) |
---|
248 | def get_display_links( self, dataset, type, app, base_url, target_frame='_blank', **kwd ): |
---|
249 | """ |
---|
250 | Returns a list of tuples of (name, link) for a particular display type. No check on |
---|
251 | 'access' permissions is done here - if you can view the dataset, you can also save it |
---|
252 | or send it to a destination outside of Galaxy, so Galaxy security restrictions do not |
---|
253 | apply anyway. |
---|
254 | """ |
---|
255 | try: |
---|
256 | if type in self.get_display_types(): |
---|
257 | return target_frame, getattr ( self, self.supported_display_apps[type]['links_function'] ) ( dataset, type, app, base_url, **kwd ) |
---|
258 | except: |
---|
259 | log.exception( 'Function %s is referred to in datatype %s for generating links for type %s, but is not accessible' \ |
---|
260 | % ( self.supported_display_apps[type]['links_function'], self.__class__.__name__, type ) ) |
---|
261 | return [] |
---|
262 | def get_converter_types(self, original_dataset, datatypes_registry): |
---|
263 | """Returns available converters by type for this dataset""" |
---|
264 | return datatypes_registry.get_converters_by_datatype(original_dataset.ext) |
---|
265 | def find_conversion_destination( self, dataset, accepted_formats, datatypes_registry, **kwd ): |
---|
266 | """Returns ( target_ext, existing converted dataset )""" |
---|
267 | return datatypes_registry.find_conversion_destination_for_dataset_by_extensions( dataset, accepted_formats, **kwd ) |
---|
268 | def convert_dataset(self, trans, original_dataset, target_type, return_output = False, visible = True, deps=None): |
---|
269 | """This function adds a job to the queue to convert a dataset to another type. Returns a message about success/failure.""" |
---|
270 | converter = trans.app.datatypes_registry.get_converter_by_target_type( original_dataset.ext, target_type ) |
---|
271 | |
---|
272 | if converter is None: |
---|
273 | raise Exception( "A converter does not exist for %s to %s." % ( original_dataset.ext, target_type ) ) |
---|
274 | #Generate parameter dictionary |
---|
275 | params = {} |
---|
276 | #determine input parameter name and add to params |
---|
277 | input_name = 'input1' |
---|
278 | for key, value in converter.inputs.items(): |
---|
279 | if (deps) and (value.name in deps): |
---|
280 | params[value.name] = deps[value.name] |
---|
281 | elif value.type == 'data': |
---|
282 | input_name = key |
---|
283 | |
---|
284 | params[input_name] = original_dataset |
---|
285 | #Run converter, job is dispatched through Queue |
---|
286 | converted_dataset = converter.execute( trans, incoming = params, set_output_hid = visible )[1] |
---|
287 | if len(params) > 0: |
---|
288 | trans.log_event( "Converter params: %s" % (str(params)), tool_id=converter.id ) |
---|
289 | if not visible: |
---|
290 | for name, value in converted_dataset.iteritems(): |
---|
291 | value.visible = False |
---|
292 | if return_output: |
---|
293 | return converted_dataset |
---|
294 | return "The file conversion of %s on data %s has been added to the Queue." % (converter.name, original_dataset.hid) |
---|
295 | #We need to clear associated files before we set metadata |
---|
296 | #so that as soon as metadata starts to be set, e.g. implicitly converted datasets are deleted and no longer available 'while' metadata is being set, not just after |
---|
297 | #We'll also clear after setting metadata, for backwards compatibility |
---|
298 | def after_setting_metadata( self, dataset ): |
---|
299 | """This function is called on the dataset after metadata is set.""" |
---|
300 | dataset.clear_associated_files( metadata_safe = True ) |
---|
301 | def before_setting_metadata( self, dataset ): |
---|
302 | """This function is called on the dataset before metadata is set.""" |
---|
303 | dataset.clear_associated_files( metadata_safe = True ) |
---|
304 | def __new_composite_file( self, name, optional = False, mimetype = None, description = None, substitute_name_with_metadata = None, is_binary = False, space_to_tab = False, **kwds ): |
---|
305 | kwds[ 'name' ] = name |
---|
306 | kwds[ 'optional' ] = optional |
---|
307 | kwds[ 'mimetype' ] = mimetype |
---|
308 | kwds[ 'description' ] = description |
---|
309 | kwds[ 'substitute_name_with_metadata' ] = substitute_name_with_metadata |
---|
310 | kwds[ 'is_binary' ] = is_binary |
---|
311 | kwds[ 'space_to_tab' ] = space_to_tab |
---|
312 | return Bunch( **kwds ) |
---|
313 | def add_composite_file( self, name, **kwds ): |
---|
314 | #self.composite_files = self.composite_files.copy() |
---|
315 | self.composite_files[ name ] = self.__new_composite_file( name, **kwds ) |
---|
316 | def __substitute_composite_key( self, key, composite_file, dataset = None ): |
---|
317 | if composite_file.substitute_name_with_metadata: |
---|
318 | if dataset: |
---|
319 | meta_value = str( dataset.metadata.get( composite_file.substitute_name_with_metadata ) ) |
---|
320 | else: |
---|
321 | meta_value = self.spec[composite_file.substitute_name_with_metadata].default |
---|
322 | return key % meta_value |
---|
323 | return key |
---|
324 | @property |
---|
325 | def writable_files( self, dataset = None ): |
---|
326 | files = odict() |
---|
327 | if self.composite_type != 'auto_primary_file': |
---|
328 | files[ self.primary_file_name ] = self.__new_composite_file( self.primary_file_name ) |
---|
329 | for key, value in self.get_composite_files( dataset = dataset ).iteritems(): |
---|
330 | files[ key ] = value |
---|
331 | return files |
---|
332 | def get_composite_files( self, dataset = None ): |
---|
333 | def substitute_composite_key( key, composite_file ): |
---|
334 | if composite_file.substitute_name_with_metadata: |
---|
335 | if dataset: |
---|
336 | meta_value = str( dataset.metadata.get( composite_file.substitute_name_with_metadata ) ) |
---|
337 | else: |
---|
338 | meta_value = self.metadata_spec[ composite_file.substitute_name_with_metadata ].default |
---|
339 | return key % meta_value |
---|
340 | return key |
---|
341 | files = odict() |
---|
342 | for key, value in self.composite_files.iteritems(): |
---|
343 | files[ substitute_composite_key( key, value ) ] = value |
---|
344 | return files |
---|
345 | def generate_auto_primary_file( self, dataset = None ): |
---|
346 | raise Exception( "generate_auto_primary_file is not implemented for this datatype." ) |
---|
347 | @property |
---|
348 | def has_resolution(self): |
---|
349 | return False |
---|
350 | |
---|
351 | class Text( Data ): |
---|
352 | file_ext = 'txt' |
---|
353 | |
---|
354 | """Add metadata elements""" |
---|
355 | MetadataElement( name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0 ) |
---|
356 | |
---|
357 | def write_from_stream(self, dataset, stream): |
---|
358 | """Writes data from a stream""" |
---|
359 | # write it twice for now |
---|
360 | fd, temp_name = tempfile.mkstemp() |
---|
361 | while 1: |
---|
362 | chunk = stream.read(1048576) |
---|
363 | if not chunk: |
---|
364 | break |
---|
365 | os.write(fd, chunk) |
---|
366 | os.close(fd) |
---|
367 | # rewrite the file with unix newlines |
---|
368 | fp = open(dataset.file_name, 'wt') |
---|
369 | for line in file(temp_name, "U"): |
---|
370 | line = line.strip() + '\n' |
---|
371 | fp.write(line) |
---|
372 | fp.close() |
---|
373 | def set_raw_data(self, dataset, data): |
---|
374 | """Saves the data on the disc""" |
---|
375 | fd, temp_name = tempfile.mkstemp() |
---|
376 | os.write(fd, data) |
---|
377 | os.close(fd) |
---|
378 | # rewrite the file with unix newlines |
---|
379 | fp = open(dataset.file_name, 'wt') |
---|
380 | for line in file(temp_name, "U"): |
---|
381 | line = line.strip() + '\n' |
---|
382 | fp.write(line) |
---|
383 | fp.close() |
---|
384 | os.remove( temp_name ) |
---|
385 | def get_mime(self): |
---|
386 | """Returns the mime type of the datatype""" |
---|
387 | return 'text/plain' |
---|
388 | def set_meta( self, dataset, **kwd ): |
---|
389 | """ |
---|
390 | Set the number of lines of data in dataset, |
---|
391 | skipping all blank lines and comments. |
---|
392 | """ |
---|
393 | data_lines = 0 |
---|
394 | for line in file( dataset.file_name ): |
---|
395 | line = line.strip() |
---|
396 | if line and not line.startswith( '#' ): |
---|
397 | data_lines += 1 |
---|
398 | dataset.metadata.data_lines = data_lines |
---|
399 | def set_peek( self, dataset, line_count=None, is_multi_byte=False ): |
---|
400 | if not dataset.dataset.purged: |
---|
401 | # The file must exist on disk for the get_file_peek() method |
---|
402 | dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) |
---|
403 | if line_count is None: |
---|
404 | # See if line_count is stored in the metadata |
---|
405 | if dataset.metadata.data_lines: |
---|
406 | dataset.blurb = "%s lines" % util.commaify( str( dataset.metadata.data_lines ) ) |
---|
407 | else: |
---|
408 | # Number of lines is not known ( this should not happen ), and auto-detect is |
---|
409 | # needed to set metadata |
---|
410 | dataset.blurb = "? lines" |
---|
411 | else: |
---|
412 | dataset.blurb = "%s lines" % util.commaify( str( line_count ) ) |
---|
413 | else: |
---|
414 | dataset.peek = 'file does not exist' |
---|
415 | dataset.blurb = 'file purged from disk' |
---|
416 | |
---|
417 | class Newick( Text ): |
---|
418 | pass |
---|
419 | |
---|
420 | # ------------- Utility methods -------------- |
---|
421 | |
---|
422 | def get_test_fname( fname ): |
---|
423 | """Returns test data filename""" |
---|
424 | path, name = os.path.split(__file__) |
---|
425 | full_path = os.path.join( path, 'test', fname ) |
---|
426 | return full_path |
---|
427 | def nice_size(size): |
---|
428 | """ |
---|
429 | Returns a readably formatted string with the size |
---|
430 | |
---|
431 | >>> nice_size(100) |
---|
432 | '100.0 bytes' |
---|
433 | >>> nice_size(10000) |
---|
434 | '9.8 Kb' |
---|
435 | >>> nice_size(1000000) |
---|
436 | '976.6 Kb' |
---|
437 | >>> nice_size(100000000) |
---|
438 | '95.4 Mb' |
---|
439 | """ |
---|
440 | words = [ 'bytes', 'Kb', 'Mb', 'Gb' ] |
---|
441 | try: |
---|
442 | size = float( size ) |
---|
443 | except: |
---|
444 | return '??? bytes' |
---|
445 | for ind, word in enumerate(words): |
---|
446 | step = 1024 ** (ind + 1) |
---|
447 | if step > size: |
---|
448 | size = size / float(1024 ** ind) |
---|
449 | out = "%.1f %s" % (size, word) |
---|
450 | return out |
---|
451 | return '??? bytes' |
---|
452 | def get_file_peek( file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5 ): |
---|
453 | """ |
---|
454 | Returns the first LINE_COUNT lines wrapped to WIDTH |
---|
455 | |
---|
456 | ## >>> fname = get_test_fname('4.bed') |
---|
457 | ## >>> get_file_peek(fname) |
---|
458 | ## 'chr22 30128507 31828507 uc003bnx.1_cds_2_0_chr22_29227_f 0 +\n' |
---|
459 | """ |
---|
460 | lines = [] |
---|
461 | count = 0 |
---|
462 | file_type = None |
---|
463 | data_checked = False |
---|
464 | temp = open( file_name, "U" ) |
---|
465 | while count <= LINE_COUNT: |
---|
466 | line = temp.readline( WIDTH ) |
---|
467 | if line and not is_multi_byte and not data_checked: |
---|
468 | # See if we have a compressed or binary file |
---|
469 | if line[0:2] == util.gzip_magic: |
---|
470 | file_type = 'gzipped' |
---|
471 | break |
---|
472 | else: |
---|
473 | for char in line: |
---|
474 | if ord( char ) > 128: |
---|
475 | file_type = 'binary' |
---|
476 | break |
---|
477 | data_checked = True |
---|
478 | if file_type in [ 'gzipped', 'binary' ]: |
---|
479 | break |
---|
480 | lines.append( line ) |
---|
481 | count += 1 |
---|
482 | temp.close() |
---|
483 | if file_type in [ 'gzipped', 'binary' ]: |
---|
484 | text = "%s file" % file_type |
---|
485 | else: |
---|
486 | try: |
---|
487 | text = unicode( '\n'.join( lines ), 'utf-8' ) |
---|
488 | except UnicodeDecodeError: |
---|
489 | text = "binary/unknown file" |
---|
490 | return text |
---|