[2] | 1 | """ |
---|
| 2 | Tabular datatype |
---|
| 3 | |
---|
| 4 | """ |
---|
| 5 | import pkg_resources |
---|
| 6 | pkg_resources.require( "bx-python" ) |
---|
| 7 | |
---|
| 8 | import logging |
---|
| 9 | import data |
---|
| 10 | from galaxy import util |
---|
| 11 | from cgi import escape |
---|
| 12 | from galaxy.datatypes import metadata |
---|
| 13 | from galaxy.datatypes.metadata import MetadataElement |
---|
| 14 | import galaxy_utils.sequence.vcf |
---|
| 15 | from sniff import * |
---|
| 16 | |
---|
| 17 | log = logging.getLogger(__name__) |
---|
| 18 | |
---|
| 19 | class Tabular( data.Text ): |
---|
| 20 | """Tab delimited data""" |
---|
| 21 | |
---|
| 22 | """Add metadata elements""" |
---|
| 23 | MetadataElement( name="comment_lines", default=0, desc="Number of comment lines", readonly=False, optional=True, no_value=0 ) |
---|
| 24 | MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=False, no_value=0 ) |
---|
| 25 | MetadataElement( name="column_types", default=[], desc="Column types", param=metadata.ColumnTypesParameter, readonly=True, visible=False, no_value=[] ) |
---|
| 26 | |
---|
| 27 | def init_meta( self, dataset, copy_from=None ): |
---|
| 28 | data.Text.init_meta( self, dataset, copy_from=copy_from ) |
---|
| 29 | def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ): |
---|
| 30 | """ |
---|
| 31 | Tries to determine the number of columns as well as those columns |
---|
| 32 | that contain numerical values in the dataset. A skip parameter is |
---|
| 33 | used because various tabular data types reuse this function, and |
---|
| 34 | their data type classes are responsible to determine how many invalid |
---|
| 35 | comment lines should be skipped. Using None for skip will cause skip |
---|
| 36 | to be zero, but the first line will be processed as a header. A |
---|
| 37 | max_data_lines parameter is used because various tabular data types |
---|
| 38 | reuse this function, and their data type classes are responsible to |
---|
| 39 | determine how many data lines should be processed to ensure that the |
---|
| 40 | non-optional metadata parameters are properly set; if used, optional |
---|
| 41 | metadata parameters will be set to None, unless the entire file has |
---|
| 42 | already been read. Using None (default) for max_data_lines will |
---|
| 43 | process all data lines. |
---|
| 44 | |
---|
| 45 | Items of interest: |
---|
| 46 | 1. We treat 'overwrite' as always True (we always want to set tabular metadata when called). |
---|
| 47 | 2. If a tabular file has no data, it will have one column of type 'str'. |
---|
| 48 | 3. We used to check only the first 100 lines when setting metadata and this class's |
---|
| 49 | set_peek() method read the entire file to determine the number of lines in the file. |
---|
| 50 | Since metadata can now be processed on cluster nodes, we've merged the line count portion |
---|
| 51 | of the set_peek() processing here, and we now check the entire contents of the file. |
---|
| 52 | """ |
---|
| 53 | # Store original skip value to check with later |
---|
| 54 | requested_skip = skip |
---|
| 55 | if skip is None: |
---|
| 56 | skip = 0 |
---|
| 57 | column_type_set_order = [ 'int', 'float', 'list', 'str' ] #Order to set column types in |
---|
| 58 | default_column_type = column_type_set_order[-1] # Default column type is lowest in list |
---|
| 59 | column_type_compare_order = list( column_type_set_order ) #Order to compare column types |
---|
| 60 | column_type_compare_order.reverse() |
---|
| 61 | def type_overrules_type( column_type1, column_type2 ): |
---|
| 62 | if column_type1 is None or column_type1 == column_type2: |
---|
| 63 | return False |
---|
| 64 | if column_type2 is None: |
---|
| 65 | return True |
---|
| 66 | for column_type in column_type_compare_order: |
---|
| 67 | if column_type1 == column_type: |
---|
| 68 | return True |
---|
| 69 | if column_type2 == column_type: |
---|
| 70 | return False |
---|
| 71 | #neither column type was found in our ordered list, this cannot happen |
---|
| 72 | raise "Tried to compare unknown column types" |
---|
| 73 | def is_int( column_text ): |
---|
| 74 | try: |
---|
| 75 | int( column_text ) |
---|
| 76 | return True |
---|
| 77 | except: |
---|
| 78 | return False |
---|
| 79 | def is_float( column_text ): |
---|
| 80 | try: |
---|
| 81 | float( column_text ) |
---|
| 82 | return True |
---|
| 83 | except: |
---|
| 84 | if column_text.strip().lower() == 'na': |
---|
| 85 | return True #na is special cased to be a float |
---|
| 86 | return False |
---|
| 87 | def is_list( column_text ): |
---|
| 88 | return "," in column_text |
---|
| 89 | def is_str( column_text ): |
---|
| 90 | #anything, except an empty string, is True |
---|
| 91 | if column_text == "": |
---|
| 92 | return False |
---|
| 93 | return True |
---|
| 94 | is_column_type = {} #Dict to store column type string to checking function |
---|
| 95 | for column_type in column_type_set_order: |
---|
| 96 | is_column_type[column_type] = locals()[ "is_%s" % ( column_type ) ] |
---|
| 97 | def guess_column_type( column_text ): |
---|
| 98 | for column_type in column_type_set_order: |
---|
| 99 | if is_column_type[column_type]( column_text ): |
---|
| 100 | return column_type |
---|
| 101 | return None |
---|
| 102 | data_lines = 0 |
---|
| 103 | comment_lines = 0 |
---|
| 104 | column_types = [] |
---|
| 105 | first_line_column_types = [default_column_type] # default value is one column of type str |
---|
| 106 | if dataset.has_data(): |
---|
| 107 | #NOTE: if skip > num_check_lines, we won't detect any metadata, and will use default |
---|
| 108 | dataset_fh = open( dataset.file_name ) |
---|
| 109 | i = 0 |
---|
| 110 | while True: |
---|
| 111 | line = dataset_fh.readline() |
---|
| 112 | if not line: break |
---|
| 113 | line = line.rstrip( '\r\n' ) |
---|
| 114 | if i < skip or not line or line.startswith( '#' ): |
---|
| 115 | # We'll call blank lines comments |
---|
| 116 | comment_lines += 1 |
---|
| 117 | else: |
---|
| 118 | data_lines += 1 |
---|
| 119 | fields = line.split( '\t' ) |
---|
| 120 | for field_count, field in enumerate( fields ): |
---|
| 121 | if field_count >= len( column_types ): #found a previously unknown column, we append None |
---|
| 122 | column_types.append( None ) |
---|
| 123 | column_type = guess_column_type( field ) |
---|
| 124 | if type_overrules_type( column_type, column_types[field_count] ): |
---|
| 125 | column_types[field_count] = column_type |
---|
| 126 | if i == 0 and requested_skip is None: |
---|
| 127 | # This is our first line, people seem to like to upload files that have a header line, but do not |
---|
| 128 | # start with '#' (i.e. all column types would then most likely be detected as str). We will assume |
---|
| 129 | # that the first line is always a header (this was previous behavior - it was always skipped). When |
---|
| 130 | # the requested skip is None, we only use the data from the first line if we have no other data for |
---|
| 131 | # a column. This is far from perfect, as |
---|
| 132 | # 1,2,3 1.1 2.2 qwerty |
---|
| 133 | # 0 0 1,2,3 |
---|
| 134 | # will be detected as |
---|
| 135 | # "column_types": ["int", "int", "float", "list"] |
---|
| 136 | # instead of |
---|
| 137 | # "column_types": ["list", "float", "float", "str"] *** would seem to be the 'Truth' by manual |
---|
| 138 | # observation that the first line should be included as data. The old method would have detected as |
---|
| 139 | # "column_types": ["int", "int", "str", "list"] |
---|
| 140 | first_line_column_types = column_types |
---|
| 141 | column_types = [ None for col in first_line_column_types ] |
---|
| 142 | if max_data_lines is not None and data_lines >= max_data_lines: |
---|
| 143 | if dataset_fh.tell() != dataset.get_size(): |
---|
| 144 | data_lines = None #Clear optional data_lines metadata value |
---|
| 145 | comment_lines = None #Clear optional comment_lines metadata value; additional comment lines could appear below this point |
---|
| 146 | break |
---|
| 147 | i += 1 |
---|
| 148 | dataset_fh.close() |
---|
| 149 | |
---|
| 150 | #we error on the larger number of columns |
---|
| 151 | #first we pad our column_types by using data from first line |
---|
| 152 | if len( first_line_column_types ) > len( column_types ): |
---|
| 153 | for column_type in first_line_column_types[len( column_types ):]: |
---|
| 154 | column_types.append( column_type ) |
---|
| 155 | #Now we fill any unknown (None) column_types with data from first line |
---|
| 156 | for i in range( len( column_types ) ): |
---|
| 157 | if column_types[i] is None: |
---|
| 158 | if len( first_line_column_types ) <= i or first_line_column_types[i] is None: |
---|
| 159 | column_types[i] = default_column_type |
---|
| 160 | else: |
---|
| 161 | column_types[i] = first_line_column_types[i] |
---|
| 162 | # Set the discovered metadata values for the dataset |
---|
| 163 | dataset.metadata.data_lines = data_lines |
---|
| 164 | dataset.metadata.comment_lines = comment_lines |
---|
| 165 | dataset.metadata.column_types = column_types |
---|
| 166 | dataset.metadata.columns = len( column_types ) |
---|
| 167 | def make_html_table( self, dataset, skipchars=[] ): |
---|
| 168 | """Create HTML table, used for displaying peek""" |
---|
| 169 | out = ['<table cellspacing="0" cellpadding="3">'] |
---|
| 170 | try: |
---|
| 171 | out.append( '<tr>' ) |
---|
| 172 | # Generate column header |
---|
| 173 | for i in range( 1, dataset.metadata.columns+1 ): |
---|
| 174 | out.append( '<th>%s</th>' % str( i ) ) |
---|
| 175 | out.append( '</tr>' ) |
---|
| 176 | out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) ) |
---|
| 177 | out.append( '</table>' ) |
---|
| 178 | out = "".join( out ) |
---|
| 179 | except Exception, exc: |
---|
| 180 | out = "Can't create peek %s" % str( exc ) |
---|
| 181 | return out |
---|
| 182 | def make_html_peek_rows( self, dataset, skipchars=[] ): |
---|
| 183 | out = [""] |
---|
| 184 | comments = [] |
---|
| 185 | if not dataset.peek: |
---|
| 186 | dataset.set_peek() |
---|
| 187 | data = dataset.peek |
---|
| 188 | lines = data.splitlines() |
---|
| 189 | for line in lines: |
---|
| 190 | line = line.rstrip( '\r\n' ) |
---|
| 191 | if not line: |
---|
| 192 | continue |
---|
| 193 | comment = False |
---|
| 194 | for skipchar in skipchars: |
---|
| 195 | if line.startswith( skipchar ): |
---|
| 196 | comments.append( line ) |
---|
| 197 | comment = True |
---|
| 198 | break |
---|
| 199 | if comment: |
---|
| 200 | continue |
---|
| 201 | elems = line.split( '\t' ) |
---|
| 202 | if len( elems ) != dataset.metadata.columns: |
---|
| 203 | # We may have an invalid comment line or invalid data |
---|
| 204 | comments.append( line ) |
---|
| 205 | comment = True |
---|
| 206 | continue |
---|
| 207 | while len( comments ) > 0: # Keep comments |
---|
| 208 | try: |
---|
| 209 | out.append( '<tr><td colspan="100%">' ) |
---|
| 210 | except: |
---|
| 211 | out.append( '<tr><td>' ) |
---|
| 212 | out.append( '%s</td></tr>' % escape( comments.pop(0) ) ) |
---|
| 213 | out.append( '<tr>' ) |
---|
| 214 | for elem in elems: # valid data |
---|
| 215 | elem = escape( elem ) |
---|
| 216 | out.append( '<td>%s</td>' % elem ) |
---|
| 217 | out.append( '</tr>' ) |
---|
| 218 | # Peek may consist only of comments |
---|
| 219 | while len( comments ) > 0: |
---|
| 220 | try: |
---|
| 221 | out.append( '<tr><td colspan="100%">' ) |
---|
| 222 | except: |
---|
| 223 | out.append( '<tr><td>' ) |
---|
| 224 | out.append( '%s</td></tr>' % escape( comments.pop(0) ) ) |
---|
| 225 | return "".join( out ) |
---|
| 226 | def set_peek( self, dataset, line_count=None, is_multi_byte=False ): |
---|
| 227 | data.Text.set_peek( self, dataset, line_count=line_count, is_multi_byte=is_multi_byte ) |
---|
| 228 | if dataset.metadata.comment_lines: |
---|
| 229 | dataset.blurb = "%s, %s comments" % ( dataset.blurb, util.commaify( str( dataset.metadata.comment_lines ) ) ) |
---|
| 230 | def display_peek( self, dataset ): |
---|
| 231 | """Returns formatted html of peek""" |
---|
| 232 | return self.make_html_table( dataset ) |
---|
| 233 | def displayable( self, dataset ): |
---|
| 234 | try: |
---|
| 235 | return dataset.has_data() \ |
---|
| 236 | and dataset.state == dataset.states.OK \ |
---|
| 237 | and dataset.metadata.columns > 0 \ |
---|
| 238 | and dataset.metadata.data_lines > 0 |
---|
| 239 | except: |
---|
| 240 | return False |
---|
| 241 | def as_gbrowse_display_file( self, dataset, **kwd ): |
---|
| 242 | return open( dataset.file_name ) |
---|
| 243 | def as_ucsc_display_file( self, dataset, **kwd ): |
---|
| 244 | return open( dataset.file_name ) |
---|
| 245 | |
---|
| 246 | class Taxonomy( Tabular ): |
---|
| 247 | def __init__(self, **kwd): |
---|
| 248 | """Initialize taxonomy datatype""" |
---|
| 249 | Tabular.__init__( self, **kwd ) |
---|
| 250 | self.column_names = ['Name', 'TaxId', 'Root', 'Superkingdom', 'Kingdom', 'Subkingdom', |
---|
| 251 | 'Superphylum', 'Phylum', 'Subphylum', 'Superclass', 'Class', 'Subclass', |
---|
| 252 | 'Superorder', 'Order', 'Suborder', 'Superfamily', 'Family', 'Subfamily', |
---|
| 253 | 'Tribe', 'Subtribe', 'Genus', 'Subgenus', 'Species', 'Subspecies' |
---|
| 254 | ] |
---|
| 255 | def make_html_table( self, dataset, skipchars=[] ): |
---|
| 256 | """Create HTML table, used for displaying peek""" |
---|
| 257 | out = ['<table cellspacing="0" cellpadding="3">'] |
---|
| 258 | comments = [] |
---|
| 259 | try: |
---|
| 260 | # Generate column header |
---|
| 261 | out.append( '<tr>' ) |
---|
| 262 | for i, name in enumerate( self.column_names ): |
---|
| 263 | out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) ) |
---|
| 264 | # This data type requires at least 24 columns in the data |
---|
| 265 | if dataset.metadata.columns - len( self.column_names ) > 0: |
---|
| 266 | for i in range( len( self.column_names ), dataset.metadata.columns ): |
---|
| 267 | out.append( '<th>%s</th>' % str( i+1 ) ) |
---|
| 268 | out.append( '</tr>' ) |
---|
| 269 | out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) ) |
---|
| 270 | out.append( '</table>' ) |
---|
| 271 | out = "".join( out ) |
---|
| 272 | except Exception, exc: |
---|
| 273 | out = "Can't create peek %s" % exc |
---|
| 274 | return out |
---|
| 275 | |
---|
| 276 | class Sam( Tabular ): |
---|
| 277 | file_ext = 'sam' |
---|
| 278 | def __init__(self, **kwd): |
---|
| 279 | """Initialize taxonomy datatype""" |
---|
| 280 | Tabular.__init__( self, **kwd ) |
---|
| 281 | self.column_names = ['QNAME', 'FLAG', 'RNAME', 'POS', 'MAPQ', 'CIGAR', |
---|
| 282 | 'MRNM', 'MPOS', 'ISIZE', 'SEQ', 'QUAL', 'OPT' |
---|
| 283 | ] |
---|
| 284 | def make_html_table( self, dataset, skipchars=[] ): |
---|
| 285 | """Create HTML table, used for displaying peek""" |
---|
| 286 | out = ['<table cellspacing="0" cellpadding="3">'] |
---|
| 287 | try: |
---|
| 288 | # Generate column header |
---|
| 289 | out.append( '<tr>' ) |
---|
| 290 | for i, name in enumerate( self.column_names ): |
---|
| 291 | out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) ) |
---|
| 292 | # This data type requires at least 11 columns in the data |
---|
| 293 | if dataset.metadata.columns - len( self.column_names ) > 0: |
---|
| 294 | for i in range( len( self.column_names ), dataset.metadata.columns ): |
---|
| 295 | out.append( '<th>%s</th>' % str( i+1 ) ) |
---|
| 296 | out.append( '</tr>' ) |
---|
| 297 | out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) ) |
---|
| 298 | out.append( '</table>' ) |
---|
| 299 | out = "".join( out ) |
---|
| 300 | except Exception, exc: |
---|
| 301 | out = "Can't create peek %s" % exc |
---|
| 302 | return out |
---|
| 303 | def sniff( self, filename ): |
---|
| 304 | """ |
---|
| 305 | Determines whether the file is in SAM format |
---|
| 306 | |
---|
| 307 | A file in SAM format consists of lines of tab-separated data. |
---|
| 308 | The following header line may be the first line: |
---|
| 309 | @QNAME FLAG RNAME POS MAPQ CIGAR MRNM MPOS ISIZE SEQ QUAL |
---|
| 310 | or |
---|
| 311 | @QNAME FLAG RNAME POS MAPQ CIGAR MRNM MPOS ISIZE SEQ QUAL OPT |
---|
| 312 | Data in the OPT column is optional and can consist of tab-separated data |
---|
| 313 | |
---|
| 314 | For complete details see http://samtools.sourceforge.net/SAM1.pdf |
---|
| 315 | |
---|
| 316 | Rules for sniffing as True: |
---|
| 317 | There must be 11 or more columns of data on each line |
---|
| 318 | Columns 2 (FLAG), 4(POS), 5 (MAPQ), 8 (MPOS), and 9 (ISIZE) must be numbers (9 can be negative) |
---|
| 319 | We will only check that up to the first 5 alignments are correctly formatted. |
---|
| 320 | |
---|
| 321 | >>> fname = get_test_fname( 'sequence.maf' ) |
---|
| 322 | >>> Sam().sniff( fname ) |
---|
| 323 | False |
---|
| 324 | >>> fname = get_test_fname( '1.sam' ) |
---|
| 325 | >>> Sam().sniff( fname ) |
---|
| 326 | True |
---|
| 327 | """ |
---|
| 328 | try: |
---|
| 329 | fh = open( filename ) |
---|
| 330 | count = 0 |
---|
| 331 | while True: |
---|
| 332 | line = fh.readline() |
---|
| 333 | line = line.strip() |
---|
| 334 | if not line: |
---|
| 335 | break #EOF |
---|
| 336 | if line: |
---|
| 337 | if line[0] != '@': |
---|
| 338 | linePieces = line.split('\t') |
---|
| 339 | if len(linePieces) < 11: |
---|
| 340 | return False |
---|
| 341 | try: |
---|
| 342 | check = int(linePieces[1]) |
---|
| 343 | check = int(linePieces[3]) |
---|
| 344 | check = int(linePieces[4]) |
---|
| 345 | check = int(linePieces[7]) |
---|
| 346 | check = int(linePieces[8]) |
---|
| 347 | except ValueError: |
---|
| 348 | return False |
---|
| 349 | count += 1 |
---|
| 350 | if count == 5: |
---|
| 351 | return True |
---|
| 352 | fh.close() |
---|
| 353 | if count < 5 and count > 0: |
---|
| 354 | return True |
---|
| 355 | except: |
---|
| 356 | pass |
---|
| 357 | return False |
---|
| 358 | |
---|
| 359 | class Pileup( Tabular ): |
---|
| 360 | """Tab delimited data in pileup (6- or 10-column) format""" |
---|
| 361 | file_ext = "pileup" |
---|
| 362 | |
---|
| 363 | """Add metadata elements""" |
---|
| 364 | MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter ) |
---|
| 365 | MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter ) |
---|
| 366 | MetadataElement( name="baseCol", default=3, desc="Reference base column", param=metadata.ColumnParameter ) |
---|
| 367 | |
---|
| 368 | def init_meta( self, dataset, copy_from=None ): |
---|
| 369 | Tabular.init_meta( self, dataset, copy_from=copy_from ) |
---|
| 370 | |
---|
| 371 | def set_peek( self, dataset, line_count=None, is_multi_byte=False ): |
---|
| 372 | """Set the peek and blurb text""" |
---|
| 373 | if not dataset.dataset.purged: |
---|
| 374 | dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) |
---|
| 375 | if line_count is None: |
---|
| 376 | # See if line_count is stored in the metadata |
---|
| 377 | if dataset.metadata.data_lines: |
---|
| 378 | dataset.blurb = "%s genomic coordinates" % util.commaify( str( dataset.metadata.data_lines ) ) |
---|
| 379 | else: |
---|
| 380 | # Number of lines is not known ( this should not happen ), and auto-detect is |
---|
| 381 | # needed to set metadata |
---|
| 382 | dataset.blurb = "? genomic coordinates" |
---|
| 383 | else: |
---|
| 384 | dataset.blurb = "%s genomic coordinates" % util.commaify( str( line_count ) ) |
---|
| 385 | else: |
---|
| 386 | dataset.peek = 'file does not exist' |
---|
| 387 | dataset.blurb = 'file purged from disk' |
---|
| 388 | |
---|
| 389 | def make_html_table( self, dataset, skipchars=[] ): |
---|
| 390 | """Create HTML table, used for displaying peek""" |
---|
| 391 | out = ['<table cellspacing="0" cellpadding="3">'] |
---|
| 392 | comments = [] |
---|
| 393 | try: |
---|
| 394 | # Generate column header |
---|
| 395 | out.append('<tr>') |
---|
| 396 | for i in range( 1, dataset.metadata.columns+1 ): |
---|
| 397 | if i == dataset.metadata.chromCol: |
---|
| 398 | out.append( '<th>%s.Chrom</th>' % i ) |
---|
| 399 | elif i == dataset.metadata.startCol: |
---|
| 400 | out.append( '<th>%s.Start</th>' % i ) |
---|
| 401 | elif i == dataset.metadata.baseCol: |
---|
| 402 | out.append( '<th>%s.Base</th>' % i ) |
---|
| 403 | else: |
---|
| 404 | out.append( '<th>%s</th>' % i ) |
---|
| 405 | out.append('</tr>') |
---|
| 406 | out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) ) |
---|
| 407 | out.append( '</table>' ) |
---|
| 408 | out = "".join( out ) |
---|
| 409 | except Exception, exc: |
---|
| 410 | out = "Can't create peek %s" % str( exc ) |
---|
| 411 | return out |
---|
| 412 | |
---|
| 413 | def repair_methods( self, dataset ): |
---|
| 414 | """Return options for removing errors along with a description""" |
---|
| 415 | return [ ("lines", "Remove erroneous lines") ] |
---|
| 416 | |
---|
| 417 | def sniff( self, filename ): |
---|
| 418 | """ |
---|
| 419 | Checks for 'pileup-ness' |
---|
| 420 | |
---|
| 421 | There are two main types of pileup: 6-column and 10-column. For both, |
---|
| 422 | the first three and last two columns are the same. We only check the |
---|
| 423 | first three to allow for some personalization of the format. |
---|
| 424 | |
---|
| 425 | >>> fname = get_test_fname( 'interval.interval' ) |
---|
| 426 | >>> Pileup().sniff( fname ) |
---|
| 427 | False |
---|
| 428 | >>> fname = get_test_fname( '6col.pileup' ) |
---|
| 429 | >>> Pileup().sniff( fname ) |
---|
| 430 | True |
---|
| 431 | >>> fname = get_test_fname( '10col.pileup' ) |
---|
| 432 | >>> Pileup().sniff( fname ) |
---|
| 433 | True |
---|
| 434 | """ |
---|
| 435 | headers = get_headers( filename, '\t' ) |
---|
| 436 | try: |
---|
| 437 | for hdr in headers: |
---|
| 438 | if hdr and not hdr[0].startswith( '#' ): |
---|
| 439 | if len( hdr ) < 3: |
---|
| 440 | return False |
---|
| 441 | try: |
---|
| 442 | # chrom start in column 1 (with 0-based columns) |
---|
| 443 | # and reference base is in column 2 |
---|
| 444 | check = int( hdr[1] ) |
---|
| 445 | assert hdr[2] in [ 'A', 'C', 'G', 'T', 'N', 'a', 'c', 'g', 't', 'n' ] |
---|
| 446 | except: |
---|
| 447 | return False |
---|
| 448 | return True |
---|
| 449 | except: |
---|
| 450 | return False |
---|
| 451 | |
---|
| 452 | class Eland( Tabular ): |
---|
| 453 | file_ext = 'eland' |
---|
| 454 | |
---|
| 455 | def sniff( self, filename ): |
---|
| 456 | return False |
---|
| 457 | |
---|
| 458 | class ElandMulti( Tabular ): |
---|
| 459 | file_ext = 'elandmulti' |
---|
| 460 | |
---|
| 461 | def sniff( self, filename ): |
---|
| 462 | return False |
---|
| 463 | |
---|
| 464 | class Vcf( Tabular ): |
---|
| 465 | """ Variant Call Format for describing SNPs and other simple genome variations. """ |
---|
| 466 | |
---|
| 467 | file_ext = 'vcf' |
---|
| 468 | column_names = [ 'Chrom', 'Pos', 'ID', 'Ref', 'Alt', 'Qual', 'Filter', 'Info', 'Format', 'data' ] |
---|
| 469 | |
---|
| 470 | MetadataElement( name="columns", default=10, desc="Number of columns", readonly=True, visible=False ) |
---|
| 471 | MetadataElement( name="column_types", default=['str','int','str','str','str','int','str','list','str','str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False ) |
---|
| 472 | MetadataElement( name="viz_filter_cols", default=[5], param=metadata.ColumnParameter, multiple=True ) |
---|
| 473 | |
---|
| 474 | def sniff( self, filename ): |
---|
| 475 | try: |
---|
| 476 | # If reader can read and parse file, it's VCF. |
---|
| 477 | for line in list( galaxy_utils.sequence.vcf.Reader( open( filename ) ) ): |
---|
| 478 | pass |
---|
| 479 | return True |
---|
| 480 | except: |
---|
| 481 | return False |
---|
| 482 | |
---|
| 483 | def make_html_table( self, dataset, skipchars=[] ): |
---|
| 484 | """Create HTML table, used for displaying peek""" |
---|
| 485 | out = ['<table cellspacing="0" cellpadding="3">'] |
---|
| 486 | try: |
---|
| 487 | # Generate column header |
---|
| 488 | out.append( '<tr>' ) |
---|
| 489 | for i, name in enumerate( self.column_names ): |
---|
| 490 | out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) ) |
---|
| 491 | out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) ) |
---|
| 492 | out.append( '</table>' ) |
---|
| 493 | out = "".join( out ) |
---|
| 494 | except Exception, exc: |
---|
| 495 | out = "Can't create peek %s" % exc |
---|
| 496 | return out |
---|
| 497 | |
---|
| 498 | def get_track_type( self ): |
---|
| 499 | return "FeatureTrack", {"data": "interval_index", "index": "summary_tree"} |
---|