| 1 | """ |
|---|
| 2 | Sequence classes |
|---|
| 3 | """ |
|---|
| 4 | |
|---|
| 5 | import data |
|---|
| 6 | import logging |
|---|
| 7 | import re |
|---|
| 8 | import string |
|---|
| 9 | from cgi import escape |
|---|
| 10 | from galaxy.datatypes.metadata import MetadataElement |
|---|
| 11 | from galaxy.datatypes import metadata |
|---|
| 12 | import galaxy.model |
|---|
| 13 | from galaxy import util |
|---|
| 14 | from sniff import * |
|---|
| 15 | |
|---|
| 16 | log = logging.getLogger(__name__) |
|---|
| 17 | |
|---|
| 18 | class Sequence( data.Text ): |
|---|
| 19 | """Class describing a sequence""" |
|---|
| 20 | |
|---|
| 21 | """Add metadata elements""" |
|---|
| 22 | MetadataElement( name="sequences", default=0, desc="Number of sequences", readonly=True, visible=False, optional=True, no_value=0 ) |
|---|
| 23 | |
|---|
| 24 | def set_meta( self, dataset, **kwd ): |
|---|
| 25 | """ |
|---|
| 26 | Set the number of sequences and the number of data lines in dataset. |
|---|
| 27 | """ |
|---|
| 28 | data_lines = 0 |
|---|
| 29 | sequences = 0 |
|---|
| 30 | for line in file( dataset.file_name ): |
|---|
| 31 | line = line.strip() |
|---|
| 32 | if line and line.startswith( '#' ): |
|---|
| 33 | # We don't count comment lines for sequence data types |
|---|
| 34 | continue |
|---|
| 35 | if line and line.startswith( '>' ): |
|---|
| 36 | sequences += 1 |
|---|
| 37 | data_lines +=1 |
|---|
| 38 | else: |
|---|
| 39 | data_lines += 1 |
|---|
| 40 | dataset.metadata.data_lines = data_lines |
|---|
| 41 | dataset.metadata.sequences = sequences |
|---|
| 42 | def set_peek( self, dataset, is_multi_byte=False ): |
|---|
| 43 | if not dataset.dataset.purged: |
|---|
| 44 | dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) |
|---|
| 45 | if dataset.metadata.sequences: |
|---|
| 46 | dataset.blurb = "%s sequences" % util.commaify( str( dataset.metadata.sequences ) ) |
|---|
| 47 | else: |
|---|
| 48 | dataset.blurb = data.nice_size( dataset.get_size() ) |
|---|
| 49 | else: |
|---|
| 50 | dataset.peek = 'file does not exist' |
|---|
| 51 | dataset.blurb = 'file purged from disk' |
|---|
| 52 | |
|---|
| 53 | class Alignment( data.Text ): |
|---|
| 54 | """Class describing an alignment""" |
|---|
| 55 | |
|---|
| 56 | """Add metadata elements""" |
|---|
| 57 | MetadataElement( name="species", desc="Species", default=[], param=metadata.SelectParameter, multiple=True, readonly=True, no_value=None ) |
|---|
| 58 | |
|---|
| 59 | class Fasta( Sequence ): |
|---|
| 60 | """Class representing a FASTA sequence""" |
|---|
| 61 | file_ext = "fasta" |
|---|
| 62 | |
|---|
| 63 | def sniff( self, filename ): |
|---|
| 64 | """ |
|---|
| 65 | Determines whether the file is in fasta format |
|---|
| 66 | |
|---|
| 67 | A sequence in FASTA format consists of a single-line description, followed by lines of sequence data. |
|---|
| 68 | The first character of the description line is a greater-than (">") symbol in the first column. |
|---|
| 69 | All lines should be shorter than 80 characters |
|---|
| 70 | |
|---|
| 71 | For complete details see http://www.ncbi.nlm.nih.gov/blast/fasta.shtml |
|---|
| 72 | |
|---|
| 73 | Rules for sniffing as True: |
|---|
| 74 | We don't care about line length (other than empty lines). |
|---|
| 75 | The first non-empty line must start with '>' and the Very Next line.strip() must have sequence data and not be a header. |
|---|
| 76 | 'sequence data' here is loosely defined as non-empty lines which do not start with '>' |
|---|
| 77 | This will cause Color Space FASTA (csfasta) to be detected as True (they are, after all, still FASTA files - they have a header line followed by sequence data) |
|---|
| 78 | Previously this method did some checking to determine if the sequence data had integers (presumably to differentiate between fasta and csfasta) |
|---|
| 79 | This should be done through sniff order, where csfasta (currently has a null sniff function) is detected for first (stricter definition) followed sometime after by fasta |
|---|
| 80 | We will only check that the first purported sequence is correctly formatted. |
|---|
| 81 | |
|---|
| 82 | >>> fname = get_test_fname( 'sequence.maf' ) |
|---|
| 83 | >>> Fasta().sniff( fname ) |
|---|
| 84 | False |
|---|
| 85 | >>> fname = get_test_fname( 'sequence.fasta' ) |
|---|
| 86 | >>> Fasta().sniff( fname ) |
|---|
| 87 | True |
|---|
| 88 | """ |
|---|
| 89 | |
|---|
| 90 | try: |
|---|
| 91 | fh = open( filename ) |
|---|
| 92 | while True: |
|---|
| 93 | line = fh.readline() |
|---|
| 94 | if not line: |
|---|
| 95 | break #EOF |
|---|
| 96 | line = line.strip() |
|---|
| 97 | if line: #first non-empty line |
|---|
| 98 | if line.startswith( '>' ): |
|---|
| 99 | #The next line.strip() must not be '', nor startwith '>' |
|---|
| 100 | line = fh.readline().strip() |
|---|
| 101 | if line == '' or line.startswith( '>' ): |
|---|
| 102 | break |
|---|
| 103 | return True |
|---|
| 104 | else: |
|---|
| 105 | break #we found a non-empty line, but its not a fasta header |
|---|
| 106 | fh.close() |
|---|
| 107 | except: |
|---|
| 108 | pass |
|---|
| 109 | return False |
|---|
| 110 | |
|---|
| 111 | class csFasta( Sequence ): |
|---|
| 112 | """ Class representing the SOLID Color-Space sequence ( csfasta ) """ |
|---|
| 113 | file_ext = "csfasta" |
|---|
| 114 | |
|---|
| 115 | def sniff( self, filename ): |
|---|
| 116 | """ |
|---|
| 117 | Color-space sequence: |
|---|
| 118 | >2_15_85_F3 |
|---|
| 119 | T213021013012303002332212012112221222112212222 |
|---|
| 120 | |
|---|
| 121 | >>> fname = get_test_fname( 'sequence.fasta' ) |
|---|
| 122 | >>> csFasta().sniff( fname ) |
|---|
| 123 | False |
|---|
| 124 | >>> fname = get_test_fname( 'sequence.csfasta' ) |
|---|
| 125 | >>> csFasta().sniff( fname ) |
|---|
| 126 | True |
|---|
| 127 | """ |
|---|
| 128 | try: |
|---|
| 129 | fh = open( filename ) |
|---|
| 130 | while True: |
|---|
| 131 | line = fh.readline() |
|---|
| 132 | if not line: |
|---|
| 133 | break #EOF |
|---|
| 134 | line = line.strip() |
|---|
| 135 | if line and not line.startswith( '#' ): #first non-empty non-comment line |
|---|
| 136 | if line.startswith( '>' ): |
|---|
| 137 | line = fh.readline().strip() |
|---|
| 138 | if line == '' or line.startswith( '>' ): |
|---|
| 139 | break |
|---|
| 140 | elif line[0] not in string.ascii_uppercase: |
|---|
| 141 | return False |
|---|
| 142 | elif len( line ) > 1 and not re.search( '^[\d.]+$', line[1:] ): |
|---|
| 143 | return False |
|---|
| 144 | return True |
|---|
| 145 | else: |
|---|
| 146 | break #we found a non-empty line, but it's not a header |
|---|
| 147 | fh.close() |
|---|
| 148 | except: |
|---|
| 149 | pass |
|---|
| 150 | return False |
|---|
| 151 | |
|---|
| 152 | def set_meta( self, dataset, **kwd ): |
|---|
| 153 | if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize: |
|---|
| 154 | dataset.metadata.data_lines = None |
|---|
| 155 | dataset.metadata.sequences = None |
|---|
| 156 | return |
|---|
| 157 | return Sequence.set_meta( self, dataset, **kwd ) |
|---|
| 158 | |
|---|
| 159 | class Fastq ( Sequence ): |
|---|
| 160 | """Class representing a generic FASTQ sequence""" |
|---|
| 161 | file_ext = "fastq" |
|---|
| 162 | |
|---|
| 163 | def set_meta( self, dataset, **kwd ): |
|---|
| 164 | """ |
|---|
| 165 | Set the number of sequences and the number of data lines |
|---|
| 166 | in dataset. |
|---|
| 167 | """ |
|---|
| 168 | if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize: |
|---|
| 169 | dataset.metadata.data_lines = None |
|---|
| 170 | dataset.metadata.sequences = None |
|---|
| 171 | return |
|---|
| 172 | data_lines = 0 |
|---|
| 173 | sequences = 0 |
|---|
| 174 | seq_counter = 0 # blocks should be 4 lines long |
|---|
| 175 | for line in file( dataset.file_name ): |
|---|
| 176 | line = line.strip() |
|---|
| 177 | if line and line.startswith( '#' ) and not sequences: |
|---|
| 178 | # We don't count comment lines for sequence data types |
|---|
| 179 | continue |
|---|
| 180 | if line and line.startswith( '@' ): |
|---|
| 181 | if seq_counter >= 4: |
|---|
| 182 | # count previous block |
|---|
| 183 | # blocks should be 4 lines long |
|---|
| 184 | sequences += 1 |
|---|
| 185 | seq_counter = 1 |
|---|
| 186 | else: |
|---|
| 187 | # in case quality line starts with @ |
|---|
| 188 | seq_counter += 1 |
|---|
| 189 | data_lines += 1 |
|---|
| 190 | else: |
|---|
| 191 | data_lines += 1 |
|---|
| 192 | seq_counter += 1 |
|---|
| 193 | if seq_counter >= 4: |
|---|
| 194 | # count final block |
|---|
| 195 | sequences += 1 |
|---|
| 196 | dataset.metadata.data_lines = data_lines |
|---|
| 197 | dataset.metadata.sequences = sequences |
|---|
| 198 | def sniff ( self, filename ): |
|---|
| 199 | """ |
|---|
| 200 | Determines whether the file is in generic fastq format |
|---|
| 201 | For details, see http://maq.sourceforge.net/fastq.shtml |
|---|
| 202 | |
|---|
| 203 | Note: There are three kinds of FASTQ files, known as "Sanger" (sometimes called "Standard"), Solexa, and Illumina |
|---|
| 204 | These differ in the representation of the quality scores |
|---|
| 205 | |
|---|
| 206 | >>> fname = get_test_fname( '1.fastqsanger' ) |
|---|
| 207 | >>> Fastq().sniff( fname ) |
|---|
| 208 | True |
|---|
| 209 | >>> fname = get_test_fname( '2.fastqsanger' ) |
|---|
| 210 | >>> Fastq().sniff( fname ) |
|---|
| 211 | True |
|---|
| 212 | """ |
|---|
| 213 | headers = get_headers( filename, None ) |
|---|
| 214 | bases_regexp = re.compile( "^[NGTAC]*" ) |
|---|
| 215 | # check that first block looks like a fastq block |
|---|
| 216 | try: |
|---|
| 217 | if len( headers ) >= 4 and headers[0][0] and headers[0][0][0] == "@" and headers[2][0] and headers[2][0][0] == "+" and headers[1][0]: |
|---|
| 218 | # Check the sequence line, make sure it contains only G/C/A/T/N |
|---|
| 219 | if not bases_regexp.match( headers[1][0] ): |
|---|
| 220 | return False |
|---|
| 221 | return True |
|---|
| 222 | return False |
|---|
| 223 | except: |
|---|
| 224 | return False |
|---|
| 225 | |
|---|
| 226 | class FastqSanger( Fastq ): |
|---|
| 227 | """Class representing a FASTQ sequence ( the Sanger variant )""" |
|---|
| 228 | file_ext = "fastqsanger" |
|---|
| 229 | |
|---|
| 230 | class FastqSolexa( Fastq ): |
|---|
| 231 | """Class representing a FASTQ sequence ( the Solexa variant )""" |
|---|
| 232 | file_ext = "fastqsolexa" |
|---|
| 233 | |
|---|
| 234 | class FastqIllumina( Fastq ): |
|---|
| 235 | """Class representing a FASTQ sequence ( the Illumina 1.3+ variant )""" |
|---|
| 236 | file_ext = "fastqillumina" |
|---|
| 237 | |
|---|
| 238 | class FastqCSSanger( Fastq ): |
|---|
| 239 | """Class representing a Color Space FASTQ sequence ( e.g a SOLiD variant )""" |
|---|
| 240 | file_ext = "fastqcssanger" |
|---|
| 241 | |
|---|
| 242 | try: |
|---|
| 243 | from galaxy import eggs |
|---|
| 244 | import pkg_resources; pkg_resources.require( "bx-python" ) |
|---|
| 245 | import bx.align.maf |
|---|
| 246 | except: |
|---|
| 247 | pass |
|---|
| 248 | |
|---|
| 249 | #trying to import maf_utilities here throws an ImportError due to a circular import between jobs and tools: |
|---|
| 250 | #from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes |
|---|
| 251 | #Traceback (most recent call last): |
|---|
| 252 | # File "./scripts/paster.py", line 27, in <module> |
|---|
| 253 | # command.run() |
|---|
| 254 | # File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 78, in run |
|---|
| 255 | # File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 117, in invoke |
|---|
| 256 | # File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 212, in run |
|---|
| 257 | # File "build/bdist.solaris-2.11-i86pc/egg/paste/script/serve.py", line 227, in command |
|---|
| 258 | # File "build/bdist.solaris-2.11-i86pc/egg/paste/script/serve.py", line 250, in loadapp |
|---|
| 259 | # File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 193, in loadapp |
|---|
| 260 | # File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 213, in loadobj |
|---|
| 261 | # File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 237, in loadcontext |
|---|
| 262 | # File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 267, in _loadconfig |
|---|
| 263 | # File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 397, in get_context |
|---|
| 264 | # File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 439, in _context_from_explicit |
|---|
| 265 | # File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 18, in import_string |
|---|
| 266 | # File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/pkg_resources.py", line 1912, in load |
|---|
| 267 | # entry = __import__(self.module_name, globals(),globals(), ['__name__']) |
|---|
| 268 | # File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/web/buildapp.py", line 18, in <module> |
|---|
| 269 | # from galaxy import config, jobs, util, tools |
|---|
| 270 | # File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/jobs/__init__.py", line 3, in <module> |
|---|
| 271 | # from galaxy import util, model |
|---|
| 272 | # File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/model/__init__.py", line 13, in <module> |
|---|
| 273 | # import galaxy.datatypes.registry |
|---|
| 274 | # File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/datatypes/registry.py", line 6, in <module> |
|---|
| 275 | # import data, tabular, interval, images, sequence, qualityscore, genetics, xml, coverage, tracks, chrominfo |
|---|
| 276 | # File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/datatypes/sequence.py", line 344, in <module> |
|---|
| 277 | # from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes |
|---|
| 278 | # File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/tools/__init__.py", line 15, in <module> |
|---|
| 279 | # from galaxy import util, jobs, model |
|---|
| 280 | #ImportError: cannot import name jobs |
|---|
| 281 | #so we'll copy and paste for now...terribly icky |
|---|
| 282 | #*** ANYCHANGE TO THIS METHOD HERE OR IN maf_utilities MUST BE PROPAGATED *** |
|---|
| 283 | def COPIED_build_maf_index_species_chromosomes( filename, index_species = None ): |
|---|
| 284 | species = [] |
|---|
| 285 | species_chromosomes = {} |
|---|
| 286 | indexes = bx.interval_index_file.Indexes() |
|---|
| 287 | blocks = 0 |
|---|
| 288 | try: |
|---|
| 289 | maf_reader = bx.align.maf.Reader( open( filename ) ) |
|---|
| 290 | while True: |
|---|
| 291 | pos = maf_reader.file.tell() |
|---|
| 292 | block = maf_reader.next() |
|---|
| 293 | if block is None: |
|---|
| 294 | break |
|---|
| 295 | blocks += 1 |
|---|
| 296 | for c in block.components: |
|---|
| 297 | spec = c.src |
|---|
| 298 | chrom = None |
|---|
| 299 | if "." in spec: |
|---|
| 300 | spec, chrom = spec.split( ".", 1 ) |
|---|
| 301 | if spec not in species: |
|---|
| 302 | species.append( spec ) |
|---|
| 303 | species_chromosomes[spec] = [] |
|---|
| 304 | if chrom and chrom not in species_chromosomes[spec]: |
|---|
| 305 | species_chromosomes[spec].append( chrom ) |
|---|
| 306 | if index_species is None or spec in index_species: |
|---|
| 307 | forward_strand_start = c.forward_strand_start |
|---|
| 308 | forward_strand_end = c.forward_strand_end |
|---|
| 309 | try: |
|---|
| 310 | forward_strand_start = int( forward_strand_start ) |
|---|
| 311 | forward_strand_end = int( forward_strand_end ) |
|---|
| 312 | except ValueError: |
|---|
| 313 | continue #start and end are not integers, can't add component to index, goto next component |
|---|
| 314 | #this likely only occurs when parse_e_rows is True? |
|---|
| 315 | #could a species exist as only e rows? should the |
|---|
| 316 | if forward_strand_end > forward_strand_start: |
|---|
| 317 | #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed |
|---|
| 318 | indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size ) |
|---|
| 319 | except Exception, e: |
|---|
| 320 | #most likely a bad MAF |
|---|
| 321 | log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) ) |
|---|
| 322 | return ( None, [], {}, 0 ) |
|---|
| 323 | return ( indexes, species, species_chromosomes, blocks ) |
|---|
| 324 | |
|---|
| 325 | class Maf( Alignment ): |
|---|
| 326 | """Class describing a Maf alignment""" |
|---|
| 327 | file_ext = "maf" |
|---|
| 328 | |
|---|
| 329 | #Readonly and optional, users can't unset it, but if it is not set, we are generally ok; if required use a metadata validator in the tool definition |
|---|
| 330 | MetadataElement( name="blocks", default=0, desc="Number of blocks", readonly=True, optional=True, visible=False, no_value=0 ) |
|---|
| 331 | MetadataElement( name="species_chromosomes", desc="Species Chromosomes", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True ) |
|---|
| 332 | MetadataElement( name="maf_index", desc="MAF Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True ) |
|---|
| 333 | |
|---|
| 334 | def init_meta( self, dataset, copy_from=None ): |
|---|
| 335 | Alignment.init_meta( self, dataset, copy_from=copy_from ) |
|---|
| 336 | def set_meta( self, dataset, overwrite = True, **kwd ): |
|---|
| 337 | """ |
|---|
| 338 | Parses and sets species, chromosomes, index from MAF file. |
|---|
| 339 | """ |
|---|
| 340 | #these metadata values are not accessable by users, always overwrite |
|---|
| 341 | indexes, species, species_chromosomes, blocks = COPIED_build_maf_index_species_chromosomes( dataset.file_name ) |
|---|
| 342 | if indexes is None: |
|---|
| 343 | return #this is not a MAF file |
|---|
| 344 | dataset.metadata.species = species |
|---|
| 345 | dataset.metadata.blocks = blocks |
|---|
| 346 | |
|---|
| 347 | #write species chromosomes to a file |
|---|
| 348 | chrom_file = dataset.metadata.species_chromosomes |
|---|
| 349 | if not chrom_file: |
|---|
| 350 | chrom_file = dataset.metadata.spec['species_chromosomes'].param.new_file( dataset = dataset ) |
|---|
| 351 | chrom_out = open( chrom_file.file_name, 'wb' ) |
|---|
| 352 | for spec, chroms in species_chromosomes.items(): |
|---|
| 353 | chrom_out.write( "%s\t%s\n" % ( spec, "\t".join( chroms ) ) ) |
|---|
| 354 | chrom_out.close() |
|---|
| 355 | dataset.metadata.species_chromosomes = chrom_file |
|---|
| 356 | |
|---|
| 357 | index_file = dataset.metadata.maf_index |
|---|
| 358 | if not index_file: |
|---|
| 359 | index_file = dataset.metadata.spec['maf_index'].param.new_file( dataset = dataset ) |
|---|
| 360 | indexes.write( open( index_file.file_name, 'wb' ) ) |
|---|
| 361 | dataset.metadata.maf_index = index_file |
|---|
| 362 | def set_peek( self, dataset, is_multi_byte=False ): |
|---|
| 363 | if not dataset.dataset.purged: |
|---|
| 364 | # The file must exist on disk for the get_file_peek() method |
|---|
| 365 | dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) |
|---|
| 366 | if dataset.metadata.blocks: |
|---|
| 367 | dataset.blurb = "%s blocks" % util.commaify( str( dataset.metadata.blocks ) ) |
|---|
| 368 | else: |
|---|
| 369 | # Number of blocks is not known ( this should not happen ), and auto-detect is |
|---|
| 370 | # needed to set metadata |
|---|
| 371 | dataset.blurb = "? blocks" |
|---|
| 372 | else: |
|---|
| 373 | dataset.peek = 'file does not exist' |
|---|
| 374 | dataset.blurb = 'file purged from disk' |
|---|
| 375 | def display_peek( self, dataset ): |
|---|
| 376 | """Returns formated html of peek""" |
|---|
| 377 | return self.make_html_table( dataset ) |
|---|
| 378 | def make_html_table( self, dataset, skipchars=[] ): |
|---|
| 379 | """Create HTML table, used for displaying peek""" |
|---|
| 380 | out = ['<table cellspacing="0" cellpadding="3">'] |
|---|
| 381 | try: |
|---|
| 382 | out.append('<tr><th>Species: ') |
|---|
| 383 | for species in dataset.metadata.species: |
|---|
| 384 | out.append( '%s ' % species ) |
|---|
| 385 | out.append( '</th></tr>' ) |
|---|
| 386 | if not dataset.peek: |
|---|
| 387 | dataset.set_peek() |
|---|
| 388 | data = dataset.peek |
|---|
| 389 | lines = data.splitlines() |
|---|
| 390 | for line in lines: |
|---|
| 391 | line = line.strip() |
|---|
| 392 | if not line: |
|---|
| 393 | continue |
|---|
| 394 | out.append( '<tr><td>%s</td></tr>' % escape( line ) ) |
|---|
| 395 | out.append( '</table>' ) |
|---|
| 396 | out = "".join( out ) |
|---|
| 397 | except Exception, exc: |
|---|
| 398 | out = "Can't create peek %s" % exc |
|---|
| 399 | return out |
|---|
| 400 | def sniff( self, filename ): |
|---|
| 401 | """ |
|---|
| 402 | Determines wether the file is in maf format |
|---|
| 403 | |
|---|
| 404 | The .maf format is line-oriented. Each multiple alignment ends with a blank line. |
|---|
| 405 | Each sequence in an alignment is on a single line, which can get quite long, but |
|---|
| 406 | there is no length limit. Words in a line are delimited by any white space. |
|---|
| 407 | Lines starting with # are considered to be comments. Lines starting with ## can |
|---|
| 408 | be ignored by most programs, but contain meta-data of one form or another. |
|---|
| 409 | |
|---|
| 410 | The first line of a .maf file begins with ##maf. This word is followed by white-space-separated |
|---|
| 411 | variable=value pairs. There should be no white space surrounding the "=". |
|---|
| 412 | |
|---|
| 413 | For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format5 |
|---|
| 414 | |
|---|
| 415 | >>> fname = get_test_fname( 'sequence.maf' ) |
|---|
| 416 | >>> Maf().sniff( fname ) |
|---|
| 417 | True |
|---|
| 418 | >>> fname = get_test_fname( 'sequence.fasta' ) |
|---|
| 419 | >>> Maf().sniff( fname ) |
|---|
| 420 | False |
|---|
| 421 | """ |
|---|
| 422 | headers = get_headers( filename, None ) |
|---|
| 423 | try: |
|---|
| 424 | if len(headers) > 1 and headers[0][0] and headers[0][0] == "##maf": |
|---|
| 425 | return True |
|---|
| 426 | else: |
|---|
| 427 | return False |
|---|
| 428 | except: |
|---|
| 429 | return False |
|---|
| 430 | |
|---|
| 431 | class MafCustomTrack( data.Text ): |
|---|
| 432 | file_ext = "mafcustomtrack" |
|---|
| 433 | |
|---|
| 434 | MetadataElement( name="vp_chromosome", default='chr1', desc="Viewport Chromosome", readonly=True, optional=True, visible=False, no_value='' ) |
|---|
| 435 | MetadataElement( name="vp_start", default='1', desc="Viewport Start", readonly=True, optional=True, visible=False, no_value='' ) |
|---|
| 436 | MetadataElement( name="vp_end", default='100', desc="Viewport End", readonly=True, optional=True, visible=False, no_value='' ) |
|---|
| 437 | |
|---|
| 438 | def set_meta( self, dataset, overwrite = True, **kwd ): |
|---|
| 439 | """ |
|---|
| 440 | Parses and sets viewport metadata from MAF file. |
|---|
| 441 | """ |
|---|
| 442 | max_block_check = 10 |
|---|
| 443 | chrom = None |
|---|
| 444 | forward_strand_start = float( 'inf' ) |
|---|
| 445 | forward_strand_end = 0 |
|---|
| 446 | try: |
|---|
| 447 | maf_file = open( dataset.file_name ) |
|---|
| 448 | maf_file.readline() #move past track line |
|---|
| 449 | for i, block in enumerate( bx.align.maf.Reader( maf_file ) ): |
|---|
| 450 | ref_comp = block.get_component_by_src_start( dataset.metadata.dbkey ) |
|---|
| 451 | if ref_comp: |
|---|
| 452 | ref_chrom = bx.align.maf.src_split( ref_comp.src )[-1] |
|---|
| 453 | if chrom is None: |
|---|
| 454 | chrom = ref_chrom |
|---|
| 455 | if chrom == ref_chrom: |
|---|
| 456 | forward_strand_start = min( forward_strand_start, ref_comp.forward_strand_start ) |
|---|
| 457 | forward_strand_end = max( forward_strand_end, ref_comp.forward_strand_end ) |
|---|
| 458 | if i > max_block_check: |
|---|
| 459 | break |
|---|
| 460 | |
|---|
| 461 | if forward_strand_end > forward_strand_start: |
|---|
| 462 | dataset.metadata.vp_chromosome = chrom |
|---|
| 463 | dataset.metadata.vp_start = forward_strand_start |
|---|
| 464 | dataset.metadata.vp_end = forward_strand_end |
|---|
| 465 | except: |
|---|
| 466 | pass |
|---|
| 467 | |
|---|
| 468 | class Axt( data.Text ): |
|---|
| 469 | """Class describing an axt alignment""" |
|---|
| 470 | |
|---|
| 471 | # gvk- 11/19/09 - This is really an alignment, but we no longer have tools that use this data type, and it is |
|---|
| 472 | # here simply for backward compatibility ( although it is still in the datatypes registry ). Subclassing |
|---|
| 473 | # from data.Text eliminates managing metadata elements inherited from the Alignemnt class. |
|---|
| 474 | |
|---|
| 475 | file_ext = "axt" |
|---|
| 476 | |
|---|
| 477 | def sniff( self, filename ): |
|---|
| 478 | """ |
|---|
| 479 | Determines whether the file is in axt format |
|---|
| 480 | |
|---|
| 481 | axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab |
|---|
| 482 | at Penn State University. |
|---|
| 483 | |
|---|
| 484 | Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines. |
|---|
| 485 | Blocks are separated from one another by blank lines. |
|---|
| 486 | |
|---|
| 487 | The summary line contains chromosomal position and size information about the alignment. It |
|---|
| 488 | consists of 9 required fields. |
|---|
| 489 | |
|---|
| 490 | The sequence lines contain the sequence of the primary assembly (line 2) and aligning assembly |
|---|
| 491 | (line 3) with inserts. Repeats are indicated by lower-case letters. |
|---|
| 492 | |
|---|
| 493 | For complete details see http://genome.ucsc.edu/goldenPath/help/axt.html |
|---|
| 494 | |
|---|
| 495 | >>> fname = get_test_fname( 'alignment.axt' ) |
|---|
| 496 | >>> Axt().sniff( fname ) |
|---|
| 497 | True |
|---|
| 498 | >>> fname = get_test_fname( 'alignment.lav' ) |
|---|
| 499 | >>> Axt().sniff( fname ) |
|---|
| 500 | False |
|---|
| 501 | """ |
|---|
| 502 | headers = get_headers( filename, None ) |
|---|
| 503 | if len(headers) < 4: |
|---|
| 504 | return False |
|---|
| 505 | for hdr in headers: |
|---|
| 506 | if len(hdr) > 0 and hdr[0].startswith("##matrix=axt"): |
|---|
| 507 | return True |
|---|
| 508 | if len(hdr) > 0 and not hdr[0].startswith("#"): |
|---|
| 509 | if len(hdr) != 9: |
|---|
| 510 | return False |
|---|
| 511 | try: |
|---|
| 512 | map ( int, [hdr[0], hdr[2], hdr[3], hdr[5], hdr[6], hdr[8]] ) |
|---|
| 513 | except: |
|---|
| 514 | return False |
|---|
| 515 | if hdr[7] not in data.valid_strand: |
|---|
| 516 | return False |
|---|
| 517 | else: |
|---|
| 518 | return True |
|---|
| 519 | |
|---|
| 520 | class Lav( data.Text ): |
|---|
| 521 | """Class describing a LAV alignment""" |
|---|
| 522 | |
|---|
| 523 | file_ext = "lav" |
|---|
| 524 | |
|---|
| 525 | # gvk- 11/19/09 - This is really an alignment, but we no longer have tools that use this data type, and it is |
|---|
| 526 | # here simply for backward compatibility ( although it is still in the datatypes registry ). Subclassing |
|---|
| 527 | # from data.Text eliminates managing metadata elements inherited from the Alignemnt class. |
|---|
| 528 | |
|---|
| 529 | def sniff( self, filename ): |
|---|
| 530 | """ |
|---|
| 531 | Determines whether the file is in lav format |
|---|
| 532 | |
|---|
| 533 | LAV is an alignment format developed by Webb Miller's group. It is the primary output format for BLASTZ. |
|---|
| 534 | The first line of a .lav file begins with #:lav. |
|---|
| 535 | |
|---|
| 536 | For complete details see http://www.bioperl.org/wiki/LAV_alignment_format |
|---|
| 537 | |
|---|
| 538 | >>> fname = get_test_fname( 'alignment.lav' ) |
|---|
| 539 | >>> Lav().sniff( fname ) |
|---|
| 540 | True |
|---|
| 541 | >>> fname = get_test_fname( 'alignment.axt' ) |
|---|
| 542 | >>> Lav().sniff( fname ) |
|---|
| 543 | False |
|---|
| 544 | """ |
|---|
| 545 | headers = get_headers( filename, None ) |
|---|
| 546 | try: |
|---|
| 547 | if len(headers) > 1 and headers[0][0] and headers[0][0].startswith('#:lav'): |
|---|
| 548 | return True |
|---|
| 549 | else: |
|---|
| 550 | return False |
|---|
| 551 | except: |
|---|
| 552 | return False |
|---|