""" velvet datatypes James E Johnson - University of Minnesota for velvet assembler tool in galaxy """ import data from galaxy.datatypes import sequence import logging, os, sys, time, tempfile, shutil, string, glob, re import galaxy.model from galaxy.datatypes import metadata from galaxy.datatypes.metadata import MetadataElement from galaxy import util from galaxy.datatypes.images import Html from sniff import * log = logging.getLogger(__name__) class Amos( data.Text ): """Class describing the AMOS assembly file """ file_ext = 'afg' def sniff( self, filename ): # FIXME: this method will read the entire file. # It should call get_headers() like other sniff methods. """ Determines whether the file is an amos assembly file format Example: {CTG iid:1 eid:1 seq: CCTCTCCTGTAGAGTTCAACCGA-GCCGGTAGAGTTTTATCA . qlt: DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD . {TLE src:1027 off:0 clr:618,0 gap: 250 612 . } } """ isAmos = False try: fh = open( filename ) while not isAmos: line = fh.readline() if not line: break #EOF line = line.strip() if line: #first non-empty line if line.startswith( '{' ): if re.match(r'{(RED|CTG|TLE)$',line): isAmos = True fh.close() except: pass return isAmos class Sequences( sequence.Fasta ): """Class describing the Sequences file generated by velveth """ def sniff( self, filename ): """ Determines whether the file is a velveth produced fasta format The id line has 3 fields separated by tabs: sequence_name sequence_index cataegory >SEQUENCE_0_length_35 1 1 GGATATAGGGCCAACCCAACTCAACGGCCTGTCTT >SEQUENCE_1_length_35 2 1 CGACGAATGACAGGTCACGAATTTGGCGGGGATTA """ try: fh = open( filename ) while True: line = fh.readline() if not line: break #EOF line = line.strip() if line: #first non-empty line if line.startswith( '>' ): if not re.match(r'>[^\t]+\t\d+\t\d+$',line): break #The next line.strip() must not be '', nor startwith '>' line = fh.readline().strip() if line == '' or line.startswith( '>' ): break return True else: break #we found a non-empty line, but its not a fasta header fh.close() except: pass return False class Roadmaps( data.Text ): """Class describing the Sequences file generated by velveth """ def sniff( self, filename ): """ Determines whether the file is a velveth produced RoadMap 142858 21 1 ROADMAP 1 ROADMAP 2 ... """ try: fh = open( filename ) while True: line = fh.readline() if not line: break #EOF line = line.strip() if line: #first non-empty line if not re.match(r'\d+\t\d+\t\d+$',line): break #The next line.strip() should be 'ROADMAP 1' line = fh.readline().strip() if not re.match(r'ROADMAP \d+$',line): break return True else: break #we found a non-empty line, but its not a fasta header fh.close() except: pass return False class Velvet( Html ): MetadataElement( name="base_name", desc="base name for velveth dataset", default="velvet", readonly=True, set_in_upload=True) MetadataElement( name="paired_end_reads", desc="has paired-end reads", default="False", readonly=False, set_in_upload=True) MetadataElement( name="long_reads", desc="has long reads", default="False", readonly=False, set_in_upload=True) MetadataElement( name="short2_reads", desc="has 2nd short reads", default="False", readonly=False, set_in_upload=True) composite_type = 'auto_primary_file' allow_datatype_change = False file_ext = 'html' def __init__( self, **kwd ): Html.__init__( self, **kwd ) log.debug( "Velvet log info %s" % 'JJ __init__') self.add_composite_file( 'Sequences', mimetype = 'text/html', description = 'Sequences', substitute_name_with_metadata = None, is_binary = False ) self.add_composite_file( 'Roadmaps', mimetype = 'text/html', description = 'Roadmaps', substitute_name_with_metadata = None, is_binary = False ) self.add_composite_file( 'Log', mimetype = 'text/html', description = 'Log', optional = 'True', substitute_name_with_metadata = None, is_binary = False ) def generate_primary_file( self, dataset = None ): log.debug( "Velvet log info %s %s" % ('JJ generate_primary_file',dataset)) rval = ['Velvet Galaxy Composite Dataset

'] rval.append('

This composite dataset is composed of the following files:

' ) return "\n".join( rval ) def regenerate_primary_file(self,dataset): """ cannot do this until we are setting metadata """ log.debug( "Velvet log info %s" % 'JJ regenerate_primary_file') gen_msg = '' try: efp = dataset.extra_files_path flist = os.listdir(efp) log_path = os.path.join(efp,'Log') f = open(log_path,'r') log_content = f.read(1000) f.close() log_msg = re.sub('/\S*/','',log_content) log.debug( "Velveth log info %s" % log_msg) paired_end_reads = re.search('-(short|long)Paired', log_msg) != None dataset.metadata.paired_end_reads = paired_end_reads long_reads = re.search('-long', log_msg) != None dataset.metadata.long_reads = long_reads short2_reads = re.search('-short(Paired)?2', log_msg) != None dataset.metadata.short2_reads = short2_reads dataset.info = re.sub('.*velveth \S+','hash_length',re.sub('\n',' ',log_msg)) if paired_end_reads: gen_msg = gen_msg + ' Paired-End Reads' if long_reads: gen_msg = gen_msg + ' Long Reads' if len(gen_msg) > 0: gen_msg = 'Uses: ' + gen_msg except: log.debug( "Velveth could not read Log file in %s" % efp) log.debug( "Velveth log info %s" % gen_msg) rval = ['Velvet Galaxy Composite Dataset

'] # rval.append('

Generated:

%s

' %(re.sub('\n','
',log_msg))) rval.append('
Generated:

%s

' %(gen_msg)) rval.append('
Velveth dataset:

' ) f = file(dataset.file_name,'w') f.write("\n".join( rval )) f.write('\n') f.close() def set_meta( self, dataset, **kwd ): Html.set_meta( self, dataset, **kwd ) self.regenerate_primary_file(dataset) if __name__ == '__main__': import doctest, sys doctest.testmod(sys.modules[__name__])