import sys, subprocess, tempfile, shutil, glob, os, os.path, gzip from galaxy import eggs import pkg_resources pkg_resources.require( "simplejson" ) import simplejson CHUNK_SIZE = 1024 ERROR_LINES = [ 'ERROR', 'command not found' ] #'Fatal' errors that should be passed back to stderr WARNING_LINE_START = 'WARNING @' def gunzip_cat_glob_path( glob_path, target_filename, delete = False ): out = open( target_filename, 'wb' ) for filename in glob.glob( glob_path ): fh = gzip.open( filename, 'rb' ) while True: data = fh.read( CHUNK_SIZE ) if data: out.write( data ) else: break fh.close() if delete: os.unlink( filename ) out.close() def xls_to_interval( xls_file, interval_file, header = None ): out = open( interval_file, 'wb' ) if header: out.write( '#%s\n' % header ) wrote_header = False #From macs readme: Coordinates in XLS is 1-based which is different with BED format. for line in open( xls_file ): #keep all existing comment lines if line.startswith( '#' ): out.write( line ) elif not wrote_header: out.write( '#%s' % line ) wrote_header = True else: fields = line.split( '\t' ) if len( fields ) > 1: fields[1] = str( int( fields[1] ) - 1 ) out.write( '\t'.join( fields ) ) out.close() def main(): options = simplejson.load( open( sys.argv[1] ) ) output_bed = sys.argv[2] output_extra_html = sys.argv[3] output_extra_path = sys.argv[4] experiment_name = '_'.join( options['experiment_name'].split() ) #save experiment name here, it will be used by macs for filenames (gzip of wig files will fail with spaces - macs doesn't properly escape them)..need to replace all whitespace, split makes this easier cmdline = "macs -t %s" % ",".join( options['input_chipseq'] ) if options['input_control']: cmdline = "%s -c %s" % ( cmdline, ",".join( options['input_control'] ) ) cmdline = "%s --format='%s' --name='%s' --gsize='%s' --tsize='%s' --bw='%s' --pvalue='%s' --mfold='%s' %s --lambdaset='%s' %s" % ( cmdline, options['format'], experiment_name, options['gsize'], options['tsize'], options['bw'], options['pvalue'], options['mfold'], options['nolambda'], options['lambdaset'], options['futurefdr'] ) if 'wig' in options: wigextend = int( options['wig']['wigextend'] ) if wigextend >= 0: wigextend = "--wigextend='%s'" % wigextend else: wigextend = '' cmdline = "%s --wig %s --space='%s'" % ( cmdline, wigextend, options['wig']['space'] ) if 'nomodel' in options: cmdline = "%s --nomodel --shiftsize='%s'" % ( cmdline, options['nomodel'] ) if 'diag' in options: cmdline = "%s --diag --fe-min='%s' --fe-max='%s' --fe-step='%s'" % ( cmdline, options['diag']['fe-min'], options['diag']['fe-max'], options['diag']['fe-step'] ) tmp_dir = tempfile.mkdtemp() #macs makes very messy output, need to contain it into a temp dir, then provide to user stderr_name = tempfile.NamedTemporaryFile().name # redirect stderr here, macs provides lots of info via stderr, make it into a report proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir, stderr=open( stderr_name, 'wb' ) ) proc.wait() #Need to lightly parse stderr file to see if there is a fatal error (e.g. macs is not installed on system) #We don't want to set tool run to error state if only warnings or info, e.g. mfold needs to be decreased, rather create empty outputs, but let user view macs log for line in open( stderr_name ): for err_text in ERROR_LINES: if err_text in line: #print error, but don't quit, allow cleanup to occur at end print >> sys.stderr, line.rstrip( '\n\r' ) if line.startswith( WARNING_LINE_START ): #print warnings so they are viewable from info print line.split( ':' )[-1].strip() #run R to create pdf from model script if os.path.exists( os.path.join( tmp_dir, "%s_model.r" % experiment_name ) ): cmdline = 'R --vanilla --slave < "%s_model.r" > "%s_model.r.log"' % ( experiment_name, experiment_name ) proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir ) proc.wait() #move bed out to proper output file created_bed_name = os.path.join( tmp_dir, "%s_peaks.bed" % experiment_name ) if os.path.exists( created_bed_name ): shutil.move( created_bed_name, output_bed ) #parse xls files to interval files as needed if options['xls_to_interval']: create_peak_xls_file = os.path.join( tmp_dir, '%s_peaks.xls' % experiment_name ) if os.path.exists( create_peak_xls_file ): xls_to_interval( create_peak_xls_file, options['xls_to_interval']['peaks_file'], header = 'peaks file' ) create_peak_xls_file = os.path.join( tmp_dir, '%s_negative_peaks.xls' % experiment_name ) if os.path.exists( create_peak_xls_file ): xls_to_interval( create_peak_xls_file, options['xls_to_interval']['negative_peaks_file'], header = 'negative peaks file' ) #merge and move wig files as needed, delete gz'd files and remove emptied dirs if 'wig' in options: wig_base_dir = os.path.join( tmp_dir, "%s_MACS_wiggle" % experiment_name ) if os.path.exists( wig_base_dir ): #treatment treatment_dir = os.path.join( wig_base_dir, "treat" ) if os.path.exists( treatment_dir ): gunzip_cat_glob_path( os.path.join( treatment_dir, "*.wig.gz" ), options['wig']['output_treatment_file'], delete = True ) os.rmdir( treatment_dir ) #control if options['input_control']: control_dir = os.path.join( wig_base_dir, "control" ) if os.path.exists( control_dir ): gunzip_cat_glob_path( os.path.join( control_dir, "*.wig.gz" ), options['wig']['output_control_file'], delete = True ) os.rmdir( control_dir ) os.rmdir( wig_base_dir ) #move all remaining files to extra files path of html file output to allow user download out_html = open( output_extra_html, 'wb' ) out_html.write( 'Additional output created by MACS (%s)

Additional Files:

\n' ) out_html.write( '

Messages from MACS:

\n

%s

\n' % open( stderr_name, 'rb' ).read() ) out_html.write( '\n' ) out_html.close() os.unlink( stderr_name ) os.rmdir( tmp_dir ) if __name__ == "__main__": main()