[2] | 1 | #Dan Blankenberg |
---|
| 2 | from optparse import OptionParser |
---|
| 3 | from galaxy_utils.sequence.fastq import fastqReader, fastqWriter |
---|
| 4 | |
---|
| 5 | def mean( score_list ): |
---|
| 6 | return float( sum( score_list ) ) / float( len( score_list ) ) |
---|
| 7 | |
---|
| 8 | ACTION_METHODS = { 'min':min, 'max':max, 'sum':sum, 'mean':mean } |
---|
| 9 | |
---|
| 10 | def compare( aggregated_value, operator, threshold_value ): |
---|
| 11 | if operator == '>': |
---|
| 12 | return aggregated_value > threshold_value |
---|
| 13 | elif operator == '>=': |
---|
| 14 | return aggregated_value >= threshold_value |
---|
| 15 | elif operator == '==': |
---|
| 16 | return aggregated_value == threshold_value |
---|
| 17 | elif operator == '<': |
---|
| 18 | return aggregated_value < threshold_value |
---|
| 19 | elif operator == '<=': |
---|
| 20 | return aggregated_value <= threshold_value |
---|
| 21 | elif operator == '!=': |
---|
| 22 | return aggregated_value != threshold_value |
---|
| 23 | |
---|
| 24 | def exclude( value_list, exclude_indexes ): |
---|
| 25 | rval = [] |
---|
| 26 | for i, val in enumerate( value_list ): |
---|
| 27 | if i not in exclude_indexes: |
---|
| 28 | rval.append( val ) |
---|
| 29 | return rval |
---|
| 30 | |
---|
| 31 | def exclude_and_compare( aggregate_action, aggregate_list, operator, threshold_value, exclude_indexes = None ): |
---|
| 32 | if not aggregate_list or compare( aggregate_action( aggregate_list ), operator, threshold_value ): |
---|
| 33 | return True |
---|
| 34 | if exclude_indexes: |
---|
| 35 | for exclude_index in exclude_indexes: |
---|
| 36 | excluded_list = exclude( aggregate_list, exclude_index ) |
---|
| 37 | if not excluded_list or compare( aggregate_action( excluded_list ), operator, threshold_value ): |
---|
| 38 | return True |
---|
| 39 | return False |
---|
| 40 | |
---|
| 41 | def main(): |
---|
| 42 | usage = "usage: %prog [options] input_file output_file" |
---|
| 43 | parser = OptionParser( usage=usage ) |
---|
| 44 | parser.add_option( '-f', '--format', dest='format', type='choice', default='sanger', choices=( 'sanger', 'cssanger', 'solexa', 'illumina' ), help='FASTQ variant type' ) |
---|
| 45 | parser.add_option( '-s', '--window_size', type="int", dest='window_size', default='1', help='Window size' ) |
---|
| 46 | parser.add_option( '-t', '--window_step', type="int", dest='window_step', default='1', help='Window step' ) |
---|
| 47 | parser.add_option( '-e', '--trim_ends', type="choice", dest='trim_ends', default='53', choices=('5','3','53','35' ), help='Ends to Trim' ) |
---|
| 48 | parser.add_option( '-a', '--aggregation_action', type="choice", dest='aggregation_action', default='min', choices=('min','max','sum','mean' ), help='Aggregate action for window' ) |
---|
| 49 | parser.add_option( '-x', '--exclude_count', type="int", dest='exclude_count', default='0', help='Maximum number of bases to exclude from the window during aggregation' ) |
---|
| 50 | parser.add_option( '-c', '--score_comparison', type="choice", dest='score_comparison', default='>=', choices=('>','>=','==','<', '<=', '!=' ), help='Keep read when aggregate score is' ) |
---|
| 51 | parser.add_option( '-q', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score' ) |
---|
| 52 | parser.add_option( "-k", "--keep_zero_length", action="store_true", dest="keep_zero_length", default=False, help="Keep reads with zero length") |
---|
| 53 | ( options, args ) = parser.parse_args() |
---|
| 54 | |
---|
| 55 | if len ( args ) != 2: |
---|
| 56 | parser.error( "Need to specify an input file and an output file" ) |
---|
| 57 | |
---|
| 58 | if options.window_size < 1: |
---|
| 59 | parser.error( 'You must specify a strictly positive window size' ) |
---|
| 60 | |
---|
| 61 | if options.window_step < 1: |
---|
| 62 | parser.error( 'You must specify a strictly positive step size' ) |
---|
| 63 | |
---|
| 64 | #determine an exhaustive list of window indexes that can be excluded from aggregation |
---|
| 65 | exclude_window_indexes = [] |
---|
| 66 | last_exclude_indexes = [] |
---|
| 67 | for exclude_count in range( min( options.exclude_count, options.window_size ) ): |
---|
| 68 | if last_exclude_indexes: |
---|
| 69 | new_exclude_indexes = [] |
---|
| 70 | for exclude_list in last_exclude_indexes: |
---|
| 71 | for window_index in range( options.window_size ): |
---|
| 72 | if window_index not in exclude_list: |
---|
| 73 | new_exclude = sorted( exclude_list + [ window_index ] ) |
---|
| 74 | if new_exclude not in exclude_window_indexes + new_exclude_indexes: |
---|
| 75 | new_exclude_indexes.append( new_exclude ) |
---|
| 76 | exclude_window_indexes += new_exclude_indexes |
---|
| 77 | last_exclude_indexes = new_exclude_indexes |
---|
| 78 | else: |
---|
| 79 | for window_index in range( options.window_size ): |
---|
| 80 | last_exclude_indexes.append( [ window_index ] ) |
---|
| 81 | exclude_window_indexes = list( last_exclude_indexes ) |
---|
| 82 | |
---|
| 83 | out = fastqWriter( open( args[1], 'wb' ), format = options.format ) |
---|
| 84 | action = ACTION_METHODS[ options.aggregation_action ] |
---|
| 85 | |
---|
| 86 | num_reads = None |
---|
| 87 | num_reads_excluded = 0 |
---|
| 88 | for num_reads, fastq_read in enumerate( fastqReader( open( args[0] ), format = options.format ) ): |
---|
| 89 | for trim_end in options.trim_ends: |
---|
| 90 | quality_list = fastq_read.get_decimal_quality_scores() |
---|
| 91 | if trim_end == '5': |
---|
| 92 | lwindow_position = 0 #left position of window |
---|
| 93 | while True: |
---|
| 94 | if lwindow_position >= len( quality_list ): |
---|
| 95 | fastq_read.sequence = '' |
---|
| 96 | fastq_read.quality = '' |
---|
| 97 | break |
---|
| 98 | if exclude_and_compare( action, quality_list[ lwindow_position:lwindow_position + options.window_size ], options.score_comparison, options.quality_score, exclude_window_indexes ): |
---|
| 99 | fastq_read = fastq_read.slice( lwindow_position, None ) |
---|
| 100 | break |
---|
| 101 | lwindow_position += options.window_step |
---|
| 102 | else: |
---|
| 103 | rwindow_position = len( quality_list ) #right position of window |
---|
| 104 | while True: |
---|
| 105 | lwindow_position = rwindow_position - options.window_size #left position of window |
---|
| 106 | if rwindow_position <= 0 or lwindow_position < 0: |
---|
| 107 | fastq_read.sequence = '' |
---|
| 108 | fastq_read.quality = '' |
---|
| 109 | break |
---|
| 110 | if exclude_and_compare( action, quality_list[ lwindow_position:rwindow_position ], options.score_comparison, options.quality_score, exclude_window_indexes ): |
---|
| 111 | fastq_read = fastq_read.slice( None, rwindow_position ) |
---|
| 112 | break |
---|
| 113 | rwindow_position -= options.window_step |
---|
| 114 | if options.keep_zero_length or len( fastq_read ): |
---|
| 115 | out.write( fastq_read ) |
---|
| 116 | else: |
---|
| 117 | num_reads_excluded += 1 |
---|
| 118 | out.close() |
---|
| 119 | if num_reads is None: |
---|
| 120 | print "No valid FASTQ reads could be processed." |
---|
| 121 | else: |
---|
| 122 | print "%i FASTQ reads were processed." % ( num_reads + 1 ) |
---|
| 123 | if num_reads_excluded: |
---|
| 124 | print "%i reads of zero length were excluded from the output." % num_reads_excluded |
---|
| 125 | |
---|
| 126 | if __name__ == "__main__": main() |
---|