1 | #Dan Blankenberg |
---|
2 | import string |
---|
3 | from optparse import OptionParser |
---|
4 | from galaxy_utils.sequence.fastq import fastqReader, fastqWriter |
---|
5 | |
---|
6 | |
---|
7 | def get_score_comparer( operator ): |
---|
8 | if operator == 'gt': |
---|
9 | return compare_gt |
---|
10 | elif operator == 'ge': |
---|
11 | return compare_ge |
---|
12 | elif operator == 'eq': |
---|
13 | return compare_eq |
---|
14 | elif operator == 'lt': |
---|
15 | return compare_lt |
---|
16 | elif operator == 'le': |
---|
17 | return compare_le |
---|
18 | elif operator == 'ne': |
---|
19 | return compare_ne |
---|
20 | raise 'Invalid operator provided: %s' % operator |
---|
21 | |
---|
22 | def compare_gt( quality_score, threshold_value ): |
---|
23 | return quality_score > threshold_value |
---|
24 | |
---|
25 | def compare_ge( quality_score, threshold_value ): |
---|
26 | return quality_score >= threshold_value |
---|
27 | |
---|
28 | def compare_eq( quality_score, threshold_value ): |
---|
29 | return quality_score == threshold_value |
---|
30 | |
---|
31 | def compare_ne( quality_score, threshold_value ): |
---|
32 | return quality_score != threshold_value |
---|
33 | |
---|
34 | def compare_lt( quality_score, threshold_value ): |
---|
35 | return quality_score < threshold_value |
---|
36 | |
---|
37 | def compare_le( quality_score, threshold_value ): |
---|
38 | return quality_score <= threshold_value |
---|
39 | |
---|
40 | class BaseReplacer( object ): |
---|
41 | def __init__( self, replace_character ): |
---|
42 | self.replace_character = replace_character |
---|
43 | def __call__( self, base_character ): |
---|
44 | return self.replace_character |
---|
45 | |
---|
46 | def main(): |
---|
47 | usage = "usage: %prog [options] input_file output_file" |
---|
48 | parser = OptionParser( usage=usage ) |
---|
49 | parser.add_option( '-f', '--format', dest='format', type='choice', default='sanger', choices=( 'sanger', 'solexa', 'illumina' ), help='FASTQ variant type' ) |
---|
50 | parser.add_option( '-m', '--mask_character', dest='mask_character', default='N', help='Mask Character to use' ) |
---|
51 | parser.add_option( '-c', '--score_comparison', type="choice", dest='score_comparison', default='le', choices=('gt','ge','eq','lt', 'le', 'ne' ), help='Mask base when score is' ) |
---|
52 | parser.add_option( '-s', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score' ) |
---|
53 | parser.add_option( "-l", "--lowercase", action="store_true", dest="lowercase", default=False, help="Use lowercase masking") |
---|
54 | ( options, args ) = parser.parse_args() |
---|
55 | |
---|
56 | if len ( args ) != 2: |
---|
57 | parser.error( "Need to specify an input file and an output file" ) |
---|
58 | |
---|
59 | score_comparer = get_score_comparer( options.score_comparison ) |
---|
60 | |
---|
61 | if options.lowercase: |
---|
62 | base_masker = string.lower |
---|
63 | else: |
---|
64 | base_masker = BaseReplacer( options.mask_character ) |
---|
65 | |
---|
66 | out = fastqWriter( open( args[1], 'wb' ), format = options.format ) |
---|
67 | |
---|
68 | num_reads = None |
---|
69 | num_reads_excluded = 0 |
---|
70 | for num_reads, fastq_read in enumerate( fastqReader( open( args[0] ), format = options.format ) ): |
---|
71 | sequence_list = list( fastq_read.sequence ) |
---|
72 | for i, quality_score in enumerate( fastq_read.get_decimal_quality_scores() ): |
---|
73 | if score_comparer( quality_score, options.quality_score ): |
---|
74 | sequence_list[ i ] = base_masker( sequence_list[ i ] ) |
---|
75 | fastq_read.sequence = "".join( sequence_list ) |
---|
76 | out.write( fastq_read ) |
---|
77 | |
---|
78 | if num_reads is not None: |
---|
79 | print "Processed %i %s reads." % ( num_reads + 1, options.format ) |
---|
80 | else: |
---|
81 | print "No valid FASTQ reads were provided." |
---|
82 | |
---|
83 | if __name__ == "__main__": main() |
---|