[2] | 1 | #!/usr/bin/env python |
---|
| 2 | |
---|
| 3 | import sys |
---|
| 4 | import string |
---|
| 5 | import optparse |
---|
| 6 | import tempfile |
---|
| 7 | import sqlite3 |
---|
| 8 | |
---|
| 9 | def stop_err( msg ): |
---|
| 10 | sys.stderr.write( msg ) |
---|
| 11 | sys.exit() |
---|
| 12 | |
---|
| 13 | def solid2sanger( quality_string, min_qual = 0 ): |
---|
| 14 | sanger = "" |
---|
| 15 | quality_string = quality_string.rstrip( " " ) |
---|
| 16 | for qv in quality_string.split(" "): |
---|
| 17 | try: |
---|
| 18 | if int( qv ) < 0: |
---|
| 19 | qv = '0' |
---|
| 20 | if int( qv ) < min_qual: |
---|
| 21 | return False |
---|
| 22 | break |
---|
| 23 | sanger += chr( int( qv ) + 33 ) |
---|
| 24 | except: |
---|
| 25 | pass |
---|
| 26 | return sanger |
---|
| 27 | |
---|
| 28 | def Translator(frm='', to='', delete='', keep=None): |
---|
| 29 | allchars = string.maketrans('','') |
---|
| 30 | if len(to) == 1: |
---|
| 31 | to = to * len(frm) |
---|
| 32 | trans = string.maketrans(frm, to) |
---|
| 33 | if keep is not None: |
---|
| 34 | delete = allchars.translate(allchars, keep.translate(allchars, delete)) |
---|
| 35 | def callable(s): |
---|
| 36 | return s.translate(trans, delete) |
---|
| 37 | return callable |
---|
| 38 | |
---|
| 39 | def merge_reads_qual( f_reads, f_qual, f_out, trim_name=False, out='fastq', double_encode = False, trim_first_base = False, pair_end_flag = '', min_qual = 0, table_name=None ): |
---|
| 40 | |
---|
| 41 | # Reads from two files f_csfasta (reads) and f_qual (quality values) and produces output in three formats depending on out parameter, |
---|
| 42 | # which can have three values: fastq, txt, and db |
---|
| 43 | # fastq = fastq format |
---|
| 44 | # txt = space delimited format with defline, reads, and qvs |
---|
| 45 | # dp = dump data into sqlite3 db. |
---|
| 46 | # IMPORTNAT! If out = db two optins must be provided: |
---|
| 47 | # 1. f_out must be a db connection object initialized with sqlite3.connect() |
---|
| 48 | # 2. table_name must be provided |
---|
| 49 | |
---|
| 50 | if out == 'db': |
---|
| 51 | cursor = f_out.cursor() |
---|
| 52 | sql = "create table %s (name varchar(50) not null, read blob, qv blob)" % table_name |
---|
| 53 | cursor.execute(sql) |
---|
| 54 | |
---|
| 55 | lines = [] |
---|
| 56 | line = " " |
---|
| 57 | while line: |
---|
| 58 | for f in [ f_reads, f_qual ]: |
---|
| 59 | line = f.readline().rstrip( '\n\r' ) |
---|
| 60 | while line.startswith( '#' ): |
---|
| 61 | line = f.readline().rstrip( '\n\r' ) |
---|
| 62 | lines.append( line ) |
---|
| 63 | |
---|
| 64 | if lines[0].startswith( '>' ): |
---|
| 65 | defline = lines[0][1:] |
---|
| 66 | if trim_name and ( defline[ len( defline )-3: ] == "_F3" or defline[ len( defline )-3: ] == "_R3" ): |
---|
| 67 | defline = defline[ : len( defline )-3 ] |
---|
| 68 | |
---|
| 69 | else: |
---|
| 70 | |
---|
| 71 | if trim_first_base: |
---|
| 72 | lines[0] = lines[0][1:] |
---|
| 73 | if double_encode: |
---|
| 74 | de = Translator(frm="0123.", to="ACGTN") |
---|
| 75 | lines[0] = de(lines[0]) |
---|
| 76 | qual = solid2sanger( lines[1], int( min_qual ) ) |
---|
| 77 | if qual: |
---|
| 78 | if out == 'fastq': |
---|
| 79 | f_out.write( "@%s%s\n%s\n+\n%s\n" % ( defline, pair_end_flag, lines[0], qual ) ) |
---|
| 80 | if out == 'txt': |
---|
| 81 | f_out.write( '%s %s %s\n' % (defline, lines[0], qual ) ) |
---|
| 82 | if out == 'db': |
---|
| 83 | cursor.execute('insert into %s values("%s","%s","%s")' % (table_name, defline, lines[0], qual ) ) |
---|
| 84 | lines = [] |
---|
| 85 | |
---|
| 86 | def main(): |
---|
| 87 | |
---|
| 88 | usage = "%prog --fr F3.csfasta --fq R3.csfasta --fout fastq_output_file [option]" |
---|
| 89 | parser = optparse.OptionParser(usage=usage) |
---|
| 90 | |
---|
| 91 | |
---|
| 92 | parser.add_option( |
---|
| 93 | '--fr','--f_reads', |
---|
| 94 | metavar="F3_CSFASTA_FILE", |
---|
| 95 | dest='fr', |
---|
| 96 | help='Name of F3 file with color space reads') |
---|
| 97 | |
---|
| 98 | parser.add_option( |
---|
| 99 | '--fq','--f_qual', |
---|
| 100 | metavar="F3_QUAL_FILE", |
---|
| 101 | dest='fq', |
---|
| 102 | help='Name of F3 file with color quality values') |
---|
| 103 | |
---|
| 104 | parser.add_option( |
---|
| 105 | '--fout','--f3_fastq_output', |
---|
| 106 | metavar="F3_OUTPUT", |
---|
| 107 | dest='fout', |
---|
| 108 | help='Name for F3 output file') |
---|
| 109 | |
---|
| 110 | parser.add_option( |
---|
| 111 | '--rr','--r_reads', |
---|
| 112 | metavar="R3_CSFASTA_FILE", |
---|
| 113 | dest='rr', |
---|
| 114 | default = False, |
---|
| 115 | help='Name of R3 file with color space reads') |
---|
| 116 | |
---|
| 117 | parser.add_option( |
---|
| 118 | '--rq','--r_qual', |
---|
| 119 | metavar="R3_QUAL_FILE", |
---|
| 120 | dest='rq', |
---|
| 121 | default = False, |
---|
| 122 | help='Name of R3 file with color quality values') |
---|
| 123 | |
---|
| 124 | parser.add_option( |
---|
| 125 | '--rout', |
---|
| 126 | metavar="R3_OUTPUT", |
---|
| 127 | dest='rout', |
---|
| 128 | help='Name for F3 output file') |
---|
| 129 | |
---|
| 130 | parser.add_option( |
---|
| 131 | '-q','--min_qual', |
---|
| 132 | dest='min_qual', |
---|
| 133 | default = '-1000', |
---|
| 134 | help='Minimum quality threshold for printing reads. If a read contains a single call with QV lower than this value, it will not be reported. Default is -1000') |
---|
| 135 | |
---|
| 136 | parser.add_option( |
---|
| 137 | '-t','--trim_name', |
---|
| 138 | dest='trim_name', |
---|
| 139 | action='store_true', |
---|
| 140 | default = False, |
---|
| 141 | help='Trim _R3 and _F3 off read names. Default is False') |
---|
| 142 | |
---|
| 143 | parser.add_option( |
---|
| 144 | '-f','--trim_first_base', |
---|
| 145 | dest='trim_first_base', |
---|
| 146 | action='store_true', |
---|
| 147 | default = False, |
---|
| 148 | help='Remove the first base of reads in color-space. Default is False') |
---|
| 149 | |
---|
| 150 | parser.add_option( |
---|
| 151 | '-d','--double_encode', |
---|
| 152 | dest='de', |
---|
| 153 | action='store_true', |
---|
| 154 | default = False, |
---|
| 155 | help='Double encode color calls as nucleotides: 0123. becomes ACGTN. Default is False') |
---|
| 156 | |
---|
| 157 | options, args = parser.parse_args() |
---|
| 158 | |
---|
| 159 | if not ( options.fout and options.fr and options.fq ): |
---|
| 160 | parser.error(""" |
---|
| 161 | One or more of the three required paremetrs is missing: |
---|
| 162 | (1) --fr F3.csfasta file |
---|
| 163 | (2) --fq F3.qual file |
---|
| 164 | (3) --fout name of output file |
---|
| 165 | Use --help for more info |
---|
| 166 | """) |
---|
| 167 | |
---|
| 168 | fr = open ( options.fr , 'r' ) |
---|
| 169 | fq = open ( options.fq , 'r' ) |
---|
| 170 | f_out = open ( options.fout , 'w' ) |
---|
| 171 | |
---|
| 172 | if options.rr and options.rq: |
---|
| 173 | rr = open ( options.rr , 'r' ) |
---|
| 174 | rq = open ( options.rq , 'r' ) |
---|
| 175 | if not options.rout: |
---|
| 176 | parser.error("Provide the name for f3 output using --rout option. Use --help for more info") |
---|
| 177 | r_out = open ( options.rout, 'w' ) |
---|
| 178 | |
---|
| 179 | db = tempfile.NamedTemporaryFile() |
---|
| 180 | |
---|
| 181 | try: |
---|
| 182 | con = sqlite3.connect(db.name) |
---|
| 183 | cur = con.cursor() |
---|
| 184 | except: |
---|
| 185 | stop_err('Cannot connect to %s\n') % db.name |
---|
| 186 | |
---|
| 187 | |
---|
| 188 | merge_reads_qual( fr, fq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="f3" ) |
---|
| 189 | merge_reads_qual( rr, rq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="r3" ) |
---|
| 190 | cur.execute('create index f3_name on f3( name )') |
---|
| 191 | cur.execute('create index r3_name on r3( name )') |
---|
| 192 | |
---|
| 193 | cur.execute('select * from r3,f3 where f3.name = r3.name') |
---|
| 194 | for item in cur: |
---|
| 195 | f_out.write( "@%s%s\n%s\n+\n%s\n" % (item[0], "/1", item[1], item[2]) ) |
---|
| 196 | r_out.write( "@%s%s\n%s\n+\n%s\n" % (item[3], "/2", item[4], item[5]) ) |
---|
| 197 | |
---|
| 198 | |
---|
| 199 | else: |
---|
| 200 | merge_reads_qual( fr, fq, f_out, trim_name=options.trim_name, out='fastq', double_encode = options.de, trim_first_base = options.trim_first_base, min_qual=options.min_qual ) |
---|
| 201 | |
---|
| 202 | |
---|
| 203 | |
---|
| 204 | f_out.close() |
---|
| 205 | |
---|
| 206 | if __name__ == "__main__": |
---|
| 207 | main() |
---|
| 208 | |
---|
| 209 | |
---|