| 1 | #!/usr/bin/env python |
|---|
| 2 | |
|---|
| 3 | import sys |
|---|
| 4 | import string |
|---|
| 5 | import optparse |
|---|
| 6 | import tempfile |
|---|
| 7 | import sqlite3 |
|---|
| 8 | |
|---|
| 9 | def stop_err( msg ): |
|---|
| 10 | sys.stderr.write( msg ) |
|---|
| 11 | sys.exit() |
|---|
| 12 | |
|---|
| 13 | def solid2sanger( quality_string, min_qual = 0 ): |
|---|
| 14 | sanger = "" |
|---|
| 15 | quality_string = quality_string.rstrip( " " ) |
|---|
| 16 | for qv in quality_string.split(" "): |
|---|
| 17 | try: |
|---|
| 18 | if int( qv ) < 0: |
|---|
| 19 | qv = '0' |
|---|
| 20 | if int( qv ) < min_qual: |
|---|
| 21 | return False |
|---|
| 22 | break |
|---|
| 23 | sanger += chr( int( qv ) + 33 ) |
|---|
| 24 | except: |
|---|
| 25 | pass |
|---|
| 26 | return sanger |
|---|
| 27 | |
|---|
| 28 | def Translator(frm='', to='', delete='', keep=None): |
|---|
| 29 | allchars = string.maketrans('','') |
|---|
| 30 | if len(to) == 1: |
|---|
| 31 | to = to * len(frm) |
|---|
| 32 | trans = string.maketrans(frm, to) |
|---|
| 33 | if keep is not None: |
|---|
| 34 | delete = allchars.translate(allchars, keep.translate(allchars, delete)) |
|---|
| 35 | def callable(s): |
|---|
| 36 | return s.translate(trans, delete) |
|---|
| 37 | return callable |
|---|
| 38 | |
|---|
| 39 | def merge_reads_qual( f_reads, f_qual, f_out, trim_name=False, out='fastq', double_encode = False, trim_first_base = False, pair_end_flag = '', min_qual = 0, table_name=None ): |
|---|
| 40 | |
|---|
| 41 | # Reads from two files f_csfasta (reads) and f_qual (quality values) and produces output in three formats depending on out parameter, |
|---|
| 42 | # which can have three values: fastq, txt, and db |
|---|
| 43 | # fastq = fastq format |
|---|
| 44 | # txt = space delimited format with defline, reads, and qvs |
|---|
| 45 | # dp = dump data into sqlite3 db. |
|---|
| 46 | # IMPORTNAT! If out = db two optins must be provided: |
|---|
| 47 | # 1. f_out must be a db connection object initialized with sqlite3.connect() |
|---|
| 48 | # 2. table_name must be provided |
|---|
| 49 | |
|---|
| 50 | if out == 'db': |
|---|
| 51 | cursor = f_out.cursor() |
|---|
| 52 | sql = "create table %s (name varchar(50) not null, read blob, qv blob)" % table_name |
|---|
| 53 | cursor.execute(sql) |
|---|
| 54 | |
|---|
| 55 | lines = [] |
|---|
| 56 | line = " " |
|---|
| 57 | while line: |
|---|
| 58 | for f in [ f_reads, f_qual ]: |
|---|
| 59 | line = f.readline().rstrip( '\n\r' ) |
|---|
| 60 | while line.startswith( '#' ): |
|---|
| 61 | line = f.readline().rstrip( '\n\r' ) |
|---|
| 62 | lines.append( line ) |
|---|
| 63 | |
|---|
| 64 | if lines[0].startswith( '>' ): |
|---|
| 65 | defline = lines[0][1:] |
|---|
| 66 | if trim_name and ( defline[ len( defline )-3: ] == "_F3" or defline[ len( defline )-3: ] == "_R3" ): |
|---|
| 67 | defline = defline[ : len( defline )-3 ] |
|---|
| 68 | |
|---|
| 69 | else: |
|---|
| 70 | |
|---|
| 71 | if trim_first_base: |
|---|
| 72 | lines[0] = lines[0][1:] |
|---|
| 73 | if double_encode: |
|---|
| 74 | de = Translator(frm="0123.", to="ACGTN") |
|---|
| 75 | lines[0] = de(lines[0]) |
|---|
| 76 | qual = solid2sanger( lines[1], int( min_qual ) ) |
|---|
| 77 | if qual: |
|---|
| 78 | if out == 'fastq': |
|---|
| 79 | f_out.write( "@%s%s\n%s\n+\n%s\n" % ( defline, pair_end_flag, lines[0], qual ) ) |
|---|
| 80 | if out == 'txt': |
|---|
| 81 | f_out.write( '%s %s %s\n' % (defline, lines[0], qual ) ) |
|---|
| 82 | if out == 'db': |
|---|
| 83 | cursor.execute('insert into %s values("%s","%s","%s")' % (table_name, defline, lines[0], qual ) ) |
|---|
| 84 | lines = [] |
|---|
| 85 | |
|---|
| 86 | def main(): |
|---|
| 87 | |
|---|
| 88 | usage = "%prog --fr F3.csfasta --fq R3.csfasta --fout fastq_output_file [option]" |
|---|
| 89 | parser = optparse.OptionParser(usage=usage) |
|---|
| 90 | |
|---|
| 91 | |
|---|
| 92 | parser.add_option( |
|---|
| 93 | '--fr','--f_reads', |
|---|
| 94 | metavar="F3_CSFASTA_FILE", |
|---|
| 95 | dest='fr', |
|---|
| 96 | help='Name of F3 file with color space reads') |
|---|
| 97 | |
|---|
| 98 | parser.add_option( |
|---|
| 99 | '--fq','--f_qual', |
|---|
| 100 | metavar="F3_QUAL_FILE", |
|---|
| 101 | dest='fq', |
|---|
| 102 | help='Name of F3 file with color quality values') |
|---|
| 103 | |
|---|
| 104 | parser.add_option( |
|---|
| 105 | '--fout','--f3_fastq_output', |
|---|
| 106 | metavar="F3_OUTPUT", |
|---|
| 107 | dest='fout', |
|---|
| 108 | help='Name for F3 output file') |
|---|
| 109 | |
|---|
| 110 | parser.add_option( |
|---|
| 111 | '--rr','--r_reads', |
|---|
| 112 | metavar="R3_CSFASTA_FILE", |
|---|
| 113 | dest='rr', |
|---|
| 114 | default = False, |
|---|
| 115 | help='Name of R3 file with color space reads') |
|---|
| 116 | |
|---|
| 117 | parser.add_option( |
|---|
| 118 | '--rq','--r_qual', |
|---|
| 119 | metavar="R3_QUAL_FILE", |
|---|
| 120 | dest='rq', |
|---|
| 121 | default = False, |
|---|
| 122 | help='Name of R3 file with color quality values') |
|---|
| 123 | |
|---|
| 124 | parser.add_option( |
|---|
| 125 | '--rout', |
|---|
| 126 | metavar="R3_OUTPUT", |
|---|
| 127 | dest='rout', |
|---|
| 128 | help='Name for F3 output file') |
|---|
| 129 | |
|---|
| 130 | parser.add_option( |
|---|
| 131 | '-q','--min_qual', |
|---|
| 132 | dest='min_qual', |
|---|
| 133 | default = '-1000', |
|---|
| 134 | help='Minimum quality threshold for printing reads. If a read contains a single call with QV lower than this value, it will not be reported. Default is -1000') |
|---|
| 135 | |
|---|
| 136 | parser.add_option( |
|---|
| 137 | '-t','--trim_name', |
|---|
| 138 | dest='trim_name', |
|---|
| 139 | action='store_true', |
|---|
| 140 | default = False, |
|---|
| 141 | help='Trim _R3 and _F3 off read names. Default is False') |
|---|
| 142 | |
|---|
| 143 | parser.add_option( |
|---|
| 144 | '-f','--trim_first_base', |
|---|
| 145 | dest='trim_first_base', |
|---|
| 146 | action='store_true', |
|---|
| 147 | default = False, |
|---|
| 148 | help='Remove the first base of reads in color-space. Default is False') |
|---|
| 149 | |
|---|
| 150 | parser.add_option( |
|---|
| 151 | '-d','--double_encode', |
|---|
| 152 | dest='de', |
|---|
| 153 | action='store_true', |
|---|
| 154 | default = False, |
|---|
| 155 | help='Double encode color calls as nucleotides: 0123. becomes ACGTN. Default is False') |
|---|
| 156 | |
|---|
| 157 | options, args = parser.parse_args() |
|---|
| 158 | |
|---|
| 159 | if not ( options.fout and options.fr and options.fq ): |
|---|
| 160 | parser.error(""" |
|---|
| 161 | One or more of the three required paremetrs is missing: |
|---|
| 162 | (1) --fr F3.csfasta file |
|---|
| 163 | (2) --fq F3.qual file |
|---|
| 164 | (3) --fout name of output file |
|---|
| 165 | Use --help for more info |
|---|
| 166 | """) |
|---|
| 167 | |
|---|
| 168 | fr = open ( options.fr , 'r' ) |
|---|
| 169 | fq = open ( options.fq , 'r' ) |
|---|
| 170 | f_out = open ( options.fout , 'w' ) |
|---|
| 171 | |
|---|
| 172 | if options.rr and options.rq: |
|---|
| 173 | rr = open ( options.rr , 'r' ) |
|---|
| 174 | rq = open ( options.rq , 'r' ) |
|---|
| 175 | if not options.rout: |
|---|
| 176 | parser.error("Provide the name for f3 output using --rout option. Use --help for more info") |
|---|
| 177 | r_out = open ( options.rout, 'w' ) |
|---|
| 178 | |
|---|
| 179 | db = tempfile.NamedTemporaryFile() |
|---|
| 180 | |
|---|
| 181 | try: |
|---|
| 182 | con = sqlite3.connect(db.name) |
|---|
| 183 | cur = con.cursor() |
|---|
| 184 | except: |
|---|
| 185 | stop_err('Cannot connect to %s\n') % db.name |
|---|
| 186 | |
|---|
| 187 | |
|---|
| 188 | merge_reads_qual( fr, fq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="f3" ) |
|---|
| 189 | merge_reads_qual( rr, rq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="r3" ) |
|---|
| 190 | cur.execute('create index f3_name on f3( name )') |
|---|
| 191 | cur.execute('create index r3_name on r3( name )') |
|---|
| 192 | |
|---|
| 193 | cur.execute('select * from r3,f3 where f3.name = r3.name') |
|---|
| 194 | for item in cur: |
|---|
| 195 | f_out.write( "@%s%s\n%s\n+\n%s\n" % (item[0], "/1", item[1], item[2]) ) |
|---|
| 196 | r_out.write( "@%s%s\n%s\n+\n%s\n" % (item[3], "/2", item[4], item[5]) ) |
|---|
| 197 | |
|---|
| 198 | |
|---|
| 199 | else: |
|---|
| 200 | merge_reads_qual( fr, fq, f_out, trim_name=options.trim_name, out='fastq', double_encode = options.de, trim_first_base = options.trim_first_base, min_qual=options.min_qual ) |
|---|
| 201 | |
|---|
| 202 | |
|---|
| 203 | |
|---|
| 204 | f_out.close() |
|---|
| 205 | |
|---|
| 206 | if __name__ == "__main__": |
|---|
| 207 | main() |
|---|
| 208 | |
|---|
| 209 | |
|---|