root/galaxy-central/tools/next_gen_conversion/solid2fastq.py @ 3

リビジョン 2, 7.1 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2
3import sys
4import string
5import optparse
6import tempfile
7import sqlite3
8
9def stop_err( msg ):
10    sys.stderr.write( msg )
11    sys.exit()
12   
13def solid2sanger( quality_string, min_qual = 0 ):
14    sanger = ""
15    quality_string = quality_string.rstrip( " " )
16    for qv in quality_string.split(" "):
17        try:
18            if int( qv ) < 0:
19                qv = '0'
20            if int( qv ) < min_qual:
21                return False
22                break
23            sanger += chr( int( qv ) + 33 )
24        except:
25            pass   
26    return sanger
27
28def Translator(frm='', to='', delete='', keep=None):
29    allchars = string.maketrans('','')
30    if len(to) == 1:
31        to = to * len(frm)
32    trans = string.maketrans(frm, to)
33    if keep is not None:
34        delete = allchars.translate(allchars, keep.translate(allchars, delete))
35    def callable(s):
36        return s.translate(trans, delete)
37    return callable
38   
39def merge_reads_qual( f_reads, f_qual, f_out, trim_name=False, out='fastq', double_encode = False, trim_first_base = False, pair_end_flag = '', min_qual = 0, table_name=None ):
40 
41    # Reads from two files f_csfasta (reads) and f_qual (quality values) and produces output in three formats depending on out parameter,
42    # which can have three values: fastq, txt, and db
43    # fastq = fastq format
44    # txt = space delimited format with defline, reads, and qvs
45    # dp = dump data into sqlite3 db.
46    # IMPORTNAT! If out = db two optins must be provided:
47    #   1. f_out must be a db connection object initialized with sqlite3.connect()
48    #   2. table_name must be provided
49   
50    if out == 'db':
51        cursor = f_out.cursor()
52        sql = "create table %s (name varchar(50) not null, read blob, qv blob)" % table_name
53        cursor.execute(sql)
54   
55    lines = []
56    line = " "
57    while line:
58        for f in [ f_reads, f_qual ]:
59            line = f.readline().rstrip( '\n\r' )
60            while line.startswith( '#' ):
61                line = f.readline().rstrip( '\n\r' )
62            lines.append( line )
63           
64        if lines[0].startswith( '>' ):
65            defline = lines[0][1:]
66            if trim_name and ( defline[ len( defline )-3: ] == "_F3" or defline[ len( defline )-3: ] == "_R3" ):
67                defline = defline[ : len( defline )-3 ]
68               
69        else:
70
71            if trim_first_base:
72                lines[0] = lines[0][1:]
73            if double_encode:
74                de = Translator(frm="0123.", to="ACGTN")
75                lines[0] = de(lines[0])
76            qual = solid2sanger( lines[1], int( min_qual ) )
77            if qual:
78                if out == 'fastq':
79                    f_out.write( "@%s%s\n%s\n+\n%s\n" % ( defline, pair_end_flag, lines[0], qual ) )
80                if out == 'txt':
81                    f_out.write( '%s %s %s\n' % (defline, lines[0], qual ) )
82                if out == 'db':
83                    cursor.execute('insert into %s values("%s","%s","%s")' % (table_name, defline, lines[0], qual ) )     
84        lines = []
85
86def main():
87
88    usage = "%prog --fr F3.csfasta --fq R3.csfasta --fout fastq_output_file [option]"
89    parser = optparse.OptionParser(usage=usage)
90   
91       
92    parser.add_option(
93        '--fr','--f_reads',
94        metavar="F3_CSFASTA_FILE",
95        dest='fr',
96        help='Name of F3 file with color space reads')
97       
98    parser.add_option(
99        '--fq','--f_qual',
100        metavar="F3_QUAL_FILE",
101        dest='fq',
102        help='Name of F3 file with color quality values')
103       
104    parser.add_option(
105        '--fout','--f3_fastq_output',
106        metavar="F3_OUTPUT",
107        dest='fout',
108        help='Name for F3 output file')
109               
110    parser.add_option(
111        '--rr','--r_reads',
112        metavar="R3_CSFASTA_FILE",
113        dest='rr',
114        default = False,
115        help='Name of R3 file with color space reads')
116       
117    parser.add_option(
118        '--rq','--r_qual',
119        metavar="R3_QUAL_FILE",
120        dest='rq',
121        default = False,
122        help='Name of R3 file with color quality values')
123       
124    parser.add_option(
125        '--rout',
126        metavar="R3_OUTPUT",
127        dest='rout',
128        help='Name for F3 output file')
129   
130    parser.add_option(
131        '-q','--min_qual',
132        dest='min_qual',
133        default = '-1000',
134        help='Minimum quality threshold for printing reads. If a read contains a single call with QV lower than this value, it will not be reported. Default is -1000')
135       
136    parser.add_option(
137        '-t','--trim_name',
138        dest='trim_name',
139        action='store_true',
140        default = False,
141        help='Trim _R3 and _F3 off read names. Default is False')
142   
143    parser.add_option(
144        '-f','--trim_first_base',
145        dest='trim_first_base',
146        action='store_true',
147        default = False,
148        help='Remove the first base of reads in color-space. Default is False')
149       
150    parser.add_option(
151        '-d','--double_encode',
152        dest='de',
153        action='store_true',
154        default = False,
155        help='Double encode color calls as nucleotides: 0123. becomes ACGTN. Default is False')
156   
157    options, args = parser.parse_args()
158   
159    if not ( options.fout and options.fr and options.fq ):
160        parser.error("""
161        One or more of the three required paremetrs is missing:
162        (1) --fr F3.csfasta file
163        (2) --fq F3.qual file
164        (3) --fout name of output file
165        Use --help for more info
166        """)
167
168    fr =  open ( options.fr , 'r' )
169    fq =  open ( options.fq , 'r' )
170    f_out = open ( options.fout , 'w' )
171   
172    if options.rr and options.rq:
173        rr =  open ( options.rr , 'r' )
174        rq =  open ( options.rq , 'r' )
175        if not options.rout:
176            parser.error("Provide the name for f3 output using --rout option. Use --help for more info")
177        r_out = open ( options.rout, 'w' )
178   
179        db = tempfile.NamedTemporaryFile()
180       
181        try:
182            con = sqlite3.connect(db.name)
183            cur = con.cursor()
184        except:
185            stop_err('Cannot connect to %s\n') % db.name
186   
187         
188        merge_reads_qual( fr, fq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="f3" )
189        merge_reads_qual( rr, rq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="r3" )
190        cur.execute('create index f3_name on f3( name )')
191        cur.execute('create index r3_name on r3( name )')
192         
193        cur.execute('select * from r3,f3 where f3.name = r3.name')
194        for item in cur:
195            f_out.write( "@%s%s\n%s\n+\n%s\n" % (item[0], "/1", item[1], item[2]) )
196            r_out.write( "@%s%s\n%s\n+\n%s\n" % (item[3], "/2", item[4], item[5]) )
197       
198           
199    else:
200        merge_reads_qual( fr, fq, f_out, trim_name=options.trim_name, out='fastq', double_encode = options.de, trim_first_base = options.trim_first_base, min_qual=options.min_qual )
201       
202   
203     
204    f_out.close()
205
206if __name__ == "__main__":
207    main()
208   
209 
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。