1 | #!/usr/bin/perl -w |
---|
2 | |
---|
3 | # Author: lh3 |
---|
4 | # Note: Ideally, this script should be written in C. It is a bit slow at present. |
---|
5 | |
---|
6 | use strict; |
---|
7 | use warnings; |
---|
8 | use Getopt::Std; |
---|
9 | |
---|
10 | my %opts; |
---|
11 | my $version = '0.1.3'; |
---|
12 | my $usage = qq{ |
---|
13 | Usage: solid2fastq.pl <paired> <outfile1> <outfile2> <F3.csfasta> <F3.qual> <R3.csfasta> <R3.qual> |
---|
14 | |
---|
15 | Note: <in.title> is the string showed in the `# Title:' line of a |
---|
16 | ".csfasta" read file. Then <in.title>F3.csfasta is read sequence |
---|
17 | file and <in.title>F3_QV.qual is the quality file. If |
---|
18 | <in.title>R3.csfasta is present, this script assumes reads are |
---|
19 | paired; otherwise reads will be regarded as single-end. |
---|
20 | |
---|
21 | The read name will be <out.prefix>:panel_x_y/[12] with `1' for R3 |
---|
22 | tag and `2' for F3. Usually you may want to use short <out.prefix> |
---|
23 | to save diskspace. Long <out.prefix> also causes troubles to maq. |
---|
24 | |
---|
25 | }; |
---|
26 | |
---|
27 | getopts('', \%opts); |
---|
28 | die($usage) if (@ARGV != 7); |
---|
29 | my ($is_paired,$outfile1,$outfile2,$f3reads,$f3qual,$r3reads,$r3qual) = @ARGV; |
---|
30 | my (@fhr, @fhw); |
---|
31 | my $fn = ''; |
---|
32 | my @fn_suff = ($f3reads,$f3qual,$r3reads,$r3qual); |
---|
33 | if ($is_paired eq "yes") { # paired end |
---|
34 | for (0 .. 3) { |
---|
35 | $fn = $fn_suff[$_]; |
---|
36 | $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); |
---|
37 | open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); |
---|
38 | } |
---|
39 | open($fhw[0], "|gzip >$outfile2") || die; |
---|
40 | open($fhw[1], "|gzip >$outfile1") || die; |
---|
41 | my (@df, @dr); |
---|
42 | @df = &read1(1); @dr = &read1(2); |
---|
43 | while (@df && @dr) { |
---|
44 | if ($df[0] eq $dr[0]) { # mate pair |
---|
45 | print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1]; |
---|
46 | @df = &read1(1); @dr = &read1(2); |
---|
47 | } |
---|
48 | } |
---|
49 | close($fhr[$_]) for (0 .. $#fhr); |
---|
50 | close($fhw[$_]) for (0 .. $#fhw); |
---|
51 | } else { # single end |
---|
52 | for (0 .. 1) { |
---|
53 | my $fn = "$fn_suff[$_]"; |
---|
54 | $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); |
---|
55 | open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); |
---|
56 | } |
---|
57 | open($fhw[2], "|gzip >$outfile1") || die; |
---|
58 | my @df; |
---|
59 | while (@df = &read1(1, $fhr[0], $fhr[1])) { |
---|
60 | print {$fhw[2]} $df[1]; |
---|
61 | } |
---|
62 | close($fhr[$_]) for (0 .. $#fhr); |
---|
63 | close($fhw[2]); |
---|
64 | } |
---|
65 | |
---|
66 | sub read1 { |
---|
67 | my $i = shift(@_); |
---|
68 | my $j = ($i-1)<<1; |
---|
69 | my ($key, $seq); |
---|
70 | my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]); |
---|
71 | while (<$fhs>) { |
---|
72 | my $t = <$fhq>; |
---|
73 | if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) { |
---|
74 | $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines |
---|
75 | #print $key; |
---|
76 | die(qq/** unmatched read name: '$_' != '$t'\n/) unless ($_ eq $t); |
---|
77 | my $name = "$1_$2_$3/$i"; |
---|
78 | $_ = substr(<$fhs>, 2); |
---|
79 | tr/0123./ACGTN/; |
---|
80 | my $s = $_; |
---|
81 | $_ = <$fhq>; |
---|
82 | s/^(\d+)\s*//; |
---|
83 | s/(\d+)\s*/chr($1+33)/eg; |
---|
84 | $seq = qq/\@$name\n$s+\n$_\n/; |
---|
85 | last; |
---|
86 | } |
---|
87 | } |
---|
88 | return defined($seq)? ($key, $seq) : (); |
---|
89 | } |
---|