[2] | 1 | #!/usr/bin/perl -w |
---|
| 2 | |
---|
| 3 | # Author: lh3 |
---|
| 4 | # Note: Ideally, this script should be written in C. It is a bit slow at present. |
---|
| 5 | |
---|
| 6 | use strict; |
---|
| 7 | use warnings; |
---|
| 8 | use Getopt::Std; |
---|
| 9 | |
---|
| 10 | my %opts; |
---|
| 11 | my $version = '0.1.3'; |
---|
| 12 | my $usage = qq{ |
---|
| 13 | Usage: solid2fastq.pl <paired> <outfile1> <outfile2> <F3.csfasta> <F3.qual> <R3.csfasta> <R3.qual> |
---|
| 14 | |
---|
| 15 | Note: <in.title> is the string showed in the `# Title:' line of a |
---|
| 16 | ".csfasta" read file. Then <in.title>F3.csfasta is read sequence |
---|
| 17 | file and <in.title>F3_QV.qual is the quality file. If |
---|
| 18 | <in.title>R3.csfasta is present, this script assumes reads are |
---|
| 19 | paired; otherwise reads will be regarded as single-end. |
---|
| 20 | |
---|
| 21 | The read name will be <out.prefix>:panel_x_y/[12] with `1' for R3 |
---|
| 22 | tag and `2' for F3. Usually you may want to use short <out.prefix> |
---|
| 23 | to save diskspace. Long <out.prefix> also causes troubles to maq. |
---|
| 24 | |
---|
| 25 | }; |
---|
| 26 | |
---|
| 27 | getopts('', \%opts); |
---|
| 28 | die($usage) if (@ARGV != 7); |
---|
| 29 | my ($is_paired,$outfile1,$outfile2,$f3reads,$f3qual,$r3reads,$r3qual) = @ARGV; |
---|
| 30 | my (@fhr, @fhw); |
---|
| 31 | my $fn = ''; |
---|
| 32 | my @fn_suff = ($f3reads,$f3qual,$r3reads,$r3qual); |
---|
| 33 | if ($is_paired eq "yes") { # paired end |
---|
| 34 | for (0 .. 3) { |
---|
| 35 | $fn = $fn_suff[$_]; |
---|
| 36 | $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); |
---|
| 37 | open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); |
---|
| 38 | } |
---|
| 39 | open($fhw[0], "|gzip >$outfile2") || die; |
---|
| 40 | open($fhw[1], "|gzip >$outfile1") || die; |
---|
| 41 | my (@df, @dr); |
---|
| 42 | @df = &read1(1); @dr = &read1(2); |
---|
| 43 | while (@df && @dr) { |
---|
| 44 | if ($df[0] eq $dr[0]) { # mate pair |
---|
| 45 | print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1]; |
---|
| 46 | @df = &read1(1); @dr = &read1(2); |
---|
| 47 | } |
---|
| 48 | } |
---|
| 49 | close($fhr[$_]) for (0 .. $#fhr); |
---|
| 50 | close($fhw[$_]) for (0 .. $#fhw); |
---|
| 51 | } else { # single end |
---|
| 52 | for (0 .. 1) { |
---|
| 53 | my $fn = "$fn_suff[$_]"; |
---|
| 54 | $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); |
---|
| 55 | open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); |
---|
| 56 | } |
---|
| 57 | open($fhw[2], "|gzip >$outfile1") || die; |
---|
| 58 | my @df; |
---|
| 59 | while (@df = &read1(1, $fhr[0], $fhr[1])) { |
---|
| 60 | print {$fhw[2]} $df[1]; |
---|
| 61 | } |
---|
| 62 | close($fhr[$_]) for (0 .. $#fhr); |
---|
| 63 | close($fhw[2]); |
---|
| 64 | } |
---|
| 65 | |
---|
| 66 | sub read1 { |
---|
| 67 | my $i = shift(@_); |
---|
| 68 | my $j = ($i-1)<<1; |
---|
| 69 | my ($key, $seq); |
---|
| 70 | my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]); |
---|
| 71 | while (<$fhs>) { |
---|
| 72 | my $t = <$fhq>; |
---|
| 73 | if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) { |
---|
| 74 | $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines |
---|
| 75 | #print $key; |
---|
| 76 | die(qq/** unmatched read name: '$_' != '$t'\n/) unless ($_ eq $t); |
---|
| 77 | my $name = "$1_$2_$3/$i"; |
---|
| 78 | $_ = substr(<$fhs>, 2); |
---|
| 79 | tr/0123./ACGTN/; |
---|
| 80 | my $s = $_; |
---|
| 81 | $_ = <$fhq>; |
---|
| 82 | s/^(\d+)\s*//; |
---|
| 83 | s/(\d+)\s*/chr($1+33)/eg; |
---|
| 84 | $seq = qq/\@$name\n$s+\n$_\n/; |
---|
| 85 | last; |
---|
| 86 | } |
---|
| 87 | } |
---|
| 88 | return defined($seq)? ($key, $seq) : (); |
---|
| 89 | } |
---|