root/galaxy-central/tools/samtools/pileup_parser.pl

リビジョン 2, 3.9 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

  • 属性 svn:executable の設定値 *
行番号 
1#! /usr/bin/perl -w
2
3use strict;
4use POSIX;
5
6
7die "Usage: pileup_parser.pl <in_file> <ref_base_column> <read_bases_column> <base_quality_column> <coverage column> <qv cutoff> <coverage cutoff> <SNPs only?> <output bed?> <coord_column> <out_file> <total_diff> <print_qual_bases>\n" unless @ARGV == 13;
8
9my $in_file = $ARGV[0];
10my $ref_base_column = $ARGV[1]-1; # 1 based
11my $read_bases_column = $ARGV[2]-1; # 1 based
12my $base_quality_column = $ARGV[3]-1; # 1 based
13my $cvrg_column = $ARGV[4]-1; # 1 based
14my $quality_cutoff = $ARGV[5]; # phred scale integer
15my $cvrg_cutoff = $ARGV[6]; # unsigned integer
16my $SNPs_only = $ARGV[7]; # set to "Yes" to print only positions with SNPs; set to "No" to pring everything
17my $bed = $ARGV[8]; #set to "Yes" to convert coordinates to bed format (0-based start, 1-based end); set to "No" to leave as is
18my $coord_column = $ARGV[9]-1; #1 based
19my $out_file = $ARGV[10];
20my $total_diff = $ARGV[11]; # set to "Yes" to print total number of deviant based
21my $print_qual_bases = $ARGV[12]; #set to "Yes" to print quality and read base columns
22
23my $invalid_line_counter = 0;
24my $first_skipped_line = "";
25my %SNPs = ('A',0,'T',0,'C',0,'G',0);
26my $above_qv_bases = 0;
27my $SNPs_exist = 0;
28my $out_string = "";
29my $diff_count = 0;
30
31open (IN, "<$in_file") or die "Cannot open $in_file $!\n";
32open (OUT, ">$out_file") or die "Cannot open $out_file $!\n";
33
34while (<IN>) {
35        chop;
36        next if m/^\#/;
37        my @fields = split /\t/;
38        next if $fields[ $ref_base_column ] eq "*"; # skip indel lines
39        my $read_bases   = $fields[ $read_bases_column ];
40        die "Coverage column" . ($cvrg_column+1) . " contains non-numeric values. Check your input parameters as well as format of input dataset." if ( not isdigit $fields[ $cvrg_column ] );
41    next if $fields[ $cvrg_column ] < $cvrg_cutoff;
42        my $base_quality = $fields[ $base_quality_column ];
43        if ($read_bases =~ m/[\$\^\+-]/) {
44                $read_bases =~ s/\^.//g; #removing the start of the read segement mark
45                $read_bases =~ s/\$//g; #removing end of the read segment mark
46                while ($read_bases =~ m/[\+-]{1}(\d+)/g) {
47                        my $indel_len = $1;
48                        $read_bases =~ s/[\+-]{1}$indel_len.{$indel_len}//; # remove indel info from read base field
49                }
50        }
51        if ( length($read_bases) != length($base_quality) ) {
52        $first_skipped_line = $. if $first_skipped_line eq "";
53        ++$invalid_line_counter;
54        next;
55        }
56        # after removing read block and indel data the length of read_base
57        # field should identical to the length of base_quality field
58       
59        my @bases = split //, $read_bases;
60        my @qv    = split //, $base_quality;
61       
62        for my $base ( 0 .. @bases - 1 ) {
63                if ( ord( $qv[ $base ] ) - 33 >= $quality_cutoff and $bases[ $base ] ne '*')
64                {
65                        ++$above_qv_bases;
66                       
67                        if ( $bases[ $base ] =~ m/[ATGC]/i )
68                        {
69                                $SNPs_exist = 1;       
70                                $SNPs{ uc( $bases[ $base ] ) } += 1;
71                                $diff_count += 1;
72                        } elsif ( $bases[ $base ] =~ m/[\.,]/ ) {
73                            $SNPs{ uc( $fields[ $ref_base_column ] ) } += 1;
74                    }                   
75                }
76        }
77       
78        if ($bed eq "Yes") {
79               my $start = $fields[ $coord_column ] - 1;
80               my $end   = $fields[ $coord_column ];
81               $fields[ $coord_column ] = "$start\t$end";
82        }
83       
84        if ($print_qual_bases ne "Yes") {
85               $fields[ $base_quality_column ] = "";
86               $fields[ $read_bases_column ] = "";
87        }
88               
89       
90        $out_string = join("\t", @fields); # \t$read_bases\t$base_quality";
91        foreach my $SNP (sort keys %SNPs) {
92                        $out_string .= "\t$SNPs{$SNP}";
93        }
94       
95        if ($total_diff eq "Yes") {
96           $out_string .= "\t$above_qv_bases\t$diff_count\n";
97        } else {
98           $out_string .= "\t$above_qv_bases\n";
99        }       
100       
101        $out_string =~ s/\t+/\t/g;
102       
103        if ( $SNPs_only eq "Yes" ) {
104                print OUT $out_string if $SNPs_exist == 1;
105        } else {
106                print OUT $out_string;
107        }
108
109               
110        %SNPs = ();
111        %SNPs = ('A',0,'T',0,'C',0,'G',0);
112        $above_qv_bases = 0;
113        $SNPs_exist = 0;
114        $diff_count = 0;
115       
116
117}
118
119print "Skipped $invalid_line_counter invalid line(s) beginning with line $first_skipped_line\n" if $invalid_line_counter > 0;
120close IN;
121close OUT;
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。