1 | #! /usr/bin/perl -w |
---|
2 | |
---|
3 | use strict; |
---|
4 | use POSIX; |
---|
5 | |
---|
6 | |
---|
7 | die "Usage: pileup_parser.pl <in_file> <ref_base_column> <read_bases_column> <base_quality_column> <coverage column> <qv cutoff> <coverage cutoff> <SNPs only?> <output bed?> <coord_column> <out_file> <total_diff> <print_qual_bases>\n" unless @ARGV == 13; |
---|
8 | |
---|
9 | my $in_file = $ARGV[0]; |
---|
10 | my $ref_base_column = $ARGV[1]-1; # 1 based |
---|
11 | my $read_bases_column = $ARGV[2]-1; # 1 based |
---|
12 | my $base_quality_column = $ARGV[3]-1; # 1 based |
---|
13 | my $cvrg_column = $ARGV[4]-1; # 1 based |
---|
14 | my $quality_cutoff = $ARGV[5]; # phred scale integer |
---|
15 | my $cvrg_cutoff = $ARGV[6]; # unsigned integer |
---|
16 | my $SNPs_only = $ARGV[7]; # set to "Yes" to print only positions with SNPs; set to "No" to pring everything |
---|
17 | my $bed = $ARGV[8]; #set to "Yes" to convert coordinates to bed format (0-based start, 1-based end); set to "No" to leave as is |
---|
18 | my $coord_column = $ARGV[9]-1; #1 based |
---|
19 | my $out_file = $ARGV[10]; |
---|
20 | my $total_diff = $ARGV[11]; # set to "Yes" to print total number of deviant based |
---|
21 | my $print_qual_bases = $ARGV[12]; #set to "Yes" to print quality and read base columns |
---|
22 | |
---|
23 | my $invalid_line_counter = 0; |
---|
24 | my $first_skipped_line = ""; |
---|
25 | my %SNPs = ('A',0,'T',0,'C',0,'G',0); |
---|
26 | my $above_qv_bases = 0; |
---|
27 | my $SNPs_exist = 0; |
---|
28 | my $out_string = ""; |
---|
29 | my $diff_count = 0; |
---|
30 | |
---|
31 | open (IN, "<$in_file") or die "Cannot open $in_file $!\n"; |
---|
32 | open (OUT, ">$out_file") or die "Cannot open $out_file $!\n"; |
---|
33 | |
---|
34 | while (<IN>) { |
---|
35 | chop; |
---|
36 | next if m/^\#/; |
---|
37 | my @fields = split /\t/; |
---|
38 | next if $fields[ $ref_base_column ] eq "*"; # skip indel lines |
---|
39 | my $read_bases = $fields[ $read_bases_column ]; |
---|
40 | die "Coverage column" . ($cvrg_column+1) . " contains non-numeric values. Check your input parameters as well as format of input dataset." if ( not isdigit $fields[ $cvrg_column ] ); |
---|
41 | next if $fields[ $cvrg_column ] < $cvrg_cutoff; |
---|
42 | my $base_quality = $fields[ $base_quality_column ]; |
---|
43 | if ($read_bases =~ m/[\$\^\+-]/) { |
---|
44 | $read_bases =~ s/\^.//g; #removing the start of the read segement mark |
---|
45 | $read_bases =~ s/\$//g; #removing end of the read segment mark |
---|
46 | while ($read_bases =~ m/[\+-]{1}(\d+)/g) { |
---|
47 | my $indel_len = $1; |
---|
48 | $read_bases =~ s/[\+-]{1}$indel_len.{$indel_len}//; # remove indel info from read base field |
---|
49 | } |
---|
50 | } |
---|
51 | if ( length($read_bases) != length($base_quality) ) { |
---|
52 | $first_skipped_line = $. if $first_skipped_line eq ""; |
---|
53 | ++$invalid_line_counter; |
---|
54 | next; |
---|
55 | } |
---|
56 | # after removing read block and indel data the length of read_base |
---|
57 | # field should identical to the length of base_quality field |
---|
58 | |
---|
59 | my @bases = split //, $read_bases; |
---|
60 | my @qv = split //, $base_quality; |
---|
61 | |
---|
62 | for my $base ( 0 .. @bases - 1 ) { |
---|
63 | if ( ord( $qv[ $base ] ) - 33 >= $quality_cutoff and $bases[ $base ] ne '*') |
---|
64 | { |
---|
65 | ++$above_qv_bases; |
---|
66 | |
---|
67 | if ( $bases[ $base ] =~ m/[ATGC]/i ) |
---|
68 | { |
---|
69 | $SNPs_exist = 1; |
---|
70 | $SNPs{ uc( $bases[ $base ] ) } += 1; |
---|
71 | $diff_count += 1; |
---|
72 | } elsif ( $bases[ $base ] =~ m/[\.,]/ ) { |
---|
73 | $SNPs{ uc( $fields[ $ref_base_column ] ) } += 1; |
---|
74 | } |
---|
75 | } |
---|
76 | } |
---|
77 | |
---|
78 | if ($bed eq "Yes") { |
---|
79 | my $start = $fields[ $coord_column ] - 1; |
---|
80 | my $end = $fields[ $coord_column ]; |
---|
81 | $fields[ $coord_column ] = "$start\t$end"; |
---|
82 | } |
---|
83 | |
---|
84 | if ($print_qual_bases ne "Yes") { |
---|
85 | $fields[ $base_quality_column ] = ""; |
---|
86 | $fields[ $read_bases_column ] = ""; |
---|
87 | } |
---|
88 | |
---|
89 | |
---|
90 | $out_string = join("\t", @fields); # \t$read_bases\t$base_quality"; |
---|
91 | foreach my $SNP (sort keys %SNPs) { |
---|
92 | $out_string .= "\t$SNPs{$SNP}"; |
---|
93 | } |
---|
94 | |
---|
95 | if ($total_diff eq "Yes") { |
---|
96 | $out_string .= "\t$above_qv_bases\t$diff_count\n"; |
---|
97 | } else { |
---|
98 | $out_string .= "\t$above_qv_bases\n"; |
---|
99 | } |
---|
100 | |
---|
101 | $out_string =~ s/\t+/\t/g; |
---|
102 | |
---|
103 | if ( $SNPs_only eq "Yes" ) { |
---|
104 | print OUT $out_string if $SNPs_exist == 1; |
---|
105 | } else { |
---|
106 | print OUT $out_string; |
---|
107 | } |
---|
108 | |
---|
109 | |
---|
110 | %SNPs = (); |
---|
111 | %SNPs = ('A',0,'T',0,'C',0,'G',0); |
---|
112 | $above_qv_bases = 0; |
---|
113 | $SNPs_exist = 0; |
---|
114 | $diff_count = 0; |
---|
115 | |
---|
116 | |
---|
117 | } |
---|
118 | |
---|
119 | print "Skipped $invalid_line_counter invalid line(s) beginning with line $first_skipped_line\n" if $invalid_line_counter > 0; |
---|
120 | close IN; |
---|
121 | close OUT; |
---|