#!/usr/bin/env perl use strict; use warnings; ######################################################################### # codingSnps.pl # This takes a bed file with the names being / separated nts # and a gene bed file with cds start and stop. # It then checks for changes in coding regions, reporting # those that cause a frameshift or substitution in the amino acid. ######################################################################### if (!@ARGV or scalar @ARGV < 3) { print "Usage: codingSnps.pl snps.bed genes.bed (/dir/nib/|Galaxy build= loc=) [chr=# start=# end=# snp=#] > codingSnps.txt\n"; exit; } my $uniq = 0; #flag for whether want uniq positions my $syn = 0; #flag for if want synonomous changes rather than non-syn my $snpFile = shift @ARGV; my $geneFile = shift @ARGV; my $nibDir = shift @ARGV; if ($nibDir eq 'Galaxy') { getGalaxyInfo(); } my $seqFlag = "2bit"; #flag to set sequence type 2bit|nib my $col0 = 0; #bed like columns in default positions my $col1 = 1; my $col2 = 2; my $col3 = 3; #column positions 1 based coming in (for Galaxy) foreach (@ARGV) { if (/chr=(\d+)/) { $col0 = $1 -1; } elsif (/start=(\d+)/) { $col1 = $1 -1; } elsif (/end=(\d+)/) { $col2 = $1 -1; } elsif (/snp=(\d+)/) { $col3 = $1 -1; } } if ($col0 < 0 || $col1 < 0 || $col2 < 0 || $col3 < 0) { print STDERR "ERROR column numbers are given with origin 1\n"; exit 1; } my @genes; #bed lines for genes, sorted by chrom and start my %chrSt; #index in array where each chrom starts my %codon; #hash of codon amino acid conversions my $ends = 0; #ends vs sizes in bed 11 position, starts relative to chrom my $ignoreN = 1; #skip N my %amb = ( "R" => "A/G", "Y" => "C/T", "S" => "C/G", "W" => "A/T", "K" => "G/T", "M" => "A/C", "B" => "C/G/T", "D" => "A/G/T", "H" => "A/C/T", "V" => "A/C/G", "N" => "A/C/G/T" ); fill_codon(); open(FH, "cat $geneFile | sort -k1,1 -k2,2n |") or die "Couldn't open and sort $geneFile, $!\n"; my $i = 0; while() { chomp; if (/refGene.cdsEnd|ccdsGene.exonEnds/) { $ends = 1; next; } push(@genes, "$_"); my @f = split(/\t/); if (!exists $chrSt{$f[0]}) { $chrSt{$f[0]} = $i; } $i++; } close FH or die "Couldn't close $geneFile, $!\n"; if ($ends) { print STDERR "WARNING using block ends rather than sizes\n"; } #open snps sorted as well my $s1 = $col0 + 1; #sort order is origin 1 my $s2 = $col1 + 1; open(FH, "cat $snpFile | sort -k$s1,$s1 -k$s2,${s2}n |") or die "Couldn't open and sort $snpFile, $!\n"; $i = 0; my @g; #one genes fields, should be used repeatedly my %done; while() { chomp; if (/^\s*#/) { next; } #comment my @s = split(/\t/); #SNP fields if (!@s or !$s[$col0]) { die "ERROR missing SNP data, $_\n"; } my $size = $#s; if ($col0 > $size || $col1 > $size || $col2 > $size || $col3 > $size) { print STDERR "ERROR file has fewer columns than requested, requested columns (0 based) $col0 $col1 $col2 $col3, file has $size\n"; exit 1; } if ($s[$col1] =~ /\D/) { print STDERR "ERROR the start point must be an integer not $s[$col1]\n"; exit 1; } if ($s[$col2] =~ /\D/) { print STDERR "ERROR the start point must be an integer not $s[$col2]\n"; exit 1; } if ($s[$col3] eq 'N' && $ignoreN) { next; } if (exists $amb{$s[$col3]}) { $s[$col3] = $amb{$s[$col3]}; } if (!@g && exists $chrSt{$s[$col0]}) { #need to fetch first gene row $i = $chrSt{$s[$col0]}; @g = split(/\t/, $genes[$i]); }elsif (!@g) { next; #no gene for this chrom }elsif ($s[$col0] ne $g[0] && exists $chrSt{$s[$col0]}) { #new chrom $i = $chrSt{$s[$col0]}; @g = split(/\t/, $genes[$i]); }elsif ($s[$col0] ne $g[0]) { next; #no gene for this chrom }elsif ($s[$col1] < $g[1] && $i == $chrSt{$s[$col0]}) { next; #before any genes }elsif ($s[$col1] > $g[2] && ($i == $#genes or $genes[$i+1] !~ $s[$col0])) { next; #after all genes on chr }else { while ($s[$col1] > $g[2] && $i < $#genes) { $i++; @g = split(/\t/, $genes[$i]); if ($s[$col0] ne $g[0]) { last; } #end of gene } if ($s[$col0] ne $g[0] or $s[$col1] < $g[1] or $s[$col1] > $g[2]) { next; #no overlap with genes } } processSnp(\@s, \@g); if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { next; } my $k = $i + 1; #check for more genes without losing data of first if ($k <= $#genes) { my @g2 = split(/\t/, $genes[$k]); while (@g2 && $k <= $#genes) { @g2 = split(/\t/, $genes[$k]); if ($s[$col0] ne $g2[0]) { undef @g2; last; #not same chrom }else { while ($s[$col1] > $g2[2] && $k <= $#genes) { $k++; @g2 = split(/\t/, $genes[$k]); if ($s[$col0] ne $g2[0]) { last; } #end of chrom } if ($s[$col0] ne $g2[0] or $s[$col1] < $g2[1] or $s[$col1] > $g2[2]) { undef @g2; last; #no overlap with more genes } processSnp(\@s, \@g2); if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { last; } } $k++; } } } close FH or die "Couldn't close $snpFile, $!\n"; exit; ######################################################################## sub processSnp { my $sref = shift; my $gref = shift; #overlaps gene, but maybe not coding seq #inside cds if ($sref->[$col1] + 1 < $gref->[6] or $sref->[$col2] > $gref->[7]) { return; #outside of coding } #now check exon my $i = 0; my @st = split(/,/, $gref->[11]); my @size = split(/,/, $gref->[10]); if (scalar @st ne $gref->[9]) { return; } #cant do this gene #die "bad gene $gref->[3]\n"; } my @pos; my $in = 0; for($i = 0; $i < $gref->[9]; $i++) { my $sta = $gref->[1] + $st[$i] + 1; #1 based position my $end = $sta + $size[$i] - 1; # if ($ends) { $end = $size[$i]; $sta = $st[$i] + 1; } #ends instead of sizes if ($end < $gref->[6]) { next; } #utr only if ($sta > $gref->[7]) { next; } #utr only #shorten to coding only if ($sta < $gref->[6]) { $sta = $gref->[6] + 1; } if ($end > $gref->[7]) { $end = $gref->[7]; } if ($sref->[$col1] + 1 >= $sta && $sref->[$col2] <= $end) { $in = 1; } elsif ($sref->[$col1] == $sref->[$col2] && $sref->[$col2] <= $end && $sref->[$col2] >= $sta) { $in = 1; } push(@pos, ($sta .. $end)); #add exon worth of positions } #@pos has coding positions for whole gene (chr coors), #and $in has whether we need to continue if (!$in) { return; } #not in coding exon if ((scalar @pos) % 3 != 0) { return; } #partial gene? not even codons if ($sref->[$col3] =~ /^-+\/[ACTG]+$/ or $sref->[$col3] =~ /^[ACTG]+\/-+$/ or $sref->[$col3] =~ /^-+$/) { #indel or del my $copy = $sref->[$col3]; my $c = ($copy =~ tr/-//); if ($c % 3 == 0) { return; } #not frameshift #handle bed4 to bed4 + 4 (pgSnp) print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; #if ($sref->[4]) { print "\t$sref->[4]"; } #if ($sref->[5]) { print "\t$sref->[5]"; } #if ($sref->[6]) { print "\t$sref->[6]"; } #if ($sref->[7]) { print "\t$sref->[7]"; } print "\t$gref->[3]\tframeshift\n"; $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; return; }elsif ($sref->[$col1] == $sref->[$col2]) { #insertion my $copy = $sref->[$col3]; my $c = ($copy =~ tr/\[ACTG]+//); if ($c % 3 == 0) { return; } #not frameshift #handle bed4 to bed4 + 4 (pgSnp) print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; #if ($sref->[4]) { print "\t$sref->[4]"; } #if ($sref->[5]) { print "\t$sref->[5]"; } #if ($sref->[6]) { print "\t$sref->[6]"; } #if ($sref->[7]) { print "\t$sref->[7]"; } print "\t$gref->[3]\tframeshift\n"; $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; return; } #check for amino acid substitutions my $s = $sref->[$col1] + 1; my $e = $sref->[$col2]; my $len = $sref->[$col2] - $sref->[$col1]; if ($gref->[5] eq '-') { @pos = reverse(@pos); my $t = $s; $s = $e; $e = $t; } $i = 0; my $found = 0; foreach (@pos) { if ($s == $_) { $found = 1; last; } $i++; } if ($found) { my $fs = $i; #keep original start index #have index where substitution starts my $cp = $i % 3; $i -= $cp; #i is now first position in codon my $cdNum = int($i / 3) + 1; my $ls = $i; if (!defined $ls) { die "ERROR not defined ls for $fs $sref->[$col2]\n"; } if (!@pos) { die "ERROR not defined array pos\n"; } if (!defined $pos[$ls]) { die "ERROR not defined pos at $ls\n"; } if (!defined $e) { die "ERROR not defined e for $pos[0] $pos[1] $pos[2]\n"; } while ($ls <= $#pos && $pos[$ls] ne $e) { $ls++; } my $i2 = $ls + (2 - ($ls % 3)); if ($i2 > $#pos) { return; } #not a full codon, partial gene? if ($i2 - $i < 2) { die "not a full codon positions $i to $i2 for $sref->[3]\n"; } my $oldnts = getnts($sref->[$col0], @pos[$i..$i2]); if (!$oldnts) { die "Failed to get sequence for $sref->[$col0] $pos[$i] .. $pos[$i2]\n"; } my @vars = split(/\//, $sref->[$col3]); if ($gref->[5] eq '-') { #complement oldnts and revcomp vars $oldnts = compl($oldnts); if (!$oldnts) { return; } #skip this one $oldnts = join('', (reverse(split(/ */, $oldnts)))); foreach (@vars) { $_ = reverse(split(/ */)); $_ = compl($_); } } my $r = $fs - $i; #difference in old indexes gives new index my @newnts; my $changed = ''; foreach my $v (@vars) { if (!$v or length($v) != 1) { return; } #only simple changes my @new = split(/ */, $oldnts); $changed = splice(@new, $r, $len, split(/ */, $v)); #should only change single nt push(@newnts, join("", @new)); } #now compute amino acids my $oldaa = getaa($oldnts); my @newaa; my $change = 0; #flag for if there is a change foreach my $v (@newnts) { my $t = getaa($v); if ($t ne $oldaa) { $change = 1; } push(@newaa, $t); } if (!$change && $syn) { print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n"; return; }elsif ($syn) { return; } #only want synonymous changes if (!$change) { return; } #no change in amino acids #if (abs($pos[$i] - $pos[$i2]) > 200) { #print STDERR "TESTING found mutation at splice site $sref->[0]\t$sref->[1]\t$sref->[2]\n"; #print STDERR "old $oldaa, new ", join(', ', @newaa), "\n"; #print STDERR "oldnt $oldnts, strand $gref->[5]\n"; #exit; #} print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; #if (defined $sref->[4]) { print "\t$sref->[4]"; } #if (defined $sref->[5]) { print "\t$sref->[5]"; } #if (defined $sref->[6]) { print "\t$sref->[6]"; } #if (defined $sref->[7]) { print "\t$sref->[7]"; } if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref if (!$changed) { return; } #skip this one print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n"; $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; } } sub getnts { my $chr = shift; my @pos = @_; #list of positions not necessarily in order #list may be reversed or have gaps(introns), at least 3 bps my $seq = ''; if (scalar @pos < 3) { die "too small region for $chr $pos[0]\n"; } if ($pos[0] < $pos[1]) { #not reversed my $s = $pos[0]; for(my $i = 1; $i <= $#pos; $i++) { if ($pos[$i] == $pos[$i-1] + 1) { next; } if ($seqFlag eq '2bit') { $seq .= fetchSeq2bit($chr, $s, $pos[$i-1]); }else { $seq .= fetchSeqNib($chr, $s, $pos[$i-1]); } $s = $pos[$i]; } if (length $seq != scalar @pos) { #still need to fetch seq #if (abs($pos[$#pos]-$pos[0]) > 200) { #print STDERR "TESTING have split codon $chr $pos[0] $pos[$#pos]\n"; #exit; #} if ($seqFlag eq '2bit') { $seq .= fetchSeq2bit($chr, $s, $pos[$#pos]); }else { $seq .= fetchSeqNib($chr, $s, $pos[$#pos]); } } }else { #reversed my $s = $pos[$#pos]; for(my $i = $#pos -1; $i >= 0; $i--) { if ($pos[$i] == $pos[$i+1] + 1) { next; } if ($seqFlag eq '2bit') { $seq .= fetchSeq2bit($chr, $s, $pos[$i+1]); }else { $seq .= fetchSeqNib($chr, $s, $pos[$i+1]); } $s = $pos[$i]; } if (length $seq != scalar @pos) { #still need to fetch seq #if (abs($pos[$#pos]-$pos[0]) > 200) { #print STDERR "TESTING have split codon $pos[0] .. $pos[$#pos]\n"; #} if ($seqFlag eq '2bit') { $seq .= fetchSeq2bit($chr, $s, $pos[0]); }else { $seq .= fetchSeqNib($chr, $s, $pos[0]); } } } } sub fetchSeq2bit { my $chr = shift; my $st = shift; my $end = shift; my $strand = '+'; $st--; #change to UCSC numbering open (BIT, "twoBitToFa -seq=$chr -start=$st -end=$end $nibDir stdout |") or die "Couldn't run twoBitToFa, $!\n"; my $seq = ''; while () { chomp; if (/^>/) { next; } #header $seq .= $_; } close BIT or die "Couldn't finish twoBitToFa on $chr $st $end, $!\n"; return $seq; } sub fetchSeqNib { my $chr = shift; my $st = shift; my $end = shift; my $strand = '+'; $st--; #change to UCSC numbering open (NIB, "nibFrag -upper $nibDir/${chr}.nib $st $end $strand stdout |") or die "Couldn't run nibFrag, $!\n"; my $seq = ''; while () { chomp; if (/^>/) { next; } #header $seq .= $_; } close NIB or die "Couldn't finish nibFrag on $chr $st $end, $!\n"; return $seq; } sub compl { my $nts = shift; my $comp = ''; foreach my $n (split(/ */, $nts)) { if ($n eq 'A') { $comp .= 'T'; } elsif ($n eq 'T') { $comp .= 'A'; } elsif ($n eq 'C') { $comp .= 'G'; } elsif ($n eq 'G') { $comp .= 'C'; } elsif ($n eq 'N') { $comp .= 'N'; } elsif ($n eq '-') { $comp .= '-'; } #deletion else { $comp = undef; } } return $comp; } sub getaa { my $nts = shift; #in multiples of 3 my $aa = ''; my @n = split(/ */, $nts); while (@n) { my @t = splice(@n, 0, 3); my $n = uc(join("", @t)); if (!exists $codon{$n}) { $aa .= 'N'; next; } $aa .= $codon{$n}; } return $aa; } sub fill_codon { $codon{GCA} = 'Ala'; $codon{GCC} = 'Ala'; $codon{GCG} = 'Ala'; $codon{GCT} = 'Ala'; $codon{CGG} = 'Arg'; $codon{CGT} = 'Arg'; $codon{CGC} = 'Arg'; $codon{AGA} = 'Arg'; $codon{AGG} = 'Arg'; $codon{CGA} = 'Arg'; $codon{AAC} = 'Asn'; $codon{AAT} = 'Asn'; $codon{GAC} = 'Asp'; $codon{GAT} = 'Asp'; $codon{TGC} = 'Cys'; $codon{TGT} = 'Cys'; $codon{CAG} = 'Gln'; $codon{CAA} = 'Gln'; $codon{GAA} = 'Glu'; $codon{GAG} = 'Glu'; $codon{GGG} = 'Gly'; $codon{GGA} = 'Gly'; $codon{GGC} = 'Gly'; $codon{GGT} = 'Gly'; $codon{CAC} = 'His'; $codon{CAT} = 'His'; $codon{ATA} = 'Ile'; $codon{ATT} = 'Ile'; $codon{ATC} = 'Ile'; $codon{CTA} = 'Leu'; $codon{CTC} = 'Leu'; $codon{CTG} = 'Leu'; $codon{CTT} = 'Leu'; $codon{TTG} = 'Leu'; $codon{TTA} = 'Leu'; $codon{AAA} = 'Lys'; $codon{AAG} = 'Lys'; $codon{ATG} = 'Met'; $codon{TTC} = 'Phe'; $codon{TTT} = 'Phe'; $codon{CCT} = 'Pro'; $codon{CCA} = 'Pro'; $codon{CCC} = 'Pro'; $codon{CCG} = 'Pro'; $codon{TCA} = 'Ser'; $codon{AGC} = 'Ser'; $codon{AGT} = 'Ser'; $codon{TCC} = 'Ser'; $codon{TCT} = 'Ser'; $codon{TCG} = 'Ser'; $codon{TGA} = 'Stop'; $codon{TAG} = 'Stop'; $codon{TAA} = 'Stop'; $codon{ACT} = 'Thr'; $codon{ACA} = 'Thr'; $codon{ACC} = 'Thr'; $codon{ACG} = 'Thr'; $codon{TGG} = 'Trp'; $codon{TAT} = 'Tyr'; $codon{TAC} = 'Tyr'; $codon{GTC} = 'Val'; $codon{GTA} = 'Val'; $codon{GTG} = 'Val'; $codon{GTT} = 'Val'; } sub getGalaxyInfo { my $build; my $locFile; foreach (@ARGV) { if (/build=(.*)/) { $build = $1; } elsif (/loc=(.*)/) { $locFile = $1; } } if (!$build or !$locFile) { print STDERR "ERROR missing build or locfile for Galaxy input\n"; exit 1; } # read $locFile to get $nibDir (ignoring commets) open(LF, "< $locFile") || die "open($locFile): $!\n"; while() { s/#.*$//; s/(?:^\s+|\s+$)//g; next if (/^$/); my @t = split(/\t/); if ($t[0] eq $build) { $nibDir = $t[1]; } } close(LF); if ($nibDir eq 'Galaxy') { print STDERR "Failed to find sequence directory in locfile $locFile\n"; } $nibDir .= "/$build.2bit"; #we want full path and filename }