| 1 | #!/usr/bin/env perl |
|---|
| 2 | |
|---|
| 3 | use strict; |
|---|
| 4 | use warnings; |
|---|
| 5 | |
|---|
| 6 | ######################################################################### |
|---|
| 7 | # codingSnps.pl |
|---|
| 8 | # This takes a bed file with the names being / separated nts |
|---|
| 9 | # and a gene bed file with cds start and stop. |
|---|
| 10 | # It then checks for changes in coding regions, reporting |
|---|
| 11 | # those that cause a frameshift or substitution in the amino acid. |
|---|
| 12 | ######################################################################### |
|---|
| 13 | |
|---|
| 14 | if (!@ARGV or scalar @ARGV < 3) { |
|---|
| 15 | print "Usage: codingSnps.pl snps.bed genes.bed (/dir/nib/|Galaxy build= loc=) [chr=# start=# end=# snp=#] > codingSnps.txt\n"; |
|---|
| 16 | exit; |
|---|
| 17 | } |
|---|
| 18 | my $uniq = 0; #flag for whether want uniq positions |
|---|
| 19 | my $syn = 0; #flag for if want synonomous changes rather than non-syn |
|---|
| 20 | my $snpFile = shift @ARGV; |
|---|
| 21 | my $geneFile = shift @ARGV; |
|---|
| 22 | my $nibDir = shift @ARGV; |
|---|
| 23 | if ($nibDir eq 'Galaxy') { getGalaxyInfo(); } |
|---|
| 24 | my $seqFlag = "2bit"; #flag to set sequence type 2bit|nib |
|---|
| 25 | my $col0 = 0; #bed like columns in default positions |
|---|
| 26 | my $col1 = 1; |
|---|
| 27 | my $col2 = 2; |
|---|
| 28 | my $col3 = 3; |
|---|
| 29 | #column positions 1 based coming in (for Galaxy) |
|---|
| 30 | foreach (@ARGV) { |
|---|
| 31 | if (/chr=(\d+)/) { $col0 = $1 -1; } |
|---|
| 32 | elsif (/start=(\d+)/) { $col1 = $1 -1; } |
|---|
| 33 | elsif (/end=(\d+)/) { $col2 = $1 -1; } |
|---|
| 34 | elsif (/snp=(\d+)/) { $col3 = $1 -1; } |
|---|
| 35 | } |
|---|
| 36 | if ($col0 < 0 || $col1 < 0 || $col2 < 0 || $col3 < 0) { |
|---|
| 37 | print STDERR "ERROR column numbers are given with origin 1\n"; |
|---|
| 38 | exit 1; |
|---|
| 39 | } |
|---|
| 40 | my @genes; #bed lines for genes, sorted by chrom and start |
|---|
| 41 | my %chrSt; #index in array where each chrom starts |
|---|
| 42 | my %codon; #hash of codon amino acid conversions |
|---|
| 43 | my $ends = 0; #ends vs sizes in bed 11 position, starts relative to chrom |
|---|
| 44 | my $ignoreN = 1; #skip N |
|---|
| 45 | |
|---|
| 46 | my %amb = ( |
|---|
| 47 | "R" => "A/G", |
|---|
| 48 | "Y" => "C/T", |
|---|
| 49 | "S" => "C/G", |
|---|
| 50 | "W" => "A/T", |
|---|
| 51 | "K" => "G/T", |
|---|
| 52 | "M" => "A/C", |
|---|
| 53 | "B" => "C/G/T", |
|---|
| 54 | "D" => "A/G/T", |
|---|
| 55 | "H" => "A/C/T", |
|---|
| 56 | "V" => "A/C/G", |
|---|
| 57 | "N" => "A/C/G/T" |
|---|
| 58 | ); |
|---|
| 59 | fill_codon(); |
|---|
| 60 | open(FH, "cat $geneFile | sort -k1,1 -k2,2n |") |
|---|
| 61 | or die "Couldn't open and sort $geneFile, $!\n"; |
|---|
| 62 | my $i = 0; |
|---|
| 63 | while(<FH>) { |
|---|
| 64 | chomp; |
|---|
| 65 | if (/refGene.cdsEnd|ccdsGene.exonEnds/) { $ends = 1; next; } |
|---|
| 66 | push(@genes, "$_"); |
|---|
| 67 | my @f = split(/\t/); |
|---|
| 68 | if (!exists $chrSt{$f[0]}) { $chrSt{$f[0]} = $i; } |
|---|
| 69 | $i++; |
|---|
| 70 | } |
|---|
| 71 | close FH or die "Couldn't close $geneFile, $!\n"; |
|---|
| 72 | |
|---|
| 73 | if ($ends) { print STDERR "WARNING using block ends rather than sizes\n"; } |
|---|
| 74 | |
|---|
| 75 | #open snps sorted as well |
|---|
| 76 | my $s1 = $col0 + 1; #sort order is origin 1 |
|---|
| 77 | my $s2 = $col1 + 1; |
|---|
| 78 | open(FH, "cat $snpFile | sort -k$s1,$s1 -k$s2,${s2}n |") |
|---|
| 79 | or die "Couldn't open and sort $snpFile, $!\n"; |
|---|
| 80 | $i = 0; |
|---|
| 81 | my @g; #one genes fields, should be used repeatedly |
|---|
| 82 | my %done; |
|---|
| 83 | while(<FH>) { |
|---|
| 84 | chomp; |
|---|
| 85 | if (/^\s*#/) { next; } #comment |
|---|
| 86 | my @s = split(/\t/); #SNP fields |
|---|
| 87 | if (!@s or !$s[$col0]) { die "ERROR missing SNP data, $_\n"; } |
|---|
| 88 | my $size = $#s; |
|---|
| 89 | if ($col0 > $size || $col1 > $size || $col2 > $size || $col3 > $size) { |
|---|
| 90 | print STDERR "ERROR file has fewer columns than requested, requested columns (0 based) $col0 $col1 $col2 $col3, file has $size\n"; |
|---|
| 91 | exit 1; |
|---|
| 92 | } |
|---|
| 93 | if ($s[$col1] =~ /\D/) { |
|---|
| 94 | print STDERR "ERROR the start point must be an integer not $s[$col1]\n"; |
|---|
| 95 | exit 1; |
|---|
| 96 | } |
|---|
| 97 | if ($s[$col2] =~ /\D/) { |
|---|
| 98 | print STDERR "ERROR the start point must be an integer not $s[$col2]\n"; |
|---|
| 99 | exit 1; |
|---|
| 100 | } |
|---|
| 101 | if ($s[$col3] eq 'N' && $ignoreN) { next; } |
|---|
| 102 | if (exists $amb{$s[$col3]}) { $s[$col3] = $amb{$s[$col3]}; } |
|---|
| 103 | if (!@g && exists $chrSt{$s[$col0]}) { #need to fetch first gene row |
|---|
| 104 | $i = $chrSt{$s[$col0]}; |
|---|
| 105 | @g = split(/\t/, $genes[$i]); |
|---|
| 106 | }elsif (!@g) { |
|---|
| 107 | next; #no gene for this chrom |
|---|
| 108 | }elsif ($s[$col0] ne $g[0] && exists $chrSt{$s[$col0]}) { #new chrom |
|---|
| 109 | $i = $chrSt{$s[$col0]}; |
|---|
| 110 | @g = split(/\t/, $genes[$i]); |
|---|
| 111 | }elsif ($s[$col0] ne $g[0]) { |
|---|
| 112 | next; #no gene for this chrom |
|---|
| 113 | }elsif ($s[$col1] < $g[1] && $i == $chrSt{$s[$col0]}) { |
|---|
| 114 | next; #before any genes |
|---|
| 115 | }elsif ($s[$col1] > $g[2] && ($i == $#genes or $genes[$i+1] !~ $s[$col0])) { |
|---|
| 116 | next; #after all genes on chr |
|---|
| 117 | }else { |
|---|
| 118 | while ($s[$col1] > $g[2] && $i < $#genes) { |
|---|
| 119 | $i++; |
|---|
| 120 | @g = split(/\t/, $genes[$i]); |
|---|
| 121 | if ($s[$col0] ne $g[0]) { last; } #end of gene |
|---|
| 122 | } |
|---|
| 123 | if ($s[$col0] ne $g[0] or $s[$col1] < $g[1] or $s[$col1] > $g[2]) { |
|---|
| 124 | next; #no overlap with genes |
|---|
| 125 | } |
|---|
| 126 | } |
|---|
| 127 | |
|---|
| 128 | processSnp(\@s, \@g); |
|---|
| 129 | if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { next; } |
|---|
| 130 | |
|---|
| 131 | my $k = $i + 1; #check for more genes without losing data of first |
|---|
| 132 | if ($k <= $#genes) { |
|---|
| 133 | my @g2 = split(/\t/, $genes[$k]); |
|---|
| 134 | while (@g2 && $k <= $#genes) { |
|---|
| 135 | @g2 = split(/\t/, $genes[$k]); |
|---|
| 136 | if ($s[$col0] ne $g2[0]) { |
|---|
| 137 | undef @g2; |
|---|
| 138 | last; #not same chrom |
|---|
| 139 | }else { |
|---|
| 140 | while ($s[$col1] > $g2[2] && $k <= $#genes) { |
|---|
| 141 | $k++; |
|---|
| 142 | @g2 = split(/\t/, $genes[$k]); |
|---|
| 143 | if ($s[$col0] ne $g2[0]) { last; } #end of chrom |
|---|
| 144 | } |
|---|
| 145 | if ($s[$col0] ne $g2[0] or $s[$col1] < $g2[1] or $s[$col1] > $g2[2]) { |
|---|
| 146 | undef @g2; |
|---|
| 147 | last; #no overlap with more genes |
|---|
| 148 | } |
|---|
| 149 | processSnp(\@s, \@g2); |
|---|
| 150 | if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { last; } |
|---|
| 151 | } |
|---|
| 152 | $k++; |
|---|
| 153 | } |
|---|
| 154 | } |
|---|
| 155 | } |
|---|
| 156 | close FH or die "Couldn't close $snpFile, $!\n"; |
|---|
| 157 | |
|---|
| 158 | exit; |
|---|
| 159 | |
|---|
| 160 | ######################################################################## |
|---|
| 161 | sub processSnp { |
|---|
| 162 | my $sref = shift; |
|---|
| 163 | my $gref = shift; |
|---|
| 164 | #overlaps gene, but maybe not coding seq |
|---|
| 165 | #inside cds |
|---|
| 166 | if ($sref->[$col1] + 1 < $gref->[6] or $sref->[$col2] > $gref->[7]) { |
|---|
| 167 | return; #outside of coding |
|---|
| 168 | } |
|---|
| 169 | #now check exon |
|---|
| 170 | my $i = 0; |
|---|
| 171 | my @st = split(/,/, $gref->[11]); |
|---|
| 172 | my @size = split(/,/, $gref->[10]); |
|---|
| 173 | if (scalar @st ne $gref->[9]) { return; } #cant do this gene #die "bad gene $gref->[3]\n"; } |
|---|
| 174 | my @pos; |
|---|
| 175 | my $in = 0; |
|---|
| 176 | for($i = 0; $i < $gref->[9]; $i++) { |
|---|
| 177 | my $sta = $gref->[1] + $st[$i] + 1; #1 based position |
|---|
| 178 | my $end = $sta + $size[$i] - 1; # |
|---|
| 179 | if ($ends) { $end = $size[$i]; $sta = $st[$i] + 1; } #ends instead of sizes |
|---|
| 180 | if ($end < $gref->[6]) { next; } #utr only |
|---|
| 181 | if ($sta > $gref->[7]) { next; } #utr only |
|---|
| 182 | #shorten to coding only |
|---|
| 183 | if ($sta < $gref->[6]) { $sta = $gref->[6] + 1; } |
|---|
| 184 | if ($end > $gref->[7]) { $end = $gref->[7]; } |
|---|
| 185 | if ($sref->[$col1] + 1 >= $sta && $sref->[$col2] <= $end) { $in = 1; } |
|---|
| 186 | elsif ($sref->[$col1] == $sref->[$col2] && $sref->[$col2] <= $end && $sref->[$col2] >= $sta) { $in = 1; } |
|---|
| 187 | push(@pos, ($sta .. $end)); #add exon worth of positions |
|---|
| 188 | } |
|---|
| 189 | #@pos has coding positions for whole gene (chr coors), |
|---|
| 190 | #and $in has whether we need to continue |
|---|
| 191 | if (!$in) { return; } #not in coding exon |
|---|
| 192 | if ((scalar @pos) % 3 != 0) { return; } #partial gene? not even codons |
|---|
| 193 | if ($sref->[$col3] =~ /^-+\/[ACTG]+$/ or $sref->[$col3] =~ /^[ACTG]+\/-+$/ or |
|---|
| 194 | $sref->[$col3] =~ /^-+$/) { #indel or del |
|---|
| 195 | my $copy = $sref->[$col3]; |
|---|
| 196 | my $c = ($copy =~ tr/-//); |
|---|
| 197 | if ($c % 3 == 0) { return; } #not frameshift |
|---|
| 198 | #handle bed4 to bed4 + 4 (pgSnp) |
|---|
| 199 | print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; |
|---|
| 200 | #if ($sref->[4]) { print "\t$sref->[4]"; } |
|---|
| 201 | #if ($sref->[5]) { print "\t$sref->[5]"; } |
|---|
| 202 | #if ($sref->[6]) { print "\t$sref->[6]"; } |
|---|
| 203 | #if ($sref->[7]) { print "\t$sref->[7]"; } |
|---|
| 204 | print "\t$gref->[3]\tframeshift\n"; |
|---|
| 205 | $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; |
|---|
| 206 | return; |
|---|
| 207 | }elsif ($sref->[$col1] == $sref->[$col2]) { #insertion |
|---|
| 208 | my $copy = $sref->[$col3]; |
|---|
| 209 | my $c = ($copy =~ tr/\[ACTG]+//); |
|---|
| 210 | if ($c % 3 == 0) { return; } #not frameshift |
|---|
| 211 | #handle bed4 to bed4 + 4 (pgSnp) |
|---|
| 212 | print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; |
|---|
| 213 | #if ($sref->[4]) { print "\t$sref->[4]"; } |
|---|
| 214 | #if ($sref->[5]) { print "\t$sref->[5]"; } |
|---|
| 215 | #if ($sref->[6]) { print "\t$sref->[6]"; } |
|---|
| 216 | #if ($sref->[7]) { print "\t$sref->[7]"; } |
|---|
| 217 | print "\t$gref->[3]\tframeshift\n"; |
|---|
| 218 | $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; |
|---|
| 219 | return; |
|---|
| 220 | } |
|---|
| 221 | #check for amino acid substitutions |
|---|
| 222 | my $s = $sref->[$col1] + 1; |
|---|
| 223 | my $e = $sref->[$col2]; |
|---|
| 224 | my $len = $sref->[$col2] - $sref->[$col1]; |
|---|
| 225 | if ($gref->[5] eq '-') { |
|---|
| 226 | @pos = reverse(@pos); |
|---|
| 227 | my $t = $s; |
|---|
| 228 | $s = $e; |
|---|
| 229 | $e = $t; |
|---|
| 230 | } |
|---|
| 231 | $i = 0; |
|---|
| 232 | my $found = 0; |
|---|
| 233 | foreach (@pos) { |
|---|
| 234 | if ($s == $_) { |
|---|
| 235 | $found = 1; |
|---|
| 236 | last; |
|---|
| 237 | } |
|---|
| 238 | $i++; |
|---|
| 239 | } |
|---|
| 240 | if ($found) { |
|---|
| 241 | my $fs = $i; #keep original start index |
|---|
| 242 | #have index where substitution starts |
|---|
| 243 | my $cp = $i % 3; |
|---|
| 244 | $i -= $cp; #i is now first position in codon |
|---|
| 245 | my $cdNum = int($i / 3) + 1; |
|---|
| 246 | my $ls = $i; |
|---|
| 247 | if (!defined $ls) { die "ERROR not defined ls for $fs $sref->[$col2]\n"; } |
|---|
| 248 | if (!@pos) { die "ERROR not defined array pos\n"; } |
|---|
| 249 | if (!defined $pos[$ls]) { die "ERROR not defined pos at $ls\n"; } |
|---|
| 250 | if (!defined $e) { die "ERROR not defined e for $pos[0] $pos[1] $pos[2]\n"; } |
|---|
| 251 | while ($ls <= $#pos && $pos[$ls] ne $e) { |
|---|
| 252 | $ls++; |
|---|
| 253 | } |
|---|
| 254 | my $i2 = $ls + (2 - ($ls % 3)); |
|---|
| 255 | if ($i2 > $#pos) { return; } #not a full codon, partial gene? |
|---|
| 256 | |
|---|
| 257 | if ($i2 - $i < 2) { die "not a full codon positions $i to $i2 for $sref->[3]\n"; } |
|---|
| 258 | my $oldnts = getnts($sref->[$col0], @pos[$i..$i2]); |
|---|
| 259 | if (!$oldnts) { die "Failed to get sequence for $sref->[$col0] $pos[$i] .. $pos[$i2]\n"; } |
|---|
| 260 | my @vars = split(/\//, $sref->[$col3]); |
|---|
| 261 | if ($gref->[5] eq '-') { #complement oldnts and revcomp vars |
|---|
| 262 | $oldnts = compl($oldnts); |
|---|
| 263 | if (!$oldnts) { return; } #skip this one |
|---|
| 264 | $oldnts = join('', (reverse(split(/ */, $oldnts)))); |
|---|
| 265 | foreach (@vars) { |
|---|
| 266 | $_ = reverse(split(/ */)); |
|---|
| 267 | $_ = compl($_); |
|---|
| 268 | } |
|---|
| 269 | } |
|---|
| 270 | my $r = $fs - $i; #difference in old indexes gives new index |
|---|
| 271 | my @newnts; |
|---|
| 272 | my $changed = ''; |
|---|
| 273 | foreach my $v (@vars) { |
|---|
| 274 | if (!$v or length($v) != 1) { return; } #only simple changes |
|---|
| 275 | my @new = split(/ */, $oldnts); |
|---|
| 276 | $changed = splice(@new, $r, $len, split(/ */, $v)); |
|---|
| 277 | #should only change single nt |
|---|
| 278 | push(@newnts, join("", @new)); |
|---|
| 279 | } |
|---|
| 280 | #now compute amino acids |
|---|
| 281 | my $oldaa = getaa($oldnts); |
|---|
| 282 | my @newaa; |
|---|
| 283 | my $change = 0; #flag for if there is a change |
|---|
| 284 | foreach my $v (@newnts) { |
|---|
| 285 | my $t = getaa($v); |
|---|
| 286 | if ($t ne $oldaa) { $change = 1; } |
|---|
| 287 | push(@newaa, $t); |
|---|
| 288 | } |
|---|
| 289 | if (!$change && $syn) { |
|---|
| 290 | print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; |
|---|
| 291 | print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n"; |
|---|
| 292 | return; |
|---|
| 293 | }elsif ($syn) { return; } #only want synonymous changes |
|---|
| 294 | if (!$change) { return; } #no change in amino acids |
|---|
| 295 | #if (abs($pos[$i] - $pos[$i2]) > 200) { |
|---|
| 296 | #print STDERR "TESTING found mutation at splice site $sref->[0]\t$sref->[1]\t$sref->[2]\n"; |
|---|
| 297 | #print STDERR "old $oldaa, new ", join(', ', @newaa), "\n"; |
|---|
| 298 | #print STDERR "oldnt $oldnts, strand $gref->[5]\n"; |
|---|
| 299 | #exit; |
|---|
| 300 | #} |
|---|
| 301 | print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; |
|---|
| 302 | #if (defined $sref->[4]) { print "\t$sref->[4]"; } |
|---|
| 303 | #if (defined $sref->[5]) { print "\t$sref->[5]"; } |
|---|
| 304 | #if (defined $sref->[6]) { print "\t$sref->[6]"; } |
|---|
| 305 | #if (defined $sref->[7]) { print "\t$sref->[7]"; } |
|---|
| 306 | if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref |
|---|
| 307 | if (!$changed) { return; } #skip this one |
|---|
| 308 | print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n"; |
|---|
| 309 | $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; |
|---|
| 310 | } |
|---|
| 311 | } |
|---|
| 312 | |
|---|
| 313 | sub getnts { |
|---|
| 314 | my $chr = shift; |
|---|
| 315 | my @pos = @_; #list of positions not necessarily in order |
|---|
| 316 | #list may be reversed or have gaps(introns), at least 3 bps |
|---|
| 317 | my $seq = ''; |
|---|
| 318 | if (scalar @pos < 3) { die "too small region for $chr $pos[0]\n"; } |
|---|
| 319 | if ($pos[0] < $pos[1]) { #not reversed |
|---|
| 320 | my $s = $pos[0]; |
|---|
| 321 | for(my $i = 1; $i <= $#pos; $i++) { |
|---|
| 322 | if ($pos[$i] == $pos[$i-1] + 1) { next; } |
|---|
| 323 | if ($seqFlag eq '2bit') { |
|---|
| 324 | $seq .= fetchSeq2bit($chr, $s, $pos[$i-1]); |
|---|
| 325 | }else { |
|---|
| 326 | $seq .= fetchSeqNib($chr, $s, $pos[$i-1]); |
|---|
| 327 | } |
|---|
| 328 | $s = $pos[$i]; |
|---|
| 329 | } |
|---|
| 330 | if (length $seq != scalar @pos) { #still need to fetch seq |
|---|
| 331 | #if (abs($pos[$#pos]-$pos[0]) > 200) { |
|---|
| 332 | #print STDERR "TESTING have split codon $chr $pos[0] $pos[$#pos]\n"; |
|---|
| 333 | #exit; |
|---|
| 334 | #} |
|---|
| 335 | if ($seqFlag eq '2bit') { |
|---|
| 336 | $seq .= fetchSeq2bit($chr, $s, $pos[$#pos]); |
|---|
| 337 | }else { |
|---|
| 338 | $seq .= fetchSeqNib($chr, $s, $pos[$#pos]); |
|---|
| 339 | } |
|---|
| 340 | } |
|---|
| 341 | }else { #reversed |
|---|
| 342 | my $s = $pos[$#pos]; |
|---|
| 343 | for(my $i = $#pos -1; $i >= 0; $i--) { |
|---|
| 344 | if ($pos[$i] == $pos[$i+1] + 1) { next; } |
|---|
| 345 | if ($seqFlag eq '2bit') { |
|---|
| 346 | $seq .= fetchSeq2bit($chr, $s, $pos[$i+1]); |
|---|
| 347 | }else { |
|---|
| 348 | $seq .= fetchSeqNib($chr, $s, $pos[$i+1]); |
|---|
| 349 | } |
|---|
| 350 | $s = $pos[$i]; |
|---|
| 351 | } |
|---|
| 352 | if (length $seq != scalar @pos) { #still need to fetch seq |
|---|
| 353 | #if (abs($pos[$#pos]-$pos[0]) > 200) { |
|---|
| 354 | #print STDERR "TESTING have split codon $pos[0] .. $pos[$#pos]\n"; |
|---|
| 355 | #} |
|---|
| 356 | if ($seqFlag eq '2bit') { |
|---|
| 357 | $seq .= fetchSeq2bit($chr, $s, $pos[0]); |
|---|
| 358 | }else { |
|---|
| 359 | $seq .= fetchSeqNib($chr, $s, $pos[0]); |
|---|
| 360 | } |
|---|
| 361 | } |
|---|
| 362 | } |
|---|
| 363 | } |
|---|
| 364 | |
|---|
| 365 | sub fetchSeq2bit { |
|---|
| 366 | my $chr = shift; |
|---|
| 367 | my $st = shift; |
|---|
| 368 | my $end = shift; |
|---|
| 369 | my $strand = '+'; |
|---|
| 370 | $st--; #change to UCSC numbering |
|---|
| 371 | open (BIT, "twoBitToFa -seq=$chr -start=$st -end=$end $nibDir stdout |") or |
|---|
| 372 | die "Couldn't run twoBitToFa, $!\n"; |
|---|
| 373 | my $seq = ''; |
|---|
| 374 | while (<BIT>) { |
|---|
| 375 | chomp; |
|---|
| 376 | if (/^>/) { next; } #header |
|---|
| 377 | $seq .= $_; |
|---|
| 378 | } |
|---|
| 379 | close BIT or die "Couldn't finish twoBitToFa on $chr $st $end, $!\n"; |
|---|
| 380 | return $seq; |
|---|
| 381 | } |
|---|
| 382 | |
|---|
| 383 | sub fetchSeqNib { |
|---|
| 384 | my $chr = shift; |
|---|
| 385 | my $st = shift; |
|---|
| 386 | my $end = shift; |
|---|
| 387 | my $strand = '+'; |
|---|
| 388 | $st--; #change to UCSC numbering |
|---|
| 389 | open (NIB, "nibFrag -upper $nibDir/${chr}.nib $st $end $strand stdout |") or die "Couldn't run nibFrag, $!\n"; |
|---|
| 390 | my $seq = ''; |
|---|
| 391 | while (<NIB>) { |
|---|
| 392 | chomp; |
|---|
| 393 | if (/^>/) { next; } #header |
|---|
| 394 | $seq .= $_; |
|---|
| 395 | } |
|---|
| 396 | close NIB or die "Couldn't finish nibFrag on $chr $st $end, $!\n"; |
|---|
| 397 | return $seq; |
|---|
| 398 | } |
|---|
| 399 | |
|---|
| 400 | sub compl { |
|---|
| 401 | my $nts = shift; |
|---|
| 402 | my $comp = ''; |
|---|
| 403 | foreach my $n (split(/ */, $nts)) { |
|---|
| 404 | if ($n eq 'A') { $comp .= 'T'; } |
|---|
| 405 | elsif ($n eq 'T') { $comp .= 'A'; } |
|---|
| 406 | elsif ($n eq 'C') { $comp .= 'G'; } |
|---|
| 407 | elsif ($n eq 'G') { $comp .= 'C'; } |
|---|
| 408 | elsif ($n eq 'N') { $comp .= 'N'; } |
|---|
| 409 | elsif ($n eq '-') { $comp .= '-'; } #deletion |
|---|
| 410 | else { $comp = undef; } |
|---|
| 411 | } |
|---|
| 412 | return $comp; |
|---|
| 413 | } |
|---|
| 414 | |
|---|
| 415 | sub getaa { |
|---|
| 416 | my $nts = shift; #in multiples of 3 |
|---|
| 417 | my $aa = ''; |
|---|
| 418 | my @n = split(/ */, $nts); |
|---|
| 419 | while (@n) { |
|---|
| 420 | my @t = splice(@n, 0, 3); |
|---|
| 421 | my $n = uc(join("", @t)); |
|---|
| 422 | if (!exists $codon{$n}) { $aa .= 'N'; next; } |
|---|
| 423 | $aa .= $codon{$n}; |
|---|
| 424 | } |
|---|
| 425 | return $aa; |
|---|
| 426 | } |
|---|
| 427 | |
|---|
| 428 | sub fill_codon { |
|---|
| 429 | $codon{GCA} = 'Ala'; |
|---|
| 430 | $codon{GCC} = 'Ala'; |
|---|
| 431 | $codon{GCG} = 'Ala'; |
|---|
| 432 | $codon{GCT} = 'Ala'; |
|---|
| 433 | $codon{CGG} = 'Arg'; |
|---|
| 434 | $codon{CGT} = 'Arg'; |
|---|
| 435 | $codon{CGC} = 'Arg'; |
|---|
| 436 | $codon{AGA} = 'Arg'; |
|---|
| 437 | $codon{AGG} = 'Arg'; |
|---|
| 438 | $codon{CGA} = 'Arg'; |
|---|
| 439 | $codon{AAC} = 'Asn'; |
|---|
| 440 | $codon{AAT} = 'Asn'; |
|---|
| 441 | $codon{GAC} = 'Asp'; |
|---|
| 442 | $codon{GAT} = 'Asp'; |
|---|
| 443 | $codon{TGC} = 'Cys'; |
|---|
| 444 | $codon{TGT} = 'Cys'; |
|---|
| 445 | $codon{CAG} = 'Gln'; |
|---|
| 446 | $codon{CAA} = 'Gln'; |
|---|
| 447 | $codon{GAA} = 'Glu'; |
|---|
| 448 | $codon{GAG} = 'Glu'; |
|---|
| 449 | $codon{GGG} = 'Gly'; |
|---|
| 450 | $codon{GGA} = 'Gly'; |
|---|
| 451 | $codon{GGC} = 'Gly'; |
|---|
| 452 | $codon{GGT} = 'Gly'; |
|---|
| 453 | $codon{CAC} = 'His'; |
|---|
| 454 | $codon{CAT} = 'His'; |
|---|
| 455 | $codon{ATA} = 'Ile'; |
|---|
| 456 | $codon{ATT} = 'Ile'; |
|---|
| 457 | $codon{ATC} = 'Ile'; |
|---|
| 458 | $codon{CTA} = 'Leu'; |
|---|
| 459 | $codon{CTC} = 'Leu'; |
|---|
| 460 | $codon{CTG} = 'Leu'; |
|---|
| 461 | $codon{CTT} = 'Leu'; |
|---|
| 462 | $codon{TTG} = 'Leu'; |
|---|
| 463 | $codon{TTA} = 'Leu'; |
|---|
| 464 | $codon{AAA} = 'Lys'; |
|---|
| 465 | $codon{AAG} = 'Lys'; |
|---|
| 466 | $codon{ATG} = 'Met'; |
|---|
| 467 | $codon{TTC} = 'Phe'; |
|---|
| 468 | $codon{TTT} = 'Phe'; |
|---|
| 469 | $codon{CCT} = 'Pro'; |
|---|
| 470 | $codon{CCA} = 'Pro'; |
|---|
| 471 | $codon{CCC} = 'Pro'; |
|---|
| 472 | $codon{CCG} = 'Pro'; |
|---|
| 473 | $codon{TCA} = 'Ser'; |
|---|
| 474 | $codon{AGC} = 'Ser'; |
|---|
| 475 | $codon{AGT} = 'Ser'; |
|---|
| 476 | $codon{TCC} = 'Ser'; |
|---|
| 477 | $codon{TCT} = 'Ser'; |
|---|
| 478 | $codon{TCG} = 'Ser'; |
|---|
| 479 | $codon{TGA} = 'Stop'; |
|---|
| 480 | $codon{TAG} = 'Stop'; |
|---|
| 481 | $codon{TAA} = 'Stop'; |
|---|
| 482 | $codon{ACT} = 'Thr'; |
|---|
| 483 | $codon{ACA} = 'Thr'; |
|---|
| 484 | $codon{ACC} = 'Thr'; |
|---|
| 485 | $codon{ACG} = 'Thr'; |
|---|
| 486 | $codon{TGG} = 'Trp'; |
|---|
| 487 | $codon{TAT} = 'Tyr'; |
|---|
| 488 | $codon{TAC} = 'Tyr'; |
|---|
| 489 | $codon{GTC} = 'Val'; |
|---|
| 490 | $codon{GTA} = 'Val'; |
|---|
| 491 | $codon{GTG} = 'Val'; |
|---|
| 492 | $codon{GTT} = 'Val'; |
|---|
| 493 | } |
|---|
| 494 | |
|---|
| 495 | sub getGalaxyInfo { |
|---|
| 496 | my $build; |
|---|
| 497 | my $locFile; |
|---|
| 498 | foreach (@ARGV) { |
|---|
| 499 | if (/build=(.*)/) { $build = $1; } |
|---|
| 500 | elsif (/loc=(.*)/) { $locFile = $1; } |
|---|
| 501 | } |
|---|
| 502 | if (!$build or !$locFile) { |
|---|
| 503 | print STDERR "ERROR missing build or locfile for Galaxy input\n"; |
|---|
| 504 | exit 1; |
|---|
| 505 | } |
|---|
| 506 | # read $locFile to get $nibDir (ignoring commets) |
|---|
| 507 | open(LF, "< $locFile") || die "open($locFile): $!\n"; |
|---|
| 508 | while(<LF>) { |
|---|
| 509 | s/#.*$//; |
|---|
| 510 | s/(?:^\s+|\s+$)//g; |
|---|
| 511 | next if (/^$/); |
|---|
| 512 | |
|---|
| 513 | my @t = split(/\t/); |
|---|
| 514 | if ($t[0] eq $build) { $nibDir = $t[1]; } |
|---|
| 515 | } |
|---|
| 516 | close(LF); |
|---|
| 517 | if ($nibDir eq 'Galaxy') { |
|---|
| 518 | print STDERR "Failed to find sequence directory in locfile $locFile\n"; |
|---|
| 519 | } |
|---|
| 520 | $nibDir .= "/$build.2bit"; #we want full path and filename |
|---|
| 521 | } |
|---|