[2] | 1 | #!/usr/bin/env perl |
---|
| 2 | |
---|
| 3 | use strict; |
---|
| 4 | use warnings; |
---|
| 5 | |
---|
| 6 | ######################################################################### |
---|
| 7 | # codingSnps.pl |
---|
| 8 | # This takes a bed file with the names being / separated nts |
---|
| 9 | # and a gene bed file with cds start and stop. |
---|
| 10 | # It then checks for changes in coding regions, reporting |
---|
| 11 | # those that cause a frameshift or substitution in the amino acid. |
---|
| 12 | ######################################################################### |
---|
| 13 | |
---|
| 14 | if (!@ARGV or scalar @ARGV < 3) { |
---|
| 15 | print "Usage: codingSnps.pl snps.bed genes.bed (/dir/nib/|Galaxy build= loc=) [chr=# start=# end=# snp=#] > codingSnps.txt\n"; |
---|
| 16 | exit; |
---|
| 17 | } |
---|
| 18 | my $uniq = 0; #flag for whether want uniq positions |
---|
| 19 | my $syn = 0; #flag for if want synonomous changes rather than non-syn |
---|
| 20 | my $snpFile = shift @ARGV; |
---|
| 21 | my $geneFile = shift @ARGV; |
---|
| 22 | my $nibDir = shift @ARGV; |
---|
| 23 | if ($nibDir eq 'Galaxy') { getGalaxyInfo(); } |
---|
| 24 | my $seqFlag = "2bit"; #flag to set sequence type 2bit|nib |
---|
| 25 | my $col0 = 0; #bed like columns in default positions |
---|
| 26 | my $col1 = 1; |
---|
| 27 | my $col2 = 2; |
---|
| 28 | my $col3 = 3; |
---|
| 29 | #column positions 1 based coming in (for Galaxy) |
---|
| 30 | foreach (@ARGV) { |
---|
| 31 | if (/chr=(\d+)/) { $col0 = $1 -1; } |
---|
| 32 | elsif (/start=(\d+)/) { $col1 = $1 -1; } |
---|
| 33 | elsif (/end=(\d+)/) { $col2 = $1 -1; } |
---|
| 34 | elsif (/snp=(\d+)/) { $col3 = $1 -1; } |
---|
| 35 | } |
---|
| 36 | if ($col0 < 0 || $col1 < 0 || $col2 < 0 || $col3 < 0) { |
---|
| 37 | print STDERR "ERROR column numbers are given with origin 1\n"; |
---|
| 38 | exit 1; |
---|
| 39 | } |
---|
| 40 | my @genes; #bed lines for genes, sorted by chrom and start |
---|
| 41 | my %chrSt; #index in array where each chrom starts |
---|
| 42 | my %codon; #hash of codon amino acid conversions |
---|
| 43 | my $ends = 0; #ends vs sizes in bed 11 position, starts relative to chrom |
---|
| 44 | my $ignoreN = 1; #skip N |
---|
| 45 | |
---|
| 46 | my %amb = ( |
---|
| 47 | "R" => "A/G", |
---|
| 48 | "Y" => "C/T", |
---|
| 49 | "S" => "C/G", |
---|
| 50 | "W" => "A/T", |
---|
| 51 | "K" => "G/T", |
---|
| 52 | "M" => "A/C", |
---|
| 53 | "B" => "C/G/T", |
---|
| 54 | "D" => "A/G/T", |
---|
| 55 | "H" => "A/C/T", |
---|
| 56 | "V" => "A/C/G", |
---|
| 57 | "N" => "A/C/G/T" |
---|
| 58 | ); |
---|
| 59 | fill_codon(); |
---|
| 60 | open(FH, "cat $geneFile | sort -k1,1 -k2,2n |") |
---|
| 61 | or die "Couldn't open and sort $geneFile, $!\n"; |
---|
| 62 | my $i = 0; |
---|
| 63 | while(<FH>) { |
---|
| 64 | chomp; |
---|
| 65 | if (/refGene.cdsEnd|ccdsGene.exonEnds/) { $ends = 1; next; } |
---|
| 66 | push(@genes, "$_"); |
---|
| 67 | my @f = split(/\t/); |
---|
| 68 | if (!exists $chrSt{$f[0]}) { $chrSt{$f[0]} = $i; } |
---|
| 69 | $i++; |
---|
| 70 | } |
---|
| 71 | close FH or die "Couldn't close $geneFile, $!\n"; |
---|
| 72 | |
---|
| 73 | if ($ends) { print STDERR "WARNING using block ends rather than sizes\n"; } |
---|
| 74 | |
---|
| 75 | #open snps sorted as well |
---|
| 76 | my $s1 = $col0 + 1; #sort order is origin 1 |
---|
| 77 | my $s2 = $col1 + 1; |
---|
| 78 | open(FH, "cat $snpFile | sort -k$s1,$s1 -k$s2,${s2}n |") |
---|
| 79 | or die "Couldn't open and sort $snpFile, $!\n"; |
---|
| 80 | $i = 0; |
---|
| 81 | my @g; #one genes fields, should be used repeatedly |
---|
| 82 | my %done; |
---|
| 83 | while(<FH>) { |
---|
| 84 | chomp; |
---|
| 85 | if (/^\s*#/) { next; } #comment |
---|
| 86 | my @s = split(/\t/); #SNP fields |
---|
| 87 | if (!@s or !$s[$col0]) { die "ERROR missing SNP data, $_\n"; } |
---|
| 88 | my $size = $#s; |
---|
| 89 | if ($col0 > $size || $col1 > $size || $col2 > $size || $col3 > $size) { |
---|
| 90 | print STDERR "ERROR file has fewer columns than requested, requested columns (0 based) $col0 $col1 $col2 $col3, file has $size\n"; |
---|
| 91 | exit 1; |
---|
| 92 | } |
---|
| 93 | if ($s[$col1] =~ /\D/) { |
---|
| 94 | print STDERR "ERROR the start point must be an integer not $s[$col1]\n"; |
---|
| 95 | exit 1; |
---|
| 96 | } |
---|
| 97 | if ($s[$col2] =~ /\D/) { |
---|
| 98 | print STDERR "ERROR the start point must be an integer not $s[$col2]\n"; |
---|
| 99 | exit 1; |
---|
| 100 | } |
---|
| 101 | if ($s[$col3] eq 'N' && $ignoreN) { next; } |
---|
| 102 | if (exists $amb{$s[$col3]}) { $s[$col3] = $amb{$s[$col3]}; } |
---|
| 103 | if (!@g && exists $chrSt{$s[$col0]}) { #need to fetch first gene row |
---|
| 104 | $i = $chrSt{$s[$col0]}; |
---|
| 105 | @g = split(/\t/, $genes[$i]); |
---|
| 106 | }elsif (!@g) { |
---|
| 107 | next; #no gene for this chrom |
---|
| 108 | }elsif ($s[$col0] ne $g[0] && exists $chrSt{$s[$col0]}) { #new chrom |
---|
| 109 | $i = $chrSt{$s[$col0]}; |
---|
| 110 | @g = split(/\t/, $genes[$i]); |
---|
| 111 | }elsif ($s[$col0] ne $g[0]) { |
---|
| 112 | next; #no gene for this chrom |
---|
| 113 | }elsif ($s[$col1] < $g[1] && $i == $chrSt{$s[$col0]}) { |
---|
| 114 | next; #before any genes |
---|
| 115 | }elsif ($s[$col1] > $g[2] && ($i == $#genes or $genes[$i+1] !~ $s[$col0])) { |
---|
| 116 | next; #after all genes on chr |
---|
| 117 | }else { |
---|
| 118 | while ($s[$col1] > $g[2] && $i < $#genes) { |
---|
| 119 | $i++; |
---|
| 120 | @g = split(/\t/, $genes[$i]); |
---|
| 121 | if ($s[$col0] ne $g[0]) { last; } #end of gene |
---|
| 122 | } |
---|
| 123 | if ($s[$col0] ne $g[0] or $s[$col1] < $g[1] or $s[$col1] > $g[2]) { |
---|
| 124 | next; #no overlap with genes |
---|
| 125 | } |
---|
| 126 | } |
---|
| 127 | |
---|
| 128 | processSnp(\@s, \@g); |
---|
| 129 | if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { next; } |
---|
| 130 | |
---|
| 131 | my $k = $i + 1; #check for more genes without losing data of first |
---|
| 132 | if ($k <= $#genes) { |
---|
| 133 | my @g2 = split(/\t/, $genes[$k]); |
---|
| 134 | while (@g2 && $k <= $#genes) { |
---|
| 135 | @g2 = split(/\t/, $genes[$k]); |
---|
| 136 | if ($s[$col0] ne $g2[0]) { |
---|
| 137 | undef @g2; |
---|
| 138 | last; #not same chrom |
---|
| 139 | }else { |
---|
| 140 | while ($s[$col1] > $g2[2] && $k <= $#genes) { |
---|
| 141 | $k++; |
---|
| 142 | @g2 = split(/\t/, $genes[$k]); |
---|
| 143 | if ($s[$col0] ne $g2[0]) { last; } #end of chrom |
---|
| 144 | } |
---|
| 145 | if ($s[$col0] ne $g2[0] or $s[$col1] < $g2[1] or $s[$col1] > $g2[2]) { |
---|
| 146 | undef @g2; |
---|
| 147 | last; #no overlap with more genes |
---|
| 148 | } |
---|
| 149 | processSnp(\@s, \@g2); |
---|
| 150 | if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { last; } |
---|
| 151 | } |
---|
| 152 | $k++; |
---|
| 153 | } |
---|
| 154 | } |
---|
| 155 | } |
---|
| 156 | close FH or die "Couldn't close $snpFile, $!\n"; |
---|
| 157 | |
---|
| 158 | exit; |
---|
| 159 | |
---|
| 160 | ######################################################################## |
---|
| 161 | sub processSnp { |
---|
| 162 | my $sref = shift; |
---|
| 163 | my $gref = shift; |
---|
| 164 | #overlaps gene, but maybe not coding seq |
---|
| 165 | #inside cds |
---|
| 166 | if ($sref->[$col1] + 1 < $gref->[6] or $sref->[$col2] > $gref->[7]) { |
---|
| 167 | return; #outside of coding |
---|
| 168 | } |
---|
| 169 | #now check exon |
---|
| 170 | my $i = 0; |
---|
| 171 | my @st = split(/,/, $gref->[11]); |
---|
| 172 | my @size = split(/,/, $gref->[10]); |
---|
| 173 | if (scalar @st ne $gref->[9]) { return; } #cant do this gene #die "bad gene $gref->[3]\n"; } |
---|
| 174 | my @pos; |
---|
| 175 | my $in = 0; |
---|
| 176 | for($i = 0; $i < $gref->[9]; $i++) { |
---|
| 177 | my $sta = $gref->[1] + $st[$i] + 1; #1 based position |
---|
| 178 | my $end = $sta + $size[$i] - 1; # |
---|
| 179 | if ($ends) { $end = $size[$i]; $sta = $st[$i] + 1; } #ends instead of sizes |
---|
| 180 | if ($end < $gref->[6]) { next; } #utr only |
---|
| 181 | if ($sta > $gref->[7]) { next; } #utr only |
---|
| 182 | #shorten to coding only |
---|
| 183 | if ($sta < $gref->[6]) { $sta = $gref->[6] + 1; } |
---|
| 184 | if ($end > $gref->[7]) { $end = $gref->[7]; } |
---|
| 185 | if ($sref->[$col1] + 1 >= $sta && $sref->[$col2] <= $end) { $in = 1; } |
---|
| 186 | elsif ($sref->[$col1] == $sref->[$col2] && $sref->[$col2] <= $end && $sref->[$col2] >= $sta) { $in = 1; } |
---|
| 187 | push(@pos, ($sta .. $end)); #add exon worth of positions |
---|
| 188 | } |
---|
| 189 | #@pos has coding positions for whole gene (chr coors), |
---|
| 190 | #and $in has whether we need to continue |
---|
| 191 | if (!$in) { return; } #not in coding exon |
---|
| 192 | if ((scalar @pos) % 3 != 0) { return; } #partial gene? not even codons |
---|
| 193 | if ($sref->[$col3] =~ /^-+\/[ACTG]+$/ or $sref->[$col3] =~ /^[ACTG]+\/-+$/ or |
---|
| 194 | $sref->[$col3] =~ /^-+$/) { #indel or del |
---|
| 195 | my $copy = $sref->[$col3]; |
---|
| 196 | my $c = ($copy =~ tr/-//); |
---|
| 197 | if ($c % 3 == 0) { return; } #not frameshift |
---|
| 198 | #handle bed4 to bed4 + 4 (pgSnp) |
---|
| 199 | print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; |
---|
| 200 | #if ($sref->[4]) { print "\t$sref->[4]"; } |
---|
| 201 | #if ($sref->[5]) { print "\t$sref->[5]"; } |
---|
| 202 | #if ($sref->[6]) { print "\t$sref->[6]"; } |
---|
| 203 | #if ($sref->[7]) { print "\t$sref->[7]"; } |
---|
| 204 | print "\t$gref->[3]\tframeshift\n"; |
---|
| 205 | $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; |
---|
| 206 | return; |
---|
| 207 | }elsif ($sref->[$col1] == $sref->[$col2]) { #insertion |
---|
| 208 | my $copy = $sref->[$col3]; |
---|
| 209 | my $c = ($copy =~ tr/\[ACTG]+//); |
---|
| 210 | if ($c % 3 == 0) { return; } #not frameshift |
---|
| 211 | #handle bed4 to bed4 + 4 (pgSnp) |
---|
| 212 | print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; |
---|
| 213 | #if ($sref->[4]) { print "\t$sref->[4]"; } |
---|
| 214 | #if ($sref->[5]) { print "\t$sref->[5]"; } |
---|
| 215 | #if ($sref->[6]) { print "\t$sref->[6]"; } |
---|
| 216 | #if ($sref->[7]) { print "\t$sref->[7]"; } |
---|
| 217 | print "\t$gref->[3]\tframeshift\n"; |
---|
| 218 | $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; |
---|
| 219 | return; |
---|
| 220 | } |
---|
| 221 | #check for amino acid substitutions |
---|
| 222 | my $s = $sref->[$col1] + 1; |
---|
| 223 | my $e = $sref->[$col2]; |
---|
| 224 | my $len = $sref->[$col2] - $sref->[$col1]; |
---|
| 225 | if ($gref->[5] eq '-') { |
---|
| 226 | @pos = reverse(@pos); |
---|
| 227 | my $t = $s; |
---|
| 228 | $s = $e; |
---|
| 229 | $e = $t; |
---|
| 230 | } |
---|
| 231 | $i = 0; |
---|
| 232 | my $found = 0; |
---|
| 233 | foreach (@pos) { |
---|
| 234 | if ($s == $_) { |
---|
| 235 | $found = 1; |
---|
| 236 | last; |
---|
| 237 | } |
---|
| 238 | $i++; |
---|
| 239 | } |
---|
| 240 | if ($found) { |
---|
| 241 | my $fs = $i; #keep original start index |
---|
| 242 | #have index where substitution starts |
---|
| 243 | my $cp = $i % 3; |
---|
| 244 | $i -= $cp; #i is now first position in codon |
---|
| 245 | my $cdNum = int($i / 3) + 1; |
---|
| 246 | my $ls = $i; |
---|
| 247 | if (!defined $ls) { die "ERROR not defined ls for $fs $sref->[$col2]\n"; } |
---|
| 248 | if (!@pos) { die "ERROR not defined array pos\n"; } |
---|
| 249 | if (!defined $pos[$ls]) { die "ERROR not defined pos at $ls\n"; } |
---|
| 250 | if (!defined $e) { die "ERROR not defined e for $pos[0] $pos[1] $pos[2]\n"; } |
---|
| 251 | while ($ls <= $#pos && $pos[$ls] ne $e) { |
---|
| 252 | $ls++; |
---|
| 253 | } |
---|
| 254 | my $i2 = $ls + (2 - ($ls % 3)); |
---|
| 255 | if ($i2 > $#pos) { return; } #not a full codon, partial gene? |
---|
| 256 | |
---|
| 257 | if ($i2 - $i < 2) { die "not a full codon positions $i to $i2 for $sref->[3]\n"; } |
---|
| 258 | my $oldnts = getnts($sref->[$col0], @pos[$i..$i2]); |
---|
| 259 | if (!$oldnts) { die "Failed to get sequence for $sref->[$col0] $pos[$i] .. $pos[$i2]\n"; } |
---|
| 260 | my @vars = split(/\//, $sref->[$col3]); |
---|
| 261 | if ($gref->[5] eq '-') { #complement oldnts and revcomp vars |
---|
| 262 | $oldnts = compl($oldnts); |
---|
| 263 | if (!$oldnts) { return; } #skip this one |
---|
| 264 | $oldnts = join('', (reverse(split(/ */, $oldnts)))); |
---|
| 265 | foreach (@vars) { |
---|
| 266 | $_ = reverse(split(/ */)); |
---|
| 267 | $_ = compl($_); |
---|
| 268 | } |
---|
| 269 | } |
---|
| 270 | my $r = $fs - $i; #difference in old indexes gives new index |
---|
| 271 | my @newnts; |
---|
| 272 | my $changed = ''; |
---|
| 273 | foreach my $v (@vars) { |
---|
| 274 | if (!$v or length($v) != 1) { return; } #only simple changes |
---|
| 275 | my @new = split(/ */, $oldnts); |
---|
| 276 | $changed = splice(@new, $r, $len, split(/ */, $v)); |
---|
| 277 | #should only change single nt |
---|
| 278 | push(@newnts, join("", @new)); |
---|
| 279 | } |
---|
| 280 | #now compute amino acids |
---|
| 281 | my $oldaa = getaa($oldnts); |
---|
| 282 | my @newaa; |
---|
| 283 | my $change = 0; #flag for if there is a change |
---|
| 284 | foreach my $v (@newnts) { |
---|
| 285 | my $t = getaa($v); |
---|
| 286 | if ($t ne $oldaa) { $change = 1; } |
---|
| 287 | push(@newaa, $t); |
---|
| 288 | } |
---|
| 289 | if (!$change && $syn) { |
---|
| 290 | print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; |
---|
| 291 | print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n"; |
---|
| 292 | return; |
---|
| 293 | }elsif ($syn) { return; } #only want synonymous changes |
---|
| 294 | if (!$change) { return; } #no change in amino acids |
---|
| 295 | #if (abs($pos[$i] - $pos[$i2]) > 200) { |
---|
| 296 | #print STDERR "TESTING found mutation at splice site $sref->[0]\t$sref->[1]\t$sref->[2]\n"; |
---|
| 297 | #print STDERR "old $oldaa, new ", join(', ', @newaa), "\n"; |
---|
| 298 | #print STDERR "oldnt $oldnts, strand $gref->[5]\n"; |
---|
| 299 | #exit; |
---|
| 300 | #} |
---|
| 301 | print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; |
---|
| 302 | #if (defined $sref->[4]) { print "\t$sref->[4]"; } |
---|
| 303 | #if (defined $sref->[5]) { print "\t$sref->[5]"; } |
---|
| 304 | #if (defined $sref->[6]) { print "\t$sref->[6]"; } |
---|
| 305 | #if (defined $sref->[7]) { print "\t$sref->[7]"; } |
---|
| 306 | if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref |
---|
| 307 | if (!$changed) { return; } #skip this one |
---|
| 308 | print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n"; |
---|
| 309 | $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; |
---|
| 310 | } |
---|
| 311 | } |
---|
| 312 | |
---|
| 313 | sub getnts { |
---|
| 314 | my $chr = shift; |
---|
| 315 | my @pos = @_; #list of positions not necessarily in order |
---|
| 316 | #list may be reversed or have gaps(introns), at least 3 bps |
---|
| 317 | my $seq = ''; |
---|
| 318 | if (scalar @pos < 3) { die "too small region for $chr $pos[0]\n"; } |
---|
| 319 | if ($pos[0] < $pos[1]) { #not reversed |
---|
| 320 | my $s = $pos[0]; |
---|
| 321 | for(my $i = 1; $i <= $#pos; $i++) { |
---|
| 322 | if ($pos[$i] == $pos[$i-1] + 1) { next; } |
---|
| 323 | if ($seqFlag eq '2bit') { |
---|
| 324 | $seq .= fetchSeq2bit($chr, $s, $pos[$i-1]); |
---|
| 325 | }else { |
---|
| 326 | $seq .= fetchSeqNib($chr, $s, $pos[$i-1]); |
---|
| 327 | } |
---|
| 328 | $s = $pos[$i]; |
---|
| 329 | } |
---|
| 330 | if (length $seq != scalar @pos) { #still need to fetch seq |
---|
| 331 | #if (abs($pos[$#pos]-$pos[0]) > 200) { |
---|
| 332 | #print STDERR "TESTING have split codon $chr $pos[0] $pos[$#pos]\n"; |
---|
| 333 | #exit; |
---|
| 334 | #} |
---|
| 335 | if ($seqFlag eq '2bit') { |
---|
| 336 | $seq .= fetchSeq2bit($chr, $s, $pos[$#pos]); |
---|
| 337 | }else { |
---|
| 338 | $seq .= fetchSeqNib($chr, $s, $pos[$#pos]); |
---|
| 339 | } |
---|
| 340 | } |
---|
| 341 | }else { #reversed |
---|
| 342 | my $s = $pos[$#pos]; |
---|
| 343 | for(my $i = $#pos -1; $i >= 0; $i--) { |
---|
| 344 | if ($pos[$i] == $pos[$i+1] + 1) { next; } |
---|
| 345 | if ($seqFlag eq '2bit') { |
---|
| 346 | $seq .= fetchSeq2bit($chr, $s, $pos[$i+1]); |
---|
| 347 | }else { |
---|
| 348 | $seq .= fetchSeqNib($chr, $s, $pos[$i+1]); |
---|
| 349 | } |
---|
| 350 | $s = $pos[$i]; |
---|
| 351 | } |
---|
| 352 | if (length $seq != scalar @pos) { #still need to fetch seq |
---|
| 353 | #if (abs($pos[$#pos]-$pos[0]) > 200) { |
---|
| 354 | #print STDERR "TESTING have split codon $pos[0] .. $pos[$#pos]\n"; |
---|
| 355 | #} |
---|
| 356 | if ($seqFlag eq '2bit') { |
---|
| 357 | $seq .= fetchSeq2bit($chr, $s, $pos[0]); |
---|
| 358 | }else { |
---|
| 359 | $seq .= fetchSeqNib($chr, $s, $pos[0]); |
---|
| 360 | } |
---|
| 361 | } |
---|
| 362 | } |
---|
| 363 | } |
---|
| 364 | |
---|
| 365 | sub fetchSeq2bit { |
---|
| 366 | my $chr = shift; |
---|
| 367 | my $st = shift; |
---|
| 368 | my $end = shift; |
---|
| 369 | my $strand = '+'; |
---|
| 370 | $st--; #change to UCSC numbering |
---|
| 371 | open (BIT, "twoBitToFa -seq=$chr -start=$st -end=$end $nibDir stdout |") or |
---|
| 372 | die "Couldn't run twoBitToFa, $!\n"; |
---|
| 373 | my $seq = ''; |
---|
| 374 | while (<BIT>) { |
---|
| 375 | chomp; |
---|
| 376 | if (/^>/) { next; } #header |
---|
| 377 | $seq .= $_; |
---|
| 378 | } |
---|
| 379 | close BIT or die "Couldn't finish twoBitToFa on $chr $st $end, $!\n"; |
---|
| 380 | return $seq; |
---|
| 381 | } |
---|
| 382 | |
---|
| 383 | sub fetchSeqNib { |
---|
| 384 | my $chr = shift; |
---|
| 385 | my $st = shift; |
---|
| 386 | my $end = shift; |
---|
| 387 | my $strand = '+'; |
---|
| 388 | $st--; #change to UCSC numbering |
---|
| 389 | open (NIB, "nibFrag -upper $nibDir/${chr}.nib $st $end $strand stdout |") or die "Couldn't run nibFrag, $!\n"; |
---|
| 390 | my $seq = ''; |
---|
| 391 | while (<NIB>) { |
---|
| 392 | chomp; |
---|
| 393 | if (/^>/) { next; } #header |
---|
| 394 | $seq .= $_; |
---|
| 395 | } |
---|
| 396 | close NIB or die "Couldn't finish nibFrag on $chr $st $end, $!\n"; |
---|
| 397 | return $seq; |
---|
| 398 | } |
---|
| 399 | |
---|
| 400 | sub compl { |
---|
| 401 | my $nts = shift; |
---|
| 402 | my $comp = ''; |
---|
| 403 | foreach my $n (split(/ */, $nts)) { |
---|
| 404 | if ($n eq 'A') { $comp .= 'T'; } |
---|
| 405 | elsif ($n eq 'T') { $comp .= 'A'; } |
---|
| 406 | elsif ($n eq 'C') { $comp .= 'G'; } |
---|
| 407 | elsif ($n eq 'G') { $comp .= 'C'; } |
---|
| 408 | elsif ($n eq 'N') { $comp .= 'N'; } |
---|
| 409 | elsif ($n eq '-') { $comp .= '-'; } #deletion |
---|
| 410 | else { $comp = undef; } |
---|
| 411 | } |
---|
| 412 | return $comp; |
---|
| 413 | } |
---|
| 414 | |
---|
| 415 | sub getaa { |
---|
| 416 | my $nts = shift; #in multiples of 3 |
---|
| 417 | my $aa = ''; |
---|
| 418 | my @n = split(/ */, $nts); |
---|
| 419 | while (@n) { |
---|
| 420 | my @t = splice(@n, 0, 3); |
---|
| 421 | my $n = uc(join("", @t)); |
---|
| 422 | if (!exists $codon{$n}) { $aa .= 'N'; next; } |
---|
| 423 | $aa .= $codon{$n}; |
---|
| 424 | } |
---|
| 425 | return $aa; |
---|
| 426 | } |
---|
| 427 | |
---|
| 428 | sub fill_codon { |
---|
| 429 | $codon{GCA} = 'Ala'; |
---|
| 430 | $codon{GCC} = 'Ala'; |
---|
| 431 | $codon{GCG} = 'Ala'; |
---|
| 432 | $codon{GCT} = 'Ala'; |
---|
| 433 | $codon{CGG} = 'Arg'; |
---|
| 434 | $codon{CGT} = 'Arg'; |
---|
| 435 | $codon{CGC} = 'Arg'; |
---|
| 436 | $codon{AGA} = 'Arg'; |
---|
| 437 | $codon{AGG} = 'Arg'; |
---|
| 438 | $codon{CGA} = 'Arg'; |
---|
| 439 | $codon{AAC} = 'Asn'; |
---|
| 440 | $codon{AAT} = 'Asn'; |
---|
| 441 | $codon{GAC} = 'Asp'; |
---|
| 442 | $codon{GAT} = 'Asp'; |
---|
| 443 | $codon{TGC} = 'Cys'; |
---|
| 444 | $codon{TGT} = 'Cys'; |
---|
| 445 | $codon{CAG} = 'Gln'; |
---|
| 446 | $codon{CAA} = 'Gln'; |
---|
| 447 | $codon{GAA} = 'Glu'; |
---|
| 448 | $codon{GAG} = 'Glu'; |
---|
| 449 | $codon{GGG} = 'Gly'; |
---|
| 450 | $codon{GGA} = 'Gly'; |
---|
| 451 | $codon{GGC} = 'Gly'; |
---|
| 452 | $codon{GGT} = 'Gly'; |
---|
| 453 | $codon{CAC} = 'His'; |
---|
| 454 | $codon{CAT} = 'His'; |
---|
| 455 | $codon{ATA} = 'Ile'; |
---|
| 456 | $codon{ATT} = 'Ile'; |
---|
| 457 | $codon{ATC} = 'Ile'; |
---|
| 458 | $codon{CTA} = 'Leu'; |
---|
| 459 | $codon{CTC} = 'Leu'; |
---|
| 460 | $codon{CTG} = 'Leu'; |
---|
| 461 | $codon{CTT} = 'Leu'; |
---|
| 462 | $codon{TTG} = 'Leu'; |
---|
| 463 | $codon{TTA} = 'Leu'; |
---|
| 464 | $codon{AAA} = 'Lys'; |
---|
| 465 | $codon{AAG} = 'Lys'; |
---|
| 466 | $codon{ATG} = 'Met'; |
---|
| 467 | $codon{TTC} = 'Phe'; |
---|
| 468 | $codon{TTT} = 'Phe'; |
---|
| 469 | $codon{CCT} = 'Pro'; |
---|
| 470 | $codon{CCA} = 'Pro'; |
---|
| 471 | $codon{CCC} = 'Pro'; |
---|
| 472 | $codon{CCG} = 'Pro'; |
---|
| 473 | $codon{TCA} = 'Ser'; |
---|
| 474 | $codon{AGC} = 'Ser'; |
---|
| 475 | $codon{AGT} = 'Ser'; |
---|
| 476 | $codon{TCC} = 'Ser'; |
---|
| 477 | $codon{TCT} = 'Ser'; |
---|
| 478 | $codon{TCG} = 'Ser'; |
---|
| 479 | $codon{TGA} = 'Stop'; |
---|
| 480 | $codon{TAG} = 'Stop'; |
---|
| 481 | $codon{TAA} = 'Stop'; |
---|
| 482 | $codon{ACT} = 'Thr'; |
---|
| 483 | $codon{ACA} = 'Thr'; |
---|
| 484 | $codon{ACC} = 'Thr'; |
---|
| 485 | $codon{ACG} = 'Thr'; |
---|
| 486 | $codon{TGG} = 'Trp'; |
---|
| 487 | $codon{TAT} = 'Tyr'; |
---|
| 488 | $codon{TAC} = 'Tyr'; |
---|
| 489 | $codon{GTC} = 'Val'; |
---|
| 490 | $codon{GTA} = 'Val'; |
---|
| 491 | $codon{GTG} = 'Val'; |
---|
| 492 | $codon{GTT} = 'Val'; |
---|
| 493 | } |
---|
| 494 | |
---|
| 495 | sub getGalaxyInfo { |
---|
| 496 | my $build; |
---|
| 497 | my $locFile; |
---|
| 498 | foreach (@ARGV) { |
---|
| 499 | if (/build=(.*)/) { $build = $1; } |
---|
| 500 | elsif (/loc=(.*)/) { $locFile = $1; } |
---|
| 501 | } |
---|
| 502 | if (!$build or !$locFile) { |
---|
| 503 | print STDERR "ERROR missing build or locfile for Galaxy input\n"; |
---|
| 504 | exit 1; |
---|
| 505 | } |
---|
| 506 | # read $locFile to get $nibDir (ignoring commets) |
---|
| 507 | open(LF, "< $locFile") || die "open($locFile): $!\n"; |
---|
| 508 | while(<LF>) { |
---|
| 509 | s/#.*$//; |
---|
| 510 | s/(?:^\s+|\s+$)//g; |
---|
| 511 | next if (/^$/); |
---|
| 512 | |
---|
| 513 | my @t = split(/\t/); |
---|
| 514 | if ($t[0] eq $build) { $nibDir = $t[1]; } |
---|
| 515 | } |
---|
| 516 | close(LF); |
---|
| 517 | if ($nibDir eq 'Galaxy') { |
---|
| 518 | print STDERR "Failed to find sequence directory in locfile $locFile\n"; |
---|
| 519 | } |
---|
| 520 | $nibDir .= "/$build.2bit"; #we want full path and filename |
---|
| 521 | } |
---|