| 1 | #!/usr/bin/env perl |
|---|
| 2 | |
|---|
| 3 | use strict; |
|---|
| 4 | use warnings; |
|---|
| 5 | |
|---|
| 6 | #convert from a MAP and PED file to a genotype file |
|---|
| 7 | #assumes not many SNPs but lots of individuals |
|---|
| 8 | # transposed formats are used when lots of SNPs (TPED, TFAM) |
|---|
| 9 | |
|---|
| 10 | if (!@ARGV or scalar @ARGV ne 2) { |
|---|
| 11 | print "usage: lped_to_geno.pl infile.map infile.ped > outfile\n"; |
|---|
| 12 | exit; |
|---|
| 13 | } |
|---|
| 14 | |
|---|
| 15 | my $map = shift @ARGV; |
|---|
| 16 | my $ped = shift @ARGV; |
|---|
| 17 | |
|---|
| 18 | my @snp; #array to hold SNPs from map file |
|---|
| 19 | open(FH, $map) or die "Couldn't open $map, $!\n"; |
|---|
| 20 | while (<FH>) { |
|---|
| 21 | chomp; |
|---|
| 22 | my @f = split(/\s+/); #3 or 4 columns |
|---|
| 23 | #chrom ID [distance|morgans] position |
|---|
| 24 | if (!exists $f[3]) { $f[3] = $f[2]; } #only 3 columns |
|---|
| 25 | #have to leave in so know which to skip later |
|---|
| 26 | #if ($f[3] < 0) { next; } #way of excluding SNPs |
|---|
| 27 | #if ($f[0] eq '0') { next; } #unplaced SNP |
|---|
| 28 | $f[0] = "chr$f[0]"; |
|---|
| 29 | push(@snp, "$f[0]:$f[3]:$f[1]"); |
|---|
| 30 | } |
|---|
| 31 | close FH or die "Couldn't finish $map, $!\n"; |
|---|
| 32 | |
|---|
| 33 | #rows are individuals, columns are SNPs (7 & up) |
|---|
| 34 | #need to print row per SNP |
|---|
| 35 | my @allele; #alleles to go with @snp |
|---|
| 36 | my @pheno; #marker for phenotype |
|---|
| 37 | open(FH, $ped) or die "Couldn't open $ped, $!\n"; |
|---|
| 38 | while (<FH>) { |
|---|
| 39 | chomp; |
|---|
| 40 | my @f = split(/\s+/); |
|---|
| 41 | if (!defined $f[5]) { die "ERROR undefined phenotype $f[0] $f[1] $f[2] $f[3] $f[4]\n"; } |
|---|
| 42 | push(@pheno, $f[5]); |
|---|
| 43 | my $j = 0; |
|---|
| 44 | for(my $i = 6; $i< $#f; $i+=2) { |
|---|
| 45 | if (!$allele[$j]) { $allele[$j] = ''; } |
|---|
| 46 | #can be ACTG or 1234 (for haploview etc) or 0 for missing |
|---|
| 47 | if ($f[$i] eq '1') { $f[$i] = 'A'; } |
|---|
| 48 | elsif ($f[$i] eq '2') { $f[$i] = 'C'; } |
|---|
| 49 | elsif ($f[$i] eq '3') { $f[$i] = 'G'; } |
|---|
| 50 | elsif ($f[$i] eq '4') { $f[$i] = 'T'; } |
|---|
| 51 | if ($f[$i+1] eq '1') { $f[$i+1] = 'A'; } |
|---|
| 52 | elsif ($f[$i+1] eq '2') { $f[$i+1] = 'C'; } |
|---|
| 53 | elsif ($f[$i+1] eq '3') { $f[$i+1] = 'G'; } |
|---|
| 54 | elsif ($f[$i+1] eq '4') { $f[$i+1] = 'T'; } |
|---|
| 55 | $f[$i] = uc($f[$i]); |
|---|
| 56 | $f[$i+1] = uc($f[$i+1]); |
|---|
| 57 | $allele[$j] .= " $f[$i]$f[$i+1]"; |
|---|
| 58 | $j++; |
|---|
| 59 | } |
|---|
| 60 | } |
|---|
| 61 | close FH or die "Couldn't close $ped, $!\n"; |
|---|
| 62 | |
|---|
| 63 | print "ID Chr Pos"; |
|---|
| 64 | foreach (@pheno) { if ($_ > 0) { print " ", $_ - 1; }} #go from 1/2 to 0/1 |
|---|
| 65 | print "\n"; |
|---|
| 66 | for(my $i =0; $i <= $#snp; $i++) { #foreach snp |
|---|
| 67 | $allele[$i] =~ /(\w)/; |
|---|
| 68 | my $nt = $1; |
|---|
| 69 | my $j = 0; |
|---|
| 70 | my @t = split(/:/, $snp[$i]); |
|---|
| 71 | if ($t[0] eq 'chr0' or $t[1] < 0) { next; } #skip this SNP |
|---|
| 72 | if ($t[0] eq 'chrX') { $t[0] = 'chr23'; } |
|---|
| 73 | elsif ($t[0] eq 'chrY') { $t[0] = 'chr24'; } |
|---|
| 74 | elsif ($t[0] eq 'chrXY') { $t[0] = 'chr23'; } |
|---|
| 75 | elsif ($t[0] eq 'chrMT') { $t[0] = 'chr25'; } |
|---|
| 76 | print "$t[2] $t[0] $t[1]"; |
|---|
| 77 | $allele[$i] =~ s/^\s+//; |
|---|
| 78 | foreach my $p (split(/ +/, $allele[$i])) { |
|---|
| 79 | if ($pheno[$j] > 0) { #pheno 0 or -9 skip |
|---|
| 80 | #change AA BB AB to 2 0 1 |
|---|
| 81 | if ($p eq "$nt$nt") { print " 2"; } |
|---|
| 82 | elsif ($p =~ /$nt/) { print " 1"; } |
|---|
| 83 | else { print " 0"; } |
|---|
| 84 | } |
|---|
| 85 | $j++; |
|---|
| 86 | } |
|---|
| 87 | print "\n"; |
|---|
| 88 | } |
|---|
| 89 | |
|---|
| 90 | exit; |
|---|