| 1 | #!/usr/bin/perl | 
|---|
| 2 |  | 
|---|
| 3 | ########################################################################### | 
|---|
| 4 | # Purpose: To calculate the correlation of two sets of scores in one file. | 
|---|
| 5 | # Usage: correlation.pl infile.bed output.txt column1 column2 | 
|---|
| 6 | #        (column start from 1) | 
|---|
| 7 | # Written by: Yi Zhang  (June, 2005) | 
|---|
| 8 | ########################################################################### | 
|---|
| 9 | if (!$ARGV[0] || !$ARGV[1] || !defined($ARGV[2]) || !defined($ARGV[3]) ) { | 
|---|
| 10 |    print STDERR "Usage: correlation.pl infile.bed output.txt column1 column2\n"; | 
|---|
| 11 |    print STDERR "       (column start from 1)\n";  | 
|---|
| 12 |    exit; | 
|---|
| 13 | } | 
|---|
| 14 | my $file = $ARGV[0]; | 
|---|
| 15 | my $out = $ARGV[1]; | 
|---|
| 16 |  | 
|---|
| 17 | die "<font color=\"yellow\">The input columns contain numerical values: $ARGV[2], $ARGV[3]</font>.\n" if ($ARGV[2] =~ /[a-zA-Z]+/ || $ARGV[3] =~ /[a-zA-Z]+/); | 
|---|
| 18 |  | 
|---|
| 19 | my $col1 = $ARGV[2] - 1; | 
|---|
| 20 | my $col2 = $ARGV[3] - 1; | 
|---|
| 21 |  | 
|---|
| 22 | my ($f, $o); | 
|---|
| 23 | my (@a, @b); | 
|---|
| 24 |  | 
|---|
| 25 | my $n_t = 0; | 
|---|
| 26 | open($f, $file) or die "Could't open $file, $!\n"; | 
|---|
| 27 | while(<$f>) { | 
|---|
| 28 |   chomp; | 
|---|
| 29 |   my @t = split(/\t/); | 
|---|
| 30 |   if ($n_t == 0) {  | 
|---|
| 31 |      $n_t = scalar(@t) - 1;  | 
|---|
| 32 |      die "<font color=\"yellow\">The input column number exceeds the size of the file: $col1, $col2, $n_t</font>\n" if ( $col1 > $n_t || $col2 > $n_t ); | 
|---|
| 33 |   } | 
|---|
| 34 |   die "<font color=\"yellow\">The columns you have selected contain non numeric characters:$t[$col1] and $t[$col2] \n</font>" if ($t[$col1] =~ /[a-zA-Z]+/ || $t[$col2] =~ /[a-zA-Z]+/);   | 
|---|
| 35 |   push(@a, $t[$col1]); | 
|---|
| 36 |   push(@b, $t[$col2]); | 
|---|
| 37 | } | 
|---|
| 38 | close($f); | 
|---|
| 39 |  | 
|---|
| 40 | my $result = correlation(\@a, \@b); | 
|---|
| 41 |  | 
|---|
| 42 | open($o, ">$out") or die "Couldn't open $out, $!\n"; | 
|---|
| 43 | $col1 = $col1 + 1; | 
|---|
| 44 | $col2 = $col2 + 1; | 
|---|
| 45 | print $o "The correlation of column $col1 and $col2 is $result\n"; | 
|---|
| 46 | close($o); | 
|---|
| 47 | print "The correlation of column $col1 and $col2 is $result\n"; | 
|---|
| 48 |  | 
|---|
| 49 | sub correlation { | 
|---|
| 50 |    my ($array1ref, $array2ref) = @_; | 
|---|
| 51 |    my ($sum1, $sum2); | 
|---|
| 52 |    my ($sum1_squared, $sum2_squared);  | 
|---|
| 53 |    foreach (@$array1ref) { $sum1 += $_;  $sum1_squared += $_**2; } | 
|---|
| 54 |    foreach (@$array2ref) { $sum2 += $_;  $sum2_squared += $_**2; } | 
|---|
| 55 |    my $numerator = (@$array1ref**2) * covariance($array1ref, $array2ref); | 
|---|
| 56 |    my $denominator = sqrt(((@$array1ref * $sum1_squared) - ($sum1**2)) * | 
|---|
| 57 |                           ((@$array1ref * $sum2_squared) - ($sum2**2))); | 
|---|
| 58 |    my $r; | 
|---|
| 59 |    if ($denominator == 0) { | 
|---|
| 60 |      print STDERR "The denominator is 0.\n"; | 
|---|
| 61 |          exit 0;  | 
|---|
| 62 |    } else { | 
|---|
| 63 |       $r = $numerator / $denominator; | 
|---|
| 64 |    } | 
|---|
| 65 |     return $r; | 
|---|
| 66 | } | 
|---|
| 67 |  | 
|---|
| 68 | sub covariance { | 
|---|
| 69 |    my ($array1ref, $array2ref) = @_; | 
|---|
| 70 |    my ($i, $result); | 
|---|
| 71 |    for ($i = 0; $i < @$array1ref; $i++) { | 
|---|
| 72 |        $result += $array1ref->[$i] * $array2ref->[$i]; | 
|---|
| 73 |    } | 
|---|
| 74 |    $result /= @$array1ref; | 
|---|
| 75 |    $result -= mean($array1ref) * mean($array2ref); | 
|---|
| 76 | } | 
|---|
| 77 |  | 
|---|
| 78 | sub mean { | 
|---|
| 79 |   my ($arrayref) = @_; | 
|---|
| 80 |   my $result; | 
|---|
| 81 |   foreach (@$arrayref) { $result += $_; } | 
|---|
| 82 |   return $result/@$arrayref; | 
|---|
| 83 | } | 
|---|
| 84 |  | 
|---|