[2] | 1 | #!/usr/bin/perl -w |
---|
| 2 | |
---|
| 3 | # This program detects overlapping indels in a chromosome and keeps all non-overlapping indels. As for overlapping indels, |
---|
| 4 | # the first encountered one is kept and all others are removed. It requires three inputs: |
---|
| 5 | # The first input is a TABULAR format file containing coordinates of indels in blocks extracted from multi-alignment. |
---|
| 6 | # The second input is an integer number representing the number of the column where indel start coordinates are stored in the input file. |
---|
| 7 | # The third input is an integer number representing the number of the column where indel end coordinates are stored in the input file. |
---|
| 8 | # The output is a TABULAR format file containing all non-overlapping indels in the input file, and the first encountered indel of overlapping ones. |
---|
| 9 | # Note: The number of the first column is 1. |
---|
| 10 | |
---|
| 11 | use strict; |
---|
| 12 | use warnings; |
---|
| 13 | |
---|
| 14 | #varaibles to handle information related to indels |
---|
| 15 | my $indel1 = ""; |
---|
| 16 | my $indel2 = ""; |
---|
| 17 | my @indelArray1 = (); |
---|
| 18 | my @indelArray2 = (); |
---|
| 19 | my $lineCounter1 = 0; |
---|
| 20 | my $lineCounter2 = 0; |
---|
| 21 | my $totalNumberofNonOverlappingIndels = 0; |
---|
| 22 | |
---|
| 23 | # check to make sure having correct files |
---|
| 24 | my $usage = "usage: delete_overlapping_indels.pl [TABULAR.in] [indelStartColumn] [indelEndColumn] [TABULAR.out]\n"; |
---|
| 25 | die $usage unless @ARGV == 4; |
---|
| 26 | |
---|
| 27 | my $inputFile = $ARGV[0]; |
---|
| 28 | my $indelStartColumn = $ARGV[1] - 1; |
---|
| 29 | my $indelEndColumn = $ARGV[2] - 1; |
---|
| 30 | my $outputFile = $ARGV[3]; |
---|
| 31 | |
---|
| 32 | #verifie column numbers |
---|
| 33 | if ($indelStartColumn < 0 ){ |
---|
| 34 | die ("The indel start column number is invalid \n"); |
---|
| 35 | } |
---|
| 36 | if ($indelEndColumn < 0 ){ |
---|
| 37 | die ("The indel end column number is invalid \n"); |
---|
| 38 | } |
---|
| 39 | |
---|
| 40 | #open the input and output files |
---|
| 41 | open (INPUT, "<", $inputFile) || die ("Could not open file $inputFile \n"); |
---|
| 42 | open (OUTPUT, ">", $outputFile) || die ("Could not open file $outputFile \n"); |
---|
| 43 | |
---|
| 44 | #store the input file in the array @rawData |
---|
| 45 | my @indelsRawData = <INPUT>; |
---|
| 46 | |
---|
| 47 | #iterated through the indels of the input file |
---|
| 48 | INDEL1: |
---|
| 49 | foreach $indel1 (@indelsRawData){ |
---|
| 50 | chomp ($indel1); |
---|
| 51 | $lineCounter1++; |
---|
| 52 | |
---|
| 53 | #get the first indel |
---|
| 54 | @indelArray1 = split(/\t/, $indel1); |
---|
| 55 | |
---|
| 56 | #our purpose is to detect overlapping indels and to store one copy of them only in the output file |
---|
| 57 | #all other non-overlapping indels will stored in the output file also |
---|
| 58 | |
---|
| 59 | $lineCounter2 = 0; |
---|
| 60 | |
---|
| 61 | #iterated through the indels of the input file |
---|
| 62 | INDEL2: |
---|
| 63 | foreach $indel2 (@indelsRawData){ |
---|
| 64 | chomp ($indel2); |
---|
| 65 | $lineCounter2++; |
---|
| 66 | |
---|
| 67 | if ($lineCounter2 > $lineCounter1){ |
---|
| 68 | #get the second indel |
---|
| 69 | @indelArray2 = split(/\t/, $indel2); |
---|
| 70 | |
---|
| 71 | #check if the two indels are overlapping |
---|
| 72 | if (($indelArray2[$indelEndColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelEndColumn] <= $indelArray1[$indelEndColumn]) || ($indelArray2[$indelStartColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelStartColumn] <= $indelArray1[$indelEndColumn])){ |
---|
| 73 | #print ("There is an overlap between" . "\n" . $indel1 . "\n" . $indel2 . "\n"); |
---|
| 74 | #print("The two overlapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n\n"); |
---|
| 75 | |
---|
| 76 | #break out of the loop and go back to the outerloop |
---|
| 77 | next INDEL1; |
---|
| 78 | } |
---|
| 79 | else{ |
---|
| 80 | #print("The two non-overlaapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n"); |
---|
| 81 | } |
---|
| 82 | } |
---|
| 83 | } |
---|
| 84 | |
---|
| 85 | print OUTPUT $indel1 . "\n"; |
---|
| 86 | $totalNumberofNonOverlappingIndels++; |
---|
| 87 | } |
---|
| 88 | |
---|
| 89 | #print("The total number of indels is: " . $lineCounter1 . "\n"); |
---|
| 90 | #print("The total number of non-overlapping indels is: " . $totalNumberofNonOverlappingIndels . "\n"); |
---|
| 91 | |
---|
| 92 | #close the input and output files |
---|
| 93 | close(OUTPUT); |
---|
| 94 | close(INPUT); |
---|