| 1 | #!/usr/bin/perl -w |
|---|
| 2 | |
|---|
| 3 | # This program detects overlapping indels in a chromosome and keeps all non-overlapping indels. As for overlapping indels, |
|---|
| 4 | # the first encountered one is kept and all others are removed. It requires three inputs: |
|---|
| 5 | # The first input is a TABULAR format file containing coordinates of indels in blocks extracted from multi-alignment. |
|---|
| 6 | # The second input is an integer number representing the number of the column where indel start coordinates are stored in the input file. |
|---|
| 7 | # The third input is an integer number representing the number of the column where indel end coordinates are stored in the input file. |
|---|
| 8 | # The output is a TABULAR format file containing all non-overlapping indels in the input file, and the first encountered indel of overlapping ones. |
|---|
| 9 | # Note: The number of the first column is 1. |
|---|
| 10 | |
|---|
| 11 | use strict; |
|---|
| 12 | use warnings; |
|---|
| 13 | |
|---|
| 14 | #varaibles to handle information related to indels |
|---|
| 15 | my $indel1 = ""; |
|---|
| 16 | my $indel2 = ""; |
|---|
| 17 | my @indelArray1 = (); |
|---|
| 18 | my @indelArray2 = (); |
|---|
| 19 | my $lineCounter1 = 0; |
|---|
| 20 | my $lineCounter2 = 0; |
|---|
| 21 | my $totalNumberofNonOverlappingIndels = 0; |
|---|
| 22 | |
|---|
| 23 | # check to make sure having correct files |
|---|
| 24 | my $usage = "usage: delete_overlapping_indels.pl [TABULAR.in] [indelStartColumn] [indelEndColumn] [TABULAR.out]\n"; |
|---|
| 25 | die $usage unless @ARGV == 4; |
|---|
| 26 | |
|---|
| 27 | my $inputFile = $ARGV[0]; |
|---|
| 28 | my $indelStartColumn = $ARGV[1] - 1; |
|---|
| 29 | my $indelEndColumn = $ARGV[2] - 1; |
|---|
| 30 | my $outputFile = $ARGV[3]; |
|---|
| 31 | |
|---|
| 32 | #verifie column numbers |
|---|
| 33 | if ($indelStartColumn < 0 ){ |
|---|
| 34 | die ("The indel start column number is invalid \n"); |
|---|
| 35 | } |
|---|
| 36 | if ($indelEndColumn < 0 ){ |
|---|
| 37 | die ("The indel end column number is invalid \n"); |
|---|
| 38 | } |
|---|
| 39 | |
|---|
| 40 | #open the input and output files |
|---|
| 41 | open (INPUT, "<", $inputFile) || die ("Could not open file $inputFile \n"); |
|---|
| 42 | open (OUTPUT, ">", $outputFile) || die ("Could not open file $outputFile \n"); |
|---|
| 43 | |
|---|
| 44 | #store the input file in the array @rawData |
|---|
| 45 | my @indelsRawData = <INPUT>; |
|---|
| 46 | |
|---|
| 47 | #iterated through the indels of the input file |
|---|
| 48 | INDEL1: |
|---|
| 49 | foreach $indel1 (@indelsRawData){ |
|---|
| 50 | chomp ($indel1); |
|---|
| 51 | $lineCounter1++; |
|---|
| 52 | |
|---|
| 53 | #get the first indel |
|---|
| 54 | @indelArray1 = split(/\t/, $indel1); |
|---|
| 55 | |
|---|
| 56 | #our purpose is to detect overlapping indels and to store one copy of them only in the output file |
|---|
| 57 | #all other non-overlapping indels will stored in the output file also |
|---|
| 58 | |
|---|
| 59 | $lineCounter2 = 0; |
|---|
| 60 | |
|---|
| 61 | #iterated through the indels of the input file |
|---|
| 62 | INDEL2: |
|---|
| 63 | foreach $indel2 (@indelsRawData){ |
|---|
| 64 | chomp ($indel2); |
|---|
| 65 | $lineCounter2++; |
|---|
| 66 | |
|---|
| 67 | if ($lineCounter2 > $lineCounter1){ |
|---|
| 68 | #get the second indel |
|---|
| 69 | @indelArray2 = split(/\t/, $indel2); |
|---|
| 70 | |
|---|
| 71 | #check if the two indels are overlapping |
|---|
| 72 | if (($indelArray2[$indelEndColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelEndColumn] <= $indelArray1[$indelEndColumn]) || ($indelArray2[$indelStartColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelStartColumn] <= $indelArray1[$indelEndColumn])){ |
|---|
| 73 | #print ("There is an overlap between" . "\n" . $indel1 . "\n" . $indel2 . "\n"); |
|---|
| 74 | #print("The two overlapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n\n"); |
|---|
| 75 | |
|---|
| 76 | #break out of the loop and go back to the outerloop |
|---|
| 77 | next INDEL1; |
|---|
| 78 | } |
|---|
| 79 | else{ |
|---|
| 80 | #print("The two non-overlaapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n"); |
|---|
| 81 | } |
|---|
| 82 | } |
|---|
| 83 | } |
|---|
| 84 | |
|---|
| 85 | print OUTPUT $indel1 . "\n"; |
|---|
| 86 | $totalNumberofNonOverlappingIndels++; |
|---|
| 87 | } |
|---|
| 88 | |
|---|
| 89 | #print("The total number of indels is: " . $lineCounter1 . "\n"); |
|---|
| 90 | #print("The total number of non-overlapping indels is: " . $totalNumberofNonOverlappingIndels . "\n"); |
|---|
| 91 | |
|---|
| 92 | #close the input and output files |
|---|
| 93 | close(OUTPUT); |
|---|
| 94 | close(INPUT); |
|---|