1 | #!/usr/bin/perl -w |
---|
2 | |
---|
3 | # This program detects overlapping indels in a chromosome and keeps all non-overlapping indels. As for overlapping indels, |
---|
4 | # the first encountered one is kept and all others are removed. It requires three inputs: |
---|
5 | # The first input is a TABULAR format file containing coordinates of indels in blocks extracted from multi-alignment. |
---|
6 | # The second input is an integer number representing the number of the column where indel start coordinates are stored in the input file. |
---|
7 | # The third input is an integer number representing the number of the column where indel end coordinates are stored in the input file. |
---|
8 | # The output is a TABULAR format file containing all non-overlapping indels in the input file, and the first encountered indel of overlapping ones. |
---|
9 | # Note: The number of the first column is 1. |
---|
10 | |
---|
11 | use strict; |
---|
12 | use warnings; |
---|
13 | |
---|
14 | #varaibles to handle information related to indels |
---|
15 | my $indel1 = ""; |
---|
16 | my $indel2 = ""; |
---|
17 | my @indelArray1 = (); |
---|
18 | my @indelArray2 = (); |
---|
19 | my $lineCounter1 = 0; |
---|
20 | my $lineCounter2 = 0; |
---|
21 | my $totalNumberofNonOverlappingIndels = 0; |
---|
22 | |
---|
23 | # check to make sure having correct files |
---|
24 | my $usage = "usage: delete_overlapping_indels.pl [TABULAR.in] [indelStartColumn] [indelEndColumn] [TABULAR.out]\n"; |
---|
25 | die $usage unless @ARGV == 4; |
---|
26 | |
---|
27 | my $inputFile = $ARGV[0]; |
---|
28 | my $indelStartColumn = $ARGV[1] - 1; |
---|
29 | my $indelEndColumn = $ARGV[2] - 1; |
---|
30 | my $outputFile = $ARGV[3]; |
---|
31 | |
---|
32 | #verifie column numbers |
---|
33 | if ($indelStartColumn < 0 ){ |
---|
34 | die ("The indel start column number is invalid \n"); |
---|
35 | } |
---|
36 | if ($indelEndColumn < 0 ){ |
---|
37 | die ("The indel end column number is invalid \n"); |
---|
38 | } |
---|
39 | |
---|
40 | #open the input and output files |
---|
41 | open (INPUT, "<", $inputFile) || die ("Could not open file $inputFile \n"); |
---|
42 | open (OUTPUT, ">", $outputFile) || die ("Could not open file $outputFile \n"); |
---|
43 | |
---|
44 | #store the input file in the array @rawData |
---|
45 | my @indelsRawData = <INPUT>; |
---|
46 | |
---|
47 | #iterated through the indels of the input file |
---|
48 | INDEL1: |
---|
49 | foreach $indel1 (@indelsRawData){ |
---|
50 | chomp ($indel1); |
---|
51 | $lineCounter1++; |
---|
52 | |
---|
53 | #get the first indel |
---|
54 | @indelArray1 = split(/\t/, $indel1); |
---|
55 | |
---|
56 | #our purpose is to detect overlapping indels and to store one copy of them only in the output file |
---|
57 | #all other non-overlapping indels will stored in the output file also |
---|
58 | |
---|
59 | $lineCounter2 = 0; |
---|
60 | |
---|
61 | #iterated through the indels of the input file |
---|
62 | INDEL2: |
---|
63 | foreach $indel2 (@indelsRawData){ |
---|
64 | chomp ($indel2); |
---|
65 | $lineCounter2++; |
---|
66 | |
---|
67 | if ($lineCounter2 > $lineCounter1){ |
---|
68 | #get the second indel |
---|
69 | @indelArray2 = split(/\t/, $indel2); |
---|
70 | |
---|
71 | #check if the two indels are overlapping |
---|
72 | if (($indelArray2[$indelEndColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelEndColumn] <= $indelArray1[$indelEndColumn]) || ($indelArray2[$indelStartColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelStartColumn] <= $indelArray1[$indelEndColumn])){ |
---|
73 | #print ("There is an overlap between" . "\n" . $indel1 . "\n" . $indel2 . "\n"); |
---|
74 | #print("The two overlapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n\n"); |
---|
75 | |
---|
76 | #break out of the loop and go back to the outerloop |
---|
77 | next INDEL1; |
---|
78 | } |
---|
79 | else{ |
---|
80 | #print("The two non-overlaapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n"); |
---|
81 | } |
---|
82 | } |
---|
83 | } |
---|
84 | |
---|
85 | print OUTPUT $indel1 . "\n"; |
---|
86 | $totalNumberofNonOverlappingIndels++; |
---|
87 | } |
---|
88 | |
---|
89 | #print("The total number of indels is: " . $lineCounter1 . "\n"); |
---|
90 | #print("The total number of non-overlapping indels is: " . $totalNumberofNonOverlappingIndels . "\n"); |
---|
91 | |
---|
92 | #close the input and output files |
---|
93 | close(OUTPUT); |
---|
94 | close(INPUT); |
---|