1 | #!/usr/bin/env python |
---|
2 | #guruprasad Ananda |
---|
3 | """ |
---|
4 | Estimates substitution rates from pairwise alignments using JC69 model. |
---|
5 | """ |
---|
6 | |
---|
7 | from galaxy import eggs |
---|
8 | from galaxy.tools.util.galaxyops import * |
---|
9 | from galaxy.tools.util import maf_utilities |
---|
10 | import bx.align.maf |
---|
11 | import sys, fileinput |
---|
12 | |
---|
13 | def stop_err(msg): |
---|
14 | sys.stderr.write(msg) |
---|
15 | sys.exit() |
---|
16 | |
---|
17 | if len(sys.argv) < 3: |
---|
18 | stop_err("Incorrect number of arguments.") |
---|
19 | |
---|
20 | inp_file = sys.argv[1] |
---|
21 | out_file = sys.argv[2] |
---|
22 | fout = open(out_file, 'w') |
---|
23 | int_file = sys.argv[3] |
---|
24 | if int_file != "None": #The user has specified an interval file |
---|
25 | dbkey_i = sys.argv[4] |
---|
26 | chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] ) |
---|
27 | |
---|
28 | |
---|
29 | def rateEstimator(block): |
---|
30 | global alignlen, mismatches |
---|
31 | |
---|
32 | src1 = block.components[0].src |
---|
33 | sequence1 = block.components[0].text |
---|
34 | start1 = block.components[0].start |
---|
35 | end1 = block.components[0].end |
---|
36 | len1 = int(end1)-int(start1) |
---|
37 | len1_withgap = len(sequence1) |
---|
38 | mismatch = 0.0 |
---|
39 | |
---|
40 | for seq in range (1,len(block.components)): |
---|
41 | src2 = block.components[seq].src |
---|
42 | sequence2 = block.components[seq].text |
---|
43 | start2 = block.components[seq].start |
---|
44 | end2 = block.components[seq].end |
---|
45 | len2 = int(end2)-int(start2) |
---|
46 | for nt in range(len1_withgap): |
---|
47 | if sequence1[nt] not in '-#$^*?' and sequence2[nt] not in '-#$^*?': #Not a gap or masked character |
---|
48 | if sequence1[nt].upper() != sequence2[nt].upper(): |
---|
49 | mismatch += 1 |
---|
50 | |
---|
51 | if int_file == "None": |
---|
52 | p = mismatch/min(len1,len2) |
---|
53 | print >>fout, "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%.4f" %(src1,start1,end1,src2,start2,end2,min(len1,len2),mismatch,p) |
---|
54 | else: |
---|
55 | mismatches += mismatch |
---|
56 | alignlen += min(len1,len2) |
---|
57 | |
---|
58 | def main(): |
---|
59 | skipped = 0 |
---|
60 | not_pairwise = 0 |
---|
61 | |
---|
62 | if int_file == "None": |
---|
63 | try: |
---|
64 | maf_reader = bx.align.maf.Reader( open(inp_file, 'r') ) |
---|
65 | except: |
---|
66 | stop_err("Your MAF file appears to be malformed.") |
---|
67 | print >>fout, "#Seq1\tStart1\tEnd1\tSeq2\tStart2\tEnd2\tL\tN\tp" |
---|
68 | for block in maf_reader: |
---|
69 | if len(block.components) != 2: |
---|
70 | not_pairwise += 1 |
---|
71 | continue |
---|
72 | try: |
---|
73 | rateEstimator(block) |
---|
74 | except: |
---|
75 | skipped += 1 |
---|
76 | else: |
---|
77 | index, index_filename = maf_utilities.build_maf_index( inp_file, species = [dbkey_i] ) |
---|
78 | if index is None: |
---|
79 | print >> sys.stderr, "Your MAF file appears to be malformed." |
---|
80 | sys.exit() |
---|
81 | win = NiceReaderWrapper( fileinput.FileInput( int_file ), |
---|
82 | chrom_col=chr_col_i, |
---|
83 | start_col=start_col_i, |
---|
84 | end_col=end_col_i, |
---|
85 | strand_col=strand_col_i, |
---|
86 | fix_strand=True) |
---|
87 | species=None |
---|
88 | mincols = 0 |
---|
89 | global alignlen, mismatches |
---|
90 | |
---|
91 | for interval in win: |
---|
92 | alignlen = 0 |
---|
93 | mismatches = 0.0 |
---|
94 | src = "%s.%s" % ( dbkey_i, interval.chrom ) |
---|
95 | for block in maf_utilities.get_chopped_blocks_for_region( index, src, interval, species, mincols ): |
---|
96 | if len(block.components) != 2: |
---|
97 | not_pairwise += 1 |
---|
98 | continue |
---|
99 | try: |
---|
100 | rateEstimator(block) |
---|
101 | except: |
---|
102 | skipped += 1 |
---|
103 | if alignlen: |
---|
104 | p = mismatches/alignlen |
---|
105 | else: |
---|
106 | p = 'NA' |
---|
107 | interval.fields.append(str(alignlen)) |
---|
108 | interval.fields.append(str(mismatches)) |
---|
109 | interval.fields.append(str(p)) |
---|
110 | print >>fout, "\t".join(interval.fields) |
---|
111 | #num_blocks += 1 |
---|
112 | |
---|
113 | if not_pairwise: |
---|
114 | print "Skipped %d non-pairwise blocks" %(not_pairwise) |
---|
115 | if skipped: |
---|
116 | print "Skipped %d blocks as invalid" %(skipped) |
---|
117 | if __name__ == "__main__": |
---|
118 | main() |
---|