[2] | 1 | #!/usr/bin/env python |
---|
| 2 | #Guruprasad ANanda |
---|
| 3 | """ |
---|
| 4 | Fetches substitutions from pairwise alignments. |
---|
| 5 | """ |
---|
| 6 | |
---|
| 7 | from galaxy import eggs |
---|
| 8 | |
---|
| 9 | from galaxy.tools.util import maf_utilities |
---|
| 10 | |
---|
| 11 | import bx.align.maf |
---|
| 12 | import sys |
---|
| 13 | import os, fileinput |
---|
| 14 | def stop_err(msg): |
---|
| 15 | sys.stderr.write(msg) |
---|
| 16 | sys.exit() |
---|
| 17 | |
---|
| 18 | if len(sys.argv) < 3: |
---|
| 19 | stop_err("Incorrect number of arguments.") |
---|
| 20 | |
---|
| 21 | inp_file = sys.argv[1] |
---|
| 22 | out_file = sys.argv[2] |
---|
| 23 | fout = open(out_file, 'w') |
---|
| 24 | |
---|
| 25 | def fetchSubs(block): |
---|
| 26 | |
---|
| 27 | src1 = block.components[0].src |
---|
| 28 | sequence1 = block.components[0].text |
---|
| 29 | start1 = block.components[0].start |
---|
| 30 | end1 = block.components[0].end |
---|
| 31 | len1 = int(end1)-int(start1) |
---|
| 32 | len1_withgap = len(sequence1) |
---|
| 33 | |
---|
| 34 | for seq in range (1,len(block.components)): |
---|
| 35 | src2 = block.components[seq].src |
---|
| 36 | sequence2 = block.components[seq].text |
---|
| 37 | start2 = block.components[seq].start |
---|
| 38 | end2 = block.components[seq].end |
---|
| 39 | len2 = int(end2)-int(start2) |
---|
| 40 | sub_begin = None |
---|
| 41 | sub_end = None |
---|
| 42 | begin = False |
---|
| 43 | |
---|
| 44 | for nt in range(len1_withgap): |
---|
| 45 | if sequence1[nt] not in '-#$^*?' and sequence2[nt] not in '-#$^*?': #Not a gap or masked character |
---|
| 46 | if sequence1[nt].upper() != sequence2[nt].upper(): |
---|
| 47 | if not(begin): |
---|
| 48 | sub_begin = nt |
---|
| 49 | begin = True |
---|
| 50 | sub_end = nt |
---|
| 51 | else: |
---|
| 52 | if begin: |
---|
| 53 | print >>fout, "%s\t%s\t%s" %(src1,start1+sub_begin-sequence1[0:sub_begin].count('-'),start1+sub_end-sequence1[0:sub_end].count('-')) |
---|
| 54 | print >>fout, "%s\t%s\t%s" %(src2,start2+sub_begin-sequence2[0:sub_begin].count('-'),start2+sub_end-sequence2[0:sub_end].count('-')) |
---|
| 55 | begin = False |
---|
| 56 | |
---|
| 57 | else: |
---|
| 58 | if begin: |
---|
| 59 | print >>fout, "%s\t%s\t%s" %(src1,start1+sub_begin-sequence1[0:sub_begin].count('-'),end1+sub_end-sequence1[0:sub_end].count('-')) |
---|
| 60 | print >>fout, "%s\t%s\t%s" %(src2,start2+sub_begin-sequence2[0:sub_begin].count('-'),end2+sub_end-sequence2[0:sub_end].count('-')) |
---|
| 61 | begin = False |
---|
| 62 | ended = False |
---|
| 63 | |
---|
| 64 | |
---|
| 65 | def main(): |
---|
| 66 | skipped = 0 |
---|
| 67 | not_pairwise = 0 |
---|
| 68 | try: |
---|
| 69 | maf_reader = bx.align.maf.Reader( open(inp_file, 'r') ) |
---|
| 70 | except: |
---|
| 71 | stop_err("Your MAF file appears to be malformed.") |
---|
| 72 | print >>fout, "#Chr\tStart\tEnd" |
---|
| 73 | for block in maf_reader: |
---|
| 74 | if len(block.components) != 2: |
---|
| 75 | not_pairwise += 1 |
---|
| 76 | continue |
---|
| 77 | try: |
---|
| 78 | fetchSubs(block) |
---|
| 79 | except: |
---|
| 80 | skipped += 1 |
---|
| 81 | |
---|
| 82 | if not_pairwise: |
---|
| 83 | print "Skipped %d non-pairwise blocks" %(not_pairwise) |
---|
| 84 | if skipped: |
---|
| 85 | print "Skipped %d blocks" %(skipped) |
---|
| 86 | if __name__ == "__main__": |
---|
| 87 | main() |
---|