1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | import os, sys |
---|
4 | |
---|
5 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
6 | |
---|
7 | def stop_err( msg ): |
---|
8 | sys.stderr.write( "%s\n" % msg ) |
---|
9 | sys.exit() |
---|
10 | |
---|
11 | def reverse_complement(s): |
---|
12 | complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":"."} |
---|
13 | reversed_s = [] |
---|
14 | for i in s: |
---|
15 | reversed_s.append(complement_dna[i]) |
---|
16 | reversed_s.reverse() |
---|
17 | return "".join(reversed_s) |
---|
18 | |
---|
19 | def __main__(): |
---|
20 | nuc_index = {'a':0,'t':1,'c':2,'g':3} |
---|
21 | diff_hash = {} # key = (chrom, index) |
---|
22 | infile = sys.argv[1] |
---|
23 | outfile = sys.argv[2] |
---|
24 | invalid_lines = 0 |
---|
25 | invalid_chars = 0 |
---|
26 | data_id = '' |
---|
27 | data_seq = '' |
---|
28 | |
---|
29 | for i, line in enumerate( open( infile ) ): |
---|
30 | line = line.rstrip( '\r\n' ) |
---|
31 | if not line or line.startswith( '#' ): |
---|
32 | continue |
---|
33 | fields = line.split() |
---|
34 | if len(fields) != 23: # standard number of pslx columns |
---|
35 | invalid_lines += 1 |
---|
36 | continue |
---|
37 | if not fields[0].isdigit(): |
---|
38 | invalid_lines += 1 |
---|
39 | continue |
---|
40 | read_id = fields[9] |
---|
41 | chrom = fields[13] |
---|
42 | try: |
---|
43 | block_count = int(fields[17]) |
---|
44 | except: |
---|
45 | invalid_lines += 1 |
---|
46 | continue |
---|
47 | block_size = fields[18].split(',') |
---|
48 | read_start = fields[19].split(',') |
---|
49 | chrom_start = fields[20].split(',') |
---|
50 | read_seq = fields[21].split(',') |
---|
51 | chrom_seq = fields[22].split(',') |
---|
52 | |
---|
53 | for j in range(block_count): |
---|
54 | try: |
---|
55 | this_block_size = int(block_size[j]) |
---|
56 | this_read_start = int(read_start[j]) |
---|
57 | this_chrom_start = int(chrom_start[j]) |
---|
58 | except: |
---|
59 | invalid_lines += 1 |
---|
60 | break |
---|
61 | this_read_seq = read_seq[j] |
---|
62 | this_chrom_seq = chrom_seq[j] |
---|
63 | |
---|
64 | if not this_read_seq.isalpha(): |
---|
65 | continue |
---|
66 | if not this_chrom_seq.isalpha(): |
---|
67 | continue |
---|
68 | |
---|
69 | # brut force to check coverage |
---|
70 | for k in range(this_block_size): |
---|
71 | cur_index = this_chrom_start+k |
---|
72 | sub_a = this_read_seq[k:(k+1)].lower() |
---|
73 | sub_b = this_chrom_seq[k:(k+1)].lower() |
---|
74 | if not diff_hash.has_key((chrom, cur_index)): |
---|
75 | try: |
---|
76 | diff_hash[(chrom, cur_index)] = [0,0,0,0,sub_b.upper()] # a, t, c, g, ref. nuc. |
---|
77 | except Exception, e: |
---|
78 | stop_err( str( e ) ) |
---|
79 | if sub_a in ['a','t','c','g']: |
---|
80 | diff_hash[(chrom, cur_index)][nuc_index[(sub_a)]] += 1 |
---|
81 | else: |
---|
82 | invalid_chars += 1 |
---|
83 | |
---|
84 | outputfh = open(outfile, 'w') |
---|
85 | outputfh.write( "##title\tlocation\tref.\tcov.\tA\tT\tC\tG\n" ) |
---|
86 | keys = diff_hash.keys() |
---|
87 | keys.sort() |
---|
88 | for i in keys: |
---|
89 | (chrom, location) = i |
---|
90 | sum = diff_hash[ (i) ][ 0 ] + diff_hash[ ( i ) ][ 1 ] + diff_hash[ ( i ) ][ 2 ] + diff_hash[ ( i ) ][ 3 ] # did not include N's |
---|
91 | if sum == 0: |
---|
92 | continue |
---|
93 | ratio_A = diff_hash[ ( i ) ][ 0 ] * 100.0 / sum |
---|
94 | ratio_T = diff_hash[ ( i ) ][ 1 ] * 100.0 / sum |
---|
95 | ratio_C = diff_hash[ ( i ) ][ 2 ] * 100.0 / sum |
---|
96 | ratio_G = diff_hash[ ( i ) ][ 3 ] * 100.0 / sum |
---|
97 | (title_head, title_tail) = os.path.split(chrom) |
---|
98 | result = "%s\t%s\t%s\t%d\tA(%0.0f)\tT(%0.0f)\tC(%0.0f)\tG(%0.0f)\n" % ( title_tail, location, diff_hash[(i)][4], sum, ratio_A, ratio_T, ratio_C, ratio_G ) |
---|
99 | outputfh.write(result) |
---|
100 | outputfh.close() |
---|
101 | |
---|
102 | if invalid_lines: |
---|
103 | print 'Skipped %d invalid lines. ' % ( invalid_lines ) |
---|
104 | if invalid_chars: |
---|
105 | print 'Skipped %d invalid characters in the alignment. ' % (invalid_chars) |
---|
106 | |
---|
107 | if __name__ == '__main__': __main__() |
---|