[2] | 1 | #!/usr/bin/env python |
---|
| 2 | |
---|
| 3 | import os, sys |
---|
| 4 | |
---|
| 5 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
| 6 | |
---|
| 7 | def stop_err( msg ): |
---|
| 8 | sys.stderr.write( "%s\n" % msg ) |
---|
| 9 | sys.exit() |
---|
| 10 | |
---|
| 11 | def reverse_complement(s): |
---|
| 12 | complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":"."} |
---|
| 13 | reversed_s = [] |
---|
| 14 | for i in s: |
---|
| 15 | reversed_s.append(complement_dna[i]) |
---|
| 16 | reversed_s.reverse() |
---|
| 17 | return "".join(reversed_s) |
---|
| 18 | |
---|
| 19 | def __main__(): |
---|
| 20 | nuc_index = {'a':0,'t':1,'c':2,'g':3} |
---|
| 21 | diff_hash = {} # key = (chrom, index) |
---|
| 22 | infile = sys.argv[1] |
---|
| 23 | outfile = sys.argv[2] |
---|
| 24 | invalid_lines = 0 |
---|
| 25 | invalid_chars = 0 |
---|
| 26 | data_id = '' |
---|
| 27 | data_seq = '' |
---|
| 28 | |
---|
| 29 | for i, line in enumerate( open( infile ) ): |
---|
| 30 | line = line.rstrip( '\r\n' ) |
---|
| 31 | if not line or line.startswith( '#' ): |
---|
| 32 | continue |
---|
| 33 | fields = line.split() |
---|
| 34 | if len(fields) != 23: # standard number of pslx columns |
---|
| 35 | invalid_lines += 1 |
---|
| 36 | continue |
---|
| 37 | if not fields[0].isdigit(): |
---|
| 38 | invalid_lines += 1 |
---|
| 39 | continue |
---|
| 40 | read_id = fields[9] |
---|
| 41 | chrom = fields[13] |
---|
| 42 | try: |
---|
| 43 | block_count = int(fields[17]) |
---|
| 44 | except: |
---|
| 45 | invalid_lines += 1 |
---|
| 46 | continue |
---|
| 47 | block_size = fields[18].split(',') |
---|
| 48 | read_start = fields[19].split(',') |
---|
| 49 | chrom_start = fields[20].split(',') |
---|
| 50 | read_seq = fields[21].split(',') |
---|
| 51 | chrom_seq = fields[22].split(',') |
---|
| 52 | |
---|
| 53 | for j in range(block_count): |
---|
| 54 | try: |
---|
| 55 | this_block_size = int(block_size[j]) |
---|
| 56 | this_read_start = int(read_start[j]) |
---|
| 57 | this_chrom_start = int(chrom_start[j]) |
---|
| 58 | except: |
---|
| 59 | invalid_lines += 1 |
---|
| 60 | break |
---|
| 61 | this_read_seq = read_seq[j] |
---|
| 62 | this_chrom_seq = chrom_seq[j] |
---|
| 63 | |
---|
| 64 | if not this_read_seq.isalpha(): |
---|
| 65 | continue |
---|
| 66 | if not this_chrom_seq.isalpha(): |
---|
| 67 | continue |
---|
| 68 | |
---|
| 69 | # brut force to check coverage |
---|
| 70 | for k in range(this_block_size): |
---|
| 71 | cur_index = this_chrom_start+k |
---|
| 72 | sub_a = this_read_seq[k:(k+1)].lower() |
---|
| 73 | sub_b = this_chrom_seq[k:(k+1)].lower() |
---|
| 74 | if not diff_hash.has_key((chrom, cur_index)): |
---|
| 75 | try: |
---|
| 76 | diff_hash[(chrom, cur_index)] = [0,0,0,0,sub_b.upper()] # a, t, c, g, ref. nuc. |
---|
| 77 | except Exception, e: |
---|
| 78 | stop_err( str( e ) ) |
---|
| 79 | if sub_a in ['a','t','c','g']: |
---|
| 80 | diff_hash[(chrom, cur_index)][nuc_index[(sub_a)]] += 1 |
---|
| 81 | else: |
---|
| 82 | invalid_chars += 1 |
---|
| 83 | |
---|
| 84 | outputfh = open(outfile, 'w') |
---|
| 85 | outputfh.write( "##title\tlocation\tref.\tcov.\tA\tT\tC\tG\n" ) |
---|
| 86 | keys = diff_hash.keys() |
---|
| 87 | keys.sort() |
---|
| 88 | for i in keys: |
---|
| 89 | (chrom, location) = i |
---|
| 90 | sum = diff_hash[ (i) ][ 0 ] + diff_hash[ ( i ) ][ 1 ] + diff_hash[ ( i ) ][ 2 ] + diff_hash[ ( i ) ][ 3 ] # did not include N's |
---|
| 91 | if sum == 0: |
---|
| 92 | continue |
---|
| 93 | ratio_A = diff_hash[ ( i ) ][ 0 ] * 100.0 / sum |
---|
| 94 | ratio_T = diff_hash[ ( i ) ][ 1 ] * 100.0 / sum |
---|
| 95 | ratio_C = diff_hash[ ( i ) ][ 2 ] * 100.0 / sum |
---|
| 96 | ratio_G = diff_hash[ ( i ) ][ 3 ] * 100.0 / sum |
---|
| 97 | (title_head, title_tail) = os.path.split(chrom) |
---|
| 98 | result = "%s\t%s\t%s\t%d\tA(%0.0f)\tT(%0.0f)\tC(%0.0f)\tG(%0.0f)\n" % ( title_tail, location, diff_hash[(i)][4], sum, ratio_A, ratio_T, ratio_C, ratio_G ) |
---|
| 99 | outputfh.write(result) |
---|
| 100 | outputfh.close() |
---|
| 101 | |
---|
| 102 | if invalid_lines: |
---|
| 103 | print 'Skipped %d invalid lines. ' % ( invalid_lines ) |
---|
| 104 | if invalid_chars: |
---|
| 105 | print 'Skipped %d invalid characters in the alignment. ' % (invalid_chars) |
---|
| 106 | |
---|
| 107 | if __name__ == '__main__': __main__() |
---|