root/galaxy-central/tools/metag_tools/blat_coverage_report.py @ 2

リビジョン 2, 3.8 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2
3import os, sys
4
5assert sys.version_info[:2] >= ( 2, 4 )
6
7def stop_err( msg ):
8    sys.stderr.write( "%s\n" % msg )
9    sys.exit()
10
11def reverse_complement(s):
12    complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":"."}
13    reversed_s = []
14    for i in s:
15        reversed_s.append(complement_dna[i])
16    reversed_s.reverse()
17    return "".join(reversed_s)
18
19def __main__():
20    nuc_index = {'a':0,'t':1,'c':2,'g':3}
21    diff_hash = {}    # key = (chrom, index)
22    infile = sys.argv[1]
23    outfile = sys.argv[2]
24    invalid_lines = 0
25    invalid_chars = 0
26    data_id = ''
27    data_seq = ''
28
29    for i, line in enumerate( open( infile ) ):
30        line = line.rstrip( '\r\n' )
31        if not line or line.startswith( '#' ):
32            continue
33        fields = line.split()
34        if len(fields) != 23:    # standard number of pslx columns
35            invalid_lines += 1
36            continue
37        if not fields[0].isdigit():
38            invalid_lines += 1
39            continue
40        read_id = fields[9]
41        chrom = fields[13]
42        try:
43            block_count = int(fields[17])
44        except:
45            invalid_lines += 1
46            continue
47        block_size = fields[18].split(',')
48        read_start = fields[19].split(',')
49        chrom_start = fields[20].split(',')
50        read_seq = fields[21].split(',')
51        chrom_seq = fields[22].split(',')
52
53        for j in range(block_count):
54            try:
55                this_block_size = int(block_size[j])
56                this_read_start = int(read_start[j])
57                this_chrom_start = int(chrom_start[j])
58            except:
59                invalid_lines += 1
60                break
61            this_read_seq = read_seq[j]
62            this_chrom_seq = chrom_seq[j]
63           
64            if not this_read_seq.isalpha():
65                continue
66            if not this_chrom_seq.isalpha():
67                continue
68           
69            # brut force to check coverage               
70            for k in range(this_block_size):
71                cur_index = this_chrom_start+k
72                sub_a = this_read_seq[k:(k+1)].lower()
73                sub_b = this_chrom_seq[k:(k+1)].lower()
74                if not diff_hash.has_key((chrom, cur_index)):
75                    try:
76                        diff_hash[(chrom, cur_index)] = [0,0,0,0,sub_b.upper()]    # a, t, c, g, ref. nuc.
77                    except Exception, e:
78                        stop_err( str( e ) )
79                if sub_a in ['a','t','c','g']:
80                    diff_hash[(chrom, cur_index)][nuc_index[(sub_a)]] += 1
81                else:
82                    invalid_chars += 1
83                       
84    outputfh = open(outfile, 'w')
85    outputfh.write( "##title\tlocation\tref.\tcov.\tA\tT\tC\tG\n" )
86    keys = diff_hash.keys()
87    keys.sort()
88    for i in keys:
89        (chrom, location) = i
90        sum = diff_hash[ (i) ][ 0 ] + diff_hash[ ( i ) ][ 1 ] + diff_hash[ ( i ) ][ 2 ] + diff_hash[ ( i ) ][ 3 ]    # did not include N's
91        if sum == 0:
92            continue
93        ratio_A = diff_hash[ ( i ) ][ 0 ] * 100.0 / sum
94        ratio_T = diff_hash[ ( i ) ][ 1 ] * 100.0 / sum
95        ratio_C = diff_hash[ ( i ) ][ 2 ] * 100.0 / sum
96        ratio_G = diff_hash[ ( i ) ][ 3 ] * 100.0 / sum
97        (title_head, title_tail) = os.path.split(chrom)
98        result = "%s\t%s\t%s\t%d\tA(%0.0f)\tT(%0.0f)\tC(%0.0f)\tG(%0.0f)\n" % ( title_tail, location, diff_hash[(i)][4], sum, ratio_A, ratio_T, ratio_C, ratio_G )
99        outputfh.write(result)
100    outputfh.close()
101
102    if invalid_lines:
103        print 'Skipped %d invalid lines. ' % ( invalid_lines )
104    if invalid_chars:
105        print 'Skipped %d invalid characters in the alignment. ' % (invalid_chars)
106       
107if __name__ == '__main__': __main__()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。