1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | import os, sys |
---|
4 | |
---|
5 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
6 | |
---|
7 | def reverse_complement(s): |
---|
8 | complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":"."} |
---|
9 | reversed_s = [] |
---|
10 | for i in s: |
---|
11 | reversed_s.append(complement_dna[i]) |
---|
12 | reversed_s.reverse() |
---|
13 | return "".join(reversed_s) |
---|
14 | |
---|
15 | def __main__(): |
---|
16 | nuc_index = {'a':0,'t':1,'c':2,'g':3,'n':4} |
---|
17 | coverage = {} # key = (chrom, index) |
---|
18 | invalid_lines = 0 |
---|
19 | invalid_chrom = 0 |
---|
20 | infile = sys.argv[1] |
---|
21 | outfile = sys.argv[2] |
---|
22 | |
---|
23 | for i, line in enumerate( open( infile ) ): |
---|
24 | line = line.rstrip('\r\n') |
---|
25 | if not line or line.startswith('#'): |
---|
26 | continue |
---|
27 | fields = line.split() |
---|
28 | if len(fields) < 21: # standard number of pslx columns |
---|
29 | invalid_lines += 1 |
---|
30 | continue |
---|
31 | if not fields[0].isdigit(): |
---|
32 | invalid_lines += 1 |
---|
33 | continue |
---|
34 | chrom = fields[13] |
---|
35 | if not chrom.startswith( 'chr' ): |
---|
36 | invalid_lines += 1 |
---|
37 | invalid_chrom += 1 |
---|
38 | continue |
---|
39 | try: |
---|
40 | block_count = int(fields[17]) |
---|
41 | except: |
---|
42 | invalid_lines += 1 |
---|
43 | continue |
---|
44 | block_size = fields[18].split(',') |
---|
45 | chrom_start = fields[20].split(',') |
---|
46 | |
---|
47 | for j in range( block_count ): |
---|
48 | try: |
---|
49 | this_block_size = int(block_size[j]) |
---|
50 | this_chrom_start = int(chrom_start[j]) |
---|
51 | except: |
---|
52 | invalid_lines += 1 |
---|
53 | break |
---|
54 | # brut force coverage |
---|
55 | for k in range( this_block_size ): |
---|
56 | cur_index = this_chrom_start + k |
---|
57 | if coverage.has_key( ( chrom, cur_index ) ): |
---|
58 | coverage[(chrom, cur_index)] += 1 |
---|
59 | else: |
---|
60 | coverage[(chrom, cur_index)] = 1 |
---|
61 | |
---|
62 | # generate a index file |
---|
63 | outputfh = open(outfile, 'w') |
---|
64 | keys = coverage.keys() |
---|
65 | keys.sort() |
---|
66 | previous_chrom = '' |
---|
67 | for i in keys: |
---|
68 | (chrom, location) = i |
---|
69 | sum = coverage[(i)] |
---|
70 | if chrom != previous_chrom: |
---|
71 | outputfh.write( 'variableStep chrom=%s\n' % ( chrom ) ) |
---|
72 | previous_chrom = chrom |
---|
73 | outputfh.write( "%s\t%s\n" % ( location, sum ) ) |
---|
74 | outputfh.close() |
---|
75 | |
---|
76 | if invalid_lines: |
---|
77 | invalid_msg = "Skipped %d invalid lines" % invalid_lines |
---|
78 | if invalid_chrom: |
---|
79 | invalid_msg += ", including %d lines with chrom id errors which must begin with 'chr' to map correctly to the UCSC Genome Browser. " |
---|
80 | |
---|
81 | if __name__ == '__main__': __main__() |
---|