1 | #!/usr/bin/env python |
---|
2 | import os, sys, tempfile |
---|
3 | |
---|
4 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
5 | |
---|
6 | def __main__(): |
---|
7 | # Read parms. |
---|
8 | input_name = sys.argv[1] |
---|
9 | output_name = sys.argv[2] |
---|
10 | attribute_name = sys.argv[3] |
---|
11 | |
---|
12 | # Create temp files. |
---|
13 | tmp_name1 = tempfile.NamedTemporaryFile().name |
---|
14 | tmp_name2 = tempfile.NamedTemporaryFile().name |
---|
15 | |
---|
16 | # Do conversion. |
---|
17 | skipped_lines = 0 |
---|
18 | first_skipped_line = 0 |
---|
19 | out = open( tmp_name1, 'w' ) |
---|
20 | |
---|
21 | # Write track data to temporary file. |
---|
22 | i = 0 |
---|
23 | for i, line in enumerate( file( input_name ) ): |
---|
24 | line = line.rstrip( '\r\n' ) |
---|
25 | |
---|
26 | if line and not line.startswith( '#' ): |
---|
27 | try: |
---|
28 | elems = line.split( '\t' ) |
---|
29 | start = str( int( elems[3] ) - 1 ) # GTF coordinates are 1-based, BedGraph are 0-based. |
---|
30 | strand = elems[6] |
---|
31 | if strand not in ['+', '-']: |
---|
32 | strand = '+' |
---|
33 | attributes_list = elems[8].split(";") |
---|
34 | attributes = {} |
---|
35 | for name_value_pair in attributes_list: |
---|
36 | pair = name_value_pair.strip().split(" ") |
---|
37 | name = pair[0].strip() |
---|
38 | if name == '': |
---|
39 | continue |
---|
40 | # Need to strip double quote from values |
---|
41 | value = pair[1].strip(" \"") |
---|
42 | attributes[name] = value |
---|
43 | value = attributes[ attribute_name ] |
---|
44 | # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes. |
---|
45 | # BedGraph format: chrom, chromStart, chromEnd, value |
---|
46 | out.write( "%s\t%s\t%s\t%s\n" %( elems[0], start, elems[4], value ) ) |
---|
47 | except: |
---|
48 | skipped_lines += 1 |
---|
49 | if not first_skipped_line: |
---|
50 | first_skipped_line = i + 1 |
---|
51 | else: |
---|
52 | skipped_lines += 1 |
---|
53 | if not first_skipped_line: |
---|
54 | first_skipped_line = i + 1 |
---|
55 | out.close() |
---|
56 | |
---|
57 | # Sort tmp file by chromosome name and chromosome start to create ordered track data. |
---|
58 | cmd = "sort -k1,1 -k2,2n < %s > %s" % ( tmp_name1, tmp_name2 ) |
---|
59 | try: |
---|
60 | os.system(cmd) |
---|
61 | os.remove(tmp_name1) |
---|
62 | except Exception, ex: |
---|
63 | sys.stderr.write( "%s\n" % ex ) |
---|
64 | sys.exit(1) |
---|
65 | |
---|
66 | # Create bedgraph file by combining track definition with ordered track data. |
---|
67 | cmd = "echo 'track type=bedGraph' | cat - %s > %s " % ( tmp_name2, output_name ) |
---|
68 | try: |
---|
69 | os.system(cmd) |
---|
70 | os.remove(tmp_name2) |
---|
71 | except Exception, ex: |
---|
72 | sys.stderr.write( "%s\n" % ex ) |
---|
73 | sys.exit(1) |
---|
74 | |
---|
75 | info_msg = "%i lines converted to BEDGraph. " % ( i + 1 - skipped_lines ) |
---|
76 | if skipped_lines > 0: |
---|
77 | info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line ) |
---|
78 | print info_msg |
---|
79 | |
---|
80 | if __name__ == "__main__": __main__() |
---|