[2] | 1 | #!/usr/bin/env python |
---|
| 2 | import os, sys, tempfile |
---|
| 3 | |
---|
| 4 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
| 5 | |
---|
| 6 | def __main__(): |
---|
| 7 | # Read parms. |
---|
| 8 | input_name = sys.argv[1] |
---|
| 9 | output_name = sys.argv[2] |
---|
| 10 | attribute_name = sys.argv[3] |
---|
| 11 | |
---|
| 12 | # Create temp files. |
---|
| 13 | tmp_name1 = tempfile.NamedTemporaryFile().name |
---|
| 14 | tmp_name2 = tempfile.NamedTemporaryFile().name |
---|
| 15 | |
---|
| 16 | # Do conversion. |
---|
| 17 | skipped_lines = 0 |
---|
| 18 | first_skipped_line = 0 |
---|
| 19 | out = open( tmp_name1, 'w' ) |
---|
| 20 | |
---|
| 21 | # Write track data to temporary file. |
---|
| 22 | i = 0 |
---|
| 23 | for i, line in enumerate( file( input_name ) ): |
---|
| 24 | line = line.rstrip( '\r\n' ) |
---|
| 25 | |
---|
| 26 | if line and not line.startswith( '#' ): |
---|
| 27 | try: |
---|
| 28 | elems = line.split( '\t' ) |
---|
| 29 | start = str( int( elems[3] ) - 1 ) # GTF coordinates are 1-based, BedGraph are 0-based. |
---|
| 30 | strand = elems[6] |
---|
| 31 | if strand not in ['+', '-']: |
---|
| 32 | strand = '+' |
---|
| 33 | attributes_list = elems[8].split(";") |
---|
| 34 | attributes = {} |
---|
| 35 | for name_value_pair in attributes_list: |
---|
| 36 | pair = name_value_pair.strip().split(" ") |
---|
| 37 | name = pair[0].strip() |
---|
| 38 | if name == '': |
---|
| 39 | continue |
---|
| 40 | # Need to strip double quote from values |
---|
| 41 | value = pair[1].strip(" \"") |
---|
| 42 | attributes[name] = value |
---|
| 43 | value = attributes[ attribute_name ] |
---|
| 44 | # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes. |
---|
| 45 | # BedGraph format: chrom, chromStart, chromEnd, value |
---|
| 46 | out.write( "%s\t%s\t%s\t%s\n" %( elems[0], start, elems[4], value ) ) |
---|
| 47 | except: |
---|
| 48 | skipped_lines += 1 |
---|
| 49 | if not first_skipped_line: |
---|
| 50 | first_skipped_line = i + 1 |
---|
| 51 | else: |
---|
| 52 | skipped_lines += 1 |
---|
| 53 | if not first_skipped_line: |
---|
| 54 | first_skipped_line = i + 1 |
---|
| 55 | out.close() |
---|
| 56 | |
---|
| 57 | # Sort tmp file by chromosome name and chromosome start to create ordered track data. |
---|
| 58 | cmd = "sort -k1,1 -k2,2n < %s > %s" % ( tmp_name1, tmp_name2 ) |
---|
| 59 | try: |
---|
| 60 | os.system(cmd) |
---|
| 61 | os.remove(tmp_name1) |
---|
| 62 | except Exception, ex: |
---|
| 63 | sys.stderr.write( "%s\n" % ex ) |
---|
| 64 | sys.exit(1) |
---|
| 65 | |
---|
| 66 | # Create bedgraph file by combining track definition with ordered track data. |
---|
| 67 | cmd = "echo 'track type=bedGraph' | cat - %s > %s " % ( tmp_name2, output_name ) |
---|
| 68 | try: |
---|
| 69 | os.system(cmd) |
---|
| 70 | os.remove(tmp_name2) |
---|
| 71 | except Exception, ex: |
---|
| 72 | sys.stderr.write( "%s\n" % ex ) |
---|
| 73 | sys.exit(1) |
---|
| 74 | |
---|
| 75 | info_msg = "%i lines converted to BEDGraph. " % ( i + 1 - skipped_lines ) |
---|
| 76 | if skipped_lines > 0: |
---|
| 77 | info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line ) |
---|
| 78 | print info_msg |
---|
| 79 | |
---|
| 80 | if __name__ == "__main__": __main__() |
---|