1 | #!/usr/bin/env python |
---|
2 | # This code exists in 2 places: ~/datatypes/converters and ~/tools/filters |
---|
3 | import sys |
---|
4 | |
---|
5 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
6 | |
---|
7 | def __main__(): |
---|
8 | input_name = sys.argv[1] |
---|
9 | output_name = sys.argv[2] |
---|
10 | skipped_lines = 0 |
---|
11 | first_skipped_line = 0 |
---|
12 | out = open( output_name, 'w' ) |
---|
13 | out.write( "##gff-version 2\n" ) |
---|
14 | out.write( "##bed_to_gff_converter.py\n\n" ) |
---|
15 | i = 0 |
---|
16 | for i, line in enumerate( file( input_name ) ): |
---|
17 | complete_bed = False |
---|
18 | line = line.rstrip( '\r\n' ) |
---|
19 | if line and not line.startswith( '#' ) and not line.startswith( 'track' ) and not line.startswith( 'browser' ): |
---|
20 | try: |
---|
21 | elems = line.split( '\t' ) |
---|
22 | if len( elems ) == 12: |
---|
23 | complete_bed = True |
---|
24 | chrom = elems[0] |
---|
25 | if complete_bed: |
---|
26 | feature = "mRNA" |
---|
27 | else: |
---|
28 | try: |
---|
29 | feature = elems[3] |
---|
30 | except: |
---|
31 | feature = 'feature%d' % ( i + 1 ) |
---|
32 | start = int( elems[1] ) + 1 |
---|
33 | end = int( elems[2] ) |
---|
34 | try: |
---|
35 | score = elems[4] |
---|
36 | except: |
---|
37 | score = '0' |
---|
38 | try: |
---|
39 | strand = elems[5] |
---|
40 | except: |
---|
41 | strand = '+' |
---|
42 | try: |
---|
43 | group = elems[3] |
---|
44 | except: |
---|
45 | group = 'group%d' % ( i + 1 ) |
---|
46 | if complete_bed: |
---|
47 | out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s %s;\n' % ( chrom, feature, start, end, score, strand, feature, group ) ) |
---|
48 | else: |
---|
49 | out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s;\n' % ( chrom, feature, start, end, score, strand, group ) ) |
---|
50 | if complete_bed: |
---|
51 | # We have all the info necessary to annotate exons for genes and mRNAs |
---|
52 | block_count = int( elems[9] ) |
---|
53 | block_sizes = elems[10].split( ',' ) |
---|
54 | block_starts = elems[11].split( ',' ) |
---|
55 | for j in range( block_count ): |
---|
56 | exon_start = int( start ) + int( block_starts[j] ) |
---|
57 | exon_end = exon_start + int( block_sizes[j] ) - 1 |
---|
58 | out.write( '%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\texon %s;\n' % ( chrom, exon_start, exon_end, score, strand, group ) ) |
---|
59 | except: |
---|
60 | skipped_lines += 1 |
---|
61 | if not first_skipped_line: |
---|
62 | first_skipped_line = i + 1 |
---|
63 | else: |
---|
64 | skipped_lines += 1 |
---|
65 | if not first_skipped_line: |
---|
66 | first_skipped_line = i + 1 |
---|
67 | out.close() |
---|
68 | info_msg = "%i lines converted to GFF version 2. " % ( i + 1 - skipped_lines ) |
---|
69 | if skipped_lines > 0: |
---|
70 | info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line ) |
---|
71 | print info_msg |
---|
72 | |
---|
73 | if __name__ == "__main__": __main__() |
---|