[2] | 1 | #!/usr/bin/env python |
---|
| 2 | # This code exists in 2 places: ~/datatypes/converters and ~/tools/filters |
---|
| 3 | import sys |
---|
| 4 | |
---|
| 5 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
| 6 | |
---|
| 7 | def __main__(): |
---|
| 8 | input_name = sys.argv[1] |
---|
| 9 | output_name = sys.argv[2] |
---|
| 10 | skipped_lines = 0 |
---|
| 11 | first_skipped_line = 0 |
---|
| 12 | out = open( output_name, 'w' ) |
---|
| 13 | out.write( "##gff-version 2\n" ) |
---|
| 14 | out.write( "##bed_to_gff_converter.py\n\n" ) |
---|
| 15 | i = 0 |
---|
| 16 | for i, line in enumerate( file( input_name ) ): |
---|
| 17 | complete_bed = False |
---|
| 18 | line = line.rstrip( '\r\n' ) |
---|
| 19 | if line and not line.startswith( '#' ) and not line.startswith( 'track' ) and not line.startswith( 'browser' ): |
---|
| 20 | try: |
---|
| 21 | elems = line.split( '\t' ) |
---|
| 22 | if len( elems ) == 12: |
---|
| 23 | complete_bed = True |
---|
| 24 | chrom = elems[0] |
---|
| 25 | if complete_bed: |
---|
| 26 | feature = "mRNA" |
---|
| 27 | else: |
---|
| 28 | try: |
---|
| 29 | feature = elems[3] |
---|
| 30 | except: |
---|
| 31 | feature = 'feature%d' % ( i + 1 ) |
---|
| 32 | start = int( elems[1] ) + 1 |
---|
| 33 | end = int( elems[2] ) |
---|
| 34 | try: |
---|
| 35 | score = elems[4] |
---|
| 36 | except: |
---|
| 37 | score = '0' |
---|
| 38 | try: |
---|
| 39 | strand = elems[5] |
---|
| 40 | except: |
---|
| 41 | strand = '+' |
---|
| 42 | try: |
---|
| 43 | group = elems[3] |
---|
| 44 | except: |
---|
| 45 | group = 'group%d' % ( i + 1 ) |
---|
| 46 | if complete_bed: |
---|
| 47 | out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s %s;\n' % ( chrom, feature, start, end, score, strand, feature, group ) ) |
---|
| 48 | else: |
---|
| 49 | out.write( '%s\tbed2gff\t%s\t%d\t%d\t%s\t%s\t.\t%s;\n' % ( chrom, feature, start, end, score, strand, group ) ) |
---|
| 50 | if complete_bed: |
---|
| 51 | # We have all the info necessary to annotate exons for genes and mRNAs |
---|
| 52 | block_count = int( elems[9] ) |
---|
| 53 | block_sizes = elems[10].split( ',' ) |
---|
| 54 | block_starts = elems[11].split( ',' ) |
---|
| 55 | for j in range( block_count ): |
---|
| 56 | exon_start = int( start ) + int( block_starts[j] ) |
---|
| 57 | exon_end = exon_start + int( block_sizes[j] ) - 1 |
---|
| 58 | out.write( '%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\texon %s;\n' % ( chrom, exon_start, exon_end, score, strand, group ) ) |
---|
| 59 | except: |
---|
| 60 | skipped_lines += 1 |
---|
| 61 | if not first_skipped_line: |
---|
| 62 | first_skipped_line = i + 1 |
---|
| 63 | else: |
---|
| 64 | skipped_lines += 1 |
---|
| 65 | if not first_skipped_line: |
---|
| 66 | first_skipped_line = i + 1 |
---|
| 67 | out.close() |
---|
| 68 | info_msg = "%i lines converted to GFF version 2. " % ( i + 1 - skipped_lines ) |
---|
| 69 | if skipped_lines > 0: |
---|
| 70 | info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line ) |
---|
| 71 | print info_msg |
---|
| 72 | |
---|
| 73 | if __name__ == "__main__": __main__() |
---|