root/galaxy-central/tools/filters/gff_to_bed_converter.py @ 2

リビジョン 2, 5.0 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2import sys
3from galaxy import eggs
4from galaxy.tools.util.gff_util import parse_gff_attributes
5
6assert sys.version_info[:2] >= ( 2, 4 )
7
8def get_bed_line( chrom, name, strand, blocks ):
9    """ Returns a BED line for given data. """
10
11   
12    if len( blocks ) == 1:
13        # Use simple BED format if there is only a single block:
14        #   chrom, chromStart, chromEnd, name, score, strand
15        #
16        start, end = blocks[0]
17        return "%s\t%i\t%i\t%s\t0\t%s\n" % ( chrom, start, end, name, strand )
18
19    #
20    # Build lists for transcript blocks' starts, sizes.
21    #
22   
23    # Get transcript start, end.
24    t_start = sys.maxint
25    t_end = -1
26    for block_start, block_end in blocks:
27        if block_start < t_start:
28            t_start = block_start
29        if block_end > t_end:
30            t_end = block_end
31           
32    # Get block starts, sizes.
33    block_starts = []
34    block_sizes = []
35    for block_start, block_end in blocks:
36        block_starts.append( str( block_start - t_start ) )
37        block_sizes.append( str( block_end - block_start ) )
38   
39    #
40    # Create BED entry.
41    # Bed format: chrom, chromStart, chromEnd, name, score, strand, \
42    #               thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts
43    #
44    return "%s\t%i\t%i\t%s\t0\t%s\t%i\t%i\t0\t%i\t%s\t%s\n" % \
45            ( chrom, t_start, t_end, name, strand, t_start, t_start, len( block_starts ),
46                ",".join( block_sizes ), ",".join( block_starts ) )
47
48def __main__():
49    input_name = sys.argv[1]
50    output_name = sys.argv[2]
51    skipped_lines = 0
52    first_skipped_line = 0
53    out = open( output_name, 'w' )
54    i = 0
55    cur_transcript_chrom = None
56    cur_transcript_id = None
57    cur_transcript_strand = None
58    cur_transcripts_blocks = [] # (start, end) for each block.
59    for i, line in enumerate( file( input_name ) ):
60        line = line.rstrip( '\r\n' )
61        if line and not line.startswith( '#' ):
62            try:
63                # GFF format: chrom source, name, chromStart, chromEnd, score, strand, attributes
64                elems = line.split( '\t' )
65                start = str( long( elems[3] ) - 1 )
66                coords = [ long( start ), long( elems[4] ) ]
67                strand = elems[6]
68                if strand not in ['+', '-']:
69                    strand = '+'
70                attributes = parse_gff_attributes( elems[8] )
71                t_id = attributes.get( "transcript_id", None )
72                   
73                if not t_id:
74                    #
75                    # No transcript ID, so write last transcript and write current line as its own line.
76                    #
77                   
78                    # Write previous transcript.
79                    if cur_transcript_id:
80                        # Write BED entry.
81                        out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
82                   
83                    # Replace any spaces in the name with underscores so UCSC will not complain.
84                    name = elems[2].replace(" ", "_")
85                    out.write( get_bed_line( elems[0], name, strand, [ coords ] ) )
86                    continue
87               
88                # There is a transcript ID, so process line at transcript level.
89                if t_id == cur_transcript_id:
90                    # Line is element of transcript and will be a block in the BED entry.
91                    cur_transcripts_blocks.append( coords )
92                    continue
93                   
94                #
95                # Line is part of new transcript; write previous transcript and start
96                # new transcript.
97                #
98               
99                # Write previous transcript.
100                if cur_transcript_id:
101                    # Write BED entry.
102                    out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
103
104                # Start new transcript.
105                cur_transcript_chrome = elems[0]
106                cur_transcript_id = t_id
107                cur_transcript_strand = strand
108                cur_transcripts_blocks = []
109                cur_transcripts_blocks.append( coords )   
110            except:
111                skipped_lines += 1
112                if not first_skipped_line:
113                    first_skipped_line = i + 1
114        else:
115            skipped_lines += 1
116            if not first_skipped_line:
117                first_skipped_line = i + 1
118   
119    # Write last transcript.
120    if cur_transcript_id:
121        # Write BED entry.
122        out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
123    out.close()
124    info_msg = "%i lines converted to BED.  " % ( i + 1 - skipped_lines )
125    if skipped_lines > 0:
126        info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
127    print info_msg
128
129if __name__ == "__main__": __main__()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。