Context Navigation

gff_to_bed_converter.py @ 2

リビジョン 2, 5.0 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2	import sys
3	from galaxy import eggs
4	from galaxy.tools.util.gff_util import parse_gff_attributes
5
6	assert sys.version_info[:2] >= ( 2, 4 )
7
8	def get_bed_line( chrom, name, strand, blocks ):
9	""" Returns a BED line for given data. """
10
11
12	if len( blocks ) == 1:
13	# Use simple BED format if there is only a single block:
14	# chrom, chromStart, chromEnd, name, score, strand
15	#
16	start, end = blocks[0]
17	return "%s\t%i\t%i\t%s\t0\t%s\n" % ( chrom, start, end, name, strand )
18
19	#
20	# Build lists for transcript blocks' starts, sizes.
21	#
22
23	# Get transcript start, end.
24	t_start = sys.maxint
25	t_end = -1
26	for block_start, block_end in blocks:
27	if block_start < t_start:
28	t_start = block_start
29	if block_end > t_end:
30	t_end = block_end
31
32	# Get block starts, sizes.
33	block_starts = []
34	block_sizes = []
35	for block_start, block_end in blocks:
36	block_starts.append( str( block_start - t_start ) )
37	block_sizes.append( str( block_end - block_start ) )
38
39	#
40	# Create BED entry.
41	# Bed format: chrom, chromStart, chromEnd, name, score, strand, \
42	# thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts
43	#
44	return "%s\t%i\t%i\t%s\t0\t%s\t%i\t%i\t0\t%i\t%s\t%s\n" % \
45	( chrom, t_start, t_end, name, strand, t_start, t_start, len( block_starts ),
46	",".join( block_sizes ), ",".join( block_starts ) )
47
48	def __main__():
49	input_name = sys.argv[1]
50	output_name = sys.argv[2]
51	skipped_lines = 0
52	first_skipped_line = 0
53	out = open( output_name, 'w' )
54	i = 0
55	cur_transcript_chrom = None
56	cur_transcript_id = None
57	cur_transcript_strand = None
58	cur_transcripts_blocks = [] # (start, end) for each block.
59	for i, line in enumerate( file( input_name ) ):
60	line = line.rstrip( '\r\n' )
61	if line and not line.startswith( '#' ):
62	try:
63	# GFF format: chrom source, name, chromStart, chromEnd, score, strand, attributes
64	elems = line.split( '\t' )
65	start = str( long( elems[3] ) - 1 )
66	coords = [ long( start ), long( elems[4] ) ]
67	strand = elems[6]
68	if strand not in ['+', '-']:
69	strand = '+'
70	attributes = parse_gff_attributes( elems[8] )
71	t_id = attributes.get( "transcript_id", None )
72
73	if not t_id:
74	#
75	# No transcript ID, so write last transcript and write current line as its own line.
76	#
77
78	# Write previous transcript.
79	if cur_transcript_id:
80	# Write BED entry.
81	out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
82
83	# Replace any spaces in the name with underscores so UCSC will not complain.
84	name = elems[2].replace(" ", "_")
85	out.write( get_bed_line( elems[0], name, strand, [ coords ] ) )
86	continue
87
88	# There is a transcript ID, so process line at transcript level.
89	if t_id == cur_transcript_id:
90	# Line is element of transcript and will be a block in the BED entry.
91	cur_transcripts_blocks.append( coords )
92	continue
93
94	#
95	# Line is part of new transcript; write previous transcript and start
96	# new transcript.
97	#
98
99	# Write previous transcript.
100	if cur_transcript_id:
101	# Write BED entry.
102	out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
103
104	# Start new transcript.
105	cur_transcript_chrome = elems[0]
106	cur_transcript_id = t_id
107	cur_transcript_strand = strand
108	cur_transcripts_blocks = []
109	cur_transcripts_blocks.append( coords )
110	except:
111	skipped_lines += 1
112	if not first_skipped_line:
113	first_skipped_line = i + 1
114	else:
115	skipped_lines += 1
116	if not first_skipped_line:
117	first_skipped_line = i + 1
118
119	# Write last transcript.
120	if cur_transcript_id:
121	# Write BED entry.
122	out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
123	out.close()
124	info_msg = "%i lines converted to BED. " % ( i + 1 - skipped_lines )
125	if skipped_lines > 0:
126	info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
127	print info_msg
128
129	if __name__ == "__main__": __main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/filters/gff_to_bed_converter.py @ 2

異なるフォーマットでダウンロード: