Context Navigation

gff_to_bed_converter.py @ 2

リビジョン 2, 5.0 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

Rev	行番号
[2]	1	#!/usr/bin/env python
	2	import sys
	3	from galaxy import eggs
	4	from galaxy.tools.util.gff_util import parse_gff_attributes
	5
	6	assert sys.version_info[:2] >= ( 2, 4 )
	7
	8	def get_bed_line( chrom, name, strand, blocks ):
	9	""" Returns a BED line for given data. """
	10
	11
	12	if len( blocks ) == 1:
	13	# Use simple BED format if there is only a single block:
	14	# chrom, chromStart, chromEnd, name, score, strand
	15	#
	16	start, end = blocks[0]
	17	return "%s\t%i\t%i\t%s\t0\t%s\n" % ( chrom, start, end, name, strand )
	18
	19	#
	20	# Build lists for transcript blocks' starts, sizes.
	21	#
	22
	23	# Get transcript start, end.
	24	t_start = sys.maxint
	25	t_end = -1
	26	for block_start, block_end in blocks:
	27	if block_start < t_start:
	28	t_start = block_start
	29	if block_end > t_end:
	30	t_end = block_end
	31
	32	# Get block starts, sizes.
	33	block_starts = []
	34	block_sizes = []
	35	for block_start, block_end in blocks:
	36	block_starts.append( str( block_start - t_start ) )
	37	block_sizes.append( str( block_end - block_start ) )
	38
	39	#
	40	# Create BED entry.
	41	# Bed format: chrom, chromStart, chromEnd, name, score, strand, \
	42	# thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts
	43	#
	44	return "%s\t%i\t%i\t%s\t0\t%s\t%i\t%i\t0\t%i\t%s\t%s\n" % \
	45	( chrom, t_start, t_end, name, strand, t_start, t_start, len( block_starts ),
	46	",".join( block_sizes ), ",".join( block_starts ) )
	47
	48	def __main__():
	49	input_name = sys.argv[1]
	50	output_name = sys.argv[2]
	51	skipped_lines = 0
	52	first_skipped_line = 0
	53	out = open( output_name, 'w' )
	54	i = 0
	55	cur_transcript_chrom = None
	56	cur_transcript_id = None
	57	cur_transcript_strand = None
	58	cur_transcripts_blocks = [] # (start, end) for each block.
	59	for i, line in enumerate( file( input_name ) ):
	60	line = line.rstrip( '\r\n' )
	61	if line and not line.startswith( '#' ):
	62	try:
	63	# GFF format: chrom source, name, chromStart, chromEnd, score, strand, attributes
	64	elems = line.split( '\t' )
	65	start = str( long( elems[3] ) - 1 )
	66	coords = [ long( start ), long( elems[4] ) ]
	67	strand = elems[6]
	68	if strand not in ['+', '-']:
	69	strand = '+'
	70	attributes = parse_gff_attributes( elems[8] )
	71	t_id = attributes.get( "transcript_id", None )
	72
	73	if not t_id:
	74	#
	75	# No transcript ID, so write last transcript and write current line as its own line.
	76	#
	77
	78	# Write previous transcript.
	79	if cur_transcript_id:
	80	# Write BED entry.
	81	out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
	82
	83	# Replace any spaces in the name with underscores so UCSC will not complain.
	84	name = elems[2].replace(" ", "_")
	85	out.write( get_bed_line( elems[0], name, strand, [ coords ] ) )
	86	continue
	87
	88	# There is a transcript ID, so process line at transcript level.
	89	if t_id == cur_transcript_id:
	90	# Line is element of transcript and will be a block in the BED entry.
	91	cur_transcripts_blocks.append( coords )
	92	continue
	93
	94	#
	95	# Line is part of new transcript; write previous transcript and start
	96	# new transcript.
	97	#
	98
	99	# Write previous transcript.
	100	if cur_transcript_id:
	101	# Write BED entry.
	102	out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
	103
	104	# Start new transcript.
	105	cur_transcript_chrome = elems[0]
	106	cur_transcript_id = t_id
	107	cur_transcript_strand = strand
	108	cur_transcripts_blocks = []
	109	cur_transcripts_blocks.append( coords )
	110	except:
	111	skipped_lines += 1
	112	if not first_skipped_line:
	113	first_skipped_line = i + 1
	114	else:
	115	skipped_lines += 1
	116	if not first_skipped_line:
	117	first_skipped_line = i + 1
	118
	119	# Write last transcript.
	120	if cur_transcript_id:
	121	# Write BED entry.
	122	out.write( get_bed_line( cur_transcript_chrome, cur_transcript_id, cur_transcript_strand, cur_transcripts_blocks ) )
	123	out.close()
	124	info_msg = "%i lines converted to BED. " % ( i + 1 - skipped_lines )
	125	if skipped_lines > 0:
	126	info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
	127	print info_msg
	128
	129	if __name__ == "__main__": __main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/filters/gff_to_bed_converter.py @ 2

異なるフォーマットでダウンロード: