Context Navigation

util.py @ 2

リビジョン 2, 9.5 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

Rev	行番号
[2]	1	#!/usr/bin/env python
	2	#Dan Blankenberg
	3
	4	import sys
	5
	6	assert sys.version_info[:2] >= ( 2, 4 )
	7
	8	#genbank_to_bed
	9	class Region:
	10	def __init__( self ):
	11	self.qualifiers = {}
	12	self.start = None
	13	self.end = None
	14	self.strand = '+'
	15	def set_coordinates_by_location( self, location ):
	16	location = location.strip().lower().replace( '..', ',' )
	17	if "complement(" in location: #if part of the sequence is on the negative strand, it all is?
	18	self.strand = '-' #default of + strand
	19	for remove_text in ["join(", "order(", "complement(", ")"]:
	20	location = location.replace( remove_text, "" )
	21	for number in location.split( ',' ):
	22	number = number.strip('\n\r\t <>,()')
	23	if number:
	24	if "^" in number:
	25	#a single point
	26	#check that this is correct for points, ie: 413/NC_005027.gbk: misc_feature 6636286^6636287 ===> 6636285,6636286
	27	end = int( number.split( '^' )[0] )
	28	start = end - 1
	29	else:
	30	end = int( number )
	31	start = end - 1 #match BED coordinates
	32	if self.start is None or start < self.start:
	33	self.start = start
	34	if self.end is None or end > self.end:
	35	self.end = end
	36
	37	class GenBankFeatureParser:
	38	"""Parses Features from Single Locus GenBank file"""
	39	def __init__( self, fh, features_list = [] ):
	40	self.fh = fh
	41	self.features = {}
	42	fh.seek(0)
	43	in_features = False
	44	last_feature_name = None
	45	base_indent = 0
	46	last_indent = 0
	47	last_attr_name = None
	48	for line in fh:
	49	if not in_features and line.startswith('FEATURES'):
	50	in_features = True
	51	continue
	52	if in_features:
	53	lstrip = line.lstrip()
	54	if line and lstrip == line:
	55	break #end of feature block
	56	cur_indent = len( line ) - len( lstrip )
	57	if last_feature_name is None:
	58	base_indent = cur_indent
	59	if cur_indent == base_indent:
	60	#a new feature
	61	last_attr_name = None
	62	fields = lstrip.split( None, 1 )
	63	last_feature_name = fields[0].strip()
	64	if not features_list or ( features_list and last_feature_name in features_list ):
	65	if last_feature_name not in self.features:
	66	self.features[last_feature_name] = []
	67	region = Region()
	68	region.set_coordinates_by_location( fields[1] )
	69	self.features[last_feature_name].append( region )
	70	else:
	71	#add info to last known feature
	72	line = line.strip()
	73	if line.startswith( '/' ):
	74	fields = line[1:].split( '=', 1 )
	75	if len( fields ) == 2:
	76	last_attr_name, content = fields
	77	else:
	78	#No data
	79	last_attr_name = line[1:]
	80	content = ""
	81	content = content.strip( '"' )
	82	if last_attr_name not in self.features[last_feature_name][-1].qualifiers:
	83	self.features[last_feature_name][-1].qualifiers[last_attr_name] = []
	84	self.features[last_feature_name][-1].qualifiers[last_attr_name].append( content )
	85	elif last_attr_name is None and last_feature_name:
	86	# must still be working on location
	87	self.features[last_feature_name][-1].set_coordinates_by_location( line )
	88	else:
	89	#continuation of multi-line qualifier content
	90	if last_feature_name.lower() in ['translation']:
	91	self.features[last_feature_name][-1].qualifiers[last_attr_name][-1] = "%s%s" % ( self.features[last_feature_name][-1].qualifiers[last_attr_name][-1], line.rstrip( '"' ) )
	92	else:
	93	self.features[last_feature_name][-1].qualifiers[last_attr_name][-1] = "%s %s" % ( self.features[last_feature_name][-1].qualifiers[last_attr_name][-1], line.rstrip( '"' ) )
	94
	95
	96	def get_features_by_type( self, feature_type ):
	97	if feature_type not in self.features:
	98	return []
	99	else:
	100	return self.features[feature_type]
	101
	102	# Parse A GenBank file and return arrays of BED regions for the corresponding features
	103	def get_bed_from_genbank(gb_file, chrom, feature_list):
	104	genbank_parser = GenBankFeatureParser( open( gb_file ) )
	105	features = {}
	106	for feature_type in feature_list:
	107	features[feature_type]=[]
	108	for feature in genbank_parser.get_features_by_type( feature_type ):
	109	name = ""
	110	for name_tag in ['gene', 'locus_tag', 'db_xref']:
	111	if name_tag in feature.qualifiers:
	112	if name: name = name + ";"
	113	name = name + feature.qualifiers[name_tag][0].replace(" ","_")
	114	if not name:
	115	name = "unknown"
	116
	117	features[feature_type].append( "%s\t%s\t%s\t%s\t%s\t%s" % ( chrom, feature.start, feature.end, name, 0, feature.strand ) )#append new bed field here
	118	return features
	119
	120	#geneMark to bed
	121	import sys
	122
	123	#converts GeneMarkHMM to bed
	124	#returns an array of bed regions
	125	def get_bed_from_GeneMark(geneMark_filename, chr):
	126	orfs = open(geneMark_filename).readlines()
	127	while True:
	128	line = orfs.pop(0).strip()
	129	if line.startswith("--------"):
	130	orfs.pop(0)
	131	break
	132	orfs = "".join(orfs)
	133	ctr = 0
	134	regions = []
	135	for block in orfs.split("\n\n"):
	136	if block.startswith("List of Regions of interest"): break
	137	best_block = {'start':0,'end':0,'strand':'+','avg_prob':-sys.maxint,'start_prob':-sys.maxint,'name':'DNE'}
	138	ctr+=1
	139	ctr2=0
	140	for line in block.split("\n"):
	141	ctr2+=1
	142	fields = line.split()
	143	start = int(fields.pop(0))-1
	144	end = int(fields.pop(0))
	145	strand = fields.pop(0)
	146	if strand == 'complement':
	147	strand = "-"
	148	else:
	149	strand = "+"
	150	frame = fields.pop(0)
	151	frame = frame + " " + fields.pop(0)
	152	avg_prob = float(fields.pop(0))
	153	try:
	154	start_prob = float(fields.pop(0))
	155	except:
	156	start_prob = 0
	157	name = "orf_"+str(ctr)+"_"+str(ctr2)
	158	if avg_prob >= best_block['avg_prob']:
	159	if start_prob > best_block['start_prob']:
	160	best_block = {'start':start,'end':end,'strand':strand,'avg_prob':avg_prob,'start_prob':start_prob,'name':name}
	161	regions.append(chr+"\t"+str(best_block['start'])+"\t"+str(best_block['end'])+"\t"+best_block['name']+"\t"+str(int(best_block['avg_prob']*1000))+"\t"+best_block['strand'])
	162	return regions
	163
	164	#geneMarkHMM to bed
	165	#converts GeneMarkHMM to bed
	166	#returns an array of bed regions
	167	def get_bed_from_GeneMarkHMM(geneMarkHMM_filename, chr):
	168	orfs = open(geneMarkHMM_filename).readlines()
	169	while True:
	170	line = orfs.pop(0).strip()
	171	if line == "Predicted genes":
	172	orfs.pop(0)
	173	orfs.pop(0)
	174	break
	175	regions = []
	176	for line in orfs:
	177	fields = line.split()
	178	name = "gene_number_"+fields.pop(0)
	179	strand = fields.pop(0)
	180	start = fields.pop(0)
	181	if start.startswith("<"): start = 1
	182	start = int(start)-1
	183	end = fields.pop(0)
	184	if end.startswith(">"): end = end[1:]
	185	end = int(end)
	186	score = 0 # no scores provided
	187	regions.append(chr+"\t"+str(start)+"\t"+str(end)+"\t"+name+"\t"+str(score)+"\t"+strand)
	188	return regions
	189
	190	#glimmer3 to bed
	191	#converts glimmer3 to bed, doing some linear scaling (probably not correct?) on scores
	192	#returns an array of bed regions
	193	def get_bed_from_glimmer3(glimmer3_filename, chr):
	194	max_score = -sys.maxint
	195	min_score = sys.maxint
	196	orfs = []
	197	for line in open(glimmer3_filename).readlines():
	198	if line.startswith(">"): continue
	199	fields = line.split()
	200	name = fields.pop(0)
	201	start = int(fields.pop(0))
	202	end = int(fields.pop(0))
	203	if int(fields.pop(0))<0:
	204	strand = "-"
	205	temp = start
	206	start = end
	207	end = temp
	208	else:
	209	strand = "+"
	210	start = start - 1
	211	score = (float(fields.pop(0)))
	212	if score > max_score: max_score = score
	213	if score < min_score: min_score = score
	214	orfs.append((chr,start,end,name,score,strand))
	215
	216	delta = 0
	217	if min_score < 0: delta = min_score * -1
	218	regions = []
	219	for (chr,start,end,name,score,strand) in orfs:
	220	#need to cast to str because was having the case where 1000.0 was rounded to 999 by int, some sort of precision bug?
	221	my_score = int(float(str( ( (score+delta) * (1000-0-(min_score+delta)) ) / ( (max_score + delta) + 0 ))))
	222
	223	regions.append(chr+"\t"+str(start)+"\t"+str(end)+"\t"+name+"\t"+str(my_score)+"\t"+strand)
	224	return regions

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/scripts/microbes/util.py @ 2

異なるフォーマットでダウンロード: