Context Navigation

ncbi_to_ucsc.py @ 2

リビジョン 2, 5.5 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

Rev	行番号
[2]	1	#!/usr/bin/env python
	2
	3	"""
	4	Walk downloaded Genome Projects and Convert, in place, IDs to match the UCSC Archaea browser, where applicable.
	5	Uses UCSC Archaea DSN.
	6	"""
	7
	8	import sys, os
	9	import urllib
	10	from elementtree import ElementTree
	11	from BeautifulSoup import BeautifulSoup
	12	from shutil import move
	13
	14	def __main__():
	15	base_dir = os.path.join( os.getcwd(), "bacteria" )
	16	try:
	17	base_dir = sys.argv[1]
	18	except:
	19	print "using default base_dir:", base_dir
	20
	21	organisms = {}
	22	for result in os.walk(base_dir):
	23	this_base_dir,sub_dirs,files = result
	24	for file in files:
	25	if file[-5:] == ".info":
	26	dict = {}
	27	info_file = open(os.path.join(this_base_dir,file),'r')
	28	info = info_file.readlines()
	29	info_file.close()
	30	for line in info:
	31	fields = line.replace("\n","").split("=")
	32	dict[fields[0]]="=".join(fields[1:])
	33	if 'genome project id' in dict.keys():
	34	if dict['genome project id'] not in organisms.keys():
	35	organisms[dict['genome project id']] = {'chrs':{},'base_dir':this_base_dir}
	36	for key in dict.keys():
	37	organisms[dict['genome project id']][key]=dict[key]
	38	else:
	39	if dict['organism'] not in organisms.keys():
	40	organisms[dict['organism']] = {'chrs':{},'base_dir':this_base_dir}
	41	organisms[dict['organism']]['chrs'][dict['chromosome']]=dict
	42
	43	##get UCSC data
	44
	45	URL = "http://archaea.ucsc.edu/cgi-bin/das/dsn"
	46
	47
	48	try:
	49	page = urllib.urlopen(URL)
	50	except:
	51	print "#Unable to open " + URL
	52	print "?\tunspecified (?)"
	53	sys.exit(1)
	54
	55	text = page.read()
	56	try:
	57	tree = ElementTree.fromstring(text)
	58	except:
	59	print "#Invalid xml passed back from " + URL
	60	print "?\tunspecified (?)"
	61	sys.exit(1)
	62
	63
	64	builds = {}
	65
	66
	67	#print "#Harvested from http://archaea.ucsc.edu/cgi-bin/das/dsn"
	68	#print "?\tunspecified (?)"
	69	for dsn in tree:
	70	build = dsn.find("SOURCE").attrib['id']
	71	try:
	72	org_page = urllib.urlopen("http://archaea.ucsc.edu/cgi-bin/hgGateway?db="+build).read().replace("\n","").split("<table border=2 cellspacing=2 cellpadding=2>")[1].split("</table>")[0].split("</tr>")
	73	except:
	74	print "NO CHROMS FOR",build
	75	continue
	76	org_page.pop(0)
	77	if org_page[-1]=="":
	78	org_page.pop(-1)
	79
	80	for row in org_page:
	81	chr = row.split("</a>")[0].split(">")[-1]
	82	refseq = row.split("</a>")[-2].split(">")[-1]
	83	for org in organisms:
	84	for org_chr in organisms[org]['chrs']:
	85	if organisms[org]['chrs'][org_chr]['chromosome']==refseq:
	86	if org not in builds:
	87	builds[org]={'chrs':{},'build':build}
	88	builds[org]['chrs'][refseq]=chr
	89	#print build,org,chr,refseq
	90
	91	print
	92	ext_to_edit = ['bed', 'info', ]
	93	for org in builds:
	94	print org,"changed to",builds[org]['build']
	95
	96	#org info file
	97	info_file_old = os.path.join(base_dir+org,org+".info")
	98	info_file_new = os.path.join(base_dir+org,builds[org]['build']+".info")
	99
	100
	101	old_dir = base_dir+org
	102	new_dir = base_dir+builds[org]['build']
	103
	104	#open and edit org info file
	105	info_file_contents = open(info_file_old).read()
	106	info_file_contents = info_file_contents+"build="+builds[org]['build']+"\n"
	107	for chrom in builds[org]['chrs']:
	108	info_file_contents = info_file_contents.replace(chrom,builds[org]['chrs'][chrom])
	109	for result in os.walk(base_dir+org):
	110	this_base_dir,sub_dirs,files = result
	111	for file in files:
	112	if file[0:len(chrom)]==chrom:
	113	#rename file
	114	old_name = os.path.join(this_base_dir,file)
	115	new_name = os.path.join(this_base_dir,builds[org]['chrs'][chrom]+file[len(chrom):])
	116	move(old_name,new_name)
	117
	118	#edit contents of file, skiping those in list
	119	if file.split(".")[-1] not in ext_to_edit: continue
	120
	121	file_contents = open(new_name).read()
	122	file_contents = file_contents.replace(chrom,builds[org]['chrs'][chrom])
	123
	124	#special case fixes...
	125	if file[-5:] == ".info":
	126	file_contents = file_contents.replace("organism="+org,"organism="+builds[org]['build'])
	127	file_contents = file_contents.replace("refseq="+builds[org]['chrs'][chrom],"refseq="+chrom)
	128
	129	#write out new file
	130	file_out = open(new_name,'w')
	131	file_out.write(file_contents)
	132	file_out.close()
	133
	134
	135
	136	#write out org info file and remove old file
	137	org_info_out = open(info_file_new,'w')
	138	org_info_out.write(info_file_contents)
	139	org_info_out.close()
	140	os.unlink(info_file_old)
	141
	142	#change org directory name
	143	move(old_dir,new_dir)
	144
	145
	146	if __name__ == "__main__": __main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/scripts/microbes/ncbi_to_ucsc.py @ 2

異なるフォーマットでダウンロード: