Context Navigation

ncbi_to_ucsc.py @ 2

リビジョン 2, 5.5 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2
3	"""
4	Walk downloaded Genome Projects and Convert, in place, IDs to match the UCSC Archaea browser, where applicable.
5	Uses UCSC Archaea DSN.
6	"""
7
8	import sys, os
9	import urllib
10	from elementtree import ElementTree
11	from BeautifulSoup import BeautifulSoup
12	from shutil import move
13
14	def __main__():
15	base_dir = os.path.join( os.getcwd(), "bacteria" )
16	try:
17	base_dir = sys.argv[1]
18	except:
19	print "using default base_dir:", base_dir
20
21	organisms = {}
22	for result in os.walk(base_dir):
23	this_base_dir,sub_dirs,files = result
24	for file in files:
25	if file[-5:] == ".info":
26	dict = {}
27	info_file = open(os.path.join(this_base_dir,file),'r')
28	info = info_file.readlines()
29	info_file.close()
30	for line in info:
31	fields = line.replace("\n","").split("=")
32	dict[fields[0]]="=".join(fields[1:])
33	if 'genome project id' in dict.keys():
34	if dict['genome project id'] not in organisms.keys():
35	organisms[dict['genome project id']] = {'chrs':{},'base_dir':this_base_dir}
36	for key in dict.keys():
37	organisms[dict['genome project id']][key]=dict[key]
38	else:
39	if dict['organism'] not in organisms.keys():
40	organisms[dict['organism']] = {'chrs':{},'base_dir':this_base_dir}
41	organisms[dict['organism']]['chrs'][dict['chromosome']]=dict
42
43	##get UCSC data
44
45	URL = "http://archaea.ucsc.edu/cgi-bin/das/dsn"
46
47
48	try:
49	page = urllib.urlopen(URL)
50	except:
51	print "#Unable to open " + URL
52	print "?\tunspecified (?)"
53	sys.exit(1)
54
55	text = page.read()
56	try:
57	tree = ElementTree.fromstring(text)
58	except:
59	print "#Invalid xml passed back from " + URL
60	print "?\tunspecified (?)"
61	sys.exit(1)
62
63
64	builds = {}
65
66
67	#print "#Harvested from http://archaea.ucsc.edu/cgi-bin/das/dsn"
68	#print "?\tunspecified (?)"
69	for dsn in tree:
70	build = dsn.find("SOURCE").attrib['id']
71	try:
72	org_page = urllib.urlopen("http://archaea.ucsc.edu/cgi-bin/hgGateway?db="+build).read().replace("\n","").split("<table border=2 cellspacing=2 cellpadding=2>")[1].split("</table>")[0].split("</tr>")
73	except:
74	print "NO CHROMS FOR",build
75	continue
76	org_page.pop(0)
77	if org_page[-1]=="":
78	org_page.pop(-1)
79
80	for row in org_page:
81	chr = row.split("</a>")[0].split(">")[-1]
82	refseq = row.split("</a>")[-2].split(">")[-1]
83	for org in organisms:
84	for org_chr in organisms[org]['chrs']:
85	if organisms[org]['chrs'][org_chr]['chromosome']==refseq:
86	if org not in builds:
87	builds[org]={'chrs':{},'build':build}
88	builds[org]['chrs'][refseq]=chr
89	#print build,org,chr,refseq
90
91	print
92	ext_to_edit = ['bed', 'info', ]
93	for org in builds:
94	print org,"changed to",builds[org]['build']
95
96	#org info file
97	info_file_old = os.path.join(base_dir+org,org+".info")
98	info_file_new = os.path.join(base_dir+org,builds[org]['build']+".info")
99
100
101	old_dir = base_dir+org
102	new_dir = base_dir+builds[org]['build']
103
104	#open and edit org info file
105	info_file_contents = open(info_file_old).read()
106	info_file_contents = info_file_contents+"build="+builds[org]['build']+"\n"
107	for chrom in builds[org]['chrs']:
108	info_file_contents = info_file_contents.replace(chrom,builds[org]['chrs'][chrom])
109	for result in os.walk(base_dir+org):
110	this_base_dir,sub_dirs,files = result
111	for file in files:
112	if file[0:len(chrom)]==chrom:
113	#rename file
114	old_name = os.path.join(this_base_dir,file)
115	new_name = os.path.join(this_base_dir,builds[org]['chrs'][chrom]+file[len(chrom):])
116	move(old_name,new_name)
117
118	#edit contents of file, skiping those in list
119	if file.split(".")[-1] not in ext_to_edit: continue
120
121	file_contents = open(new_name).read()
122	file_contents = file_contents.replace(chrom,builds[org]['chrs'][chrom])
123
124	#special case fixes...
125	if file[-5:] == ".info":
126	file_contents = file_contents.replace("organism="+org,"organism="+builds[org]['build'])
127	file_contents = file_contents.replace("refseq="+builds[org]['chrs'][chrom],"refseq="+chrom)
128
129	#write out new file
130	file_out = open(new_name,'w')
131	file_out.write(file_contents)
132	file_out.close()
133
134
135
136	#write out org info file and remove old file
137	org_info_out = open(info_file_new,'w')
138	org_info_out.write(info_file_contents)
139	org_info_out.close()
140	os.unlink(info_file_old)
141
142	#change org directory name
143	move(old_dir,new_dir)
144
145
146	if __name__ == "__main__": __main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/scripts/microbes/ncbi_to_ucsc.py @ 2

異なるフォーマットでダウンロード: