Context Navigation

build_profile_indexes.py @ 2

リビジョン 2, 16.5 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2	#Dan Blankenberg
3
4	VERSION = '1.0.0' # version of this script
5
6	from optparse import OptionParser
7	import os, gzip, struct, time
8	from ftplib import FTP #do we want a diff method than using FTP to determine Chrom Names, eg use local copy
9
10	#import md5 from hashlib; if python2.4 or less, use old md5
11	try:
12	from hashlib import md5
13	except ImportError:
14	from md5 import new as md5
15
16	#import BitSet from bx-python, try using eggs and package resources, fall back to any local installation
17	try:
18	from galaxy import eggs
19	import pkg_resources
20	pkg_resources.require( "bx-python" )
21	except: pass #Maybe there is a local installation available
22	from bx.bitset import BitSet
23
24	#Define constants
25	STRUCT_FMT = '<I'
26	STRUCT_SIZE = struct.calcsize( STRUCT_FMT )
27	DEFAULT_BITSET_SIZE = 300000000
28	CHUNK_SIZE = 1024
29
30	#Headers used to parse .sql files to determine column indexes for chromosome name, start and end
31	alias_spec = {
32	'chromCol' : [ 'chrom' , 'CHROMOSOME' , 'CHROM', 'Chromosome Name', 'tName' ],
33	'startCol' : [ 'start' , 'START', 'chromStart', 'txStart', 'Start Position (bp)', 'tStart', 'genoStart' ],
34	'endCol' : [ 'end' , 'END' , 'STOP', 'chromEnd', 'txEnd', 'End Position (bp)', 'tEnd', 'genoEnd' ],
35	}
36
37	#Headers used to parse trackDb.txt.gz
38	#TODO: these should be parsed directly from trackDb.sql
39	trackDb_headers = ["tableName", "shortLabel", "type", "longLabel", "visibility", "priority", "colorR", "colorG", "colorB", "altColorR", "altColorG", "altColorB", "useScore", "private", "restrictCount", "restrictList", "url", "html", "grp", "canPack", "settings"]
40
41	def get_columns( filename ):
42	input_sql = open( filename ).read()
43	input_sql = input_sql.split( 'CREATE TABLE ' )[1].split( ';' )[0]
44	input_sql = input_sql.split( ' (', 1 )
45	table_name = input_sql[0].strip().strip( '`' )
46	input_sql = [ split.strip().split( ' ' )[0].strip().strip( '`' ) for split in input_sql[1].rsplit( ')', 1 )[0].strip().split( '\n' ) ]
47	print input_sql
48	chrom_col = None
49	start_col = None
50	end_col = None
51	for col_name in alias_spec['chromCol']:
52	for i, header_name in enumerate( input_sql ):
53	if col_name == header_name:
54	chrom_col = i
55	break
56	if chrom_col is not None:
57	break
58
59	for col_name in alias_spec['startCol']:
60	for i, header_name in enumerate( input_sql ):
61	if col_name == header_name:
62	start_col = i
63	break
64	if start_col is not None:
65	break
66
67	for col_name in alias_spec['endCol']:
68	for i, header_name in enumerate( input_sql ):
69	if col_name == header_name:
70	end_col = i
71	break
72	if end_col is not None:
73	break
74
75	return table_name, chrom_col, start_col, end_col
76
77
78	def create_grouping_xml( input_dir, output_dir, dbkey ):
79	output_filename = os.path.join( output_dir, '%s_tables.xml' % dbkey )
80	def load_groups( file_name = 'grp.txt.gz' ):
81	groups = {}
82	for line in gzip.open( os.path.join( input_dir, file_name ) ):
83	fields = line.split( '\t' )
84	groups[fields[0]] = { 'desc': fields[1], 'priority': fields[2] }
85	return groups
86	f = gzip.open( os.path.join( input_dir, 'trackDb.txt.gz' ) )
87	out = open( output_filename, 'wb' )
88	tables = {}
89	cur_buf = ''
90	while True:
91	line = f.readline()
92	if not line: break
93	#remove new lines
94	line = line.rstrip( '\n\r' )
95	line = line.replace( '\\\t', ' ' ) #replace escaped tabs with space
96	cur_buf += "%s\n" % line.rstrip( '\\' )
97	if line.endswith( '\\' ):
98	continue #line is wrapped, next line
99	#all fields should be loaded now...
100	fields = cur_buf.split( '\t' )
101	cur_buf = '' #reset buffer
102	assert len( fields ) == len( trackDb_headers ), 'Failed Parsing trackDb.txt.gz; fields: %s' % fields
103	table_name = fields[ 0 ]
104	tables[ table_name ] = {}
105	for field_name, field_value in zip( trackDb_headers, fields ):
106	tables[ table_name ][ field_name ] = field_value
107	#split settings fields into dict
108	fields = fields[-1].split( '\n' )
109	tables[ table_name ][ 'settings' ] = {}
110	for field in fields:
111	setting_fields = field.split( ' ', 1 )
112	setting_name = setting_value = setting_fields[ 0 ]
113	if len( setting_fields ) > 1:
114	setting_value = setting_fields[ 1 ]
115	if setting_name or setting_value:
116	tables[ table_name ][ 'settings' ][ setting_name ] = setting_value
117	#Load Groups
118	groups = load_groups()
119	in_groups = {}
120	for table_name, values in tables.iteritems():
121	if os.path.exists( os.path.join( output_dir, table_name ) ):
122	group = values['grp']
123	if group not in in_groups:
124	in_groups[group]={}
125	#*NAME CHANGE*, 'subTrack' no longer exists as a setting...use 'parent' instead
126	#subTrack = values.get('settings', {} ).get( 'subTrack', table_name )
127	subTrack = values.get('settings', {} ).get( 'parent', table_name ).split( ' ' )[0] #need to split, because could be e.g. 'trackgroup on'
128	if subTrack not in in_groups[group]:
129	in_groups[group][subTrack]=[]
130	in_groups[group][subTrack].append( table_name )
131
132	assigned_tables = []
133	out.write( """<filter type="data_meta" data_ref="input1" meta_key="dbkey" value="%s">\n""" % ( dbkey ) )
134	out.write( " <options>\n" )
135	for group, subTracks in sorted( in_groups.iteritems() ):
136	out.write( """ <option name="%s" value="group-%s">\n""" % ( groups[group]['desc'], group ) )
137	for sub_name, sub_tracks in subTracks.iteritems():
138	if len( sub_tracks ) > 1:
139	out.write( """ <option name="%s" value="subtracks-%s">\n""" % ( sub_name, sub_name ) )
140	sub_tracks.sort()
141	for track in sub_tracks:
142	track_label = track
143	if "$" not in tables[track]['shortLabel']:
144	track_label = tables[track]['shortLabel']
145	out.write( """ <option name="%s" value="%s"/>\n""" % ( track_label, track ) )
146	assigned_tables.append( track )
147	out.write( " </option>\n" )
148	else:
149	track = sub_tracks[0]
150	track_label = track
151	if "$" not in tables[track]['shortLabel']:
152	track_label = tables[track]['shortLabel']
153	out.write( """ <option name="%s" value="%s"/>\n""" % ( track_label, track ) )
154	assigned_tables.append( track )
155	out.write( " </option>\n" )
156	unassigned_tables = list( sorted( [ table_dir for table_dir in os.listdir( output_dir ) if table_dir not in assigned_tables and os.path.isdir( os.path.join( output_dir, table_dir ) ) ] ) )
157	if unassigned_tables:
158	out.write( """ <option name="Uncategorized Tables" value="group-trackDbUnassigned">\n""" )
159	for table_name in unassigned_tables:
160	out.write( """ <option name="%s" value="%s"/>\n""" % ( table_name, table_name ) )
161	out.write( " </option>\n" )
162	out.write( " </options>\n" )
163	out.write( """</filter>\n""" )
164	out.close()
165
166	def write_database_dump_info( input_dir, output_dir, dbkey, chrom_lengths, default_bitset_size ):
167	#generate hash for profiled table directories
168	#sort directories off output root (files in output root not hashed, including the profiler_info.txt file)
169	#sort files in each directory and hash file contents
170	profiled_hash = md5()
171	for table_dir in sorted( [ table_dir for table_dir in os.listdir( output_dir ) if os.path.isdir( os.path.join( output_dir, table_dir ) ) ] ):
172	for filename in sorted( os.listdir( os.path.join( output_dir, table_dir ) ) ):
173	f = open( os.path.join( output_dir, table_dir, filename ), 'rb' )
174	while True:
175	hash_chunk = f.read( CHUNK_SIZE )
176	if not hash_chunk:
177	break
178	profiled_hash.update( hash_chunk )
179	profiled_hash = profiled_hash.hexdigest()
180
181	#generate hash for input dir
182	#sort directories off input root
183	#sort files in each directory and hash file contents
184	database_hash = md5()
185	for dirpath, dirnames, filenames in sorted( os.walk( input_dir ) ):
186	for filename in sorted( filenames ):
187	f = open( os.path.join( input_dir, dirpath, filename ), 'rb' )
188	while True:
189	hash_chunk = f.read( CHUNK_SIZE )
190	if not hash_chunk:
191	break
192	database_hash.update( hash_chunk )
193	database_hash = database_hash.hexdigest()
194
195	#write out info file
196	out = open( os.path.join( output_dir, 'profiler_info.txt' ), 'wb' )
197	out.write( 'dbkey\t%s\n' % ( dbkey ) )
198	out.write( 'chromosomes\t%s\n' % ( ','.join( [ '%s=%s' % ( chrom_name, chrom_len ) for chrom_name, chrom_len in chrom_lengths.iteritems() ] ) ) )
199	out.write( 'bitset_size\t%s\n' % ( default_bitset_size ) )
200	for line in open( os.path.join( input_dir, 'trackDb.sql' ) ):
201	line = line.strip()
202	if line.startswith( '-- Dump completed on ' ):
203	line = line[ len( '-- Dump completed on ' ): ]
204	out.write( 'dump_time\t%s\n' % ( line ) )
205	break
206	out.write( 'dump_hash\t%s\n' % ( database_hash ) )
207	out.write( 'profiler_time\t%s\n' % ( time.time() ) )
208	out.write( 'profiler_hash\t%s\n' % ( profiled_hash ) )
209	out.write( 'profiler_version\t%s\n' % ( VERSION ) )
210	out.write( 'profiler_struct_format\t%s\n' % ( STRUCT_FMT ) )
211	out.write( 'profiler_struct_size\t%s\n' % ( STRUCT_SIZE ) )
212	out.close()
213
214	def __main__():
215	usage = "usage: %prog options"
216	parser = OptionParser( usage=usage )
217	parser.add_option( '-d', '--dbkey', dest='dbkey', default='hg18', help='dbkey to process' )
218	parser.add_option( '-i', '--input_dir', dest='input_dir', default=os.path.join( 'golden_path','%s', 'database' ), help='Input Directory' )
219	parser.add_option( '-o', '--output_dir', dest='output_dir', default=os.path.join( 'profiled_annotations','%s' ), help='Output Directory' )
220	parser.add_option( '-c', '--chromosomes', dest='chromosomes', default='', help='Comma separated list of: ChromName1[=length],ChromName2[=length],...' )
221	parser.add_option( '-b', '--bitset_size', dest='bitset_size', default=DEFAULT_BITSET_SIZE, type='int', help='Default BitSet size; overridden by sizes specified in chromInfo.txt.gz or by --chromosomes' )
222	parser.add_option( '-f', '--ftp_site', dest='ftp_site', default='hgdownload.cse.ucsc.edu', help='FTP site; used for chromosome info when chromInfo.txt.gz method fails' )
223	parser.add_option( '-p', '--ftp_path', dest='ftp_path', default='/goldenPath/%s/chromosomes/', help='FTP Path; used for chromosome info when chromInfo.txt.gz method fails' )
224
225	( options, args ) = parser.parse_args()
226
227	input_dir = options.input_dir
228	if '%' in input_dir:
229	input_dir = input_dir % options.dbkey
230	assert os.path.exists( input_dir ), 'Input directory does not exist'
231	output_dir = options.output_dir
232	if '%' in output_dir:
233	output_dir = output_dir % options.dbkey
234	assert not os.path.exists( output_dir ), 'Output directory already exists'
235	os.makedirs( output_dir )
236	ftp_path = options.ftp_path
237	if '%' in ftp_path:
238	ftp_path = ftp_path % options.dbkey
239
240	#Get chromosome names and lengths
241	chrom_lengths = {}
242	if options.chromosomes:
243	for chrom in options.chromosomes.split( ',' ):
244	fields = chrom.split( '=' )
245	chrom = fields[0]
246	if len( fields ) > 1:
247	chrom_len = int( fields[1] )
248	else:
249	chrom_len = options.bitset_size
250	chrom_lengths[ chrom ] = chrom_len
251	chroms = chrom_lengths.keys()
252	print 'Chrom info taken from command line option.'
253	else:
254	try:
255	for line in gzip.open( os.path.join( input_dir, 'chromInfo.txt.gz' ) ):
256	fields = line.strip().split( '\t' )
257	chrom_lengths[ fields[0] ] = int( fields[ 1 ] )
258	chroms = chrom_lengths.keys()
259	print 'Chrom info taken from chromInfo.txt.gz.'
260	except Exception, e:
261	print 'Error loading chrom info from chromInfo.txt.gz, trying FTP method.'
262	chrom_lengths = {} #zero out chrom_lengths
263	chroms = []
264	ftp = FTP( options.ftp_site )
265	ftp.login()
266	for name in ftp.nlst( ftp_path ):
267	if name.endswith( '.fa.gz' ):
268	chroms.append( name.split( '/' )[-1][ :-len( '.fa.gz' ) ] )
269	ftp.close()
270	for chrom in chroms:
271	chrom_lengths[ chrom ] = options.bitset_size
272	#sort chroms by length of name, decending; necessary for when table names start with chrom name
273	chroms = list( reversed( [ chrom for chrom_len, chrom in sorted( [ ( len( chrom ), chrom ) for chrom in chroms ] ) ] ) )
274
275	#parse tables from local files
276	#loop through directory contents, if file ends in '.sql', process table
277	for filename in os.listdir( input_dir ):
278	if filename.endswith ( '.sql' ):
279	base_filename = filename[ 0:-len( '.sql' ) ]
280	table_out_dir = os.path.join( output_dir, base_filename )
281	#some tables are chromosome specific, lets strip off the chrom name
282	for chrom in chroms:
283	if base_filename.startswith( "%s_" % chrom ):
284	#found chromosome
285	table_out_dir = os.path.join( output_dir, base_filename[len( "%s_" % chrom ):] )
286	break
287	#create table dir
288	if not os.path.exists( table_out_dir ):
289	os.mkdir( table_out_dir ) #table dir may already exist in the case of single chrom tables
290	print "Created table dir (%s)." % table_out_dir
291	else:
292	print "Table dir (%s) already exists." % table_out_dir
293	#find column assignments
294	table_name, chrom_col, start_col, end_col = get_columns( "%s.sql" % os.path.join( input_dir, base_filename ) )
295	if chrom_col is None or start_col is None or end_col is None:
296	print "Table %s (%s) does not appear to have a chromosome, a start, or a stop." % ( table_name, "%s.sql" % os.path.join( input_dir, base_filename ) )
297	if not os.listdir( table_out_dir ):
298	print "Removing empty table (%s) directory (%s)." % ( table_name, table_out_dir )
299	os.rmdir( table_out_dir )
300	continue
301	#build bitsets from table
302	bitset_dict = {}
303	for line in gzip.open( '%s.txt.gz' % os.path.join( input_dir, base_filename ) ):
304	fields = line.strip().split( '\t' )
305	chrom = fields[ chrom_col ]
306	start = int( fields[ start_col ] )
307	end = int( fields[ end_col ] )
308	if chrom not in bitset_dict:
309	bitset_dict[ chrom ] = BitSet( chrom_lengths.get( chrom, options.bitset_size ) )
310	bitset_dict[ chrom ].set_range( start, end - start )
311	#write bitsets as profiled annotations
312	for chrom_name, chrom_bits in bitset_dict.iteritems():
313	out = open( os.path.join( table_out_dir, '%s.covered' % chrom_name ), 'wb' )
314	end = 0
315	total_regions = 0
316	total_coverage = 0
317	max_size = chrom_lengths.get( chrom_name, options.bitset_size )
318	while True:
319	start = chrom_bits.next_set( end )
320	if start >= max_size:
321	break
322	end = chrom_bits.next_clear( start )
323	out.write( struct.pack( STRUCT_FMT, start ) )
324	out.write( struct.pack( STRUCT_FMT, end ) )
325	total_regions += 1
326	total_coverage += end - start
327	if end >= max_size:
328	break
329	out.close()
330	open( os.path.join( table_out_dir, '%s.total_regions' % chrom_name ), 'wb' ).write( str( total_regions ) )
331	open( os.path.join( table_out_dir, '%s.total_coverage' % chrom_name ), 'wb' ).write( str( total_coverage ) )
332
333	#create xml
334	create_grouping_xml( input_dir, output_dir, options.dbkey )
335	#create database dump info file, for database version control
336	write_database_dump_info( input_dir, output_dir, options.dbkey, chrom_lengths, options.bitset_size )
337
338	if __name__ == "__main__": __main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/scripts/tools/annotation_profiler/build_profile_indexes.py @ 2

異なるフォーマットでダウンロード: