1 | #!/usr/bin/env python |
---|
2 | #Dan Blankenberg |
---|
3 | |
---|
4 | VERSION = '1.0.0' # version of this script |
---|
5 | |
---|
6 | from optparse import OptionParser |
---|
7 | import os, gzip, struct, time |
---|
8 | from ftplib import FTP #do we want a diff method than using FTP to determine Chrom Names, eg use local copy |
---|
9 | |
---|
10 | #import md5 from hashlib; if python2.4 or less, use old md5 |
---|
11 | try: |
---|
12 | from hashlib import md5 |
---|
13 | except ImportError: |
---|
14 | from md5 import new as md5 |
---|
15 | |
---|
16 | #import BitSet from bx-python, try using eggs and package resources, fall back to any local installation |
---|
17 | try: |
---|
18 | from galaxy import eggs |
---|
19 | import pkg_resources |
---|
20 | pkg_resources.require( "bx-python" ) |
---|
21 | except: pass #Maybe there is a local installation available |
---|
22 | from bx.bitset import BitSet |
---|
23 | |
---|
24 | #Define constants |
---|
25 | STRUCT_FMT = '<I' |
---|
26 | STRUCT_SIZE = struct.calcsize( STRUCT_FMT ) |
---|
27 | DEFAULT_BITSET_SIZE = 300000000 |
---|
28 | CHUNK_SIZE = 1024 |
---|
29 | |
---|
30 | #Headers used to parse .sql files to determine column indexes for chromosome name, start and end |
---|
31 | alias_spec = { |
---|
32 | 'chromCol' : [ 'chrom' , 'CHROMOSOME' , 'CHROM', 'Chromosome Name', 'tName' ], |
---|
33 | 'startCol' : [ 'start' , 'START', 'chromStart', 'txStart', 'Start Position (bp)', 'tStart', 'genoStart' ], |
---|
34 | 'endCol' : [ 'end' , 'END' , 'STOP', 'chromEnd', 'txEnd', 'End Position (bp)', 'tEnd', 'genoEnd' ], |
---|
35 | } |
---|
36 | |
---|
37 | #Headers used to parse trackDb.txt.gz |
---|
38 | #TODO: these should be parsed directly from trackDb.sql |
---|
39 | trackDb_headers = ["tableName", "shortLabel", "type", "longLabel", "visibility", "priority", "colorR", "colorG", "colorB", "altColorR", "altColorG", "altColorB", "useScore", "private", "restrictCount", "restrictList", "url", "html", "grp", "canPack", "settings"] |
---|
40 | |
---|
41 | def get_columns( filename ): |
---|
42 | input_sql = open( filename ).read() |
---|
43 | input_sql = input_sql.split( 'CREATE TABLE ' )[1].split( ';' )[0] |
---|
44 | input_sql = input_sql.split( ' (', 1 ) |
---|
45 | table_name = input_sql[0].strip().strip( '`' ) |
---|
46 | input_sql = [ split.strip().split( ' ' )[0].strip().strip( '`' ) for split in input_sql[1].rsplit( ')', 1 )[0].strip().split( '\n' ) ] |
---|
47 | print input_sql |
---|
48 | chrom_col = None |
---|
49 | start_col = None |
---|
50 | end_col = None |
---|
51 | for col_name in alias_spec['chromCol']: |
---|
52 | for i, header_name in enumerate( input_sql ): |
---|
53 | if col_name == header_name: |
---|
54 | chrom_col = i |
---|
55 | break |
---|
56 | if chrom_col is not None: |
---|
57 | break |
---|
58 | |
---|
59 | for col_name in alias_spec['startCol']: |
---|
60 | for i, header_name in enumerate( input_sql ): |
---|
61 | if col_name == header_name: |
---|
62 | start_col = i |
---|
63 | break |
---|
64 | if start_col is not None: |
---|
65 | break |
---|
66 | |
---|
67 | for col_name in alias_spec['endCol']: |
---|
68 | for i, header_name in enumerate( input_sql ): |
---|
69 | if col_name == header_name: |
---|
70 | end_col = i |
---|
71 | break |
---|
72 | if end_col is not None: |
---|
73 | break |
---|
74 | |
---|
75 | return table_name, chrom_col, start_col, end_col |
---|
76 | |
---|
77 | |
---|
78 | def create_grouping_xml( input_dir, output_dir, dbkey ): |
---|
79 | output_filename = os.path.join( output_dir, '%s_tables.xml' % dbkey ) |
---|
80 | def load_groups( file_name = 'grp.txt.gz' ): |
---|
81 | groups = {} |
---|
82 | for line in gzip.open( os.path.join( input_dir, file_name ) ): |
---|
83 | fields = line.split( '\t' ) |
---|
84 | groups[fields[0]] = { 'desc': fields[1], 'priority': fields[2] } |
---|
85 | return groups |
---|
86 | f = gzip.open( os.path.join( input_dir, 'trackDb.txt.gz' ) ) |
---|
87 | out = open( output_filename, 'wb' ) |
---|
88 | tables = {} |
---|
89 | cur_buf = '' |
---|
90 | while True: |
---|
91 | line = f.readline() |
---|
92 | if not line: break |
---|
93 | #remove new lines |
---|
94 | line = line.rstrip( '\n\r' ) |
---|
95 | line = line.replace( '\\\t', ' ' ) #replace escaped tabs with space |
---|
96 | cur_buf += "%s\n" % line.rstrip( '\\' ) |
---|
97 | if line.endswith( '\\' ): |
---|
98 | continue #line is wrapped, next line |
---|
99 | #all fields should be loaded now... |
---|
100 | fields = cur_buf.split( '\t' ) |
---|
101 | cur_buf = '' #reset buffer |
---|
102 | assert len( fields ) == len( trackDb_headers ), 'Failed Parsing trackDb.txt.gz; fields: %s' % fields |
---|
103 | table_name = fields[ 0 ] |
---|
104 | tables[ table_name ] = {} |
---|
105 | for field_name, field_value in zip( trackDb_headers, fields ): |
---|
106 | tables[ table_name ][ field_name ] = field_value |
---|
107 | #split settings fields into dict |
---|
108 | fields = fields[-1].split( '\n' ) |
---|
109 | tables[ table_name ][ 'settings' ] = {} |
---|
110 | for field in fields: |
---|
111 | setting_fields = field.split( ' ', 1 ) |
---|
112 | setting_name = setting_value = setting_fields[ 0 ] |
---|
113 | if len( setting_fields ) > 1: |
---|
114 | setting_value = setting_fields[ 1 ] |
---|
115 | if setting_name or setting_value: |
---|
116 | tables[ table_name ][ 'settings' ][ setting_name ] = setting_value |
---|
117 | #Load Groups |
---|
118 | groups = load_groups() |
---|
119 | in_groups = {} |
---|
120 | for table_name, values in tables.iteritems(): |
---|
121 | if os.path.exists( os.path.join( output_dir, table_name ) ): |
---|
122 | group = values['grp'] |
---|
123 | if group not in in_groups: |
---|
124 | in_groups[group]={} |
---|
125 | #***NAME CHANGE***, 'subTrack' no longer exists as a setting...use 'parent' instead |
---|
126 | #subTrack = values.get('settings', {} ).get( 'subTrack', table_name ) |
---|
127 | subTrack = values.get('settings', {} ).get( 'parent', table_name ).split( ' ' )[0] #need to split, because could be e.g. 'trackgroup on' |
---|
128 | if subTrack not in in_groups[group]: |
---|
129 | in_groups[group][subTrack]=[] |
---|
130 | in_groups[group][subTrack].append( table_name ) |
---|
131 | |
---|
132 | assigned_tables = [] |
---|
133 | out.write( """<filter type="data_meta" data_ref="input1" meta_key="dbkey" value="%s">\n""" % ( dbkey ) ) |
---|
134 | out.write( " <options>\n" ) |
---|
135 | for group, subTracks in sorted( in_groups.iteritems() ): |
---|
136 | out.write( """ <option name="%s" value="group-%s">\n""" % ( groups[group]['desc'], group ) ) |
---|
137 | for sub_name, sub_tracks in subTracks.iteritems(): |
---|
138 | if len( sub_tracks ) > 1: |
---|
139 | out.write( """ <option name="%s" value="subtracks-%s">\n""" % ( sub_name, sub_name ) ) |
---|
140 | sub_tracks.sort() |
---|
141 | for track in sub_tracks: |
---|
142 | track_label = track |
---|
143 | if "$" not in tables[track]['shortLabel']: |
---|
144 | track_label = tables[track]['shortLabel'] |
---|
145 | out.write( """ <option name="%s" value="%s"/>\n""" % ( track_label, track ) ) |
---|
146 | assigned_tables.append( track ) |
---|
147 | out.write( " </option>\n" ) |
---|
148 | else: |
---|
149 | track = sub_tracks[0] |
---|
150 | track_label = track |
---|
151 | if "$" not in tables[track]['shortLabel']: |
---|
152 | track_label = tables[track]['shortLabel'] |
---|
153 | out.write( """ <option name="%s" value="%s"/>\n""" % ( track_label, track ) ) |
---|
154 | assigned_tables.append( track ) |
---|
155 | out.write( " </option>\n" ) |
---|
156 | unassigned_tables = list( sorted( [ table_dir for table_dir in os.listdir( output_dir ) if table_dir not in assigned_tables and os.path.isdir( os.path.join( output_dir, table_dir ) ) ] ) ) |
---|
157 | if unassigned_tables: |
---|
158 | out.write( """ <option name="Uncategorized Tables" value="group-trackDbUnassigned">\n""" ) |
---|
159 | for table_name in unassigned_tables: |
---|
160 | out.write( """ <option name="%s" value="%s"/>\n""" % ( table_name, table_name ) ) |
---|
161 | out.write( " </option>\n" ) |
---|
162 | out.write( " </options>\n" ) |
---|
163 | out.write( """</filter>\n""" ) |
---|
164 | out.close() |
---|
165 | |
---|
166 | def write_database_dump_info( input_dir, output_dir, dbkey, chrom_lengths, default_bitset_size ): |
---|
167 | #generate hash for profiled table directories |
---|
168 | #sort directories off output root (files in output root not hashed, including the profiler_info.txt file) |
---|
169 | #sort files in each directory and hash file contents |
---|
170 | profiled_hash = md5() |
---|
171 | for table_dir in sorted( [ table_dir for table_dir in os.listdir( output_dir ) if os.path.isdir( os.path.join( output_dir, table_dir ) ) ] ): |
---|
172 | for filename in sorted( os.listdir( os.path.join( output_dir, table_dir ) ) ): |
---|
173 | f = open( os.path.join( output_dir, table_dir, filename ), 'rb' ) |
---|
174 | while True: |
---|
175 | hash_chunk = f.read( CHUNK_SIZE ) |
---|
176 | if not hash_chunk: |
---|
177 | break |
---|
178 | profiled_hash.update( hash_chunk ) |
---|
179 | profiled_hash = profiled_hash.hexdigest() |
---|
180 | |
---|
181 | #generate hash for input dir |
---|
182 | #sort directories off input root |
---|
183 | #sort files in each directory and hash file contents |
---|
184 | database_hash = md5() |
---|
185 | for dirpath, dirnames, filenames in sorted( os.walk( input_dir ) ): |
---|
186 | for filename in sorted( filenames ): |
---|
187 | f = open( os.path.join( input_dir, dirpath, filename ), 'rb' ) |
---|
188 | while True: |
---|
189 | hash_chunk = f.read( CHUNK_SIZE ) |
---|
190 | if not hash_chunk: |
---|
191 | break |
---|
192 | database_hash.update( hash_chunk ) |
---|
193 | database_hash = database_hash.hexdigest() |
---|
194 | |
---|
195 | #write out info file |
---|
196 | out = open( os.path.join( output_dir, 'profiler_info.txt' ), 'wb' ) |
---|
197 | out.write( 'dbkey\t%s\n' % ( dbkey ) ) |
---|
198 | out.write( 'chromosomes\t%s\n' % ( ','.join( [ '%s=%s' % ( chrom_name, chrom_len ) for chrom_name, chrom_len in chrom_lengths.iteritems() ] ) ) ) |
---|
199 | out.write( 'bitset_size\t%s\n' % ( default_bitset_size ) ) |
---|
200 | for line in open( os.path.join( input_dir, 'trackDb.sql' ) ): |
---|
201 | line = line.strip() |
---|
202 | if line.startswith( '-- Dump completed on ' ): |
---|
203 | line = line[ len( '-- Dump completed on ' ): ] |
---|
204 | out.write( 'dump_time\t%s\n' % ( line ) ) |
---|
205 | break |
---|
206 | out.write( 'dump_hash\t%s\n' % ( database_hash ) ) |
---|
207 | out.write( 'profiler_time\t%s\n' % ( time.time() ) ) |
---|
208 | out.write( 'profiler_hash\t%s\n' % ( profiled_hash ) ) |
---|
209 | out.write( 'profiler_version\t%s\n' % ( VERSION ) ) |
---|
210 | out.write( 'profiler_struct_format\t%s\n' % ( STRUCT_FMT ) ) |
---|
211 | out.write( 'profiler_struct_size\t%s\n' % ( STRUCT_SIZE ) ) |
---|
212 | out.close() |
---|
213 | |
---|
214 | def __main__(): |
---|
215 | usage = "usage: %prog options" |
---|
216 | parser = OptionParser( usage=usage ) |
---|
217 | parser.add_option( '-d', '--dbkey', dest='dbkey', default='hg18', help='dbkey to process' ) |
---|
218 | parser.add_option( '-i', '--input_dir', dest='input_dir', default=os.path.join( 'golden_path','%s', 'database' ), help='Input Directory' ) |
---|
219 | parser.add_option( '-o', '--output_dir', dest='output_dir', default=os.path.join( 'profiled_annotations','%s' ), help='Output Directory' ) |
---|
220 | parser.add_option( '-c', '--chromosomes', dest='chromosomes', default='', help='Comma separated list of: ChromName1[=length],ChromName2[=length],...' ) |
---|
221 | parser.add_option( '-b', '--bitset_size', dest='bitset_size', default=DEFAULT_BITSET_SIZE, type='int', help='Default BitSet size; overridden by sizes specified in chromInfo.txt.gz or by --chromosomes' ) |
---|
222 | parser.add_option( '-f', '--ftp_site', dest='ftp_site', default='hgdownload.cse.ucsc.edu', help='FTP site; used for chromosome info when chromInfo.txt.gz method fails' ) |
---|
223 | parser.add_option( '-p', '--ftp_path', dest='ftp_path', default='/goldenPath/%s/chromosomes/', help='FTP Path; used for chromosome info when chromInfo.txt.gz method fails' ) |
---|
224 | |
---|
225 | ( options, args ) = parser.parse_args() |
---|
226 | |
---|
227 | input_dir = options.input_dir |
---|
228 | if '%' in input_dir: |
---|
229 | input_dir = input_dir % options.dbkey |
---|
230 | assert os.path.exists( input_dir ), 'Input directory does not exist' |
---|
231 | output_dir = options.output_dir |
---|
232 | if '%' in output_dir: |
---|
233 | output_dir = output_dir % options.dbkey |
---|
234 | assert not os.path.exists( output_dir ), 'Output directory already exists' |
---|
235 | os.makedirs( output_dir ) |
---|
236 | ftp_path = options.ftp_path |
---|
237 | if '%' in ftp_path: |
---|
238 | ftp_path = ftp_path % options.dbkey |
---|
239 | |
---|
240 | #Get chromosome names and lengths |
---|
241 | chrom_lengths = {} |
---|
242 | if options.chromosomes: |
---|
243 | for chrom in options.chromosomes.split( ',' ): |
---|
244 | fields = chrom.split( '=' ) |
---|
245 | chrom = fields[0] |
---|
246 | if len( fields ) > 1: |
---|
247 | chrom_len = int( fields[1] ) |
---|
248 | else: |
---|
249 | chrom_len = options.bitset_size |
---|
250 | chrom_lengths[ chrom ] = chrom_len |
---|
251 | chroms = chrom_lengths.keys() |
---|
252 | print 'Chrom info taken from command line option.' |
---|
253 | else: |
---|
254 | try: |
---|
255 | for line in gzip.open( os.path.join( input_dir, 'chromInfo.txt.gz' ) ): |
---|
256 | fields = line.strip().split( '\t' ) |
---|
257 | chrom_lengths[ fields[0] ] = int( fields[ 1 ] ) |
---|
258 | chroms = chrom_lengths.keys() |
---|
259 | print 'Chrom info taken from chromInfo.txt.gz.' |
---|
260 | except Exception, e: |
---|
261 | print 'Error loading chrom info from chromInfo.txt.gz, trying FTP method.' |
---|
262 | chrom_lengths = {} #zero out chrom_lengths |
---|
263 | chroms = [] |
---|
264 | ftp = FTP( options.ftp_site ) |
---|
265 | ftp.login() |
---|
266 | for name in ftp.nlst( ftp_path ): |
---|
267 | if name.endswith( '.fa.gz' ): |
---|
268 | chroms.append( name.split( '/' )[-1][ :-len( '.fa.gz' ) ] ) |
---|
269 | ftp.close() |
---|
270 | for chrom in chroms: |
---|
271 | chrom_lengths[ chrom ] = options.bitset_size |
---|
272 | #sort chroms by length of name, decending; necessary for when table names start with chrom name |
---|
273 | chroms = list( reversed( [ chrom for chrom_len, chrom in sorted( [ ( len( chrom ), chrom ) for chrom in chroms ] ) ] ) ) |
---|
274 | |
---|
275 | #parse tables from local files |
---|
276 | #loop through directory contents, if file ends in '.sql', process table |
---|
277 | for filename in os.listdir( input_dir ): |
---|
278 | if filename.endswith ( '.sql' ): |
---|
279 | base_filename = filename[ 0:-len( '.sql' ) ] |
---|
280 | table_out_dir = os.path.join( output_dir, base_filename ) |
---|
281 | #some tables are chromosome specific, lets strip off the chrom name |
---|
282 | for chrom in chroms: |
---|
283 | if base_filename.startswith( "%s_" % chrom ): |
---|
284 | #found chromosome |
---|
285 | table_out_dir = os.path.join( output_dir, base_filename[len( "%s_" % chrom ):] ) |
---|
286 | break |
---|
287 | #create table dir |
---|
288 | if not os.path.exists( table_out_dir ): |
---|
289 | os.mkdir( table_out_dir ) #table dir may already exist in the case of single chrom tables |
---|
290 | print "Created table dir (%s)." % table_out_dir |
---|
291 | else: |
---|
292 | print "Table dir (%s) already exists." % table_out_dir |
---|
293 | #find column assignments |
---|
294 | table_name, chrom_col, start_col, end_col = get_columns( "%s.sql" % os.path.join( input_dir, base_filename ) ) |
---|
295 | if chrom_col is None or start_col is None or end_col is None: |
---|
296 | print "Table %s (%s) does not appear to have a chromosome, a start, or a stop." % ( table_name, "%s.sql" % os.path.join( input_dir, base_filename ) ) |
---|
297 | if not os.listdir( table_out_dir ): |
---|
298 | print "Removing empty table (%s) directory (%s)." % ( table_name, table_out_dir ) |
---|
299 | os.rmdir( table_out_dir ) |
---|
300 | continue |
---|
301 | #build bitsets from table |
---|
302 | bitset_dict = {} |
---|
303 | for line in gzip.open( '%s.txt.gz' % os.path.join( input_dir, base_filename ) ): |
---|
304 | fields = line.strip().split( '\t' ) |
---|
305 | chrom = fields[ chrom_col ] |
---|
306 | start = int( fields[ start_col ] ) |
---|
307 | end = int( fields[ end_col ] ) |
---|
308 | if chrom not in bitset_dict: |
---|
309 | bitset_dict[ chrom ] = BitSet( chrom_lengths.get( chrom, options.bitset_size ) ) |
---|
310 | bitset_dict[ chrom ].set_range( start, end - start ) |
---|
311 | #write bitsets as profiled annotations |
---|
312 | for chrom_name, chrom_bits in bitset_dict.iteritems(): |
---|
313 | out = open( os.path.join( table_out_dir, '%s.covered' % chrom_name ), 'wb' ) |
---|
314 | end = 0 |
---|
315 | total_regions = 0 |
---|
316 | total_coverage = 0 |
---|
317 | max_size = chrom_lengths.get( chrom_name, options.bitset_size ) |
---|
318 | while True: |
---|
319 | start = chrom_bits.next_set( end ) |
---|
320 | if start >= max_size: |
---|
321 | break |
---|
322 | end = chrom_bits.next_clear( start ) |
---|
323 | out.write( struct.pack( STRUCT_FMT, start ) ) |
---|
324 | out.write( struct.pack( STRUCT_FMT, end ) ) |
---|
325 | total_regions += 1 |
---|
326 | total_coverage += end - start |
---|
327 | if end >= max_size: |
---|
328 | break |
---|
329 | out.close() |
---|
330 | open( os.path.join( table_out_dir, '%s.total_regions' % chrom_name ), 'wb' ).write( str( total_regions ) ) |
---|
331 | open( os.path.join( table_out_dir, '%s.total_coverage' % chrom_name ), 'wb' ).write( str( total_coverage ) ) |
---|
332 | |
---|
333 | #create xml |
---|
334 | create_grouping_xml( input_dir, output_dir, options.dbkey ) |
---|
335 | #create database dump info file, for database version control |
---|
336 | write_database_dump_info( input_dir, output_dir, options.dbkey, chrom_lengths, options.bitset_size ) |
---|
337 | |
---|
338 | if __name__ == "__main__": __main__() |
---|