Context Navigation

column_join.py @ 2

リビジョン 2, 11.9 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2
3	"""
4	This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
5
6	usage: %prog -o output -1 input1 -2 input2 -c column1[,column2[,column3[,...]]] -g hinge1[,hinge2[,hinge3[,...]]] -f <fill_options_file> [other_input1 [other_input2 [other_input3 ...]]]
7	-o, output=0: the output pileup
8	-1, input1=1: the pileup file to start with
9	-2, input2=2: the second pileup file to join
10	-g, hinge=h: the columns to be used for matching
11	-c, columns=c: the columns that should appear in the output
12	-f, fill_options_file=f: the file specifying the fill value to use
13	other_inputs: the other input files to join
14	"""
15
16	import optparse, os, re, struct, sys, tempfile
17
18	try:
19	simple_json_exception = None
20	from galaxy import eggs
21	from galaxy.util.bunch import Bunch
22	from galaxy.util import stringify_dictionary_keys
23	import pkg_resources
24	pkg_resources.require("simplejson")
25	import simplejson
26	except Exception, e:
27	simplejson_exception = e
28	simplejson = None
29
30	def stop_err( msg ):
31	sys.stderr.write( msg )
32	sys.exit()
33
34	def split_nums( text ):
35	"""
36	Splits a string into pieces of numbers and non-numbers, like 'abc23B3' --> [ 'abc', 23, 'B', 3 ]
37	"""
38	split_t = []
39	c = ''
40	n = ''
41	for ch in text:
42	try:
43	v = int( ch )
44	n += ch
45	if c:
46	split_t.append( ''.join( c ) )
47	c = ''
48	except ValueError:
49	c += ch
50	if n:
51	split_t.append( int( ''.join( n ) ) )
52	n = ''
53	if c:
54	split_t.append( ''.join( c ) )
55	if n:
56	split_t.append( int( ''.join( n ) ) )
57	return split_t
58
59	def hinge_compare( hinge1, hinge2 ):
60	"""
61	Compares items like 'chr10' and 'chrM' or 'scaffold2' and scaffold10' so that
62	first part handled as text but last part as number
63	"""
64	split_hinge1 = hinge1.split( '\t' )
65	split_hinge2 = hinge2.split( '\t' )
66	# quick check if either hinge is empty
67	if not ''.join( split_hinge2 ):
68	if ''.join( split_hinge1 ):
69	return 1
70	elif not ''.join( split_hinge1 ):
71	return 0
72	else:
73	if not ''.join( split_hinge1 ):
74	return -1
75	# go through all parts of the hinges and compare
76	for i, sh1 in enumerate( split_hinge1 ):
77	# if these hinge segments are the same, just move on to the next ones
78	if sh1 == split_hinge2[ i ]:
79	continue
80	# check all parts of each hinge
81	h1 = split_nums( sh1 )
82	h2 = split_nums( split_hinge2[ i ] )
83	for j, h in enumerate( h1 ):
84	# if second hinge has no more parts, first is considered larger
85	if j > 0 and len( h2 ) <= j:
86	return 1
87	# if these two parts are the same, move on to next
88	if h == h2[ j ]:
89	continue
90	# do actual comparison, depending on whether letter or number
91	if type( h ) == int:
92	if type( h2[ j ] ) == int:
93	if h > h2[ j ]:
94	return 1
95	elif h < h2[ j ]:
96	return -1
97	# numbers are less than letters
98	elif type( h2[ j ] ) == str:
99	return -1
100	elif type( h ) == str:
101	if type( h2[ j ] ) == str:
102	if h > h2[ j ]:
103	return 1
104	elif h < h2[ j ]:
105	return -1
106	# numbers are less than letters
107	elif type( h2[ j ] ) == int:
108	return 1
109	# if all else has failed, just do basic string comparison
110	if hinge1 > hinge2:
111	return 1
112	elif hinge1 == hinge2:
113	return 0
114	elif hinge1 < hinge2:
115	return -1
116
117	def hinge_sort( infile, outfile, hinge ):
118	"""Given input file name, sorts logically (text vs. numeric) into provided output file name."""
119	hinge_locs = {}
120	bad_lines = []
121	fin = open( infile, 'rb' )
122	line = fin.readline()
123	while line.strip():
124	try:
125	hinge_parts = line.split( '\t' )[ :hinge ]
126	try:
127	hinge_locs[ '\t'.join( hinge_parts ) ].append( fin.tell() - len( line ) )
128	except KeyError:
129	hinge_locs[ '\t'.join( hinge_parts ) ] = [ fin.tell() - len( line ) ]
130	except ValueError:
131	bad_line.append( line )
132	line = fin.readline()
133	fin.close()
134	fin = open( infile, 'rb' )
135	fout = open( outfile, 'wb' )
136	hinge_locs_sorted = hinge_locs.keys()
137	hinge_locs_sorted.sort( hinge_compare )
138	for hinge_loc in hinge_locs_sorted:
139	locs = hinge_locs[ hinge_loc ]
140	for loc in locs:
141	fin.seek( loc )
142	fout.write( fin.readline() )
143	fout.close()
144	fin.close()
145
146	def __main__():
147	parser = optparse.OptionParser()
148	parser.add_option( '-o', '--output', dest='output', help='The name of the output file' )
149	parser.add_option( '-1', '--input1', dest='input1', help='The name of the first input file' )
150	parser.add_option( '-2', '--input2', dest='input2', help='The name of the second input file' )
151	parser.add_option( '-g', '--hinge', dest='hinge', help='The "hinge" to use (the value to compare)' )
152	parser.add_option( '-c', '--columns', dest='columns', help='The columns to include in the output file' )
153	parser.add_option( '-f', '--fill_options_file', dest='fill_options_file', default=None, help='The file specifying the fill value to use' )
154	(options, args) = parser.parse_args()
155	hinge = int( options.hinge )
156	cols = [ int( c ) for c in str( options.columns ).split( ',' ) if int( c ) > hinge ]
157	inputs = [ options.input1, options.input2 ]
158	if options.fill_options_file == 'None':
159	inputs.extend( args )
160	elif len( args ) > 0:
161	inputs.extend( args )
162	fill_options = None
163	if options.fill_options_file != 'None' and options.fill_options_file is not None:
164	try:
165	if simplejson is None:
166	raise simplejson_exception
167	fill_options = Bunch( **stringify_dictionary_keys( simplejson.load( open( options.fill_options_file ) ) ) )
168	except Exception, e:
169	print 'Warning: Ignoring fill options due to simplejson error (%s).' % e
170	if fill_options is None:
171	fill_options = Bunch()
172	if 'file1_columns' not in fill_options:
173	fill_options.file1_columns = None
174	if fill_options and fill_options.file1_columns:
175	fill_empty = {}
176	for col in cols:
177	fill_empty[ col ] = fill_options.file1_columns[ col - 1 ]
178	else:
179	fill_empty = None
180	assert len( cols ) > 0, 'You need to select at least one column in addition to the hinge'
181	delimiter = '\t'
182	# make sure all files are sorted in same way, ascending
183	tmp_input_files = []
184	input_files = inputs[:]
185	for in_file in input_files:
186	tmp_file = tempfile.NamedTemporaryFile()
187	tmp_file_name = tmp_file.name
188	tmp_file.close()
189	hinge_sort( in_file, tmp_file_name, hinge )
190	tmp_file = open( tmp_file_name, 'rb' )
191	tmp_input_files.append( tmp_file )
192	# cycle through files, getting smallest line of all files one at a time
193	# also have to keep track of vertical position of extra columns
194	fout = file( options.output, 'w' )
195	old_current = ''
196	first_line = True
197	current_lines = [ f.readline().rstrip( '\r\n' ) for f in tmp_input_files ]
198	last_lines = ''.join( current_lines )
199	last_loc = -1
200	while last_lines:
201	# get the "minimum" hinge, which should come first, and the file location in list
202	hinges = [ delimiter.join( line.split( delimiter )[ :hinge ] ) for line in current_lines ]
203	hinge_dict = {}
204	for i in range( len( hinges ) ):
205	if not hinge_dict.has_key( hinges[ i ] ):
206	hinge_dict[ hinges[ i ] ] = i
207	hinges.sort( hinge_compare )
208	hinges = [ h for h in hinges if h ]
209	current, loc = hinges[0], hinge_dict[ hinges[0] ]
210	# first output empty columns for vertical alignment (account for "missing" files)
211	# write output for leading and trailing empty columns
212	# columns missing from actual file handled further below
213	current_data = []
214	if current != old_current:
215	# fill trailing empty columns with appropriate fill value
216	if not first_line:
217	if last_loc < len( inputs ) - 1:
218	if not fill_empty:
219	filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
220	else:
221	filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
222	fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) )
223	# insert line break before current line
224	fout.write( '\n' )
225	# fill leading empty columns with appropriate fill value
226	if loc > 0:
227	if not fill_empty:
228	current_data = [ '' for col in range( loc * len( cols ) ) ]
229	else:
230	current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( loc * len( cols ) ) ]
231	else:
232	if loc - last_loc > 1:
233	if not fill_empty:
234	current_data = [ '' for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ]
235	else:
236	current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ]
237	# now output actual data
238	split_line = current_lines[ loc ].split( delimiter )
239	# fill empties within actual line if appropriate
240	if fill_empty:
241	new_split_line = split_line[:]
242	split_line = []
243	for i, item in enumerate( new_split_line ):
244	col = i + 1
245	if not item:
246	try:
247	split_line.append( fill_empty[ i + 1 ] )
248	except KeyError:
249	split_line.append( item )
250	else:
251	split_line.append( item )
252	# add actual data to be output below
253	if ''.join( split_line ):
254	for col in cols:
255	if col > hinge:
256	# if this column doesn't exist, add the appropriate filler or empty column
257	try:
258	new_item = split_line[ col - 1 ]
259	except IndexError:
260	if fill_empty:
261	new_item = fill_empty[ col ]
262	else:
263	new_item = ''
264	current_data.append( new_item )
265	# grab next line for selected file
266	current_lines[ loc ] = tmp_input_files[ loc ].readline().rstrip( '\r\n' )
267	# write relevant data to file
268	if current == old_current and current_data:
269	fout.write( '%s%s' % ( delimiter, delimiter.join( current_data ) ) )
270	elif current_data:
271	fout.write( '%s%s%s' % ( current, delimiter, delimiter.join( current_data ) ) )
272	last_lines = ''.join( current_lines )
273	else:
274	last_lines = None
275	last_loc = loc
276	old_current = current
277	first_line = False
278	# fill trailing empty columns for final line
279	if last_loc < len( inputs ) - 1:
280	if not fill_empty:
281	filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
282	else:
283	filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
284	fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) )
285	fout.write( '\n' )
286	fout.close()
287	for f in tmp_input_files:
288	os.unlink( f.name )
289
290	if __name__ == "__main__" : __main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/new_operations/column_join.py @ 2

異なるフォーマットでダウンロード: