Context Navigation

dna_filtering.py

リビジョン 2, 9.3 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2
3	"""
4	This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
5
6	usage: %prog [options]
7	-i, --input=i: tabular input file
8	-o, --output=o: filtered output file
9	-c, --cond=c: conditions to filter on
10	-n, --n_handling=n: how to handle N and X
11	-l, --columns=l: columns
12	-t, --col_types=t: column types
13
14	"""
15
16	#from __future__ import division
17	import os.path, re, string, string, sys
18	from galaxy import eggs
19	import pkg_resources; pkg_resources.require( "bx-python" )
20	from bx.cookbook import doc_optparse
21
22	# Older py compatibility
23	try:
24	set()
25	except:
26	from sets import Set as set
27
28	#assert sys.version_info[:2] >= ( 2, 4 )
29
30	def get_operands( filter_condition ):
31	# Note that the order of all_operators is important
32	items_to_strip = [ '==', '!=', ' and ', ' or ' ]
33	for item in items_to_strip:
34	if filter_condition.find( item ) >= 0:
35	filter_condition = filter_condition.replace( item, ' ' )
36	operands = set( filter_condition.split( ' ' ) )
37	return operands
38
39	def stop_err( msg ):
40	sys.stderr.write( msg )
41	sys.exit()
42
43	def __main__():
44	#Parse Command Line
45	options, args = doc_optparse.parse( __doc__ )
46	input = options.input
47	output = options.output
48	cond = options.cond
49	n_handling = options.n_handling
50	columns = options.columns
51	col_types = options.col_types
52
53	try:
54	in_columns = int( columns )
55	assert col_types #check to see that the column types variable isn't null
56	in_column_types = col_types.split( ',' )
57	except:
58	stop_err( "Data does not appear to be tabular. This tool can only be used with tab-delimited data." )
59
60	# Unescape if input has been escaped
61	cond_text = cond.replace( '__eq__', '==' ).replace( '__ne__', '!=' ).replace( '__sq__', "'" )
62	orig_cond_text = cond_text
63	# Expand to allow for DNA codes
64	dot_letters = [ letter for letter in string.uppercase if letter not in \
65	[ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ] ]
66	dot_letters.append( '.' )
67	codes = {'A': [ 'A', 'D', 'H', 'M', 'R', 'V', 'W' ],
68	'C': [ 'C', 'B', 'H', 'M', 'S', 'V', 'Y' ],
69	'G': [ 'G', 'B', 'D', 'K', 'R', 'S', 'V' ],
70	'T': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ],
71	'U': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ],
72	'K': [ 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'R', 'S', 'V', 'W', 'Y' ],
73	'M': [ 'A', 'C', 'B', 'D', 'H', 'M', 'R', 'S', 'V', 'W', 'Y' ],
74	'R': [ 'A', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ],
75	'Y': [ 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'S', 'V', 'W', 'Y' ],
76	'S': [ 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'Y' ],
77	'W': [ 'A', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'V', 'W', 'Y' ],
78	'B': [ 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
79	'V': [ 'A', 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ],
80	'H': [ 'A', 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
81	'D': [ 'A', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
82	'.': dot_letters,
83	'-': [ '-' ]}
84	# Add handling for N and X
85	if n_handling == "all":
86	codes[ 'N' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ]
87	codes[ 'X' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ]
88	for code in codes.keys():
89	if code != '.' and code != '-':
90	codes[code].append( 'N' )
91	codes[code].append( 'X' )
92	else:
93	codes[ 'N' ] = dot_letters
94	codes[ 'X' ] = dot_letters
95	codes[ '.' ].extend( [ 'N', 'X' ] )
96	# Expand conditions to allow for DNA codes
97	try:
98	match_replace = {}
99	pat = re.compile( 'c\d+\s[!=]=\s[\w\d"\'+-.]+' )
100	matches = pat.findall( cond_text )
101	for match in matches:
102	if match.find( 'chr' ) >= 0 or match.find( 'scaffold' ) >= 0 or match.find( '+' ) >= 0:
103	if match.find( '==' ) >= 0:
104	match_parts = match.split( '==' )
105	elif match.find( '!=' ) >= 0:
106	match_parts = match.split( '!=' )
107	else:
108	raise Exception, "The operators '==' and '!=' were not found."
109	left = match_parts[0].strip()
110	right = match_parts[1].strip()
111	new_match = "(%s)" % ( match )
112	elif match.find( '==' ) > 0:
113	match_parts = match.split( '==' )
114	left = match_parts[0].strip()
115	right = match_parts[1].strip()
116	new_match = '(%s in codes[%s] and %s in codes[%s])' % ( left, right, right, left )
117	elif match.find( '!=' ) > 0 :
118	match_parts = match.split( '!=' )
119	left = match_parts[0].strip()
120	right = match_parts[1].strip()
121	new_match = '(%s not in codes[%s] or %s not in codes[%s])' % ( left, right, right, left )
122	else:
123	raise Exception, "The operators '==' and '!=' were not found."
124	assert left.startswith( 'c' ), 'The column names should start with c (lowercase)'
125	if right.find( "'" ) >= 0 or right.find( '"' ) >= 0:
126	test = right.replace( "'", '' ).replace( '"', '' )
127	assert test in string.uppercase or test.find( '+' ) >= 0 or test.find( '.' ) >= 0 or test.find( '-' ) >= 0\
128	or test.startswith( 'chr' ) or test.startswith( 'scaffold' ), \
129	'The value to search for should be a valid base, code, plus sign, chromosome (like "chr1") or scaffold (like "scaffold5"). ' \
130	'Use the general filter tool to filter on anything else first'
131	else:
132	assert right.startswith( 'c' ), 'The column names should start with c (lowercase)'
133	match_replace[match] = new_match
134	if len( match_replace.keys() ) == 0:
135	raise Exception, 'There do not appear to be any valid conditions'
136	for match in match_replace.keys():
137	cond_text = cond_text.replace( match, match_replace[match] )
138	except Exception, e:
139	stop_err( "At least one of your conditions is invalid. Make sure to use only '!=' or '==', valid column numbers, and valid base values.\n" + str(e) )
140
141	# Attempt to determine if the condition includes executable stuff and, if so, exit
142	secured = dir()
143	operands = get_operands( cond_text )
144	for operand in operands:
145	try:
146	check = int( operand )
147	except:
148	if operand in secured:
149	stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) )
150
151	# Prepare the column variable names and wrappers for column data types
152	cols, type_casts = [], []
153	for col in range( 1, in_columns + 1 ):
154	col_name = "c%d" % col
155	cols.append( col_name )
156	col_type = in_column_types[ col - 1 ]
157	type_cast = "%s(%s)" % ( col_type, col_name )
158	type_casts.append( type_cast )
159
160	col_str = ', '.join( cols ) # 'c1, c2, c3, c4'
161	type_cast_str = ', '.join( type_casts ) # 'str(c1), int(c2), int(c3), str(c4)'
162	assign = "%s = line.split( '\\t' )" % col_str
163	wrap = "%s = %s" % ( col_str, type_cast_str )
164	skipped_lines = 0
165	first_invalid_line = 0
166	invalid_line = None
167	lines_kept = 0
168	total_lines = 0
169	out = open( output, 'wt' )
170	# Read and filter input file, skipping invalid lines
171	code = '''
172	for i, line in enumerate( file( input ) ):
173	total_lines += 1
174	line = line.rstrip( '\\r\\n' )
175	if not line or line.startswith( '#' ):
176	skipped_lines += 1
177	if not invalid_line:
178	first_invalid_line = i + 1
179	invalid_line = line
180	continue
181	try:
182	%s = line.split( '\\t' )
183	%s = %s
184	if %s:
185	lines_kept += 1
186	print >> out, line
187	except Exception, e:
188	skipped_lines += 1
189	if not invalid_line:
190	first_invalid_line = i + 1
191	invalid_line = line
192	''' % ( col_str, col_str, type_cast_str, cond_text )
193
194	valid_filter = True
195	try:
196	exec code
197	except Exception, e:
198	out.close()
199	if str( e ).startswith( 'invalid syntax' ):
200	valid_filter = False
201	stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % orig_cond_text + ' '+str(e))
202	else:
203	stop_err( str( e ) )
204
205	if valid_filter:
206	out.close()
207	valid_lines = total_lines - skipped_lines
208	print 'Filtering with %s, ' % orig_cond_text
209	if valid_lines > 0:
210	print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
211	else:
212	print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % orig_cond_text
213	if skipped_lines > 0:
214	print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
215
216	if __name__ == "__main__" : __main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/stats/dna_filtering.py

異なるフォーマットでダウンロード: