Context Navigation

gff_filter_by_feature_count.py @ 2

リビジョン 2, 4.7 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2	"""
3	Filter a gff file using a criterion based on feature counts for a transcript.
4
5	Usage:
6	%prog input_name output_name feature_name condition
7	"""
8	import sys
9	from galaxy import eggs
10	from galaxy.tools.util.gff_util import parse_gff_attributes
11
12	assert sys.version_info[:2] >= ( 2, 4 )
13
14	# Valid operators, ordered so that complex operators (e.g. '>=') are
15	# recognized before simple operators (e.g. '>')
16	ops = [
17	'>=',
18	'<=',
19	'<',
20	'>',
21	'==',
22	'!='
23	]
24
25	# Escape sequences for valid operators.
26	mapped_ops = {
27	'__ge__': ops[0],
28	'__le__': ops[1],
29	'__lt__': ops[2],
30	'__gt__': ops[3],
31	'__eq__': ops[4],
32	'__ne__': ops[5],
33	}
34
35
36	def __main__():
37	# Get args.
38	input_name = sys.argv[1]
39	output_name = sys.argv[2]
40	feature_name = sys.argv[3]
41	condition = sys.argv[4]
42
43	# Unescape operations in condition str.
44	for key, value in mapped_ops.items():
45	condition = condition.replace( key, value )
46
47	# Error checking: condition should be of the form <operator><number>
48	for op in ops:
49	if op in condition:
50	empty, number_str = condition.split( op )
51	try:
52	number = float( number_str )
53	except:
54	number = None
55	if empty != "" or not number:
56	print >> sys.stderr, "Invalid condition: %s, cannot filter." % condition
57	return
58	break
59
60	# Do filtering.
61	kept_lines = 0
62	skipped_lines = 0
63	first_skipped_line = 0
64	out = open( output_name, 'w' )
65	i = 0
66	cur_transcript_id = None
67	cur_transcript_lines = []
68	cur_transcript_feature_counts = {} # Key is feature name, value is feature count.
69	for i, line in enumerate( file( input_name ) ):
70	line = line.rstrip( '\r\n' )
71	if line and not line.startswith( '#' ):
72	try:
73	# GFF format: chrom, source, feature, chromStart, chromEnd, score, strand, attributes
74	elems = line.split( '\t' )
75	feature = elems[2]
76	start = str( long( elems[3] ) - 1 )
77	coords = [ long( start ), long( elems[4] ) ]
78	strand = elems[6]
79	attributes = parse_gff_attributes( elems[8] )
80	t_id = attributes.get( "transcript_id", None )
81
82	if not t_id:
83	# No transcript id, so pass line to output.
84	out.write( line )
85	kept_lines += 1
86	continue
87
88	# There is a transcript ID, so process line at transcript level.
89	if t_id == cur_transcript_id:
90	# Line is element of transcript; increment feature count.
91	if not feature in cur_transcript_feature_counts:
92	cur_transcript_feature_counts[feature] = 0
93	cur_transcript_feature_counts[feature] += 1
94	cur_transcript_lines.append( line )
95	continue
96
97	#
98	# Line is part of new transcript; filter previous transcript.
99	#
100
101	# Filter/write previous transcript.
102	result = eval( '%s %s' % ( cur_transcript_feature_counts.get( feature_name, 0 ), condition ) )
103	if cur_transcript_id and result:
104	# Transcript passes filter; write transcript line to file."
105	out.write( "\n".join( cur_transcript_lines ) + "\n" )
106	kept_lines += len( cur_transcript_lines )
107
108	# Start new transcript.
109	cur_transcript_id = t_id
110	cur_transcript_feature_counts = {}
111	cur_transcript_feature_counts[feature] = 1
112	cur_transcript_lines = [ line ]
113	except Exception, e:
114	print e
115	skipped_lines += 1
116	if not first_skipped_line:
117	first_skipped_line = i + 1
118	else:
119	skipped_lines += 1
120	if not first_skipped_line:
121	first_skipped_line = i + 1
122
123	# Write last transcript.
124	if cur_transcript_id and eval( '%s %s' % ( cur_transcript_feature_counts[feature_name], condition ) ):
125	# Transcript passes filter; write transcript lints to file.
126	out.write( "\n".join( cur_transcript_lines ) + "\n" )
127	kept_lines += len( cur_transcript_lines )
128
129	# Clean up.
130	out.close()
131	info_msg = "%i lines kept (%.2f%%) using condition %s. " % ( kept_lines, float(kept_lines)/i, feature_name + condition )
132	if skipped_lines > 0:
133	info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
134	print info_msg
135
136	if __name__ == "__main__": __main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/filters/gff/gff_filter_by_feature_count.py @ 2

異なるフォーマットでダウンロード: