1 | #!/usr/bin/env python |
---|
2 | """ |
---|
3 | Filter a gff file using a criterion based on feature counts for a transcript. |
---|
4 | |
---|
5 | Usage: |
---|
6 | %prog input_name output_name feature_name condition |
---|
7 | """ |
---|
8 | import sys |
---|
9 | from galaxy import eggs |
---|
10 | from galaxy.tools.util.gff_util import parse_gff_attributes |
---|
11 | |
---|
12 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
13 | |
---|
14 | # Valid operators, ordered so that complex operators (e.g. '>=') are |
---|
15 | # recognized before simple operators (e.g. '>') |
---|
16 | ops = [ |
---|
17 | '>=', |
---|
18 | '<=', |
---|
19 | '<', |
---|
20 | '>', |
---|
21 | '==', |
---|
22 | '!=' |
---|
23 | ] |
---|
24 | |
---|
25 | # Escape sequences for valid operators. |
---|
26 | mapped_ops = { |
---|
27 | '__ge__': ops[0], |
---|
28 | '__le__': ops[1], |
---|
29 | '__lt__': ops[2], |
---|
30 | '__gt__': ops[3], |
---|
31 | '__eq__': ops[4], |
---|
32 | '__ne__': ops[5], |
---|
33 | } |
---|
34 | |
---|
35 | |
---|
36 | def __main__(): |
---|
37 | # Get args. |
---|
38 | input_name = sys.argv[1] |
---|
39 | output_name = sys.argv[2] |
---|
40 | feature_name = sys.argv[3] |
---|
41 | condition = sys.argv[4] |
---|
42 | |
---|
43 | # Unescape operations in condition str. |
---|
44 | for key, value in mapped_ops.items(): |
---|
45 | condition = condition.replace( key, value ) |
---|
46 | |
---|
47 | # Error checking: condition should be of the form <operator><number> |
---|
48 | for op in ops: |
---|
49 | if op in condition: |
---|
50 | empty, number_str = condition.split( op ) |
---|
51 | try: |
---|
52 | number = float( number_str ) |
---|
53 | except: |
---|
54 | number = None |
---|
55 | if empty != "" or not number: |
---|
56 | print >> sys.stderr, "Invalid condition: %s, cannot filter." % condition |
---|
57 | return |
---|
58 | break |
---|
59 | |
---|
60 | # Do filtering. |
---|
61 | kept_lines = 0 |
---|
62 | skipped_lines = 0 |
---|
63 | first_skipped_line = 0 |
---|
64 | out = open( output_name, 'w' ) |
---|
65 | i = 0 |
---|
66 | cur_transcript_id = None |
---|
67 | cur_transcript_lines = [] |
---|
68 | cur_transcript_feature_counts = {} # Key is feature name, value is feature count. |
---|
69 | for i, line in enumerate( file( input_name ) ): |
---|
70 | line = line.rstrip( '\r\n' ) |
---|
71 | if line and not line.startswith( '#' ): |
---|
72 | try: |
---|
73 | # GFF format: chrom, source, feature, chromStart, chromEnd, score, strand, attributes |
---|
74 | elems = line.split( '\t' ) |
---|
75 | feature = elems[2] |
---|
76 | start = str( long( elems[3] ) - 1 ) |
---|
77 | coords = [ long( start ), long( elems[4] ) ] |
---|
78 | strand = elems[6] |
---|
79 | attributes = parse_gff_attributes( elems[8] ) |
---|
80 | t_id = attributes.get( "transcript_id", None ) |
---|
81 | |
---|
82 | if not t_id: |
---|
83 | # No transcript id, so pass line to output. |
---|
84 | out.write( line ) |
---|
85 | kept_lines += 1 |
---|
86 | continue |
---|
87 | |
---|
88 | # There is a transcript ID, so process line at transcript level. |
---|
89 | if t_id == cur_transcript_id: |
---|
90 | # Line is element of transcript; increment feature count. |
---|
91 | if not feature in cur_transcript_feature_counts: |
---|
92 | cur_transcript_feature_counts[feature] = 0 |
---|
93 | cur_transcript_feature_counts[feature] += 1 |
---|
94 | cur_transcript_lines.append( line ) |
---|
95 | continue |
---|
96 | |
---|
97 | # |
---|
98 | # Line is part of new transcript; filter previous transcript. |
---|
99 | # |
---|
100 | |
---|
101 | # Filter/write previous transcript. |
---|
102 | result = eval( '%s %s' % ( cur_transcript_feature_counts.get( feature_name, 0 ), condition ) ) |
---|
103 | if cur_transcript_id and result: |
---|
104 | # Transcript passes filter; write transcript line to file." |
---|
105 | out.write( "\n".join( cur_transcript_lines ) + "\n" ) |
---|
106 | kept_lines += len( cur_transcript_lines ) |
---|
107 | |
---|
108 | # Start new transcript. |
---|
109 | cur_transcript_id = t_id |
---|
110 | cur_transcript_feature_counts = {} |
---|
111 | cur_transcript_feature_counts[feature] = 1 |
---|
112 | cur_transcript_lines = [ line ] |
---|
113 | except Exception, e: |
---|
114 | print e |
---|
115 | skipped_lines += 1 |
---|
116 | if not first_skipped_line: |
---|
117 | first_skipped_line = i + 1 |
---|
118 | else: |
---|
119 | skipped_lines += 1 |
---|
120 | if not first_skipped_line: |
---|
121 | first_skipped_line = i + 1 |
---|
122 | |
---|
123 | # Write last transcript. |
---|
124 | if cur_transcript_id and eval( '%s %s' % ( cur_transcript_feature_counts[feature_name], condition ) ): |
---|
125 | # Transcript passes filter; write transcript lints to file. |
---|
126 | out.write( "\n".join( cur_transcript_lines ) + "\n" ) |
---|
127 | kept_lines += len( cur_transcript_lines ) |
---|
128 | |
---|
129 | # Clean up. |
---|
130 | out.close() |
---|
131 | info_msg = "%i lines kept (%.2f%%) using condition %s. " % ( kept_lines, float(kept_lines)/i, feature_name + condition ) |
---|
132 | if skipped_lines > 0: |
---|
133 | info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line ) |
---|
134 | print info_msg |
---|
135 | |
---|
136 | if __name__ == "__main__": __main__() |
---|