root/galaxy-central/tools/filters/gff/gff_filter_by_feature_count.py @ 2

リビジョン 2, 4.7 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2"""
3Filter a gff file using a criterion based on feature counts for a transcript.
4
5Usage:
6%prog input_name output_name feature_name condition
7"""
8import sys
9from galaxy import eggs
10from galaxy.tools.util.gff_util import parse_gff_attributes
11
12assert sys.version_info[:2] >= ( 2, 4 )
13
14# Valid operators, ordered so that complex operators (e.g. '>=') are
15# recognized before simple operators (e.g. '>')
16ops = [
17    '>=',
18    '<=',
19    '<',
20    '>',
21    '==',
22    '!='
23]
24
25# Escape sequences for valid operators.
26mapped_ops = {
27    '__ge__': ops[0],
28    '__le__': ops[1],
29    '__lt__': ops[2],
30    '__gt__': ops[3],
31    '__eq__': ops[4],
32    '__ne__': ops[5],
33}
34
35
36def __main__():
37    # Get args.
38    input_name = sys.argv[1]
39    output_name = sys.argv[2]
40    feature_name = sys.argv[3]
41    condition = sys.argv[4]
42   
43    # Unescape operations in condition str.
44    for key, value in mapped_ops.items():
45        condition = condition.replace( key, value )
46   
47    # Error checking: condition should be of the form <operator><number>
48    for op in ops:
49        if op in condition:
50            empty, number_str = condition.split( op )
51            try:
52                number = float( number_str )
53            except:
54                number = None
55            if empty != "" or not number:
56                print >> sys.stderr, "Invalid condition: %s, cannot filter." % condition
57                return
58            break
59
60    # Do filtering.
61    kept_lines = 0
62    skipped_lines = 0
63    first_skipped_line = 0
64    out = open( output_name, 'w' )
65    i = 0
66    cur_transcript_id = None
67    cur_transcript_lines = []
68    cur_transcript_feature_counts = {} # Key is feature name, value is feature count.
69    for i, line in enumerate( file( input_name ) ):
70        line = line.rstrip( '\r\n' )
71        if line and not line.startswith( '#' ):
72            try:
73                # GFF format: chrom, source, feature, chromStart, chromEnd, score, strand, attributes
74                elems = line.split( '\t' )
75                feature = elems[2]
76                start = str( long( elems[3] ) - 1 )
77                coords = [ long( start ), long( elems[4] ) ]
78                strand = elems[6]
79                attributes = parse_gff_attributes( elems[8] )
80                t_id = attributes.get( "transcript_id", None )
81                   
82                if not t_id:
83                    # No transcript id, so pass line to output.
84                    out.write( line )
85                    kept_lines += 1
86                    continue
87               
88                # There is a transcript ID, so process line at transcript level.
89                if t_id == cur_transcript_id:
90                    # Line is element of transcript; increment feature count.
91                    if not feature in cur_transcript_feature_counts:
92                        cur_transcript_feature_counts[feature] = 0
93                    cur_transcript_feature_counts[feature] += 1
94                    cur_transcript_lines.append( line )
95                    continue
96                   
97                #
98                # Line is part of new transcript; filter previous transcript.
99                #
100               
101                # Filter/write previous transcript.
102                result = eval( '%s %s' % ( cur_transcript_feature_counts.get( feature_name, 0 ), condition ) )
103                if cur_transcript_id and result:
104                    # Transcript passes filter; write transcript line to file."
105                    out.write( "\n".join( cur_transcript_lines ) + "\n" )
106                    kept_lines += len( cur_transcript_lines )
107
108                # Start new transcript.
109                cur_transcript_id = t_id
110                cur_transcript_feature_counts = {}
111                cur_transcript_feature_counts[feature] = 1
112                cur_transcript_lines = [ line ]
113            except Exception, e:
114                print e
115                skipped_lines += 1
116                if not first_skipped_line:
117                    first_skipped_line = i + 1
118        else:
119            skipped_lines += 1
120            if not first_skipped_line:
121                first_skipped_line = i + 1
122   
123    # Write last transcript.
124    if cur_transcript_id and eval( '%s %s' % ( cur_transcript_feature_counts[feature_name], condition ) ):
125        # Transcript passes filter; write transcript lints to file.
126        out.write( "\n".join( cur_transcript_lines ) + "\n" )
127        kept_lines += len( cur_transcript_lines )
128
129    # Clean up.
130    out.close()
131    info_msg = "%i lines kept (%.2f%%) using condition %s.  " % ( kept_lines, float(kept_lines)/i, feature_name + condition )
132    if skipped_lines > 0:
133        info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
134    print info_msg
135
136if __name__ == "__main__": __main__()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。