root/galaxy-central/tools/stats/dna_filtering.py @ 3

リビジョン 2, 9.3 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2
3"""
4This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
5
6usage: %prog [options]
7    -i, --input=i: tabular input file
8    -o, --output=o: filtered output file
9    -c, --cond=c: conditions to filter on
10    -n, --n_handling=n: how to handle N and X
11    -l, --columns=l: columns
12    -t, --col_types=t: column types   
13
14"""
15
16#from __future__ import division
17import os.path, re, string, string, sys
18from galaxy import eggs
19import pkg_resources; pkg_resources.require( "bx-python" )
20from bx.cookbook import doc_optparse
21
22# Older py compatibility
23try:
24    set()
25except:
26    from sets import Set as set
27
28#assert sys.version_info[:2] >= ( 2, 4 )
29
30def get_operands( filter_condition ):
31    # Note that the order of all_operators is important
32    items_to_strip = [ '==', '!=', ' and ', ' or ' ]
33    for item in items_to_strip:
34        if filter_condition.find( item ) >= 0:
35            filter_condition = filter_condition.replace( item, ' ' )
36    operands = set( filter_condition.split( ' ' ) )
37    return operands
38
39def stop_err( msg ):
40    sys.stderr.write( msg )
41    sys.exit()
42
43def __main__():
44    #Parse Command Line
45    options, args = doc_optparse.parse( __doc__ )
46    input = options.input
47    output = options.output
48    cond = options.cond
49    n_handling = options.n_handling
50    columns = options.columns
51    col_types = options.col_types
52
53    try:
54        in_columns = int( columns )
55        assert col_types  #check to see that the column types variable isn't null
56        in_column_types = col_types.split( ',' )
57    except:
58        stop_err( "Data does not appear to be tabular.  This tool can only be used with tab-delimited data." )
59
60    # Unescape if input has been escaped
61    cond_text = cond.replace( '__eq__', '==' ).replace( '__ne__', '!=' ).replace( '__sq__', "'" )
62    orig_cond_text = cond_text
63    # Expand to allow for DNA codes
64    dot_letters = [ letter for letter in string.uppercase if letter not in \
65                   [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ] ]
66    dot_letters.append( '.' )
67    codes = {'A': [ 'A', 'D', 'H', 'M', 'R', 'V', 'W' ],
68             'C': [ 'C', 'B', 'H', 'M', 'S', 'V', 'Y' ],
69             'G': [ 'G', 'B', 'D', 'K', 'R', 'S', 'V' ],
70             'T': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ],
71             'U': [ 'T', 'U', 'B', 'D', 'H', 'K', 'W', 'Y' ],
72             'K': [ 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'R', 'S', 'V', 'W', 'Y' ],
73             'M': [ 'A', 'C', 'B', 'D', 'H', 'M', 'R', 'S', 'V', 'W', 'Y' ],
74             'R': [ 'A', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ],
75             'Y': [ 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'S', 'V', 'W', 'Y' ],
76             'S': [ 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'Y' ],
77             'W': [ 'A', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'V', 'W', 'Y' ],
78             'B': [ 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
79             'V': [ 'A', 'C', 'G', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W' ],
80             'H': [ 'A', 'C', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
81             'D': [ 'A', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'R', 'S', 'V', 'W', 'Y' ],
82             '.': dot_letters,
83             '-': [ '-' ]}
84    # Add handling for N and X
85    if n_handling == "all":
86        codes[ 'N' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ]
87        codes[ 'X' ] = [ 'A', 'C', 'G', 'T', 'U', 'B', 'D', 'H', 'K', 'M', 'N', 'R', 'S', 'V', 'W', 'X', 'Y' ]
88        for code in codes.keys():
89            if code != '.' and code != '-':
90                codes[code].append( 'N' )
91                codes[code].append( 'X' )
92    else:
93        codes[ 'N' ] = dot_letters
94        codes[ 'X' ] = dot_letters
95        codes[ '.' ].extend( [ 'N', 'X' ] )
96    # Expand conditions to allow for DNA codes
97    try:
98        match_replace = {}
99        pat = re.compile( 'c\d+\s*[!=]=\s*[\w\d"\'+-.]+' )
100        matches = pat.findall( cond_text )
101        for match in matches:
102            if match.find( 'chr' ) >= 0 or match.find( 'scaffold' ) >= 0 or match.find( '+' ) >= 0:
103                if match.find( '==' ) >= 0:
104                    match_parts = match.split( '==' )
105                elif match.find( '!=' ) >= 0:
106                    match_parts = match.split( '!=' )
107                else:
108                    raise Exception, "The operators '==' and '!=' were not found."
109                left = match_parts[0].strip()
110                right = match_parts[1].strip()
111                new_match = "(%s)" % ( match )
112            elif match.find( '==' ) > 0:
113                match_parts = match.split( '==' )
114                left = match_parts[0].strip()
115                right = match_parts[1].strip()
116                new_match = '(%s in codes[%s] and %s in codes[%s])' % ( left, right, right, left )
117            elif match.find( '!=' ) > 0 :
118                match_parts = match.split( '!=' )
119                left = match_parts[0].strip()
120                right = match_parts[1].strip()
121                new_match = '(%s not in codes[%s] or %s not in codes[%s])' % ( left, right, right, left )
122            else:
123                raise Exception, "The operators '==' and '!=' were not found."
124            assert left.startswith( 'c' ), 'The column names should start with c (lowercase)'
125            if right.find( "'" ) >= 0 or right.find( '"' ) >= 0:
126                test = right.replace( "'", '' ).replace( '"', '' )
127                assert test in string.uppercase or test.find( '+' ) >= 0 or test.find( '.' ) >= 0 or test.find( '-' ) >= 0\
128                        or test.startswith( 'chr' ) or test.startswith( 'scaffold' ), \
129                        'The value to search for should be a valid base, code, plus sign, chromosome (like "chr1") or scaffold (like "scaffold5"). ' \
130                        'Use the general filter tool to filter on anything else first'
131            else:
132                assert right.startswith( 'c' ), 'The column names should start with c (lowercase)'
133            match_replace[match] = new_match
134        if len( match_replace.keys() ) == 0:
135            raise Exception, 'There do not appear to be any valid conditions'
136        for match in match_replace.keys():
137            cond_text = cond_text.replace( match, match_replace[match] )
138    except Exception, e:
139        stop_err( "At least one of your conditions is invalid. Make sure to use only '!=' or '==', valid column numbers, and valid base values.\n" + str(e) )
140
141    # Attempt to determine if the condition includes executable stuff and, if so, exit
142    secured = dir()
143    operands = get_operands( cond_text )
144    for operand in operands:
145        try:
146            check = int( operand )
147        except:
148            if operand in secured:
149                stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) )
150
151    # Prepare the column variable names and wrappers for column data types
152    cols, type_casts = [], []
153    for col in range( 1, in_columns + 1 ):
154        col_name = "c%d" % col
155        cols.append( col_name )
156        col_type = in_column_types[ col - 1 ]
157        type_cast = "%s(%s)" % ( col_type, col_name )
158        type_casts.append( type_cast )
159
160    col_str = ', '.join( cols )    # 'c1, c2, c3, c4'
161    type_cast_str = ', '.join( type_casts )  # 'str(c1), int(c2), int(c3), str(c4)'
162    assign = "%s = line.split( '\\t' )" % col_str
163    wrap = "%s = %s" % ( col_str, type_cast_str )
164    skipped_lines = 0
165    first_invalid_line = 0
166    invalid_line = None
167    lines_kept = 0
168    total_lines = 0
169    out = open( output, 'wt' )
170    # Read and filter input file, skipping invalid lines
171    code = '''
172for i, line in enumerate( file( input ) ):
173    total_lines += 1
174    line = line.rstrip( '\\r\\n' )
175    if not line or line.startswith( '#' ):
176        skipped_lines += 1
177        if not invalid_line:
178            first_invalid_line = i + 1
179            invalid_line = line
180        continue
181    try:
182        %s = line.split( '\\t' )
183        %s = %s
184        if %s:
185            lines_kept += 1
186            print >> out, line
187    except Exception, e:
188        skipped_lines += 1
189        if not invalid_line:
190            first_invalid_line = i + 1
191            invalid_line = line
192''' % ( col_str, col_str, type_cast_str, cond_text )
193
194    valid_filter = True
195    try:
196        exec code
197    except Exception, e:
198        out.close()
199        if str( e ).startswith( 'invalid syntax' ):
200            valid_filter = False
201            stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % orig_cond_text + ' '+str(e))
202        else:
203            stop_err( str( e ) )
204
205    if valid_filter:
206        out.close()
207        valid_lines = total_lines - skipped_lines
208        print 'Filtering with %s, ' % orig_cond_text
209        if valid_lines > 0:
210            print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
211        else:
212            print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % orig_cond_text
213        if skipped_lines > 0:
214            print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
215   
216if __name__ == "__main__" : __main__()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。