root/galaxy-central/tools/new_operations/column_join.py @ 2

リビジョン 2, 11.9 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2
3"""
4This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
5
6usage: %prog -o output -1 input1 -2 input2 -c column1[,column2[,column3[,...]]] -g hinge1[,hinge2[,hinge3[,...]]] -f <fill_options_file> [other_input1 [other_input2 [other_input3 ...]]]
7    -o, output=0: the output pileup
8    -1, input1=1: the pileup file to start with
9    -2, input2=2: the second pileup file to join
10    -g, hinge=h: the columns to be used for matching
11    -c, columns=c: the columns that should appear in the output
12    -f, fill_options_file=f: the file specifying the fill value to use
13    other_inputs: the other input files to join
14"""
15
16import optparse, os, re, struct, sys, tempfile
17
18try:
19    simple_json_exception = None
20    from galaxy import eggs
21    from galaxy.util.bunch import Bunch
22    from galaxy.util import stringify_dictionary_keys
23    import pkg_resources
24    pkg_resources.require("simplejson")
25    import simplejson
26except Exception, e:
27    simplejson_exception = e
28    simplejson = None
29
30def stop_err( msg ):
31    sys.stderr.write( msg )
32    sys.exit()
33
34def split_nums( text ):
35    """
36    Splits a string into pieces of numbers and non-numbers, like 'abc23B3' --> [ 'abc', 23, 'B', 3 ]
37    """
38    split_t = []
39    c = ''
40    n = ''
41    for ch in text:
42        try:
43            v = int( ch )
44            n += ch
45            if c:
46                split_t.append( ''.join( c ) )
47                c = ''
48        except ValueError:
49            c += ch
50            if n:
51                split_t.append( int( ''.join( n ) ) )
52                n = ''
53    if c:
54        split_t.append( ''.join( c ) )
55    if n:
56        split_t.append( int( ''.join( n ) ) )
57    return split_t
58
59def hinge_compare( hinge1, hinge2 ):
60    """
61    Compares items like 'chr10' and 'chrM' or 'scaffold2' and scaffold10' so that
62    first part handled as text but last part as number
63    """
64    split_hinge1 = hinge1.split( '\t' )
65    split_hinge2 = hinge2.split( '\t' )
66    # quick check if either hinge is empty
67    if not ''.join( split_hinge2 ):
68        if ''.join( split_hinge1 ):
69            return 1
70        elif not ''.join( split_hinge1 ):
71            return 0
72    else:
73        if not ''.join( split_hinge1 ):
74            return -1
75    # go through all parts of the hinges and compare
76    for i, sh1 in enumerate( split_hinge1 ):
77        # if these hinge segments are the same, just move on to the next ones
78        if sh1 == split_hinge2[ i ]:
79            continue
80        # check all parts of each hinge
81        h1 = split_nums( sh1 )
82        h2 = split_nums( split_hinge2[ i ] )
83        for j, h in enumerate( h1 ):
84            # if second hinge has no more parts, first is considered larger
85            if j > 0 and len( h2 ) <= j:
86                return 1
87            # if these two parts are the same, move on to next
88            if h == h2[ j ]:
89                continue
90            # do actual comparison, depending on whether letter or number
91            if type( h ) == int:
92                if type( h2[ j ] ) == int:
93                    if h > h2[ j ]:
94                        return 1
95                    elif h < h2[ j ]:
96                        return -1
97                # numbers are less than letters
98                elif type( h2[ j ] ) == str:
99                    return -1
100            elif type( h ) == str:
101                if type( h2[ j ] ) == str:
102                    if h > h2[ j ]:
103                        return 1
104                    elif h < h2[ j ]:
105                        return -1
106                # numbers are less than letters
107                elif type( h2[ j ] ) == int:
108                    return 1
109    # if all else has failed, just do basic string comparison
110    if hinge1 > hinge2:
111        return 1
112    elif hinge1 == hinge2:
113        return 0
114    elif hinge1 < hinge2:
115        return -1
116
117def hinge_sort( infile, outfile, hinge ):
118    """Given input file name, sorts logically (text vs. numeric) into provided output file name."""
119    hinge_locs = {}
120    bad_lines = []
121    fin = open( infile, 'rb' )
122    line = fin.readline()
123    while line.strip():
124        try:
125            hinge_parts = line.split( '\t' )[ :hinge ]
126            try:
127                hinge_locs[ '\t'.join( hinge_parts ) ].append( fin.tell() - len( line ) )
128            except KeyError:
129                hinge_locs[ '\t'.join( hinge_parts ) ] = [ fin.tell() - len( line ) ]
130        except ValueError:
131            bad_line.append( line )
132        line = fin.readline()
133    fin.close()
134    fin = open( infile, 'rb' )
135    fout = open( outfile, 'wb' )
136    hinge_locs_sorted = hinge_locs.keys()
137    hinge_locs_sorted.sort( hinge_compare )
138    for hinge_loc in hinge_locs_sorted:
139        locs = hinge_locs[ hinge_loc ]
140        for loc in locs:
141            fin.seek( loc )
142            fout.write( fin.readline() )
143    fout.close()
144    fin.close()
145
146def __main__():
147    parser = optparse.OptionParser()
148    parser.add_option( '-o', '--output', dest='output', help='The name of the output file' )
149    parser.add_option( '-1', '--input1', dest='input1', help='The name of the first input file' )
150    parser.add_option( '-2', '--input2', dest='input2', help='The name of the second input file' )
151    parser.add_option( '-g', '--hinge', dest='hinge', help='The "hinge" to use (the value to compare)' )
152    parser.add_option( '-c', '--columns', dest='columns', help='The columns to include in the output file' )
153    parser.add_option( '-f', '--fill_options_file', dest='fill_options_file', default=None, help='The file specifying the fill value to use' )
154    (options, args) = parser.parse_args()
155    hinge = int( options.hinge )
156    cols = [ int( c ) for c in str( options.columns ).split( ',' ) if int( c ) > hinge ]
157    inputs = [ options.input1, options.input2 ]
158    if options.fill_options_file == 'None':
159        inputs.extend( args )
160    elif len( args ) > 0:
161        inputs.extend( args )
162    fill_options = None
163    if options.fill_options_file != 'None' and options.fill_options_file is not None:
164        try:
165            if simplejson is None:
166                raise simplejson_exception
167            fill_options = Bunch( **stringify_dictionary_keys( simplejson.load( open( options.fill_options_file ) ) ) )
168        except Exception, e:
169            print 'Warning: Ignoring fill options due to simplejson error (%s).' % e
170    if fill_options is None:
171        fill_options = Bunch()
172    if 'file1_columns' not in fill_options:
173        fill_options.file1_columns = None
174    if fill_options and fill_options.file1_columns:
175        fill_empty = {}
176        for col in cols:
177            fill_empty[ col ] = fill_options.file1_columns[ col - 1 ]
178    else:
179        fill_empty = None
180    assert len( cols ) > 0, 'You need to select at least one column in addition to the hinge'
181    delimiter = '\t'
182    # make sure all files are sorted in same way, ascending
183    tmp_input_files = []
184    input_files = inputs[:]
185    for in_file in input_files:
186        tmp_file = tempfile.NamedTemporaryFile()
187        tmp_file_name = tmp_file.name
188        tmp_file.close()
189        hinge_sort( in_file, tmp_file_name, hinge )
190        tmp_file = open( tmp_file_name, 'rb' )
191        tmp_input_files.append( tmp_file )
192    # cycle through files, getting smallest line of all files one at a time
193    # also have to keep track of vertical position of extra columns
194    fout = file( options.output, 'w' )
195    old_current = ''
196    first_line = True
197    current_lines = [ f.readline().rstrip( '\r\n' ) for f in tmp_input_files ]
198    last_lines = ''.join( current_lines )
199    last_loc = -1
200    while last_lines:
201        # get the "minimum" hinge, which should come first, and the file location in list
202        hinges = [ delimiter.join( line.split( delimiter )[ :hinge ] ) for line in current_lines ]
203        hinge_dict = {}
204        for i in range( len( hinges ) ):
205            if not hinge_dict.has_key( hinges[ i ] ):
206                hinge_dict[ hinges[ i ] ] = i
207        hinges.sort( hinge_compare )
208        hinges = [ h for h in hinges if h ]
209        current, loc = hinges[0], hinge_dict[ hinges[0] ]
210        # first output empty columns for vertical alignment (account for "missing" files)
211        # write output for leading and trailing empty columns
212        # columns missing from actual file handled further below
213        current_data = []
214        if current != old_current:
215            # fill trailing empty columns with appropriate fill value
216            if not first_line:
217                if last_loc < len( inputs ) - 1:
218                    if not fill_empty:
219                        filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
220                    else:
221                        filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
222                    fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) )
223                # insert line break before current line
224                fout.write( '\n' )
225            # fill leading empty columns with appropriate fill value
226            if loc > 0:
227                if not fill_empty:
228                    current_data = [ '' for col in range( loc * len( cols ) ) ]
229                else:
230                    current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( loc * len( cols ) ) ]
231        else:
232            if loc - last_loc > 1:
233                if not fill_empty:
234                    current_data = [ '' for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ]
235                else:
236                    current_data = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( loc - last_loc - 1 ) * len( cols ) ) ]
237        # now output actual data
238        split_line = current_lines[ loc ].split( delimiter )
239        # fill empties within actual line if appropriate
240        if fill_empty:
241            new_split_line = split_line[:]
242            split_line = []
243            for i, item in enumerate( new_split_line ):
244                col = i + 1
245                if not item:
246                    try:
247                        split_line.append( fill_empty[ i + 1 ] )
248                    except KeyError:
249                        split_line.append( item )
250                else:
251                    split_line.append( item )
252        # add actual data to be output below
253        if ''.join( split_line ):
254            for col in cols:
255                if col > hinge:
256                    # if this column doesn't exist, add the appropriate filler or empty column
257                    try:
258                        new_item = split_line[ col - 1 ]
259                    except IndexError:
260                        if fill_empty:
261                            new_item = fill_empty[ col ]
262                        else:
263                            new_item = ''
264                    current_data.append( new_item )
265            # grab next line for selected file
266            current_lines[ loc ] = tmp_input_files[ loc ].readline().rstrip( '\r\n' )
267            # write relevant data to file
268            if current == old_current and current_data:
269                fout.write( '%s%s' % ( delimiter, delimiter.join( current_data ) ) )
270            elif current_data:
271                fout.write( '%s%s%s' % ( current, delimiter, delimiter.join( current_data ) ) )
272            last_lines = ''.join( current_lines )
273        else:
274            last_lines = None
275        last_loc = loc
276        old_current = current
277        first_line = False
278    # fill trailing empty columns for final line
279    if last_loc < len( inputs ) - 1:
280        if not fill_empty:
281            filler = [ '' for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
282        else:
283            filler = [ fill_empty[ cols[ col % len( cols ) ] ] for col in range( ( len( inputs ) - last_loc - 1 ) * len( cols ) ) ]
284        fout.write( '%s%s' % ( delimiter, delimiter.join( filler ) ) )
285    fout.write( '\n' )
286    fout.close()
287    for f in tmp_input_files:
288        os.unlink( f.name )
289
290if __name__ == "__main__" : __main__()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。