| 1 | #!/usr/bin/env python | 
|---|
| 2 |  | 
|---|
| 3 | """ | 
|---|
| 4 | Combines several interval files containing indels with counts. All input files need to have the same number of columns. | 
|---|
| 5 |  | 
|---|
| 6 | usage: %prog [options] [input3 sum3[ input4 sum4[ input5 sum5[...]]]] | 
|---|
| 7 | -1, --input1=1: The first input file | 
|---|
| 8 | -s, --sum1=s: Whether or not to include the totals from first file in overall total | 
|---|
| 9 | -2, --input2=2: The second input file | 
|---|
| 10 | -S, --sum2=S: Whether or not to include the totals from second file in overall total | 
|---|
| 11 | -o, --output=o: The interval output file for the combined files | 
|---|
| 12 | """ | 
|---|
| 13 |  | 
|---|
| 14 | import re, sys | 
|---|
| 15 | from galaxy import eggs | 
|---|
| 16 | import pkg_resources; pkg_resources.require( "bx-python" ) | 
|---|
| 17 | from bx.cookbook import doc_optparse | 
|---|
| 18 |  | 
|---|
| 19 |  | 
|---|
| 20 | def stop_err( msg ): | 
|---|
| 21 | sys.stderr.write( '%s\n' % msg ) | 
|---|
| 22 | sys.exit() | 
|---|
| 23 |  | 
|---|
| 24 | def numeric_sort( text1, text2 ): | 
|---|
| 25 | """ | 
|---|
| 26 | For two items containing space-separated text, compares equivalent pieces | 
|---|
| 27 | numerically if both numeric or as text otherwise | 
|---|
| 28 | """ | 
|---|
| 29 | pieces1 = text1.split() | 
|---|
| 30 | pieces2 = text2.split() | 
|---|
| 31 | if len( pieces1 ) == 0: | 
|---|
| 32 | return 1 | 
|---|
| 33 | if len( pieces2 ) == 0: | 
|---|
| 34 | return -1 | 
|---|
| 35 | for i, pc1 in enumerate( pieces1 ): | 
|---|
| 36 | if i == len( pieces2 ): | 
|---|
| 37 | return 1 | 
|---|
| 38 | if not pieces2[i].isdigit(): | 
|---|
| 39 | if pc1.isdigit(): | 
|---|
| 40 | return -1 | 
|---|
| 41 | else: | 
|---|
| 42 | if pc1 > pieces2[i]: | 
|---|
| 43 | return 1 | 
|---|
| 44 | elif pc1 < pieces2[i]: | 
|---|
| 45 | return -1 | 
|---|
| 46 | else: | 
|---|
| 47 | if not pc1.isdigit(): | 
|---|
| 48 | return 1 | 
|---|
| 49 | else: | 
|---|
| 50 | if int( pc1 ) > int( pieces2[i] ): | 
|---|
| 51 | return 1 | 
|---|
| 52 | elif int( pc1 ) < int( pieces2[i] ): | 
|---|
| 53 | return -1 | 
|---|
| 54 | if i < len( pieces2 ) - 1: | 
|---|
| 55 | return -1 | 
|---|
| 56 | return 0 | 
|---|
| 57 |  | 
|---|
| 58 | def __main__(): | 
|---|
| 59 | # Parse Command Line | 
|---|
| 60 | options, args = doc_optparse.parse( __doc__ ) | 
|---|
| 61 | inputs = [ options.input1, options.input2 ] | 
|---|
| 62 | includes = [ options.sum1, options.sum2 ] | 
|---|
| 63 | inputs.extend( [ a for i, a in enumerate( args ) if i % 2 == 0 ] ) | 
|---|
| 64 | includes.extend( [ a for i, a in enumerate( args ) if i % 2 == 1 ] ) | 
|---|
| 65 | num_cols = 0 | 
|---|
| 66 | counts = {} | 
|---|
| 67 | # read in data from all files and get total counts | 
|---|
| 68 | try: | 
|---|
| 69 | for i, input in enumerate( inputs ): | 
|---|
| 70 | for line in open( input, 'rb' ): | 
|---|
| 71 | sp_line = line.strip().split( '\t' ) | 
|---|
| 72 | # set num_cols on first pass | 
|---|
| 73 | if num_cols == 0: | 
|---|
| 74 | if len( sp_line ) < 4: | 
|---|
| 75 | raise Exception, 'There need to be at least 4 columns in the file: Chrom, Start, End, and Count' | 
|---|
| 76 | num_cols = len( sp_line ) | 
|---|
| 77 | # deal with differing number of columns | 
|---|
| 78 | elif len( sp_line ) != num_cols: | 
|---|
| 79 | raise Exception, 'All of the files need to have the same number of columns (current %s != %s of first line)' % ( len( sp_line ), num_cols ) | 
|---|
| 80 | # get actual counts for each indel | 
|---|
| 81 | indel = '\t'.join( sp_line[:-1] ) | 
|---|
| 82 | try: | 
|---|
| 83 | count = int( sp_line[-1] ) | 
|---|
| 84 | except ValueError, e: | 
|---|
| 85 | raise Exception, 'The last column of each file must be numeric, with the count of the number of instances of that indel: %s' % str( e ) | 
|---|
| 86 | # total across all included files | 
|---|
| 87 | if includes[i] == "true": | 
|---|
| 88 | try: | 
|---|
| 89 | counts[ indel ]['tot'] += count | 
|---|
| 90 | except ( IndexError, KeyError ): | 
|---|
| 91 | counts[ indel ] = { 'tot': count } | 
|---|
| 92 | # counts for ith file | 
|---|
| 93 | counts[ indel ][i] = count | 
|---|
| 94 | except Exception, e: | 
|---|
| 95 | stop_err( 'Failed to read all input files:\n%s' % str( e ) ) | 
|---|
| 96 | # output combined results to table file | 
|---|
| 97 | try: | 
|---|
| 98 | output = open( options.output, 'wb' ) | 
|---|
| 99 | count_keys = counts.keys() | 
|---|
| 100 | count_keys.sort( numeric_sort ) | 
|---|
| 101 | for indel in count_keys: | 
|---|
| 102 | count_out = [ str( counts[ indel ][ 'tot' ] ) ] | 
|---|
| 103 | for i in range( len( inputs ) ): | 
|---|
| 104 | try: | 
|---|
| 105 | count_out.append( str( counts[ indel ][i] ) ) | 
|---|
| 106 | except KeyError: | 
|---|
| 107 | count_out.append( '0' ) | 
|---|
| 108 | output.write( '%s\t%s\n' % ( indel, '\t'.join( count_out ) ) ) | 
|---|
| 109 | output.close() | 
|---|
| 110 | except Exception, e: | 
|---|
| 111 | stop_err( 'Failed to output data: %s' % str( e ) ) | 
|---|
| 112 |  | 
|---|
| 113 | if __name__=="__main__": __main__() | 
|---|