[2] | 1 | # Filename: uniq.py |
---|
| 2 | # Author: Ian N. Schenck |
---|
| 3 | # Version: 19/12/2005 |
---|
| 4 | # |
---|
| 5 | # This script accepts an input file, an output file, a column |
---|
| 6 | # delimiter, and a list of columns. The script then grabs unique |
---|
| 7 | # lines based on the columns, and returns those records with a count |
---|
| 8 | # of occurences of each unique column, inserted before the columns. |
---|
| 9 | # |
---|
| 10 | # This executes the command pipeline: |
---|
| 11 | # cut -f $fields | sort | uniq -C |
---|
| 12 | # |
---|
| 13 | # -i Input file |
---|
| 14 | # -o Output file |
---|
| 15 | # -d Delimiter |
---|
| 16 | # -c Column list (Comma Seperated) |
---|
| 17 | |
---|
| 18 | import sys |
---|
| 19 | import re |
---|
| 20 | import string |
---|
| 21 | import commands |
---|
| 22 | |
---|
| 23 | # This function is exceedingly useful, perhaps package for reuse? |
---|
| 24 | def getopts(argv): |
---|
| 25 | opts = {} |
---|
| 26 | while argv: |
---|
| 27 | if argv[0][0] == '-': |
---|
| 28 | opts[argv[0]] = argv[1] |
---|
| 29 | argv = argv[2:] |
---|
| 30 | else: |
---|
| 31 | argv = argv[1:] |
---|
| 32 | return opts |
---|
| 33 | |
---|
| 34 | def main(): |
---|
| 35 | args = sys.argv[1:] |
---|
| 36 | |
---|
| 37 | try: |
---|
| 38 | opts = getopts(args) |
---|
| 39 | except IndexError: |
---|
| 40 | print "Usage:" |
---|
| 41 | print " -i Input file" |
---|
| 42 | print " -o Output file" |
---|
| 43 | print " -c Column list (comma seperated)" |
---|
| 44 | print " -d Delimiter:" |
---|
| 45 | print " T Tab" |
---|
| 46 | print " C Comma" |
---|
| 47 | print " D Dash" |
---|
| 48 | print " U Underscore" |
---|
| 49 | print " P Pipe" |
---|
| 50 | print " Dt Dot" |
---|
| 51 | print " Sp Space" |
---|
| 52 | return 0 |
---|
| 53 | |
---|
| 54 | outputfile = opts.get("-o") |
---|
| 55 | if outputfile == None: |
---|
| 56 | print "No output file specified." |
---|
| 57 | return -1 |
---|
| 58 | |
---|
| 59 | inputfile = opts.get("-i") |
---|
| 60 | if inputfile == None: |
---|
| 61 | print "No input file specified." |
---|
| 62 | return -2 |
---|
| 63 | |
---|
| 64 | delim = opts.get("-d") |
---|
| 65 | if delim == None: |
---|
| 66 | print "Field delimiter not specified." |
---|
| 67 | return -3 |
---|
| 68 | |
---|
| 69 | columns = opts.get("-c") |
---|
| 70 | if columns == None or columns == 'None': |
---|
| 71 | print "Columns not specified." |
---|
| 72 | return -4 |
---|
| 73 | |
---|
| 74 | # All inputs have been specified at this point, now validate. |
---|
| 75 | fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$") |
---|
| 76 | columnRegEx = re.compile("([0-9]{1,},?)+") |
---|
| 77 | |
---|
| 78 | if not columnRegEx.match(columns): |
---|
| 79 | print "Illegal column specification." |
---|
| 80 | return -4 |
---|
| 81 | if not fileRegEx.match(outputfile): |
---|
| 82 | print "Illegal output filename." |
---|
| 83 | return -5 |
---|
| 84 | if not fileRegEx.match(inputfile): |
---|
| 85 | print "Illegal input filename." |
---|
| 86 | return -6 |
---|
| 87 | |
---|
| 88 | column_list = re.split(",",columns) |
---|
| 89 | columns_for_display = "" |
---|
| 90 | for col in column_list: |
---|
| 91 | columns_for_display += "c"+col+", " |
---|
| 92 | |
---|
| 93 | commandline = "cut " |
---|
| 94 | # Set delimiter |
---|
| 95 | if delim=='C': |
---|
| 96 | commandline += "-d \",\" " |
---|
| 97 | if delim=='D': |
---|
| 98 | commandline += "-d \"-\" " |
---|
| 99 | if delim=='U': |
---|
| 100 | commandline += "-d \"_\" " |
---|
| 101 | if delim=='P': |
---|
| 102 | commandline += "-d \"|\" " |
---|
| 103 | if delim=='Dt': |
---|
| 104 | commandline += "-d \".\" " |
---|
| 105 | if delim=='Sp': |
---|
| 106 | commandline += "-d \" \" " |
---|
| 107 | |
---|
| 108 | # set columns |
---|
| 109 | commandline += "-f " + columns |
---|
| 110 | commandline += " " + inputfile + " | sed s/\ //g | sort | uniq -c | sed s/^\ *// | tr \" \" \"\t\" > " + outputfile |
---|
| 111 | errorcode, stdout = commands.getstatusoutput(commandline) |
---|
| 112 | |
---|
| 113 | print "Count of unique values in " + columns_for_display |
---|
| 114 | return errorcode |
---|
| 115 | |
---|
| 116 | if __name__ == "__main__": |
---|
| 117 | main() |
---|