1 | # Filename: uniq.py |
---|
2 | # Author: Ian N. Schenck |
---|
3 | # Version: 19/12/2005 |
---|
4 | # |
---|
5 | # This script accepts an input file, an output file, a column |
---|
6 | # delimiter, and a list of columns. The script then grabs unique |
---|
7 | # lines based on the columns, and returns those records with a count |
---|
8 | # of occurences of each unique column, inserted before the columns. |
---|
9 | # |
---|
10 | # This executes the command pipeline: |
---|
11 | # cut -f $fields | sort | uniq -C |
---|
12 | # |
---|
13 | # -i Input file |
---|
14 | # -o Output file |
---|
15 | # -d Delimiter |
---|
16 | # -c Column list (Comma Seperated) |
---|
17 | |
---|
18 | import sys |
---|
19 | import re |
---|
20 | import string |
---|
21 | import commands |
---|
22 | |
---|
23 | # This function is exceedingly useful, perhaps package for reuse? |
---|
24 | def getopts(argv): |
---|
25 | opts = {} |
---|
26 | while argv: |
---|
27 | if argv[0][0] == '-': |
---|
28 | opts[argv[0]] = argv[1] |
---|
29 | argv = argv[2:] |
---|
30 | else: |
---|
31 | argv = argv[1:] |
---|
32 | return opts |
---|
33 | |
---|
34 | def main(): |
---|
35 | args = sys.argv[1:] |
---|
36 | |
---|
37 | try: |
---|
38 | opts = getopts(args) |
---|
39 | except IndexError: |
---|
40 | print "Usage:" |
---|
41 | print " -i Input file" |
---|
42 | print " -o Output file" |
---|
43 | print " -c Column list (comma seperated)" |
---|
44 | print " -d Delimiter:" |
---|
45 | print " T Tab" |
---|
46 | print " C Comma" |
---|
47 | print " D Dash" |
---|
48 | print " U Underscore" |
---|
49 | print " P Pipe" |
---|
50 | print " Dt Dot" |
---|
51 | print " Sp Space" |
---|
52 | return 0 |
---|
53 | |
---|
54 | outputfile = opts.get("-o") |
---|
55 | if outputfile == None: |
---|
56 | print "No output file specified." |
---|
57 | return -1 |
---|
58 | |
---|
59 | inputfile = opts.get("-i") |
---|
60 | if inputfile == None: |
---|
61 | print "No input file specified." |
---|
62 | return -2 |
---|
63 | |
---|
64 | delim = opts.get("-d") |
---|
65 | if delim == None: |
---|
66 | print "Field delimiter not specified." |
---|
67 | return -3 |
---|
68 | |
---|
69 | columns = opts.get("-c") |
---|
70 | if columns == None or columns == 'None': |
---|
71 | print "Columns not specified." |
---|
72 | return -4 |
---|
73 | |
---|
74 | # All inputs have been specified at this point, now validate. |
---|
75 | fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$") |
---|
76 | columnRegEx = re.compile("([0-9]{1,},?)+") |
---|
77 | |
---|
78 | if not columnRegEx.match(columns): |
---|
79 | print "Illegal column specification." |
---|
80 | return -4 |
---|
81 | if not fileRegEx.match(outputfile): |
---|
82 | print "Illegal output filename." |
---|
83 | return -5 |
---|
84 | if not fileRegEx.match(inputfile): |
---|
85 | print "Illegal input filename." |
---|
86 | return -6 |
---|
87 | |
---|
88 | column_list = re.split(",",columns) |
---|
89 | columns_for_display = "" |
---|
90 | for col in column_list: |
---|
91 | columns_for_display += "c"+col+", " |
---|
92 | |
---|
93 | commandline = "cut " |
---|
94 | # Set delimiter |
---|
95 | if delim=='C': |
---|
96 | commandline += "-d \",\" " |
---|
97 | if delim=='D': |
---|
98 | commandline += "-d \"-\" " |
---|
99 | if delim=='U': |
---|
100 | commandline += "-d \"_\" " |
---|
101 | if delim=='P': |
---|
102 | commandline += "-d \"|\" " |
---|
103 | if delim=='Dt': |
---|
104 | commandline += "-d \".\" " |
---|
105 | if delim=='Sp': |
---|
106 | commandline += "-d \" \" " |
---|
107 | |
---|
108 | # set columns |
---|
109 | commandline += "-f " + columns |
---|
110 | commandline += " " + inputfile + " | sed s/\ //g | sort | uniq -c | sed s/^\ *// | tr \" \" \"\t\" > " + outputfile |
---|
111 | errorcode, stdout = commands.getstatusoutput(commandline) |
---|
112 | |
---|
113 | print "Count of unique values in " + columns_for_display |
---|
114 | return errorcode |
---|
115 | |
---|
116 | if __name__ == "__main__": |
---|
117 | main() |
---|