root/galaxy-central/tools/filters/uniq.py

リビジョン 2, 3.3 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1# Filename: uniq.py
2# Author: Ian N. Schenck
3# Version: 19/12/2005
4#
5# This script accepts an input file, an output file, a column
6# delimiter, and a list of columns.  The script then grabs unique
7# lines based on the columns, and returns those records with a count
8# of occurences of each unique column, inserted before the columns.
9#
10# This executes the command pipeline:
11#       cut -f $fields | sort  | uniq -C
12#
13# -i            Input file
14# -o            Output file
15# -d            Delimiter
16# -c            Column list (Comma Seperated)
17
18import sys
19import re
20import string
21import commands
22
23# This function is exceedingly useful, perhaps package for reuse?
24def getopts(argv):
25    opts = {}
26    while argv:
27        if argv[0][0] == '-':
28            opts[argv[0]] = argv[1]
29            argv = argv[2:]
30        else:
31            argv = argv[1:]
32    return opts
33
34def main():
35    args = sys.argv[1:]
36
37    try:
38        opts = getopts(args)
39    except IndexError:
40        print "Usage:"
41        print " -i        Input file"
42        print " -o        Output file"
43        print " -c        Column list (comma seperated)"
44        print " -d        Delimiter:"
45        print "                     T   Tab"
46        print "                     C   Comma"
47        print "                     D   Dash"
48        print "                     U   Underscore"
49        print "                     P   Pipe"
50        print "                     Dt  Dot"
51        print "                     Sp  Space"
52        return 0
53
54    outputfile = opts.get("-o")
55    if outputfile == None:
56        print "No output file specified."
57        return -1
58   
59    inputfile = opts.get("-i")
60    if inputfile == None:
61        print "No input file specified."
62        return -2
63
64    delim = opts.get("-d")
65    if delim == None:
66        print "Field delimiter not specified."
67        return -3
68
69    columns = opts.get("-c")
70    if columns == None or columns == 'None':
71        print "Columns not specified."
72        return -4
73
74    # All inputs have been specified at this point, now validate.
75    fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$")
76    columnRegEx = re.compile("([0-9]{1,},?)+")
77
78    if not columnRegEx.match(columns):
79        print "Illegal column specification."
80        return -4
81    if not fileRegEx.match(outputfile):
82        print "Illegal output filename."
83        return -5
84    if not fileRegEx.match(inputfile):
85        print "Illegal input filename."
86        return -6
87
88    column_list = re.split(",",columns)
89    columns_for_display = ""
90    for col in column_list:
91        columns_for_display += "c"+col+", "
92
93    commandline = "cut "
94    # Set delimiter
95    if delim=='C':
96        commandline += "-d \",\" "
97    if delim=='D':
98        commandline += "-d \"-\" "
99    if delim=='U':
100        commandline += "-d \"_\" "
101    if delim=='P':
102        commandline += "-d \"|\" "
103    if delim=='Dt':
104        commandline += "-d \".\" "
105    if delim=='Sp':
106        commandline += "-d \" \" "
107
108    # set columns
109    commandline += "-f " + columns
110    commandline += " " + inputfile + " | sed s/\ //g | sort | uniq -c | sed s/^\ *// | tr \" \" \"\t\" > " + outputfile
111    errorcode, stdout = commands.getstatusoutput(commandline)
112   
113    print "Count of unique values in " + columns_for_display
114    return errorcode
115
116if __name__ == "__main__":
117    main()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。