1 | #!/usr/bin/env python |
---|
2 | #Guruprasad Ananda |
---|
3 | """ |
---|
4 | This tool provides the UNIX "join" functionality. |
---|
5 | """ |
---|
6 | import sys, os, tempfile, subprocess |
---|
7 | |
---|
8 | def stop_err(msg): |
---|
9 | sys.stderr.write(msg) |
---|
10 | sys.exit() |
---|
11 | |
---|
12 | def main(): |
---|
13 | infile1 = sys.argv[1] |
---|
14 | infile2 = sys.argv[2] |
---|
15 | field1 = int(sys.argv[3]) |
---|
16 | field2 = int(sys.argv[4]) |
---|
17 | mode =sys.argv[5] |
---|
18 | outfile = sys.argv[6] |
---|
19 | |
---|
20 | tmpfile1 = tempfile.NamedTemporaryFile() |
---|
21 | tmpfile2 = tempfile.NamedTemporaryFile() |
---|
22 | |
---|
23 | try: |
---|
24 | #Sort the two files based on specified fields |
---|
25 | os.system("sort -t ' ' -k %d,%d -o %s %s" %(field1, field1, tmpfile1.name, infile1)) |
---|
26 | os.system("sort -t ' ' -k %d,%d -o %s %s" %(field2, field2, tmpfile2.name, infile2)) |
---|
27 | except Exception, exc: |
---|
28 | stop_err( 'Initialization error -> %s' %str(exc) ) |
---|
29 | |
---|
30 | option = "" |
---|
31 | for line in file(tmpfile1.name): |
---|
32 | line = line.strip() |
---|
33 | if line: |
---|
34 | elems = line.split('\t') |
---|
35 | for j in range(1,len(elems)+1): |
---|
36 | if j == 1: |
---|
37 | option = "1.1" |
---|
38 | else: |
---|
39 | option = option + ",1." + str(j) |
---|
40 | break |
---|
41 | |
---|
42 | #check if join has --version option. BSD join doens't have this option, while GNU join does. |
---|
43 | #The return value in the latter case will be 0, and non-zero in the latter case. |
---|
44 | ret = subprocess.call('join --version 2>/dev/null', shell=True) |
---|
45 | # check if we are a version later than 7 of join. If so, we want to skip |
---|
46 | # checking the order since join will raise an error with duplicated items in |
---|
47 | # the two files being joined. |
---|
48 | if ret == 0: |
---|
49 | cl = subprocess.Popen(["join", "--version"], stdout=subprocess.PIPE) |
---|
50 | (stdout, _) = cl.communicate() |
---|
51 | version_line = stdout.split("\n")[0] |
---|
52 | (version, _) = version_line.split()[-1].split(".") |
---|
53 | if int(version) >= 7: |
---|
54 | flags = "--nocheck-order" |
---|
55 | else: |
---|
56 | flags = "" |
---|
57 | else: |
---|
58 | flags = "" |
---|
59 | |
---|
60 | if mode == "V": |
---|
61 | cmdline = "join %s -t ' ' -v 1 -o %s -1 %d -2 %d %s %s > %s" %(flags, option, field1, field2, tmpfile1.name, tmpfile2.name, outfile) |
---|
62 | else: |
---|
63 | cmdline = "join %s -t ' ' -o %s -1 %d -2 %d %s %s > %s" %(flags, option, field1, field2, tmpfile1.name, tmpfile2.name, outfile) |
---|
64 | |
---|
65 | try: |
---|
66 | os.system(cmdline) |
---|
67 | except Exception, exj: |
---|
68 | stop_err('Error joining the two datasets -> %s' %str(exj)) |
---|
69 | |
---|
70 | if __name__ == "__main__": |
---|
71 | main() |
---|