[2] | 1 | #!/usr/bin/env python |
---|
| 2 | #Guruprasad Ananda |
---|
| 3 | """ |
---|
| 4 | This tool provides the UNIX "join" functionality. |
---|
| 5 | """ |
---|
| 6 | import sys, os, tempfile, subprocess |
---|
| 7 | |
---|
| 8 | def stop_err(msg): |
---|
| 9 | sys.stderr.write(msg) |
---|
| 10 | sys.exit() |
---|
| 11 | |
---|
| 12 | def main(): |
---|
| 13 | infile1 = sys.argv[1] |
---|
| 14 | infile2 = sys.argv[2] |
---|
| 15 | field1 = int(sys.argv[3]) |
---|
| 16 | field2 = int(sys.argv[4]) |
---|
| 17 | mode =sys.argv[5] |
---|
| 18 | outfile = sys.argv[6] |
---|
| 19 | |
---|
| 20 | tmpfile1 = tempfile.NamedTemporaryFile() |
---|
| 21 | tmpfile2 = tempfile.NamedTemporaryFile() |
---|
| 22 | |
---|
| 23 | try: |
---|
| 24 | #Sort the two files based on specified fields |
---|
| 25 | os.system("sort -t ' ' -k %d,%d -o %s %s" %(field1, field1, tmpfile1.name, infile1)) |
---|
| 26 | os.system("sort -t ' ' -k %d,%d -o %s %s" %(field2, field2, tmpfile2.name, infile2)) |
---|
| 27 | except Exception, exc: |
---|
| 28 | stop_err( 'Initialization error -> %s' %str(exc) ) |
---|
| 29 | |
---|
| 30 | option = "" |
---|
| 31 | for line in file(tmpfile1.name): |
---|
| 32 | line = line.strip() |
---|
| 33 | if line: |
---|
| 34 | elems = line.split('\t') |
---|
| 35 | for j in range(1,len(elems)+1): |
---|
| 36 | if j == 1: |
---|
| 37 | option = "1.1" |
---|
| 38 | else: |
---|
| 39 | option = option + ",1." + str(j) |
---|
| 40 | break |
---|
| 41 | |
---|
| 42 | #check if join has --version option. BSD join doens't have this option, while GNU join does. |
---|
| 43 | #The return value in the latter case will be 0, and non-zero in the latter case. |
---|
| 44 | ret = subprocess.call('join --version 2>/dev/null', shell=True) |
---|
| 45 | # check if we are a version later than 7 of join. If so, we want to skip |
---|
| 46 | # checking the order since join will raise an error with duplicated items in |
---|
| 47 | # the two files being joined. |
---|
| 48 | if ret == 0: |
---|
| 49 | cl = subprocess.Popen(["join", "--version"], stdout=subprocess.PIPE) |
---|
| 50 | (stdout, _) = cl.communicate() |
---|
| 51 | version_line = stdout.split("\n")[0] |
---|
| 52 | (version, _) = version_line.split()[-1].split(".") |
---|
| 53 | if int(version) >= 7: |
---|
| 54 | flags = "--nocheck-order" |
---|
| 55 | else: |
---|
| 56 | flags = "" |
---|
| 57 | else: |
---|
| 58 | flags = "" |
---|
| 59 | |
---|
| 60 | if mode == "V": |
---|
| 61 | cmdline = "join %s -t ' ' -v 1 -o %s -1 %d -2 %d %s %s > %s" %(flags, option, field1, field2, tmpfile1.name, tmpfile2.name, outfile) |
---|
| 62 | else: |
---|
| 63 | cmdline = "join %s -t ' ' -o %s -1 %d -2 %d %s %s > %s" %(flags, option, field1, field2, tmpfile1.name, tmpfile2.name, outfile) |
---|
| 64 | |
---|
| 65 | try: |
---|
| 66 | os.system(cmdline) |
---|
| 67 | except Exception, exj: |
---|
| 68 | stop_err('Error joining the two datasets -> %s' %str(exj)) |
---|
| 69 | |
---|
| 70 | if __name__ == "__main__": |
---|
| 71 | main() |
---|