| 1 | #!/usr/bin/python2.6 | 
|---|
| 2 |  | 
|---|
| 3 | """ | 
|---|
| 4 | Application to convert AXT file to LAV file. Reads an AXT file from standard | 
|---|
| 5 | input and writes a LAV file to standard out; some statistics are written to | 
|---|
| 6 | standard error. | 
|---|
| 7 |  | 
|---|
| 8 | usage: %prog primary_spec secondary_spec [--silent] < axt_file > lav_file | 
|---|
| 9 |  | 
|---|
| 10 | Each spec is of the form seq_file[:species_name]:lengths_file. | 
|---|
| 11 |  | 
|---|
| 12 | - seq_file should be a format string for the file names for the individual | 
|---|
| 13 | sequences, with %s to be replaced by the alignment's src field.  For | 
|---|
| 14 | example, "hg18/%s.nib" would prescribe files named "hg18/chr1.nib", | 
|---|
| 15 | "hg18/chr2.nib", etc. | 
|---|
| 16 |  | 
|---|
| 17 | - species_name is optional.  If present, it is prepended to the alignment's | 
|---|
| 18 | src field. | 
|---|
| 19 |  | 
|---|
| 20 | - Lengths files provide the length of each chromosome (lav format needs this | 
|---|
| 21 | information but axt file does not contain it).  The format is a series of | 
|---|
| 22 | lines of the form: | 
|---|
| 23 |  | 
|---|
| 24 | <chromosome name> <length> | 
|---|
| 25 |  | 
|---|
| 26 | The chromosome field in each axt block must match some <chromosome name> in | 
|---|
| 27 | the lengths file. | 
|---|
| 28 | """ | 
|---|
| 29 |  | 
|---|
| 30 | __author__ = "Bob Harris (rsharris@bx.psu.edu)" | 
|---|
| 31 |  | 
|---|
| 32 | import sys | 
|---|
| 33 | import copy | 
|---|
| 34 | import bx.align.axt | 
|---|
| 35 | import bx.align.lav | 
|---|
| 36 |  | 
|---|
| 37 |  | 
|---|
| 38 | def usage(s=None): | 
|---|
| 39 | message = __doc__ | 
|---|
| 40 | if (s == None): sys.exit (message) | 
|---|
| 41 | else:           sys.exit ("%s\n%s" % (s,message)) | 
|---|
| 42 |  | 
|---|
| 43 |  | 
|---|
| 44 | def main(): | 
|---|
| 45 | global debug | 
|---|
| 46 |  | 
|---|
| 47 | # parse the command line | 
|---|
| 48 |  | 
|---|
| 49 | primary   = None | 
|---|
| 50 | secondary = None | 
|---|
| 51 | silent    = False | 
|---|
| 52 |  | 
|---|
| 53 | # pick off options | 
|---|
| 54 |  | 
|---|
| 55 | args = sys.argv[1:] | 
|---|
| 56 | while (len(args) > 0): | 
|---|
| 57 | arg = args.pop(0) | 
|---|
| 58 | val = None | 
|---|
| 59 | fields = arg.split("=",1) | 
|---|
| 60 | if (len(fields) == 2): | 
|---|
| 61 | arg = fields[0] | 
|---|
| 62 | val = fields[1] | 
|---|
| 63 | if (val == ""): | 
|---|
| 64 | usage("missing a value in %s=" % arg) | 
|---|
| 65 |  | 
|---|
| 66 | if (arg == "--silent") and (val == None): | 
|---|
| 67 | silent = True | 
|---|
| 68 | elif (primary == None) and (val == None): | 
|---|
| 69 | primary = arg | 
|---|
| 70 | elif (secondary == None) and (val == None): | 
|---|
| 71 | secondary = arg | 
|---|
| 72 | else: | 
|---|
| 73 | usage("unknown argument: %s" % arg) | 
|---|
| 74 |  | 
|---|
| 75 | if (primary == None): | 
|---|
| 76 | usage("missing primary file name and length") | 
|---|
| 77 |  | 
|---|
| 78 | if (secondary == None): | 
|---|
| 79 | usage("missing secondary file name and length") | 
|---|
| 80 |  | 
|---|
| 81 | try: | 
|---|
| 82 | (primaryFile,primary,primaryLengths) = parse_spec(primary) | 
|---|
| 83 | except: | 
|---|
| 84 | usage("bad primary spec (must be seq_file[:species_name]:lengths_file") | 
|---|
| 85 |  | 
|---|
| 86 | try: | 
|---|
| 87 | (secondaryFile,secondary,secondaryLengths) = parse_spec(secondary) | 
|---|
| 88 | except: | 
|---|
| 89 | usage("bad secondary spec (must be seq_file[:species_name]:lengths_file") | 
|---|
| 90 |  | 
|---|
| 91 | # read the lengths | 
|---|
| 92 |  | 
|---|
| 93 | speciesToLengths = {} | 
|---|
| 94 | speciesToLengths[primary]   = read_lengths (primaryLengths) | 
|---|
| 95 | speciesToLengths[secondary] = read_lengths (secondaryLengths) | 
|---|
| 96 |  | 
|---|
| 97 | # read the alignments | 
|---|
| 98 |  | 
|---|
| 99 | out = bx.align.lav.Writer(sys.stdout, \ | 
|---|
| 100 | attributes = { "name_format_1" : primaryFile, | 
|---|
| 101 | "name_format_2" : secondaryFile }) | 
|---|
| 102 |  | 
|---|
| 103 | axtsRead = 0 | 
|---|
| 104 | axtsWritten = 0 | 
|---|
| 105 | for axtBlock in bx.align.axt.Reader(sys.stdin, \ | 
|---|
| 106 | species_to_lengths = speciesToLengths, | 
|---|
| 107 | species1           = primary, | 
|---|
| 108 | species2           = secondary, | 
|---|
| 109 | support_ids        = True): | 
|---|
| 110 | axtsRead += 1 | 
|---|
| 111 | out.write (axtBlock) | 
|---|
| 112 | axtsWritten += 1 | 
|---|
| 113 |  | 
|---|
| 114 | out.close() | 
|---|
| 115 |  | 
|---|
| 116 | if (not silent): | 
|---|
| 117 | sys.stderr.write ("%d blocks read, %d written\n" % (axtsRead,axtsWritten)) | 
|---|
| 118 |  | 
|---|
| 119 | def parse_spec(spec): # returns (seq_file,species_name,lengths_file) | 
|---|
| 120 | fields = spec.split(":") | 
|---|
| 121 | if   (len(fields) == 2): return (fields[0],"",fields[1]) | 
|---|
| 122 | elif (len(fields) == 3): return (fields[0],fields[1],fields[2]) | 
|---|
| 123 | else:                    raise ValueError | 
|---|
| 124 |  | 
|---|
| 125 | def read_lengths (fileName): | 
|---|
| 126 |  | 
|---|
| 127 | chromToLength = {} | 
|---|
| 128 |  | 
|---|
| 129 | f = file (fileName, "r") | 
|---|
| 130 |  | 
|---|
| 131 | for lineNumber,line in enumerate(f): | 
|---|
| 132 | line = line.strip() | 
|---|
| 133 | if (line == ""): continue | 
|---|
| 134 | if (line.startswith("#")): continue | 
|---|
| 135 |  | 
|---|
| 136 | fields = line.split () | 
|---|
| 137 | if (len(fields) != 2): | 
|---|
| 138 | raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line) | 
|---|
| 139 |  | 
|---|
| 140 | chrom = fields[0] | 
|---|
| 141 | try: | 
|---|
| 142 | length = int(fields[1]) | 
|---|
| 143 | except: | 
|---|
| 144 | raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line) | 
|---|
| 145 |  | 
|---|
| 146 | if (chrom in chromToLength): | 
|---|
| 147 | raise "%s appears more than once (%s:%d): %s" \ | 
|---|
| 148 | % (chrom,fileName,lineNumber) | 
|---|
| 149 |  | 
|---|
| 150 | chromToLength[chrom] = length | 
|---|
| 151 |  | 
|---|
| 152 | f.close () | 
|---|
| 153 |  | 
|---|
| 154 | return chromToLength | 
|---|
| 155 |  | 
|---|
| 156 |  | 
|---|
| 157 | if __name__ == "__main__": main() | 
|---|
| 158 |  | 
|---|