root/galaxy-central/tools/filters/axt_to_lav.py @ 2

リビジョン 2, 5.2 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2"""
3Application to convert AXT file to LAV file
4-------------------------------------------
5
6:Author: Bob Harris (rsharris@bx.psu.edu)
7:Version: $Revision: $
8
9The application reads an AXT file from standard input and writes a LAV file to
10standard out;  some statistics are written to standard error.
11"""
12
13import sys, copy
14from galaxy import eggs
15import pkg_resources
16pkg_resources.require( "bx-python" )
17import bx.align.axt
18import bx.align.lav
19
20assert sys.version_info[:2] >= ( 2, 4 )
21
22def usage(s=None):
23    message = """
24axt_to_lav primary_spec secondary_spec [--silent] < axt_file > lav_file
25  Each spec is of the form seq_file[:species_name]:lengths_file.
26
27  seq_file should be a format string for the file names for the individual
28  sequences, with %s to be replaced by the alignment's src field.  For example,
29  "hg18/%s.nib" would prescribe files named "hg18/chr1.nib", "hg18/chr2.nib",
30  etc.
31
32  species_name is optional.  If present, it is prepended to the alignment's src
33  field.
34
35  Lengths files provide the length of each chromosome (lav format needs this
36  information but axt file does not contain it).  The format is a series of
37  lines of the form
38    <chromosome name> <length>
39  The chromosome field in each axt block must match some <chromosome name> in
40  the lengths file.
41"""
42    if (s == None): sys.exit (message)
43    else:           sys.exit ("%s\n%s" % (s,message))
44
45
46def main():
47    global debug
48
49    # parse the command line
50
51    primary   = None
52    secondary = None
53    silent    = False
54
55    # pick off options
56
57    args = sys.argv[1:]
58    seq_file2 = open(args.pop(-1),'w')
59    seq_file1 = open(args.pop(-1),'w')
60    lav_out = args.pop(-1)
61    axt_in = args.pop(-1)
62    while (len(args) > 0):
63        arg = args.pop(0)
64        val = None
65        fields = arg.split("=",1)
66        if (len(fields) == 2):
67            arg = fields[0]
68            val = fields[1]
69            if (val == ""):
70                usage("missing a value in %s=" % arg)
71
72        if (arg == "--silent") and (val == None):
73            silent = True
74        elif (primary == None) and (val == None):
75            primary = arg
76        elif (secondary == None) and (val == None):
77            secondary = arg
78        else:
79            usage("unknown argument: %s" % arg)
80
81    if (primary == None):
82        usage("missing primary file name and length")
83
84    if (secondary == None):
85        usage("missing secondary file name and length")
86
87    try:
88        (primaryFile,primary,primaryLengths) = parse_spec(primary)
89    except:
90        usage("bad primary spec (must be seq_file[:species_name]:lengths_file")
91
92    try:
93        (secondaryFile,secondary,secondaryLengths) = parse_spec(secondary)
94    except:
95        usage("bad secondary spec (must be seq_file[:species_name]:lengths_file")
96
97    # read the lengths
98
99    speciesToLengths = {}
100    speciesToLengths[primary]   = read_lengths (primaryLengths)
101    speciesToLengths[secondary] = read_lengths (secondaryLengths)
102
103    # read the alignments
104
105    out = bx.align.lav.Writer(open(lav_out,'w'), \
106            attributes = { "name_format_1" : primaryFile,
107                           "name_format_2" : secondaryFile })
108
109    axtsRead = 0
110    axtsWritten = 0
111    for axtBlock in bx.align.axt.Reader(open(axt_in), \
112            species_to_lengths = speciesToLengths,
113            species1           = primary,
114            species2           = secondary,
115            support_ids        = True):
116        axtsRead += 1
117        out.write (axtBlock)
118        primary_c = axtBlock.get_component_by_src_start(primary)
119        secondary_c = axtBlock.get_component_by_src_start(secondary)
120       
121        print >>seq_file1, ">%s_%s_%s_%s" % (primary_c.src,secondary_c.strand,primary_c.start,primary_c.start+primary_c.size)
122        print >>seq_file1,primary_c.text
123        print >>seq_file1
124       
125        print >>seq_file2, ">%s_%s_%s_%s" % (secondary_c.src,secondary_c.strand,secondary_c.start,secondary_c.start+secondary_c.size)
126        print >>seq_file2,secondary_c.text
127        print >>seq_file2
128        axtsWritten += 1
129
130    out.close()
131    seq_file1.close()
132    seq_file2.close()
133
134    if (not silent):
135        sys.stdout.write ("%d blocks read, %d written\n" % (axtsRead,axtsWritten))
136
137def parse_spec(spec): # returns (seq_file,species_name,lengths_file)
138    fields = spec.split(":")
139    if   (len(fields) == 2): return (fields[0],"",fields[1])
140    elif (len(fields) == 3): return (fields[0],fields[1],fields[2])
141    else:                    raise ValueError
142
143def read_lengths (fileName):
144
145    chromToLength = {}
146
147    f = file (fileName, "r")
148
149    for lineNumber,line in enumerate(f):
150        line = line.strip()
151        if (line == ""): continue
152        if (line.startswith("#")): continue
153
154        fields = line.split ()
155        if (len(fields) != 2):
156            raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)
157
158        chrom = fields[0]
159        try:
160            length = int(fields[1])
161        except:
162            raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)
163
164        if (chrom in chromToLength):
165            raise "%s appears more than once (%s:%d): %s" \
166                % (chrom,fileName,lineNumber)
167
168        chromToLength[chrom] = length
169
170    f.close ()
171
172    return chromToLength
173
174
175if __name__ == "__main__": main()
176
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。