root/galaxy-central/tools/ncbi_blast_plus/blastxml_to_tabular.py

リビジョン 2, 4.5 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2"""Convert a BLAST XML file to 12 column tabular output
3
4Takes two command line options, input BLAST XML filename and output tabular
5BLAST filename.
6
7The 12 colums output are 'qseqid sseqid pident length mismatch gapopen qstart
8qend sstart send evalue bitscore' which mean:
9   
10 * qseqid   - Query Seq-id
11 * sseqid   - Subject Seq-id
12 * pident   - Percentage of identical matches
13 * length   - Alignment length
14 * mismatch - Number of mismatches
15 * gapopen  - Number of gap openings
16 * qstart   - Start of alignment in query
17 * qend     - End of alignment in query
18 * sstart   - Start of alignment in subject
19 * send     - End of alignment in subject
20 * evalue   - Expect value
21 * bitscore - Bit score
22
23Most of these fields are given explicitly in the XML file, others some like
24the percentage identity and the number of gap openings must be calculated.
25
26This script attempts to produce idential output to what BLAST+ would have done.
27However, check this with "diff -b ..." since BLAST+ sometimes includes an extra
28space character (probably a bug).
29"""
30import sys
31
32#Parse Command Line
33in_file, out_file = sys.argv[1:]
34
35assert sys.version_info[:2] >= ( 2, 4 )
36if sys.version_info[:2] >= ( 2, 5 ):
37    import xml.etree.cElementTree as cElementTree
38else:
39    import cElementTree
40
41def stop_err( msg ):
42    sys.stderr.write("%s\n" % msg)
43    sys.exit(1)
44
45tags = ["Hsp_identity",
46        "Hsp_align-len",
47        "Hsp_gaps",
48        "Hsp_query-from",
49        "Hsp_query-to",
50        "Hsp_hit-from",
51        "Hsp_hit-to",
52        "Hsp_evalue",
53        "Hsp_bit-score"]
54
55# get an iterable
56try:
57    context = cElementTree.iterparse(in_file, events=("start", "end"))
58except:
59    stop_err("Invalid data format.")
60# turn it into an iterator
61context = iter(context)
62# get the root element
63try:
64    event, root = context.next()
65except:
66    stop_err( "Invalid data format." )
67
68outfile = open(out_file, 'w')
69for event, elem in context:
70    # for every <Iteration> tag
71    if event == "end" and elem.tag == "Iteration":
72        qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
73        # for every <Hit> within <Iteration>
74        for hit in elem.findall("Iteration_hits/Hit/"):
75            sseqid = hit.findtext("Hit_id").split(None,1)[0]
76            # for every <Hsp> within <Hit>
77            for hsp in hit.findall("Hit_hsps/Hsp"):
78                identity = hsp.findtext("Hsp_identity")
79                length = hsp.findtext("Hsp_align-len")
80                pident = "%0.2f" % (100*float(identity)/float(length))
81
82                q_seq = hsp.findtext("Hsp_qseq")
83                h_seq = hsp.findtext("Hsp_hseq")
84                assert len(q_seq) == len(h_seq) == int(length)
85                gapopen = str(len(q_seq.replace('-', ' ').split())-1  + \
86                              len(h_seq.replace('-', ' ').split())-1)
87                mismatch = str(len(q_seq) - sum(1 for q,h in zip(q_seq, h_seq) \
88                                                if q == h or q == "-" or h == "-"))
89                assert int(identity) == sum(1 for q,h in zip(q_seq, h_seq) if q == h)
90
91                evalue = hsp.findtext("Hsp_evalue")
92                if evalue == "0":
93                    evalue = "0.0"
94                else:
95                    evalue = "%0.0e" % float(evalue)
96               
97                bitscore = float(hsp.findtext("Hsp_bit-score"))
98                if bitscore < 100:
99                    #Seems to show one decimal place for lower scores
100                    bitscore = "%0.1f" % bitscore
101                else:
102                    #Note BLAST does not round to nearest int, it truncates
103                    bitscore = "%i" % bitscore
104
105                values = [qseqid,
106                          sseqid,
107                          pident,
108                          length, #hsp.findtext("Hsp_align-len")
109                          mismatch,
110                          gapopen,
111                          hsp.findtext("Hsp_query-from"), #qstart,
112                          hsp.findtext("Hsp_query-to"), #qend,
113                          hsp.findtext("Hsp_hit-from"), #sstart,
114                          hsp.findtext("Hsp_hit-to"), #send,
115                          evalue, #hsp.findtext("Hsp_evalue") in scientific notation
116                          bitscore, #hsp.findtext("Hsp_bit-score") rounded
117                          ]
118                #print "\t".join(values)
119                outfile.write("\t".join(values) + "\n")
120        # prevents ElementTree from growing large datastructure
121        root.clear()
122        elem.clear()
123outfile.close()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。