root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/align/sitemask/quality.py @ 3

リビジョン 3, 5.6 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Support for masking out sites in alignments based on sequence quality. Both
3simple masking of regions below some threshold and masking using the
4neighborhood quality standard (NQS) are supported. Uses sequence quality
5values stored in a `bx.binned_array.FileBinnedArray`.
6"""
7
8from bx.align.sitemask import Masker
9from bx.align import *
10from bx.binned_array import FileBinnedArray
11
12# This class implements simple rules for masking quality, if base <
13# minqual, mask
14class Simple( Masker ):
15    # keys should be:
16    # qualspecies: dictionary of species as key, lengths
17    #              dict by chromosome or chromosome list as value
18    # qualfiles: prefix for quality file for each species in qualspecies
19    # mask: mask character (default is '?')
20    # minqual: minimum quality
21    # cache: optional, but sets the number of megabytes allowed in cache per quality masked species
22    def __init__( self, qualfiles = None, qualspecies = None, minqual = None, mask = "?", cache=100):
23        if not qualfiles:
24            raise Exception("No quality files.")
25        if not qualspecies:
26            raise Exception("No species dictionary.")
27        if not minqual:
28            raise Exception("No minimum quality specified.")
29        self.mask = "?"
30        self.minqual = minqual
31        self.mask = mask
32        self.total = 0
33        self.masked = 0
34       
35        self.qualfiles = qualfiles
36        self.qualspecies = qualspecies
37        self.cache = cache * 2 # typical bin size is 512K
38        # load quality files into FileBinnedArray
39        self.qualities = {}
40        for species, qualfile in self.qualfiles.items():
41            specdict = {}
42            for chrom in self.qualspecies[species]:
43                specdict[chrom] = FileBinnedArray( \
44                    open(qualfile + "." + chrom + ".bqv", "rb"), \
45                    cache = self.cache/len(qualfiles) )
46            self.qualities[species] = specdict
47       
48    def __call__( self, block ):
49        if not block: return
50        for qualspec in self.qualities:
51            comp = block.get_component_by_src_start(qualspec)
52            if not comp: continue
53            chrom = comp.src.split(".")[1]
54            start, end = comp.get_forward_strand_start(), comp.get_forward_strand_end()
55            # get quality slice, for + strand
56            qual = self.qualities[qualspec][chrom][start:end]
57            x = 0
58            while start+x < end:
59                self.total += 1
60                # got the column in the alignment for this particular base
61                if qual[x] < self.minqual:
62                    col = comp.coord_to_col(start+x)
63                    self.masked += 1
64                    for component in block.components:
65                        if component.text[col] != "-":
66                            component.text = component.text[0:col] + \
67                                             self.mask + \
68                                             component.text[col+1:len(component.text)]
69                # iterate through quality
70                x += 1
71        return block
72   
73class NQS( Masker ):
74    # keys should be:
75    # qualspecies: dictionary of species as key, lengths
76    #              dict by chromosome or chromosome list as value
77    # qualfiles: prefix for quality file for each species in qualspecies
78    # mask: mask character (default is '?')
79    # minqual: minimum quality
80    # neighborqual: neighborhood minimum quality (bases within 5 bps are masked)
81    # cache: optional, but sets the number of megabytes allowed in cache per quality masked species
82    def __init__( self, qualfiles = None, qualspecies = None, minqual = None, mask = "?", cache=100):
83        if not qualfiles:
84            raise Exception("No quality files.")
85        if not qualspecies:
86            raise Exception("No species dictionary.")
87        if not minqual:
88            raise Exception("No minimum quality specified.")
89        self.mask = "?"
90        self.minqual = minqual
91        self.mask = mask
92        self.total = 0
93        self.masked = 0
94       
95        self.qualfiles = qualfiles
96        self.qualspecies = qualspecies
97        self.cache = cache * 2 # typical bin size is 512K
98        # load quality files into FileBinnedArray
99        self.qualities = {}
100        for species, qualfile in self.qualfiles.items():
101            specdict = {}
102            for chrom in self.qualspecies[species]:
103                specdict[chrom] = FileBinnedArray( \
104                    open(qualfile + "." + chrom + ".bqv", "rb"), \
105                    cache = self.cache/len(qualfiles) )
106            self.qualities[species] = specdict
107       
108    def __call__( self, block ):
109        if not block: return
110        for qualspec in self.qualities:
111            comp = block.get_component_by_src_start(qualspec)
112            chrom = comp.src.split(".")[1]
113            start, end = comp.get_forward_strand_start(), comp.get_forward_strand_end()
114            # get quality slice, for + strand
115            qual = self.qualities[qualspec][chrom][start:end]
116            x = 0
117            while start+x < end:
118                self.total += 1
119                # got the column in the alignment for this particular base
120                if qual[x] < self.minqual:
121                    col = comp.coord_to_col(start+x)
122                    self.masked += 1
123                    for component in block.components:
124                        if component.text[col] != "-":
125                            component.text = component.text[0:col] + \
126                                             self.mask + \
127                                             component.text[col+1:len(component.text)]
128                # iterate through quality
129                x += 1
130        return block
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。