Context Navigation

quality_filter.py

リビジョン 2, 9.0 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2	#Guruprasad Ananda
3	"""
4	Filter based on nucleotide quality (PHRED score).
5
6	usage: %prog input out_file primary_species mask_species score mask_char mask_region mask_region_length
7	"""
8
9
10	from __future__ import division
11	from galaxy import eggs
12	import pkg_resources
13	pkg_resources.require( "bx-python" )
14	pkg_resources.require( "lrucache" )
15	try:
16	pkg_resources.require("numpy")
17	except:
18	pass
19
20	import psyco_full
21	import sys
22	import os, os.path
23	from UserDict import DictMixin
24	from bx.binned_array import BinnedArray, FileBinnedArray
25	from bx.bitset import *
26	from bx.bitset_builders import *
27	from fpconst import isNaN
28	from bx.cookbook import doc_optparse
29	from galaxy.tools.exception_handling import *
30	import bx.align.maf
31
32	class FileBinnedArrayDir( DictMixin ):
33	"""
34	Adapter that makes a directory of FileBinnedArray files look like
35	a regular dict of BinnedArray objects.
36	"""
37	def __init__( self, dir ):
38	self.dir = dir
39	self.cache = dict()
40	def __getitem__( self, key ):
41	value = None
42	if key in self.cache:
43	value = self.cache[key]
44	else:
45	fname = os.path.join( self.dir, "%s.qa.bqv" % key )
46	if os.path.exists( fname ):
47	value = FileBinnedArray( open( fname ) )
48	self.cache[key] = value
49	if value is None:
50	raise KeyError( "File does not exist: " + fname )
51	return value
52
53	def stop_err(msg):
54	sys.stderr.write(msg)
55	sys.exit()
56
57	def load_scores_ba_dir( dir ):
58	"""
59	Return a dict-like object (keyed by chromosome) that returns
60	FileBinnedArray objects created from "key.ba" files in `dir`
61	"""
62	return FileBinnedArrayDir( dir )
63
64	def bitwise_and ( string1, string2, maskch ):
65	result=[]
66	for i,ch in enumerate(string1):
67	try:
68	ch = int(ch)
69	except:
70	pass
71	if string2[i] == '-':
72	ch = 1
73	if ch and string2[i]:
74	result.append(string2[i])
75	else:
76	result.append(maskch)
77	return ''.join(result)
78
79	def main():
80	# Parsing Command Line here
81	options, args = doc_optparse.parse( __doc__ )
82
83	try:
84	#chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols )
85	inp_file, out_file, pri_species, mask_species, qual_cutoff, mask_chr, mask_region, mask_length, loc_file = args
86	qual_cutoff = int(qual_cutoff)
87	mask_chr = int(mask_chr)
88	mask_region = int(mask_region)
89	if mask_region != 3:
90	mask_length = int(mask_length)
91	else:
92	mask_length_r = int(mask_length.split(',')[0])
93	mask_length_l = int(mask_length.split(',')[1])
94	except:
95	stop_err( "Data issue, click the pencil icon in the history item to correct the metadata attributes of the input dataset." )
96
97	if pri_species == 'None':
98	stop_err( "No primary species selected, try again by selecting at least one primary species." )
99	if mask_species == 'None':
100	stop_err( "No mask species selected, try again by selecting at least one species to mask." )
101
102	mask_chr_count = 0
103	mask_chr_dict = {0:'#', 1:'$', 2:'^', 3:'*', 4:'?', 5:'N'}
104	mask_reg_dict = {0:'Current pos', 1:'Current+Downstream', 2:'Current+Upstream', 3:'Current+Both sides'}
105
106	#ensure dbkey is present in the twobit loc file
107	filepath = None
108	try:
109	pspecies_all = pri_species.split(',')
110	pspecies_all2 = pri_species.split(',')
111	pspecies = []
112	filepaths = []
113	for line in open(loc_file):
114	if pspecies_all2 == []:
115	break
116	if line[0:1] == "#":
117	continue
118	fields = line.split('\t')
119	try:
120	build = fields[0]
121	for i,dbkey in enumerate(pspecies_all2):
122	if dbkey == build:
123	pspecies.append(build)
124	filepaths.append(fields[1])
125	del pspecies_all2[i]
126	else:
127	continue
128	except:
129	pass
130	except Exception, exc:
131	stop_err( 'Initialization errorL %s' % str( exc ) )
132
133	if len(pspecies) == 0:
134	stop_err( "Quality scores are not available for the following genome builds: %s" % ( pspecies_all2 ) )
135	if len(pspecies) < len(pspecies_all):
136	print "Quality scores are not available for the following genome builds: %s" %(pspecies_all2)
137
138	scores_by_chrom = []
139	#Get scores for all the primary species
140	for file in filepaths:
141	scores_by_chrom.append(load_scores_ba_dir( file.strip() ))
142
143	try:
144	maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
145	maf_writer = bx.align.maf.Writer( open(out_file,'w') )
146	except Exception, e:
147	stop_err( "Your MAF file appears to be malformed: %s" % str( e ) )
148
149	maf_count = 0
150	for block in maf_reader:
151	status_strings = []
152	for seq in range (len(block.components)):
153	src = block.components[seq].src
154	dbkey = src.split('.')[0]
155	chr = src.split('.')[1]
156	if not (dbkey in pspecies):
157	continue
158	else: #enter if the species is a primary species
159	index = pspecies.index(dbkey)
160	sequence = block.components[seq].text
161	s_start = block.components[seq].start
162	size = len(sequence) #this includes the gaps too
163	status_str = '1'*size
164	status_list = list(status_str)
165	if status_strings == []:
166	status_strings.append(status_str)
167	ind = 0
168	s_end = block.components[seq].end
169	#Get scores for the entire sequence
170	try:
171	scores = scores_by_chrom[index][chr][s_start:s_end]
172	except:
173	continue
174	pos = 0
175	while pos < (s_end-s_start):
176	if sequence[ind] == '-': #No score for GAPS
177	ind += 1
178	continue
179	score = scores[pos]
180	if score < qual_cutoff:
181	score = 0
182
183	if not(score):
184	if mask_region == 0: #Mask Corresponding position only
185	status_list[ind] = '0'
186	ind += 1
187	pos += 1
188	elif mask_region == 1: #Mask Corresponding position + downstream neighbors
189	for n in range(mask_length+1):
190	try:
191	status_list[ind+n] = '0'
192	except:
193	pass
194	ind = ind + mask_length + 1
195	pos = pos + mask_length + 1
196	elif mask_region == 2: #Mask Corresponding position + upstream neighbors
197	for n in range(mask_length+1):
198	try:
199	status_list[ind-n] = '0'
200	except:
201	pass
202	ind += 1
203	pos += 1
204	elif mask_region == 3: #Mask Corresponding position + neighbors on both sides
205	for n in range(-mask_length_l,mask_length_r+1):
206	try:
207	status_list[ind+n] = '0'
208	except:
209	pass
210	ind = ind + mask_length_r + 1
211	pos = pos + mask_length_r + 1
212	else:
213	pos += 1
214	ind += 1
215
216	status_strings.append(''.join(status_list))
217
218	if status_strings == []: #this block has no primary species
219	continue
220	output_status_str = status_strings[0]
221	for stat in status_strings[1:]:
222	try:
223	output_status_str = bitwise_and (status_strings[0], stat, '0')
224	except Exception, e:
225	break
226
227	for seq in range (len(block.components)):
228	src = block.components[seq].src
229	dbkey = src.split('.')[0]
230	if dbkey not in mask_species.split(','):
231	continue
232	sequence = block.components[seq].text
233	sequence = bitwise_and (output_status_str, sequence, mask_chr_dict[mask_chr])
234	block.components[seq].text = sequence
235	mask_chr_count += output_status_str.count('0')
236	maf_writer.write(block)
237	maf_count += 1
238
239	maf_reader.close()
240	maf_writer.close()
241	print "No. of blocks = %d; No. of masked nucleotides = %s; Mask character = %s; Mask region = %s; Cutoff used = %d" %(maf_count, mask_chr_count, mask_chr_dict[mask_chr], mask_reg_dict[mask_region], qual_cutoff)
242
243
244	if __name__ == "__main__":
245	main()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/regVariation/quality_filter.py

異なるフォーマットでダウンロード: