Context Navigation

quality_filter.py @ 2

リビジョン 2, 9.0 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

Rev	行番号
[2]	1	#!/usr/bin/env python
	2	#Guruprasad Ananda
	3	"""
	4	Filter based on nucleotide quality (PHRED score).
	5
	6	usage: %prog input out_file primary_species mask_species score mask_char mask_region mask_region_length
	7	"""
	8
	9
	10	from __future__ import division
	11	from galaxy import eggs
	12	import pkg_resources
	13	pkg_resources.require( "bx-python" )
	14	pkg_resources.require( "lrucache" )
	15	try:
	16	pkg_resources.require("numpy")
	17	except:
	18	pass
	19
	20	import psyco_full
	21	import sys
	22	import os, os.path
	23	from UserDict import DictMixin
	24	from bx.binned_array import BinnedArray, FileBinnedArray
	25	from bx.bitset import *
	26	from bx.bitset_builders import *
	27	from fpconst import isNaN
	28	from bx.cookbook import doc_optparse
	29	from galaxy.tools.exception_handling import *
	30	import bx.align.maf
	31
	32	class FileBinnedArrayDir( DictMixin ):
	33	"""
	34	Adapter that makes a directory of FileBinnedArray files look like
	35	a regular dict of BinnedArray objects.
	36	"""
	37	def __init__( self, dir ):
	38	self.dir = dir
	39	self.cache = dict()
	40	def __getitem__( self, key ):
	41	value = None
	42	if key in self.cache:
	43	value = self.cache[key]
	44	else:
	45	fname = os.path.join( self.dir, "%s.qa.bqv" % key )
	46	if os.path.exists( fname ):
	47	value = FileBinnedArray( open( fname ) )
	48	self.cache[key] = value
	49	if value is None:
	50	raise KeyError( "File does not exist: " + fname )
	51	return value
	52
	53	def stop_err(msg):
	54	sys.stderr.write(msg)
	55	sys.exit()
	56
	57	def load_scores_ba_dir( dir ):
	58	"""
	59	Return a dict-like object (keyed by chromosome) that returns
	60	FileBinnedArray objects created from "key.ba" files in `dir`
	61	"""
	62	return FileBinnedArrayDir( dir )
	63
	64	def bitwise_and ( string1, string2, maskch ):
	65	result=[]
	66	for i,ch in enumerate(string1):
	67	try:
	68	ch = int(ch)
	69	except:
	70	pass
	71	if string2[i] == '-':
	72	ch = 1
	73	if ch and string2[i]:
	74	result.append(string2[i])
	75	else:
	76	result.append(maskch)
	77	return ''.join(result)
	78
	79	def main():
	80	# Parsing Command Line here
	81	options, args = doc_optparse.parse( __doc__ )
	82
	83	try:
	84	#chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols )
	85	inp_file, out_file, pri_species, mask_species, qual_cutoff, mask_chr, mask_region, mask_length, loc_file = args
	86	qual_cutoff = int(qual_cutoff)
	87	mask_chr = int(mask_chr)
	88	mask_region = int(mask_region)
	89	if mask_region != 3:
	90	mask_length = int(mask_length)
	91	else:
	92	mask_length_r = int(mask_length.split(',')[0])
	93	mask_length_l = int(mask_length.split(',')[1])
	94	except:
	95	stop_err( "Data issue, click the pencil icon in the history item to correct the metadata attributes of the input dataset." )
	96
	97	if pri_species == 'None':
	98	stop_err( "No primary species selected, try again by selecting at least one primary species." )
	99	if mask_species == 'None':
	100	stop_err( "No mask species selected, try again by selecting at least one species to mask." )
	101
	102	mask_chr_count = 0
	103	mask_chr_dict = {0:'#', 1:'$', 2:'^', 3:'*', 4:'?', 5:'N'}
	104	mask_reg_dict = {0:'Current pos', 1:'Current+Downstream', 2:'Current+Upstream', 3:'Current+Both sides'}
	105
	106	#ensure dbkey is present in the twobit loc file
	107	filepath = None
	108	try:
	109	pspecies_all = pri_species.split(',')
	110	pspecies_all2 = pri_species.split(',')
	111	pspecies = []
	112	filepaths = []
	113	for line in open(loc_file):
	114	if pspecies_all2 == []:
	115	break
	116	if line[0:1] == "#":
	117	continue
	118	fields = line.split('\t')
	119	try:
	120	build = fields[0]
	121	for i,dbkey in enumerate(pspecies_all2):
	122	if dbkey == build:
	123	pspecies.append(build)
	124	filepaths.append(fields[1])
	125	del pspecies_all2[i]
	126	else:
	127	continue
	128	except:
	129	pass
	130	except Exception, exc:
	131	stop_err( 'Initialization errorL %s' % str( exc ) )
	132
	133	if len(pspecies) == 0:
	134	stop_err( "Quality scores are not available for the following genome builds: %s" % ( pspecies_all2 ) )
	135	if len(pspecies) < len(pspecies_all):
	136	print "Quality scores are not available for the following genome builds: %s" %(pspecies_all2)
	137
	138	scores_by_chrom = []
	139	#Get scores for all the primary species
	140	for file in filepaths:
	141	scores_by_chrom.append(load_scores_ba_dir( file.strip() ))
	142
	143	try:
	144	maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
	145	maf_writer = bx.align.maf.Writer( open(out_file,'w') )
	146	except Exception, e:
	147	stop_err( "Your MAF file appears to be malformed: %s" % str( e ) )
	148
	149	maf_count = 0
	150	for block in maf_reader:
	151	status_strings = []
	152	for seq in range (len(block.components)):
	153	src = block.components[seq].src
	154	dbkey = src.split('.')[0]
	155	chr = src.split('.')[1]
	156	if not (dbkey in pspecies):
	157	continue
	158	else: #enter if the species is a primary species
	159	index = pspecies.index(dbkey)
	160	sequence = block.components[seq].text
	161	s_start = block.components[seq].start
	162	size = len(sequence) #this includes the gaps too
	163	status_str = '1'*size
	164	status_list = list(status_str)
	165	if status_strings == []:
	166	status_strings.append(status_str)
	167	ind = 0
	168	s_end = block.components[seq].end
	169	#Get scores for the entire sequence
	170	try:
	171	scores = scores_by_chrom[index][chr][s_start:s_end]
	172	except:
	173	continue
	174	pos = 0
	175	while pos < (s_end-s_start):
	176	if sequence[ind] == '-': #No score for GAPS
	177	ind += 1
	178	continue
	179	score = scores[pos]
	180	if score < qual_cutoff:
	181	score = 0
	182
	183	if not(score):
	184	if mask_region == 0: #Mask Corresponding position only
	185	status_list[ind] = '0'
	186	ind += 1
	187	pos += 1
	188	elif mask_region == 1: #Mask Corresponding position + downstream neighbors
	189	for n in range(mask_length+1):
	190	try:
	191	status_list[ind+n] = '0'
	192	except:
	193	pass
	194	ind = ind + mask_length + 1
	195	pos = pos + mask_length + 1
	196	elif mask_region == 2: #Mask Corresponding position + upstream neighbors
	197	for n in range(mask_length+1):
	198	try:
	199	status_list[ind-n] = '0'
	200	except:
	201	pass
	202	ind += 1
	203	pos += 1
	204	elif mask_region == 3: #Mask Corresponding position + neighbors on both sides
	205	for n in range(-mask_length_l,mask_length_r+1):
	206	try:
	207	status_list[ind+n] = '0'
	208	except:
	209	pass
	210	ind = ind + mask_length_r + 1
	211	pos = pos + mask_length_r + 1
	212	else:
	213	pos += 1
	214	ind += 1
	215
	216	status_strings.append(''.join(status_list))
	217
	218	if status_strings == []: #this block has no primary species
	219	continue
	220	output_status_str = status_strings[0]
	221	for stat in status_strings[1:]:
	222	try:
	223	output_status_str = bitwise_and (status_strings[0], stat, '0')
	224	except Exception, e:
	225	break
	226
	227	for seq in range (len(block.components)):
	228	src = block.components[seq].src
	229	dbkey = src.split('.')[0]
	230	if dbkey not in mask_species.split(','):
	231	continue
	232	sequence = block.components[seq].text
	233	sequence = bitwise_and (output_status_str, sequence, mask_chr_dict[mask_chr])
	234	block.components[seq].text = sequence
	235	mask_chr_count += output_status_str.count('0')
	236	maf_writer.write(block)
	237	maf_count += 1
	238
	239	maf_reader.close()
	240	maf_writer.close()
	241	print "No. of blocks = %d; No. of masked nucleotides = %s; Mask character = %s; Mask region = %s; Cutoff used = %d" %(maf_count, mask_chr_count, mask_chr_dict[mask_chr], mask_reg_dict[mask_region], qual_cutoff)
	242
	243
	244	if __name__ == "__main__":
	245	main()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/regVariation/quality_filter.py @ 2

異なるフォーマットでダウンロード: