Context Navigation

indel_sam2interval.py @ 2

リビジョン 2, 5.7 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

Rev	行番号
[2]	1	#!/usr/bin/env python
	2
	3	"""
	4	Allows user to filter out non-indels from SAM.
	5
	6	usage: %prog [options]
	7	-i, --input=i: The input SAM file
	8	-u, --include_base=u: Whether or not to include the base for insertions
	9	-c, --collapse=c: Wheter to collapse multiple occurrences of a location with counts shown
	10	-o, --int_out=o: The interval output file for the converted SAM file
	11	-b, --bed_ins_out=b: The bed output file with insertions only for the converted SAM file
	12	-d, --bed_del_out=d: The bed output file with deletions only for the converted SAM file
	13	"""
	14
	15	import re, sys
	16	from galaxy import eggs
	17	import pkg_resources; pkg_resources.require( "bx-python" )
	18	from bx.cookbook import doc_optparse
	19
	20
	21	def stop_err( msg ):
	22	sys.stderr.write( '%s\n' % msg )
	23	sys.exit()
	24
	25	def numeric_sort( text1, text2 ):
	26	"""
	27	For two items containing space-separated text, compares equivalent pieces
	28	numerically if both numeric or as text otherwise
	29	"""
	30	pieces1 = text1.split()
	31	pieces2 = text2.split()
	32	if len( pieces1 ) == 0:
	33	return 1
	34	if len( pieces2 ) == 0:
	35	return -1
	36	for i, pc1 in enumerate( pieces1 ):
	37	if i == len( pieces2 ):
	38	return 1
	39	if not pieces2[i].isdigit():
	40	if pc1.isdigit():
	41	return -1
	42	else:
	43	if pc1 > pieces2[i]:
	44	return 1
	45	elif pc1 < pieces2[i]:
	46	return -1
	47	else:
	48	if not pc1.isdigit():
	49	return 1
	50	else:
	51	if int( pc1 ) > int( pieces2[i] ):
	52	return 1
	53	elif int( pc1 ) < int( pieces2[i] ):
	54	return -1
	55	if i < len( pieces2 ) - 1:
	56	return -1
	57	return 0
	58
	59	def __main__():
	60	#Parse Command Line
	61	options, args = doc_optparse.parse( __doc__ )
	62
	63	# open up output files
	64	output = open( options.int_out, 'wb' )
	65	if options.bed_ins_out != 'None':
	66	output_bed_ins = open( options.bed_ins_out, 'wb' )
	67	else:
	68	output_bed_ins = None
	69	if options.bed_del_out != 'None':
	70	output_bed_del = open( options.bed_del_out, 'wb' )
	71	else:
	72	output_bed_del = None
	73
	74	# the pattern to match, assuming just one indel per cigar string
	75	pat_indel = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' )
	76	pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
	77
	78	# go through all lines in input file
	79	out_data = {}
	80	multi_indel_lines = 0
	81	for line in open( options.input, 'rb' ):
	82	if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
	83	split_line = line.split( '\t' )
	84	if split_line < 12:
	85	continue
	86	# grab relevant pieces
	87	cigar = split_line[5].strip()
	88	pos = int( split_line[3] )
	89	chr = split_line[2]
	90	base_string = split_line[9]
	91	# parse cigar string
	92	m = pat_indel.match( cigar )
	93	if not m:
	94	m = pat_multi.match( cigar )
	95	# skip this line if no match
	96	if not m:
	97	continue
	98	# account for multiple indels or operations we don't process
	99	else:
	100	multi_indel_lines += 1
	101	continue
	102	else:
	103	match = m.groupdict()
	104	left = int( match[ 'lmatch' ] )
	105	middle = int( match[ 'ins_del_width' ] )
	106	middle_type = match[ 'ins_del' ]
	107	bases = base_string[ left : left + middle ]
	108	# calculate start and end positions, and output to insertion or deletion file
	109	start = left + pos
	110	if middle_type == 'D':
	111	end = start + middle
	112	data = [ chr, start, end, 'D' ]
	113	if options.include_base == "true":
	114	data.append( '-' )
	115	else:
	116	end = start + 1
	117	data = [ chr, start, end, 'I' ]
	118	if options.include_base == "true":
	119	data.append( bases )
	120	location = '\t'.join( [ '%s' % d for d in data ] )
	121	try:
	122	out_data[ location ] += 1
	123	except KeyError:
	124	out_data[ location ] = 1
	125	# output to interval file
	126	# get all locations and sort
	127	locations = out_data.keys()
	128	locations.sort( numeric_sort )
	129	last_line = ''
	130	# output each location, either with counts or each occurrence
	131	for loc in locations:
	132	sp_loc = loc.split( '\t' )
	133	cur_line = '\t'.join( sp_loc[:3] )
	134	if options.collapse == 'true':
	135	output.write( '%s\t%s\n' % ( loc, out_data[ loc ] ) )
	136	if output_bed_del and sp_loc[3] == 'D':
	137	output_bed_del.write( '%s\n' % cur_line )
	138	if output_bed_ins and sp_loc[3] == 'I' and last_line != cur_line:
	139	output_bed_ins.write( '%s\n' % cur_line )
	140	last_line = cur_line
	141	else:
	142	for i in range( out_data[ loc ] ):
	143	output.write( '%s\n' % loc )
	144	if output_bed_del or output_bed_ins:
	145	if output_bed_del and sp_loc[3] == 'D':
	146	output_bed_del.write( '%s\n' % cur_line )
	147	if output_bed_ins and sp_loc[3] == 'I':
	148	output_bed_ins.write( '%s\n' % cur_line )
	149
	150	# cleanup, close files
	151	if output_bed_ins:
	152	output_bed_ins.close()
	153	if output_bed_del:
	154	output_bed_del.close()
	155	output.close()
	156
	157	# if skipped lines because of more than one indel, output message
	158	if multi_indel_lines > 0:
	159	sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
	160
	161	if __name__=="__main__": __main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/indels/indel_sam2interval.py @ 2

異なるフォーマットでダウンロード: