Context Navigation

indel_sam2interval.py @ 2

リビジョン 2, 5.7 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2
3	"""
4	Allows user to filter out non-indels from SAM.
5
6	usage: %prog [options]
7	-i, --input=i: The input SAM file
8	-u, --include_base=u: Whether or not to include the base for insertions
9	-c, --collapse=c: Wheter to collapse multiple occurrences of a location with counts shown
10	-o, --int_out=o: The interval output file for the converted SAM file
11	-b, --bed_ins_out=b: The bed output file with insertions only for the converted SAM file
12	-d, --bed_del_out=d: The bed output file with deletions only for the converted SAM file
13	"""
14
15	import re, sys
16	from galaxy import eggs
17	import pkg_resources; pkg_resources.require( "bx-python" )
18	from bx.cookbook import doc_optparse
19
20
21	def stop_err( msg ):
22	sys.stderr.write( '%s\n' % msg )
23	sys.exit()
24
25	def numeric_sort( text1, text2 ):
26	"""
27	For two items containing space-separated text, compares equivalent pieces
28	numerically if both numeric or as text otherwise
29	"""
30	pieces1 = text1.split()
31	pieces2 = text2.split()
32	if len( pieces1 ) == 0:
33	return 1
34	if len( pieces2 ) == 0:
35	return -1
36	for i, pc1 in enumerate( pieces1 ):
37	if i == len( pieces2 ):
38	return 1
39	if not pieces2[i].isdigit():
40	if pc1.isdigit():
41	return -1
42	else:
43	if pc1 > pieces2[i]:
44	return 1
45	elif pc1 < pieces2[i]:
46	return -1
47	else:
48	if not pc1.isdigit():
49	return 1
50	else:
51	if int( pc1 ) > int( pieces2[i] ):
52	return 1
53	elif int( pc1 ) < int( pieces2[i] ):
54	return -1
55	if i < len( pieces2 ) - 1:
56	return -1
57	return 0
58
59	def __main__():
60	#Parse Command Line
61	options, args = doc_optparse.parse( __doc__ )
62
63	# open up output files
64	output = open( options.int_out, 'wb' )
65	if options.bed_ins_out != 'None':
66	output_bed_ins = open( options.bed_ins_out, 'wb' )
67	else:
68	output_bed_ins = None
69	if options.bed_del_out != 'None':
70	output_bed_del = open( options.bed_del_out, 'wb' )
71	else:
72	output_bed_del = None
73
74	# the pattern to match, assuming just one indel per cigar string
75	pat_indel = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' )
76	pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
77
78	# go through all lines in input file
79	out_data = {}
80	multi_indel_lines = 0
81	for line in open( options.input, 'rb' ):
82	if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
83	split_line = line.split( '\t' )
84	if split_line < 12:
85	continue
86	# grab relevant pieces
87	cigar = split_line[5].strip()
88	pos = int( split_line[3] )
89	chr = split_line[2]
90	base_string = split_line[9]
91	# parse cigar string
92	m = pat_indel.match( cigar )
93	if not m:
94	m = pat_multi.match( cigar )
95	# skip this line if no match
96	if not m:
97	continue
98	# account for multiple indels or operations we don't process
99	else:
100	multi_indel_lines += 1
101	continue
102	else:
103	match = m.groupdict()
104	left = int( match[ 'lmatch' ] )
105	middle = int( match[ 'ins_del_width' ] )
106	middle_type = match[ 'ins_del' ]
107	bases = base_string[ left : left + middle ]
108	# calculate start and end positions, and output to insertion or deletion file
109	start = left + pos
110	if middle_type == 'D':
111	end = start + middle
112	data = [ chr, start, end, 'D' ]
113	if options.include_base == "true":
114	data.append( '-' )
115	else:
116	end = start + 1
117	data = [ chr, start, end, 'I' ]
118	if options.include_base == "true":
119	data.append( bases )
120	location = '\t'.join( [ '%s' % d for d in data ] )
121	try:
122	out_data[ location ] += 1
123	except KeyError:
124	out_data[ location ] = 1
125	# output to interval file
126	# get all locations and sort
127	locations = out_data.keys()
128	locations.sort( numeric_sort )
129	last_line = ''
130	# output each location, either with counts or each occurrence
131	for loc in locations:
132	sp_loc = loc.split( '\t' )
133	cur_line = '\t'.join( sp_loc[:3] )
134	if options.collapse == 'true':
135	output.write( '%s\t%s\n' % ( loc, out_data[ loc ] ) )
136	if output_bed_del and sp_loc[3] == 'D':
137	output_bed_del.write( '%s\n' % cur_line )
138	if output_bed_ins and sp_loc[3] == 'I' and last_line != cur_line:
139	output_bed_ins.write( '%s\n' % cur_line )
140	last_line = cur_line
141	else:
142	for i in range( out_data[ loc ] ):
143	output.write( '%s\n' % loc )
144	if output_bed_del or output_bed_ins:
145	if output_bed_del and sp_loc[3] == 'D':
146	output_bed_del.write( '%s\n' % cur_line )
147	if output_bed_ins and sp_loc[3] == 'I':
148	output_bed_ins.write( '%s\n' % cur_line )
149
150	# cleanup, close files
151	if output_bed_ins:
152	output_bed_ins.close()
153	if output_bed_del:
154	output_bed_del.close()
155	output.close()
156
157	# if skipped lines because of more than one indel, output message
158	if multi_indel_lines > 0:
159	sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
160
161	if __name__=="__main__": __main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/indels/indel_sam2interval.py @ 2

異なるフォーマットでダウンロード: