Context Navigation

microsats_alignment_level.py @ 2

リビジョン 2, 18.3 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

Rev	行番号
[2]	1	#!/usr/bin/env python
	2	#Guruprasad Ananda
	3	"""
	4	Uses SPUTNIK to fetch microsatellites and extracts orthologous repeats from the sputnik output.
	5	"""
	6	from galaxy import eggs
	7	import sys, os, tempfile, string, math, re
	8
	9	def reverse_complement(text):
	10	DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )
	11	comp = [ch for ch in text.translate(DNA_COMP)]
	12	comp.reverse()
	13	return "".join(comp)
	14
	15	def main():
	16	if len(sys.argv) != 8:
	17	print >>sys.stderr, "Insufficient number of arguments."
	18	sys.exit()
	19
	20	infile = open(sys.argv[1],'r')
	21	separation = int(sys.argv[2])
	22	outfile = sys.argv[3]
	23	align_type = sys.argv[4]
	24	if align_type == "2way":
	25	align_type_len = 2
	26	elif align_type == "3way":
	27	align_type_len = 3
	28	mono_threshold = int(sys.argv[5])
	29	non_mono_threshold = int(sys.argv[6])
	30	allow_different_units = int(sys.argv[7])
	31
	32	print "Min distance = %d bp; Min threshold for mono repeats = %d; Min threshold for non-mono repeats = %d; Allow different motifs = %s" %(separation, mono_threshold, non_mono_threshold, allow_different_units==1)
	33	try:
	34	fout = open(outfile, "w")
	35	print >>fout, "#Block\tSeq1_Name\tSeq1_Start\tSeq1_End\tSeq1_Type\tSeq1_Length\tSeq1_RepeatNumber\tSeq1_Unit\tSeq2_Name\tSeq2_Start\tSeq2_End\tSeq2_Type\tSeq2_Length\tSeq2_RepeatNumber\tSeq2_Unit"
	36	#sputnik_cmd = os.path.join(os.path.split(sys.argv[0])[0], "sputnik")
	37	sputnik_cmd = "sputnik"
	38	input = infile.read()
	39	skipped = 0
	40	block_num = 0
	41	input = input.replace('\r','\n')
	42	for block in input.split('\n\n'):
	43	block_num += 1
	44	tmpin = tempfile.NamedTemporaryFile()
	45	tmpout = tempfile.NamedTemporaryFile()
	46	tmpin.write(block.strip())
	47	blk = tmpin.read()
	48	cmdline = sputnik_cmd + " " + tmpin.name + " > /dev/null 2>&1 >> " + tmpout.name
	49	try:
	50	os.system(cmdline)
	51	except Exception, es:
	52	continue
	53	sputnik_out = tmpout.read()
	54	tmpin.close()
	55	tmpout.close()
	56	if sputnik_out != "":
	57	if len(block.split('>')[1:]) != 2: #len(sputnik_out.split('>')):
	58	skipped += 1
	59	continue
	60	align_block = block.strip().split('>')
	61
	62	lendict = {'mononucleotide':1, 'dinucleotide':2, 'trinucleotide':3, 'tetranucleotide':4, 'pentanucleotide':5, 'hexanucleotide':6}
	63	blockdict={}
	64	r=0
	65	namelist=[]
	66	for k,sput_block in enumerate(sputnik_out.split('>')[1:]):
	67	whole_seq = ''.join(align_block[k+1].split('\n')[1:]).replace('\n','').strip()
	68	p = re.compile('\n(\S*nucleotide)')
	69	repeats = p.split(sput_block.strip())
	70	repeats_count = len(repeats)
	71	j = 1
	72	name = repeats[0].strip()
	73	try:
	74	coords = re.search('\d+[-_:]\d+',name).group()
	75	coords = coords.replace('_','-').replace(':','-')
	76	except Exception, e:
	77	coords = '0-0'
	78	pass
	79	r += 1
	80	blockdict[r]={}
	81	try:
	82	sp_name = name[:name.index('.')]
	83	chr_name = name[name.index('.'):name.index('(')]
	84	namelist.append(sp_name + chr_name)
	85	except:
	86	namelist.append(name[:20])
	87	while j < repeats_count:
	88	try:
	89	if repeats[j].strip() not in lendict:
	90	j += 2
	91	continue
	92
	93	if blockdict[r].has_key('types'):
	94	blockdict[r]['types'].append(repeats[j].strip()) #type of microsat
	95	else:
	96	blockdict[r]['types'] = [repeats[j].strip()] #type of microsat
	97
	98	sequence = ''.join(align_block[r].split('\n')[1:]).replace('\n','').strip()
	99	start = int(repeats[j+1].split('--')[0].split(':')[0].strip())
	100	#check to see if there are gaps before the start of the repeat, and change the start accordingly
	101	sgaps = 0
	102	ch_pos = start - 1
	103	while ch_pos >= 0:
	104	if whole_seq[ch_pos] == '-':
	105	sgaps += 1
	106	else:
	107	break #break at the 1st non-gap character
	108	ch_pos -= 1
	109	if blockdict[r].has_key('starts'):
	110	blockdict[r]['starts'].append(start+sgaps) #start co-ords adjusted with alignment co-ords to include GAPS
	111	else:
	112	blockdict[r]['starts'] = [start+sgaps]
	113
	114	end = int(repeats[j+1].split('--')[0].split(':')[1].strip())
	115	#check to see if there are gaps after the end of the repeat, and change the end accordingly
	116	egaps = 0
	117	for ch in whole_seq[end:]:
	118	if ch == '-':
	119	egaps += 1
	120	else:
	121	break #break at the 1st non-gap character
	122	if blockdict[r].has_key('ends'):
	123	blockdict[r]['ends'].append(end+egaps) #end co-ords adjusted with alignment co-ords to include GAPS
	124	else:
	125	blockdict[r]['ends'] = [end+egaps]
	126
	127	repeat_seq = ''.join(repeats[j+1].replace('\r','\n').split('\n')[1:]).strip() #Repeat Sequence
	128	repeat_len = repeats[j+1].split('--')[1].split()[1].strip()
	129	gap_count = repeat_seq.count('-')
	130	#print repeats[j+1].split('--')[1], len(repeat_seq), repeat_len, gap_count
	131	repeat_len = str(int(repeat_len) - gap_count)
	132
	133	rel_start = blockdict[r]['starts'][-1]
	134	gaps_before_start = whole_seq[:rel_start].count('-')
	135
	136	if blockdict[r].has_key('gaps_before_start'):
	137	blockdict[r]['gaps_before_start'].append(gaps_before_start) #lengths
	138	else:
	139	blockdict[r]['gaps_before_start'] = [gaps_before_start] #lengths
	140
	141	whole_seq_start= int(coords.split('-')[0])
	142	if blockdict[r].has_key('whole_seq_start'):
	143	blockdict[r]['whole_seq_start'].append(whole_seq_start) #lengths
	144	else:
	145	blockdict[r]['whole_seq_start'] = [whole_seq_start] #lengths
	146
	147	if blockdict[r].has_key('lengths'):
	148	blockdict[r]['lengths'].append(repeat_len) #lengths
	149	else:
	150	blockdict[r]['lengths'] = [repeat_len] #lengths
	151
	152	if blockdict[r].has_key('counts'):
	153	blockdict[r]['counts'].append(str(int(repeat_len)/lendict[repeats[j].strip()])) #Repeat Unit
	154	else:
	155	blockdict[r]['counts'] = [str(int(repeat_len)/lendict[repeats[j].strip()])] #Repeat Unit
	156
	157	if blockdict[r].has_key('units'):
	158	blockdict[r]['units'].append(repeat_seq[:lendict[repeats[j].strip()]]) #Repeat Unit
	159	else:
	160	blockdict[r]['units'] = [repeat_seq[:lendict[repeats[j].strip()]]] #Repeat Unit
	161
	162	except Exception, eh:
	163	pass
	164	j+=2
	165	#check the co-ords of all repeats corresponding to a sequence and remove adjacent repeats separated by less than the user-specified 'separation'.
	166	delete_index_list = []
	167	for ind, item in enumerate(blockdict[r]['ends']):
	168	try:
	169	if blockdict[r]['starts'][ind+1]-item < separation:
	170	if ind not in delete_index_list:
	171	delete_index_list.append(ind)
	172	if ind+1 not in delete_index_list:
	173	delete_index_list.append(ind+1)
	174	except Exception, ek:
	175	pass
	176	for index in delete_index_list: #mark them for deletion
	177	try:
	178	blockdict[r]['starts'][index] = 'marked'
	179	blockdict[r]['ends'][index] = 'marked'
	180	blockdict[r]['types'][index] = 'marked'
	181	blockdict[r]['gaps_before_start'][index] = 'marked'
	182	blockdict[r]['whole_seq_start'][index] = 'marked'
	183	blockdict[r]['lengths'][index] = 'marked'
	184	blockdict[r]['counts'][index] = 'marked'
	185	blockdict[r]['units'][index] = 'marked'
	186	except Exception, ej:
	187	pass
	188	#remove 'marked' elements from all the lists
	189	"""
	190	for key in blockdict[r].keys():
	191	for elem in blockdict[r][key]:
	192	if elem == 'marked':
	193	blockdict[r][key].remove(elem)
	194	"""
	195	#print blockdict
	196
	197	#make sure that the blockdict has keys for both the species
	198	if (1 not in blockdict) or (2 not in blockdict):
	199	continue
	200
	201	visited_2 = [0 for x in range(len(blockdict[2]['starts']))]
	202	for ind1,coord_s1 in enumerate(blockdict[1]['starts']):
	203	if coord_s1 == 'marked':
	204	continue
	205	coord_e1 = blockdict[1]['ends'][ind1]
	206	out = []
	207	for ind2,coord_s2 in enumerate(blockdict[2]['starts']):
	208	if coord_s2 == 'marked':
	209	visited_2[ind2] = 1
	210	continue
	211	coord_e2 = blockdict[2]['ends'][ind2]
	212	#skip if the 2 repeats are not of the same type or don't have the same repeating unit.
	213	if allow_different_units == 0:
	214	if (blockdict[1]['types'][ind1] != blockdict[2]['types'][ind2]):
	215	continue
	216	else:
	217	if (blockdict[1]['units'][ind1] not in blockdict[2]['units'][ind2]2) and (reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]2):
	218	continue
	219	#print >>sys.stderr, (reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2)
	220	#skip if the repeat number thresholds are not met
	221	if blockdict[1]['types'][ind1] == 'mononucleotide':
	222	if (int(blockdict[1]['counts'][ind1]) < mono_threshold):
	223	continue
	224	else:
	225	if (int(blockdict[1]['counts'][ind1]) < non_mono_threshold):
	226	continue
	227
	228	if blockdict[2]['types'][ind2] == 'mononucleotide':
	229	if (int(blockdict[2]['counts'][ind2]) < mono_threshold):
	230	continue
	231	else:
	232	if (int(blockdict[2]['counts'][ind2]) < non_mono_threshold):
	233	continue
	234	#print "s1,e1=%s,%s; s2,e2=%s,%s" %(coord_s1,coord_e1,coord_s2,coord_e2)
	235	if (coord_s1 in range(coord_s2,coord_e2)) or (coord_e1 in range(coord_s2,coord_e2)):
	236	out.append(str(block_num))
	237	out.append(namelist[0])
	238	rel_start = blockdict[1]['whole_seq_start'][ind1] + coord_s1 - blockdict[1]['gaps_before_start'][ind1]
	239	rel_end = rel_start + int(blockdict[1]['lengths'][ind1])
	240	out.append(str(rel_start))
	241	out.append(str(rel_end))
	242	out.append(blockdict[1]['types'][ind1])
	243	out.append(blockdict[1]['lengths'][ind1])
	244	out.append(blockdict[1]['counts'][ind1])
	245	out.append(blockdict[1]['units'][ind1])
	246	out.append(namelist[1])
	247	rel_start = blockdict[2]['whole_seq_start'][ind2] + coord_s2 - blockdict[2]['gaps_before_start'][ind2]
	248	rel_end = rel_start + int(blockdict[2]['lengths'][ind2])
	249	out.append(str(rel_start))
	250	out.append(str(rel_end))
	251	out.append(blockdict[2]['types'][ind2])
	252	out.append(blockdict[2]['lengths'][ind2])
	253	out.append(blockdict[2]['counts'][ind2])
	254	out.append(blockdict[2]['units'][ind2])
	255	print >>fout, '\t'.join(out)
	256	visited_2[ind2] = 1
	257	out=[]
	258
	259	if 0 in visited_2: #there are still some elements in 2nd set which haven't found orthologs yet.
	260	for ind2, coord_s2 in enumerate(blockdict[2]['starts']):
	261	if coord_s2 == 'marked':
	262	continue
	263	if visited_2[ind] != 0:
	264	continue
	265	coord_e2 = blockdict[2]['ends'][ind2]
	266	out = []
	267	for ind1,coord_s1 in enumerate(blockdict[1]['starts']):
	268	if coord_s1 == 'marked':
	269	continue
	270	coord_e1 = blockdict[1]['ends'][ind1]
	271	#skip if the 2 repeats are not of the same type or don't have the same repeating unit.
	272	if allow_different_units == 0:
	273	if (blockdict[1]['types'][ind1] != blockdict[2]['types'][ind2]):
	274	continue
	275	else:
	276	if (blockdict[1]['units'][ind1] not in blockdict[2]['units'][ind2]2):# and reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]2:
	277	continue
	278	#skip if the repeat number thresholds are not met
	279	if blockdict[1]['types'][ind1] == 'mononucleotide':
	280	if (int(blockdict[1]['counts'][ind1]) < mono_threshold):
	281	continue
	282	else:
	283	if (int(blockdict[1]['counts'][ind1]) < non_mono_threshold):
	284	continue
	285
	286	if blockdict[2]['types'][ind2] == 'mononucleotide':
	287	if (int(blockdict[2]['counts'][ind2]) < mono_threshold):
	288	continue
	289	else:
	290	if (int(blockdict[2]['counts'][ind2]) < non_mono_threshold):
	291	continue
	292
	293	if (coord_s2 in range(coord_s1,coord_e1)) or (coord_e2 in range(coord_s1,coord_e1)):
	294	out.append(str(block_num))
	295	out.append(namelist[0])
	296	rel_start = blockdict[1]['whole_seq_start'][ind1] + coord_s1 - blockdict[1]['gaps_before_start'][ind1]
	297	rel_end = rel_start + int(blockdict[1]['lengths'][ind1])
	298	out.append(str(rel_start))
	299	out.append(str(rel_end))
	300	out.append(blockdict[1]['types'][ind1])
	301	out.append(blockdict[1]['lengths'][ind1])
	302	out.append(blockdict[1]['counts'][ind1])
	303	out.append(blockdict[1]['units'][ind1])
	304	out.append(namelist[1])
	305	rel_start = blockdict[2]['whole_seq_start'][ind2] + coord_s2 - blockdict[2]['gaps_before_start'][ind2]
	306	rel_end = rel_start + int(blockdict[2]['lengths'][ind2])
	307	out.append(str(rel_start))
	308	out.append(str(rel_end))
	309	out.append(blockdict[2]['types'][ind2])
	310	out.append(blockdict[2]['lengths'][ind2])
	311	out.append(blockdict[2]['counts'][ind2])
	312	out.append(blockdict[2]['units'][ind2])
	313	print >>fout, '\t'.join(out)
	314	visited_2[ind2] = 1
	315	out=[]
	316
	317	#print >>fout, blockdict
	318	except Exception, exc:
	319	print >>sys.stderr, "type(exc),args,exc: %s, %s, %s" %(type(exc), exc.args, exc)
	320
	321	if __name__ == "__main__":
	322	main()
	323

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/regVariation/microsats_alignment_level.py @ 2

異なるフォーマットでダウンロード: