[2] | 1 | #!/usr/bin/env python |
---|
| 2 | #Guruprasad Ananda |
---|
| 3 | """ |
---|
| 4 | Uses SPUTNIK to fetch microsatellites and extracts orthologous repeats from the sputnik output. |
---|
| 5 | """ |
---|
| 6 | from galaxy import eggs |
---|
| 7 | import sys, os, tempfile, string, math, re |
---|
| 8 | |
---|
| 9 | def reverse_complement(text): |
---|
| 10 | DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" ) |
---|
| 11 | comp = [ch for ch in text.translate(DNA_COMP)] |
---|
| 12 | comp.reverse() |
---|
| 13 | return "".join(comp) |
---|
| 14 | |
---|
| 15 | def main(): |
---|
| 16 | if len(sys.argv) != 8: |
---|
| 17 | print >>sys.stderr, "Insufficient number of arguments." |
---|
| 18 | sys.exit() |
---|
| 19 | |
---|
| 20 | infile = open(sys.argv[1],'r') |
---|
| 21 | separation = int(sys.argv[2]) |
---|
| 22 | outfile = sys.argv[3] |
---|
| 23 | align_type = sys.argv[4] |
---|
| 24 | if align_type == "2way": |
---|
| 25 | align_type_len = 2 |
---|
| 26 | elif align_type == "3way": |
---|
| 27 | align_type_len = 3 |
---|
| 28 | mono_threshold = int(sys.argv[5]) |
---|
| 29 | non_mono_threshold = int(sys.argv[6]) |
---|
| 30 | allow_different_units = int(sys.argv[7]) |
---|
| 31 | |
---|
| 32 | print "Min distance = %d bp; Min threshold for mono repeats = %d; Min threshold for non-mono repeats = %d; Allow different motifs = %s" %(separation, mono_threshold, non_mono_threshold, allow_different_units==1) |
---|
| 33 | try: |
---|
| 34 | fout = open(outfile, "w") |
---|
| 35 | print >>fout, "#Block\tSeq1_Name\tSeq1_Start\tSeq1_End\tSeq1_Type\tSeq1_Length\tSeq1_RepeatNumber\tSeq1_Unit\tSeq2_Name\tSeq2_Start\tSeq2_End\tSeq2_Type\tSeq2_Length\tSeq2_RepeatNumber\tSeq2_Unit" |
---|
| 36 | #sputnik_cmd = os.path.join(os.path.split(sys.argv[0])[0], "sputnik") |
---|
| 37 | sputnik_cmd = "sputnik" |
---|
| 38 | input = infile.read() |
---|
| 39 | skipped = 0 |
---|
| 40 | block_num = 0 |
---|
| 41 | input = input.replace('\r','\n') |
---|
| 42 | for block in input.split('\n\n'): |
---|
| 43 | block_num += 1 |
---|
| 44 | tmpin = tempfile.NamedTemporaryFile() |
---|
| 45 | tmpout = tempfile.NamedTemporaryFile() |
---|
| 46 | tmpin.write(block.strip()) |
---|
| 47 | blk = tmpin.read() |
---|
| 48 | cmdline = sputnik_cmd + " " + tmpin.name + " > /dev/null 2>&1 >> " + tmpout.name |
---|
| 49 | try: |
---|
| 50 | os.system(cmdline) |
---|
| 51 | except Exception, es: |
---|
| 52 | continue |
---|
| 53 | sputnik_out = tmpout.read() |
---|
| 54 | tmpin.close() |
---|
| 55 | tmpout.close() |
---|
| 56 | if sputnik_out != "": |
---|
| 57 | if len(block.split('>')[1:]) != 2: #len(sputnik_out.split('>')): |
---|
| 58 | skipped += 1 |
---|
| 59 | continue |
---|
| 60 | align_block = block.strip().split('>') |
---|
| 61 | |
---|
| 62 | lendict = {'mononucleotide':1, 'dinucleotide':2, 'trinucleotide':3, 'tetranucleotide':4, 'pentanucleotide':5, 'hexanucleotide':6} |
---|
| 63 | blockdict={} |
---|
| 64 | r=0 |
---|
| 65 | namelist=[] |
---|
| 66 | for k,sput_block in enumerate(sputnik_out.split('>')[1:]): |
---|
| 67 | whole_seq = ''.join(align_block[k+1].split('\n')[1:]).replace('\n','').strip() |
---|
| 68 | p = re.compile('\n(\S*nucleotide)') |
---|
| 69 | repeats = p.split(sput_block.strip()) |
---|
| 70 | repeats_count = len(repeats) |
---|
| 71 | j = 1 |
---|
| 72 | name = repeats[0].strip() |
---|
| 73 | try: |
---|
| 74 | coords = re.search('\d+[-_:]\d+',name).group() |
---|
| 75 | coords = coords.replace('_','-').replace(':','-') |
---|
| 76 | except Exception, e: |
---|
| 77 | coords = '0-0' |
---|
| 78 | pass |
---|
| 79 | r += 1 |
---|
| 80 | blockdict[r]={} |
---|
| 81 | try: |
---|
| 82 | sp_name = name[:name.index('.')] |
---|
| 83 | chr_name = name[name.index('.'):name.index('(')] |
---|
| 84 | namelist.append(sp_name + chr_name) |
---|
| 85 | except: |
---|
| 86 | namelist.append(name[:20]) |
---|
| 87 | while j < repeats_count: |
---|
| 88 | try: |
---|
| 89 | if repeats[j].strip() not in lendict: |
---|
| 90 | j += 2 |
---|
| 91 | continue |
---|
| 92 | |
---|
| 93 | if blockdict[r].has_key('types'): |
---|
| 94 | blockdict[r]['types'].append(repeats[j].strip()) #type of microsat |
---|
| 95 | else: |
---|
| 96 | blockdict[r]['types'] = [repeats[j].strip()] #type of microsat |
---|
| 97 | |
---|
| 98 | sequence = ''.join(align_block[r].split('\n')[1:]).replace('\n','').strip() |
---|
| 99 | start = int(repeats[j+1].split('--')[0].split(':')[0].strip()) |
---|
| 100 | #check to see if there are gaps before the start of the repeat, and change the start accordingly |
---|
| 101 | sgaps = 0 |
---|
| 102 | ch_pos = start - 1 |
---|
| 103 | while ch_pos >= 0: |
---|
| 104 | if whole_seq[ch_pos] == '-': |
---|
| 105 | sgaps += 1 |
---|
| 106 | else: |
---|
| 107 | break #break at the 1st non-gap character |
---|
| 108 | ch_pos -= 1 |
---|
| 109 | if blockdict[r].has_key('starts'): |
---|
| 110 | blockdict[r]['starts'].append(start+sgaps) #start co-ords adjusted with alignment co-ords to include GAPS |
---|
| 111 | else: |
---|
| 112 | blockdict[r]['starts'] = [start+sgaps] |
---|
| 113 | |
---|
| 114 | end = int(repeats[j+1].split('--')[0].split(':')[1].strip()) |
---|
| 115 | #check to see if there are gaps after the end of the repeat, and change the end accordingly |
---|
| 116 | egaps = 0 |
---|
| 117 | for ch in whole_seq[end:]: |
---|
| 118 | if ch == '-': |
---|
| 119 | egaps += 1 |
---|
| 120 | else: |
---|
| 121 | break #break at the 1st non-gap character |
---|
| 122 | if blockdict[r].has_key('ends'): |
---|
| 123 | blockdict[r]['ends'].append(end+egaps) #end co-ords adjusted with alignment co-ords to include GAPS |
---|
| 124 | else: |
---|
| 125 | blockdict[r]['ends'] = [end+egaps] |
---|
| 126 | |
---|
| 127 | repeat_seq = ''.join(repeats[j+1].replace('\r','\n').split('\n')[1:]).strip() #Repeat Sequence |
---|
| 128 | repeat_len = repeats[j+1].split('--')[1].split()[1].strip() |
---|
| 129 | gap_count = repeat_seq.count('-') |
---|
| 130 | #print repeats[j+1].split('--')[1], len(repeat_seq), repeat_len, gap_count |
---|
| 131 | repeat_len = str(int(repeat_len) - gap_count) |
---|
| 132 | |
---|
| 133 | rel_start = blockdict[r]['starts'][-1] |
---|
| 134 | gaps_before_start = whole_seq[:rel_start].count('-') |
---|
| 135 | |
---|
| 136 | if blockdict[r].has_key('gaps_before_start'): |
---|
| 137 | blockdict[r]['gaps_before_start'].append(gaps_before_start) #lengths |
---|
| 138 | else: |
---|
| 139 | blockdict[r]['gaps_before_start'] = [gaps_before_start] #lengths |
---|
| 140 | |
---|
| 141 | whole_seq_start= int(coords.split('-')[0]) |
---|
| 142 | if blockdict[r].has_key('whole_seq_start'): |
---|
| 143 | blockdict[r]['whole_seq_start'].append(whole_seq_start) #lengths |
---|
| 144 | else: |
---|
| 145 | blockdict[r]['whole_seq_start'] = [whole_seq_start] #lengths |
---|
| 146 | |
---|
| 147 | if blockdict[r].has_key('lengths'): |
---|
| 148 | blockdict[r]['lengths'].append(repeat_len) #lengths |
---|
| 149 | else: |
---|
| 150 | blockdict[r]['lengths'] = [repeat_len] #lengths |
---|
| 151 | |
---|
| 152 | if blockdict[r].has_key('counts'): |
---|
| 153 | blockdict[r]['counts'].append(str(int(repeat_len)/lendict[repeats[j].strip()])) #Repeat Unit |
---|
| 154 | else: |
---|
| 155 | blockdict[r]['counts'] = [str(int(repeat_len)/lendict[repeats[j].strip()])] #Repeat Unit |
---|
| 156 | |
---|
| 157 | if blockdict[r].has_key('units'): |
---|
| 158 | blockdict[r]['units'].append(repeat_seq[:lendict[repeats[j].strip()]]) #Repeat Unit |
---|
| 159 | else: |
---|
| 160 | blockdict[r]['units'] = [repeat_seq[:lendict[repeats[j].strip()]]] #Repeat Unit |
---|
| 161 | |
---|
| 162 | except Exception, eh: |
---|
| 163 | pass |
---|
| 164 | j+=2 |
---|
| 165 | #check the co-ords of all repeats corresponding to a sequence and remove adjacent repeats separated by less than the user-specified 'separation'. |
---|
| 166 | delete_index_list = [] |
---|
| 167 | for ind, item in enumerate(blockdict[r]['ends']): |
---|
| 168 | try: |
---|
| 169 | if blockdict[r]['starts'][ind+1]-item < separation: |
---|
| 170 | if ind not in delete_index_list: |
---|
| 171 | delete_index_list.append(ind) |
---|
| 172 | if ind+1 not in delete_index_list: |
---|
| 173 | delete_index_list.append(ind+1) |
---|
| 174 | except Exception, ek: |
---|
| 175 | pass |
---|
| 176 | for index in delete_index_list: #mark them for deletion |
---|
| 177 | try: |
---|
| 178 | blockdict[r]['starts'][index] = 'marked' |
---|
| 179 | blockdict[r]['ends'][index] = 'marked' |
---|
| 180 | blockdict[r]['types'][index] = 'marked' |
---|
| 181 | blockdict[r]['gaps_before_start'][index] = 'marked' |
---|
| 182 | blockdict[r]['whole_seq_start'][index] = 'marked' |
---|
| 183 | blockdict[r]['lengths'][index] = 'marked' |
---|
| 184 | blockdict[r]['counts'][index] = 'marked' |
---|
| 185 | blockdict[r]['units'][index] = 'marked' |
---|
| 186 | except Exception, ej: |
---|
| 187 | pass |
---|
| 188 | #remove 'marked' elements from all the lists |
---|
| 189 | """ |
---|
| 190 | for key in blockdict[r].keys(): |
---|
| 191 | for elem in blockdict[r][key]: |
---|
| 192 | if elem == 'marked': |
---|
| 193 | blockdict[r][key].remove(elem) |
---|
| 194 | """ |
---|
| 195 | #print blockdict |
---|
| 196 | |
---|
| 197 | #make sure that the blockdict has keys for both the species |
---|
| 198 | if (1 not in blockdict) or (2 not in blockdict): |
---|
| 199 | continue |
---|
| 200 | |
---|
| 201 | visited_2 = [0 for x in range(len(blockdict[2]['starts']))] |
---|
| 202 | for ind1,coord_s1 in enumerate(blockdict[1]['starts']): |
---|
| 203 | if coord_s1 == 'marked': |
---|
| 204 | continue |
---|
| 205 | coord_e1 = blockdict[1]['ends'][ind1] |
---|
| 206 | out = [] |
---|
| 207 | for ind2,coord_s2 in enumerate(blockdict[2]['starts']): |
---|
| 208 | if coord_s2 == 'marked': |
---|
| 209 | visited_2[ind2] = 1 |
---|
| 210 | continue |
---|
| 211 | coord_e2 = blockdict[2]['ends'][ind2] |
---|
| 212 | #skip if the 2 repeats are not of the same type or don't have the same repeating unit. |
---|
| 213 | if allow_different_units == 0: |
---|
| 214 | if (blockdict[1]['types'][ind1] != blockdict[2]['types'][ind2]): |
---|
| 215 | continue |
---|
| 216 | else: |
---|
| 217 | if (blockdict[1]['units'][ind1] not in blockdict[2]['units'][ind2]*2) and (reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2): |
---|
| 218 | continue |
---|
| 219 | #print >>sys.stderr, (reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2) |
---|
| 220 | #skip if the repeat number thresholds are not met |
---|
| 221 | if blockdict[1]['types'][ind1] == 'mononucleotide': |
---|
| 222 | if (int(blockdict[1]['counts'][ind1]) < mono_threshold): |
---|
| 223 | continue |
---|
| 224 | else: |
---|
| 225 | if (int(blockdict[1]['counts'][ind1]) < non_mono_threshold): |
---|
| 226 | continue |
---|
| 227 | |
---|
| 228 | if blockdict[2]['types'][ind2] == 'mononucleotide': |
---|
| 229 | if (int(blockdict[2]['counts'][ind2]) < mono_threshold): |
---|
| 230 | continue |
---|
| 231 | else: |
---|
| 232 | if (int(blockdict[2]['counts'][ind2]) < non_mono_threshold): |
---|
| 233 | continue |
---|
| 234 | #print "s1,e1=%s,%s; s2,e2=%s,%s" %(coord_s1,coord_e1,coord_s2,coord_e2) |
---|
| 235 | if (coord_s1 in range(coord_s2,coord_e2)) or (coord_e1 in range(coord_s2,coord_e2)): |
---|
| 236 | out.append(str(block_num)) |
---|
| 237 | out.append(namelist[0]) |
---|
| 238 | rel_start = blockdict[1]['whole_seq_start'][ind1] + coord_s1 - blockdict[1]['gaps_before_start'][ind1] |
---|
| 239 | rel_end = rel_start + int(blockdict[1]['lengths'][ind1]) |
---|
| 240 | out.append(str(rel_start)) |
---|
| 241 | out.append(str(rel_end)) |
---|
| 242 | out.append(blockdict[1]['types'][ind1]) |
---|
| 243 | out.append(blockdict[1]['lengths'][ind1]) |
---|
| 244 | out.append(blockdict[1]['counts'][ind1]) |
---|
| 245 | out.append(blockdict[1]['units'][ind1]) |
---|
| 246 | out.append(namelist[1]) |
---|
| 247 | rel_start = blockdict[2]['whole_seq_start'][ind2] + coord_s2 - blockdict[2]['gaps_before_start'][ind2] |
---|
| 248 | rel_end = rel_start + int(blockdict[2]['lengths'][ind2]) |
---|
| 249 | out.append(str(rel_start)) |
---|
| 250 | out.append(str(rel_end)) |
---|
| 251 | out.append(blockdict[2]['types'][ind2]) |
---|
| 252 | out.append(blockdict[2]['lengths'][ind2]) |
---|
| 253 | out.append(blockdict[2]['counts'][ind2]) |
---|
| 254 | out.append(blockdict[2]['units'][ind2]) |
---|
| 255 | print >>fout, '\t'.join(out) |
---|
| 256 | visited_2[ind2] = 1 |
---|
| 257 | out=[] |
---|
| 258 | |
---|
| 259 | if 0 in visited_2: #there are still some elements in 2nd set which haven't found orthologs yet. |
---|
| 260 | for ind2, coord_s2 in enumerate(blockdict[2]['starts']): |
---|
| 261 | if coord_s2 == 'marked': |
---|
| 262 | continue |
---|
| 263 | if visited_2[ind] != 0: |
---|
| 264 | continue |
---|
| 265 | coord_e2 = blockdict[2]['ends'][ind2] |
---|
| 266 | out = [] |
---|
| 267 | for ind1,coord_s1 in enumerate(blockdict[1]['starts']): |
---|
| 268 | if coord_s1 == 'marked': |
---|
| 269 | continue |
---|
| 270 | coord_e1 = blockdict[1]['ends'][ind1] |
---|
| 271 | #skip if the 2 repeats are not of the same type or don't have the same repeating unit. |
---|
| 272 | if allow_different_units == 0: |
---|
| 273 | if (blockdict[1]['types'][ind1] != blockdict[2]['types'][ind2]): |
---|
| 274 | continue |
---|
| 275 | else: |
---|
| 276 | if (blockdict[1]['units'][ind1] not in blockdict[2]['units'][ind2]*2):# and reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2: |
---|
| 277 | continue |
---|
| 278 | #skip if the repeat number thresholds are not met |
---|
| 279 | if blockdict[1]['types'][ind1] == 'mononucleotide': |
---|
| 280 | if (int(blockdict[1]['counts'][ind1]) < mono_threshold): |
---|
| 281 | continue |
---|
| 282 | else: |
---|
| 283 | if (int(blockdict[1]['counts'][ind1]) < non_mono_threshold): |
---|
| 284 | continue |
---|
| 285 | |
---|
| 286 | if blockdict[2]['types'][ind2] == 'mononucleotide': |
---|
| 287 | if (int(blockdict[2]['counts'][ind2]) < mono_threshold): |
---|
| 288 | continue |
---|
| 289 | else: |
---|
| 290 | if (int(blockdict[2]['counts'][ind2]) < non_mono_threshold): |
---|
| 291 | continue |
---|
| 292 | |
---|
| 293 | if (coord_s2 in range(coord_s1,coord_e1)) or (coord_e2 in range(coord_s1,coord_e1)): |
---|
| 294 | out.append(str(block_num)) |
---|
| 295 | out.append(namelist[0]) |
---|
| 296 | rel_start = blockdict[1]['whole_seq_start'][ind1] + coord_s1 - blockdict[1]['gaps_before_start'][ind1] |
---|
| 297 | rel_end = rel_start + int(blockdict[1]['lengths'][ind1]) |
---|
| 298 | out.append(str(rel_start)) |
---|
| 299 | out.append(str(rel_end)) |
---|
| 300 | out.append(blockdict[1]['types'][ind1]) |
---|
| 301 | out.append(blockdict[1]['lengths'][ind1]) |
---|
| 302 | out.append(blockdict[1]['counts'][ind1]) |
---|
| 303 | out.append(blockdict[1]['units'][ind1]) |
---|
| 304 | out.append(namelist[1]) |
---|
| 305 | rel_start = blockdict[2]['whole_seq_start'][ind2] + coord_s2 - blockdict[2]['gaps_before_start'][ind2] |
---|
| 306 | rel_end = rel_start + int(blockdict[2]['lengths'][ind2]) |
---|
| 307 | out.append(str(rel_start)) |
---|
| 308 | out.append(str(rel_end)) |
---|
| 309 | out.append(blockdict[2]['types'][ind2]) |
---|
| 310 | out.append(blockdict[2]['lengths'][ind2]) |
---|
| 311 | out.append(blockdict[2]['counts'][ind2]) |
---|
| 312 | out.append(blockdict[2]['units'][ind2]) |
---|
| 313 | print >>fout, '\t'.join(out) |
---|
| 314 | visited_2[ind2] = 1 |
---|
| 315 | out=[] |
---|
| 316 | |
---|
| 317 | #print >>fout, blockdict |
---|
| 318 | except Exception, exc: |
---|
| 319 | print >>sys.stderr, "type(exc),args,exc: %s, %s, %s" %(type(exc), exc.args, exc) |
---|
| 320 | |
---|
| 321 | if __name__ == "__main__": |
---|
| 322 | main() |
---|
| 323 | |
---|