Context Navigation

bed_rand_intersect.py

リビジョン 3, 5.7 KB (コミッタ: kohda, 15 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号
1	#!/usr/bin/python2.6
2
3	"""
4	From a set of regions and two sets of intervals inside those regions
5	compute (for each region seperately) the overlap between the two sets
6	of intervals and the overlap in `nsamples` random coverings of the
7	regions with intervals having the same lengths. Prints the z-score relative
8	to the mean and sample stdev of the random coverings.
9
10	Currently intervals must be in bed 3+ format.
11
12	TODO: There are a few versions of this floating around, including a
13	better/faster one using gap lists instead of bitsets. Need to track
14	that down and merge as neccesary.
15
16	usage: %prog bounding_region_file intervals1 intervals2 nsamples
17	"""
18
19	from __future__ import division
20
21	import sys, random
22	import bisect
23	from bx_extras import stats
24	from Numeric import *
25	from bx.bitset import *
26	from bx.intervals.random import *
27
28	maxtries = 10
29
30	class MaxtriesException( Exception ):
31	pass
32
33	def bit_clone( bits ):
34	"""
35	Clone a bitset
36	"""
37	new = BitSet( bits.size )
38	new.ior( bits )
39	return new
40
41	def throw_random( lengths, mask ):
42	"""
43	Try multiple times to run 'throw_random'
44	"""
45	saved = None
46	for i in range( maxtries ):
47	try:
48	return throw_random_bits( lengths, mask )
49	except MaxtriesException, e:
50	saved = e
51	continue
52	raise e
53
54	def as_bits( region_start, region_length, intervals ):
55	"""
56	Convert a set of intervals overlapping a region of a chromosome into
57	a bitset for just that region with the bits covered by the intervals
58	set.
59	"""
60	bits = BitSet( region_length )
61	for chr, start, stop in intervals:
62	bits.set_range( start - region_start, stop - start )
63	return bits
64
65	def interval_lengths( bits ):
66	"""
67	Get the length distribution of all contiguous runs of set bits from
68	"""
69	end = 0
70	while 1:
71	start = bits.next_set( end )
72	if start == bits.size: break
73	end = bits.next_clear( start )
74	yield end - start
75
76	def count_overlap( bits1, bits2 ):
77	"""
78	Count the number of bits that overlap between two sets
79	"""
80	b = BitSet( bits1.size )
81	b \|= bits1
82	b &= bits2
83	return b.count_range( 0, b.size )
84
85	def overlapping_in_bed( fname, r_chr, r_start, r_stop ):
86	"""
87	Get from a bed all intervals that overlap the region defined by
88	r_chr, r_start, r_stop.
89	"""
90	rval = []
91	for line in open( fname ):
92	if line.startswith( "#" ) or line.startswith( "track" ):
93	continue
94	fields = line.split()
95	chr, start, stop = fields[0], int( fields[1] ), int( fields[2] )
96	if chr == r_chr and start < r_stop and stop >= r_start:
97	rval.append( ( chr, max( start, r_start ), min( stop, r_stop ) ) )
98	return rval
99
100	def main():
101	region_fname = sys.argv[1]
102	mask_fname = sys.argv[2]
103	nsamples = int( sys.argv[3] )
104	intervals1_fname = sys.argv[4]
105	intervals2_fnames = sys.argv[5:]
106	nfeatures = len( intervals2_fnames )
107	total_actual = zeros( nfeatures )
108	# total_lengths1 = 0
109	total_lengths2 = zeros( nfeatures )
110	total_samples = zeros( ( nsamples, nfeatures ) )
111	for line in open( region_fname ):
112	# Load lengths for all intervals overlapping region
113	fields = line.split()
114	print >>sys.stderr, "Processing region:", fields[3]
115	r_chr, r_start, r_stop = fields[0], int( fields[1] ), int( fields[2] )
116	r_length = r_stop - r_start
117	# Load the mask
118	mask = overlapping_in_bed( mask_fname, r_chr, r_start, r_stop )
119	bits_mask = as_bits( r_start, r_length, mask )
120	bits_not_masked = bit_clone( bits_mask ); bits_not_masked.invert()
121	# Load the first set
122	intervals1 = overlapping_in_bed( intervals1_fname, r_chr, r_start, r_stop )
123	bits1 = as_bits( r_start, r_length, intervals1 )
124	# Intersect it with the mask
125	bits1.iand( bits_not_masked )
126	# Sanity checks
127	assert count_overlap( bits1, bits_mask ) == 0
128	# For each data set
129	for featnum, intervals2_fname in enumerate( intervals2_fnames ):
130	print >>sys.stderr, intervals2_fname
131	intervals2 = overlapping_in_bed( intervals2_fname, r_chr, r_start, r_stop )
132	bits2 = as_bits( r_start, r_length, intervals2 )
133	bits2.iand( bits_not_masked )
134	assert count_overlap( bits2, bits_mask ) == 0
135	# Observed values
136	actual_overlap = count_overlap( bits1, bits2 )
137	total_actual[featnum] += actual_overlap
138	# Sample
139	lengths2 = list( interval_lengths( bits2 ) )
140	total_lengths2[ featnum ] += sum( lengths2 )
141	for i in range( nsamples ):
142	# Build randomly covered bitmask for second set
143	random2 = throw_random( lengths2, bits_mask )
144	# Find intersection
145	random2 &= bits1
146	# Print amount intersecting
147	total_samples[ i, featnum ] += random2.count_range( 0, random2.size )
148	print >>sys.stderr, total_samples[ i, featnum ]
149	fraction_overlap = total_samples / total_lengths2
150	print "\t".join( intervals2_fnames )
151	print "\t".join( map( str, total_actual/total_lengths2 ) )
152	for row in fraction_overlap:
153	print "\t".join( map( str, row ) )
154	#print "total covered by first: %d, second: %d, overlap: %d" % ( total_lengths1, total_lengths2, total_actual )
155	print "observed overlap: %d, sample mean: %d, sample stdev: %d" % ( total_actual, stats.amean( total_samples ), stats.asamplestdev( total_samples ) )
156	print "z-score:", ( total_actual - stats.amean( total_samples ) ) / stats.asamplestdev( total_samples )
157	print "percentile:", sum( total_actual > total_samples ) / nsamples
158
159	if __name__ == "__main__":
160	main()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/EGG-INFO/scripts/bed_rand_intersect.py

異なるフォーマットでダウンロード: