Context Navigation

interval_to_bedstrict_converter.py @ 2

リビジョン 2, 8.0 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2	#Dan Blankenberg
3
4	import sys
5	from galaxy import eggs
6	import pkg_resources; pkg_resources.require( "bx-python" )
7	import bx.intervals.io
8
9	assert sys.version_info[:2] >= ( 2, 4 )
10
11	def stop_err( msg ):
12	sys.stderr.write( msg )
13	sys.exit()
14
15	def force_bed_field_count( fields, region_count, force_num_columns ):
16	if force_num_columns >= 4 and len( fields ) < 4:
17	fields.append( 'region_%i' % ( region_count ) )
18	if force_num_columns >= 5 and len( fields ) < 5:
19	fields.append( '0' )
20	if force_num_columns >= 6 and len( fields ) < 6:
21	fields.append( '+' )
22	if force_num_columns >= 7 and len( fields ) < 7:
23	fields.append( fields[1] )
24	if force_num_columns >= 8 and len( fields ) < 8:
25	fields.append( fields[2] )
26	if force_num_columns >= 9 and len( fields ) < 9:
27	fields.append( '0' )
28	if force_num_columns >= 10 and len( fields ) < 10:
29	fields.append( '0' )
30	if force_num_columns >= 11 and len( fields ) < 11:
31	fields.append( ',' )
32	if force_num_columns >= 12 and len( fields ) < 12:
33	fields.append( ',' )
34	return fields[:force_num_columns]
35
36	def __main__():
37	output_name = sys.argv[1]
38	input_name = sys.argv[2]
39	try:
40	chromCol = int( sys.argv[3] ) - 1
41	except:
42	stop_err( "'%s' is an invalid chrom column, correct the column settings before attempting to convert the data format." % str( sys.argv[3] ) )
43	try:
44	startCol = int( sys.argv[4] ) - 1
45	except:
46	stop_err( "'%s' is an invalid start column, correct the column settings before attempting to convert the data format." % str( sys.argv[4] ) )
47	try:
48	endCol = int( sys.argv[5] ) - 1
49	except:
50	stop_err( "'%s' is an invalid end column, correct the column settings before attempting to convert the data format." % str( sys.argv[5] ) )
51	try:
52	strandCol = int( sys.argv[6] ) - 1
53	except:
54	strandCol = -1
55	try:
56	nameCol = int( sys.argv[7] ) - 1
57	except:
58	nameCol = -1
59	try:
60	extension = sys.argv[8]
61	except:
62	extension = 'interval' #default extension
63	try:
64	force_num_columns = int( sys.argv[9] )
65	except:
66	force_num_columns = None
67
68	skipped_lines = 0
69	first_skipped_line = None
70	out = open( output_name,'w' )
71	count = 0
72	#does file already conform to bed strict?
73	#if so, we want to keep extended columns, otherwise we'll create a generic 6 column bed file
74	strict_bed = True
75	if extension in [ 'bed', 'bedstrict', 'bed6', 'bed12' ] and ( chromCol, startCol, endCol) == ( 0, 1, 2) and ( nameCol < 0 or nameCol == 3 ) and ( strandCol < 0 or strandCol == 5 ):
76	for count, line in enumerate( open( input_name ) ):
77	line = line.rstrip( '\n\r' )
78	if line == "" or line.startswith("#"):
79	skipped_lines += 1
80	if first_skipped_line is None:
81	first_skipped_line = count + 1
82	continue
83	fields = line.split('\t')
84	try:
85	assert len( fields ) >= 3, 'A BED file requires at least 3 columns' #we can't fix this
86	if len(fields) > 12:
87	strict_bed = False
88	break
89	#name (fields[3]) can be anything, no verification needed
90	if len( fields ) > 4:
91	float( fields[4] ) #score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray).
92	if len( fields ) > 5:
93	assert fields[5] in [ '+', '-' ], 'Invalid strand' #strand - Defines the strand - either '+' or '-'.
94	if len( fields ) > 6:
95	int( fields[6] ) #thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays).
96	if len( fields ) > 7:
97	int( fields[7] ) #thickEnd - The ending position at which the feature is drawn thickly (for example, the stop codon in gene displays).
98	if len( fields ) > 8:
99	if fields[8] != '0': #itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0). If the track line itemRgb attribute is set to "On", this RBG value will determine the display color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser.
100	fields2 = fields[8].split( ',' )
101	assert len( fields2 ) == 3, 'RGB value must be 0 or have length of 3'
102	for field in fields2:
103	int( field ) #rgb values are integers
104	if len( fields ) > 9:
105	int( fields[9] ) #blockCount - The number of blocks (exons) in the BED line.
106	if len( fields ) > 10:
107	if fields[10] != ',': #blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
108	fields2 = fields[10].rstrip( "," ).split( "," ) #remove trailing comma and split on comma
109	for field in fields2:
110	int( field )
111	if len( fields ) > 11:
112	if fields[11] != ',': #blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
113	fields2 = fields[11].rstrip( "," ).split( "," ) #remove trailing comma and split on comma
114	for field in fields2:
115	int( field )
116	except:
117	strict_bed = False
118	break
119	if force_num_columns is not None and len( fields ) != force_num_columns:
120	line = '\t'.join( force_bed_field_count( fields, count, force_num_columns ) )
121	out.write( "%s\n" % line )
122	else:
123	strict_bed = False
124	out.close()
125
126	if not strict_bed:
127	skipped_lines = 0
128	first_skipped_line = None
129	out = open( output_name,'w' )
130	count = 0
131	for count, region in enumerate( bx.intervals.io.NiceReaderWrapper( open( input_name, 'r' ), chrom_col=chromCol, start_col=startCol, end_col=endCol, strand_col=strandCol, fix_strand=True, return_header=False, return_comments=False ) ):
132	try:
133	if nameCol >= 0:
134	name = region.fields[nameCol]
135	else:
136	raise IndexError
137	except:
138	name = "region_%i" % count
139	try:
140	fields = map( str, [ region.chrom, region.start, region.end, name, 0, region.strand ] )
141	if force_num_columns is not None and len( fields ) != force_num_columns:
142	fields = force_bed_field_count( fields, count, force_num_columns )
143	out.write( "%s\n" % '\t'.join( fields ) )
144	except:
145	skipped_lines += 1
146	if first_skipped_line is None:
147	first_skipped_line = count + 1
148	out.close()
149	print "%i regions converted to BED." % ( count + 1 - skipped_lines )
150	if skipped_lines > 0:
151	print "Skipped %d blank or invalid lines starting with line # %d." % ( skipped_lines, first_skipped_line )
152
153	if __name__ == "__main__": __main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/lib/galaxy/datatypes/converters/interval_to_bedstrict_converter.py @ 2

異なるフォーマットでダウンロード: