Context Navigation

short_reads_figure_high_quality_length.py @ 2

リビジョン 2, 5.7 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2
3	import os, sys, math, tempfile, zipfile, re
4	from rpy import *
5
6	assert sys.version_info[:2] >= ( 2, 4 )
7
8	def stop_err( msg ):
9	sys.stderr.write( "%s\n" % msg )
10	sys.exit()
11
12	def unzip( filename ):
13	zip_file = zipfile.ZipFile( filename, 'r' )
14	tmpfilename = tempfile.NamedTemporaryFile().name
15	for name in zip_file.namelist():
16	file( tmpfilename, 'a' ).write( zip_file.read( name ) )
17	zip_file.close()
18	return tmpfilename
19
20	def __main__():
21	infile_score_name = sys.argv[1].strip()
22	outfile_R_name = sys.argv[2].strip()
23
24	try:
25	score_threshold = int( sys.argv[3].strip() )
26	except:
27	stop_err( 'Threshold for quality score must be numerical.' )
28
29	infile_is_zipped = False
30	if zipfile.is_zipfile( infile_score_name ):
31	infile_is_zipped = True
32	infile_name = unzip( infile_score_name )
33	else:
34	infile_name = infile_score_name
35
36	# detect whether it's tabular or fasta format
37	seq_method = None
38	data_type = None
39	for i, line in enumerate( file( infile_name ) ):
40	line = line.rstrip( '\r\n' )
41	if not line or line.startswith( '#' ):
42	continue
43	if data_type == None:
44	if line.startswith( '>' ):
45	data_type = 'fasta'
46	continue
47	elif len( line.split( '\t' ) ) > 0:
48	fields = line.split()
49	for score in fields:
50	try:
51	int( score )
52	data_type = 'tabular'
53	seq_method = 'solexa'
54	break
55	except:
56	break
57	elif data_type == 'fasta':
58	fields = line.split()
59	for score in fields:
60	try:
61	int( score )
62	seq_method = '454'
63	break
64	except:
65	break
66	if i == 100:
67	break
68
69	if data_type is None:
70	stop_err( 'This tool can only use fasta data or tabular data.' )
71	if seq_method is None:
72	stop_err( 'Invalid data for fasta format.')
73
74	cont_high_quality = []
75	invalid_lines = 0
76	invalid_scores = 0
77	if seq_method == 'solexa':
78	for i, line in enumerate( open( infile_name ) ):
79	line = line.rstrip( '\r\n' )
80	if not line or line.startswith( '#' ):
81	continue
82	locs = line.split( '\t' )
83	for j, base in enumerate( locs ):
84	nuc_errors = base.split()
85	try:
86	nuc_errors[0] = int( nuc_errors[0] )
87	nuc_errors[1] = int( nuc_errors[1] )
88	nuc_errors[2] = int( nuc_errors[2] )
89	nuc_errors[3] = int( nuc_errors[3] )
90	big = max( nuc_errors )
91	except:
92	invalid_scores += 1
93	big = 0
94	if j == 0:
95	cont_high_quality.append(1)
96	else:
97	if big >= score_threshold:
98	cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
99	else:
100	cont_high_quality.append(1)
101	else: # seq_method == '454'
102	tmp_score = ''
103	for i, line in enumerate( open( infile_name ) ):
104	line = line.rstrip( '\r\n' )
105	if not line or line.startswith( '#' ):
106	continue
107	if line.startswith( '>' ):
108	if len( tmp_score ) > 0:
109	locs = tmp_score.split()
110	for j, base in enumerate( locs ):
111	try:
112	base = int( base )
113	except:
114	invalid_scores += 1
115	base = 0
116	if j == 0:
117	cont_high_quality.append(1)
118	else:
119	if base >= score_threshold:
120	cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
121	else:
122	cont_high_quality.append(1)
123	tmp_score = ''
124	else:
125	tmp_score = "%s %s" % ( tmp_score, line )
126	if len( tmp_score ) > 0:
127	locs = tmp_score.split()
128	for j, base in enumerate( locs ):
129	try:
130	base = int( base )
131	except:
132	invalid_scores += 1
133	base = 0
134	if j == 0:
135	cont_high_quality.append(1)
136	else:
137	if base >= score_threshold:
138	cont_high_quality[ len( cont_high_quality ) - 1 ] += 1
139	else:
140	cont_high_quality.append(1)
141
142	# generate pdf figures
143	cont_high_quality = array ( cont_high_quality )
144	outfile_R_pdf = outfile_R_name
145	r.pdf( outfile_R_pdf )
146	title = "Histogram of continuous high quality scores"
147	xlim_range = [ 1, max( cont_high_quality ) ]
148	nclass = max( cont_high_quality )
149	if nclass > 100:
150	nclass = 100
151	r.hist( cont_high_quality, probability=True, xlab="Continuous High Quality Score length (bp)", ylab="Frequency (%)", xlim=xlim_range, main=title, nclass=nclass)
152	r.dev_off()
153
154	if infile_is_zipped and os.path.exists( infile_name ):
155	# Need to delete temporary file created when we unzipped the infile archive
156	os.remove( infile_name )
157
158	if invalid_lines > 0:
159	print 'Skipped %d invalid lines. ' % invalid_lines
160	if invalid_scores > 0:
161	print 'Skipped %d invalid scores. ' % invalid_scores
162
163	r.quit( save="no" )
164
165	if __name__=="__main__":__main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/metag_tools/short_reads_figure_high_quality_length.py @ 2

異なるフォーマットでダウンロード: