Context Navigation

sanitize_html.py @ 2

リビジョン 2, 20.6 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

行番号
1	"""
2	HTML Sanitizer (ripped from feedparser)
3	"""
4
5	import re, sgmllib
6
7	# reversable htmlentitydefs mappings for Python 2.2
8	try:
9	from htmlentitydefs import name2codepoint, codepoint2name
10	except:
11	import htmlentitydefs
12	name2codepoint={}
13	codepoint2name={}
14	for (name,codepoint) in htmlentitydefs.entitydefs.iteritems():
15	if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1]))
16	name2codepoint[name]=ord(codepoint)
17	codepoint2name[ord(codepoint)]=name
18
19	_cp1252 = {
20	unichr(128): unichr(8364), # euro sign
21	unichr(130): unichr(8218), # single low-9 quotation mark
22	unichr(131): unichr( 402), # latin small letter f with hook
23	unichr(132): unichr(8222), # double low-9 quotation mark
24	unichr(133): unichr(8230), # horizontal ellipsis
25	unichr(134): unichr(8224), # dagger
26	unichr(135): unichr(8225), # double dagger
27	unichr(136): unichr( 710), # modifier letter circumflex accent
28	unichr(137): unichr(8240), # per mille sign
29	unichr(138): unichr( 352), # latin capital letter s with caron
30	unichr(139): unichr(8249), # single left-pointing angle quotation mark
31	unichr(140): unichr( 338), # latin capital ligature oe
32	unichr(142): unichr( 381), # latin capital letter z with caron
33	unichr(145): unichr(8216), # left single quotation mark
34	unichr(146): unichr(8217), # right single quotation mark
35	unichr(147): unichr(8220), # left double quotation mark
36	unichr(148): unichr(8221), # right double quotation mark
37	unichr(149): unichr(8226), # bullet
38	unichr(150): unichr(8211), # en dash
39	unichr(151): unichr(8212), # em dash
40	unichr(152): unichr( 732), # small tilde
41	unichr(153): unichr(8482), # trade mark sign
42	unichr(154): unichr( 353), # latin small letter s with caron
43	unichr(155): unichr(8250), # single right-pointing angle quotation mark
44	unichr(156): unichr( 339), # latin small ligature oe
45	unichr(158): unichr( 382), # latin small letter z with caron
46	unichr(159): unichr( 376)} # latin capital letter y with diaeresis
47
48	class _BaseHTMLProcessor(sgmllib.SGMLParser):
49	special = re.compile('''[<>'"]''')
50	bare_ampersand = re.compile("&(?!#\d+;\|#x[0-9a-fA-F]+;\|\w+;)")
51	elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
52	'img', 'input', 'isindex', 'link', 'meta', 'param']
53
54	def __init__(self, encoding, type):
55	self.encoding = encoding
56	self.type = type
57	## if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
58	sgmllib.SGMLParser.__init__(self)
59
60	def reset(self):
61	self.pieces = []
62	sgmllib.SGMLParser.reset(self)
63
64	def _shorttag_replace(self, match):
65	tag = match.group(1)
66	if tag in self.elements_no_end_tag:
67	return '<' + tag + ' />'
68	else:
69	return '<' + tag + '></' + tag + '>'
70
71	def parse_starttag(self,i):
72	j=sgmllib.SGMLParser.parse_starttag(self, i)
73	if self.type == 'application/xhtml+xml':
74	if j>2 and self.rawdata[j-2:j]=='/>':
75	self.unknown_endtag(self.lasttag)
76	return j
77
78	def feed(self, data):
79	data = re.compile(r'<!((?!DOCTYPE\|--\|\[))', re.IGNORECASE).sub(r'<!\1', data)
80	#data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
81	data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
82	data = data.replace(''', "'")
83	data = data.replace('"', '"')
84	if self.encoding and type(data) == type(u''):
85	data = data.encode(self.encoding)
86	sgmllib.SGMLParser.feed(self, data)
87	sgmllib.SGMLParser.close(self)
88
89	def normalize_attrs(self, attrs):
90	if not attrs: return attrs
91	# utility method to be called by descendants
92	attrs = dict([(k.lower(), v) for k, v in attrs]).items()
93	attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
94	attrs.sort()
95	return attrs
96
97	def unknown_starttag(self, tag, attrs):
98	# called for each start tag
99	# attrs is a list of (attr, value) tuples
100	# e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
101	## if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
102	uattrs = []
103	strattrs=''
104	if attrs:
105	for key, value in attrs:
106	value=value.replace('>','>').replace('<','<').replace('"','"')
107	value = self.bare_ampersand.sub("&", value)
108	# thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
109	if type(value) != type(u''):
110	try:
111	value = unicode(value, self.encoding)
112	except:
113	value = unicode(value, 'iso-8859-1')
114	uattrs.append((unicode(key, self.encoding), value))
115	strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
116	if self.encoding:
117	try:
118	strattrs=strattrs.encode(self.encoding)
119	except:
120	pass
121	if tag in self.elements_no_end_tag:
122	self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
123	else:
124	self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
125
126	def unknown_endtag(self, tag):
127	# called for each end tag, e.g. for </pre>, tag will be 'pre'
128	# Reconstruct the original end tag.
129	if tag not in self.elements_no_end_tag:
130	self.pieces.append("</%(tag)s>" % locals())
131
132	def handle_charref(self, ref):
133	# called for each character reference, e.g. for ' ', ref will be '160'
134	# Reconstruct the original character reference.
135	if ref.startswith('x'):
136	value = unichr(int(ref[1:],16))
137	else:
138	value = unichr(int(ref))
139
140	if value in _cp1252.keys():
141	self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
142	else:
143	self.pieces.append('&#%(ref)s;' % locals())
144
145	def handle_entityref(self, ref):
146	# called for each entity reference, e.g. for '©', ref will be 'copy'
147	# Reconstruct the original entity reference.
148	if name2codepoint.has_key(ref):
149	self.pieces.append('&%(ref)s;' % locals())
150	else:
151	self.pieces.append('&%(ref)s' % locals())
152
153	def handle_data(self, text):
154	# called for each block of plain text, i.e. outside of any tag and
155	# not containing any character or entity references
156	# Store the original text verbatim.
157	## if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
158	self.pieces.append(text)
159
160	def handle_comment(self, text):
161	# called for each HTML comment, e.g. <!-- insert Javascript code here -->
162	# Reconstruct the original comment.
163	self.pieces.append('<!--%(text)s-->' % locals())
164
165	def handle_pi(self, text):
166	# called for each processing instruction, e.g. <?instruction>
167	# Reconstruct original processing instruction.
168	self.pieces.append('<?%(text)s>' % locals())
169
170	def handle_decl(self, text):
171	# called for the DOCTYPE, if present, e.g.
172	# <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
173	# "http://www.w3.org/TR/html4/loose.dtd">
174	# Reconstruct original DOCTYPE
175	self.pieces.append('<!%(text)s>' % locals())
176
177	_new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]\s').match
178	def _scan_name(self, i, declstartpos):
179	rawdata = self.rawdata
180	n = len(rawdata)
181	if i == n:
182	return None, -1
183	m = self._new_declname_match(rawdata, i)
184	if m:
185	s = m.group()
186	name = s.strip()
187	if (i + len(s)) == n:
188	return None, -1 # end of buffer
189	return name.lower(), m.end()
190	else:
191	self.handle_data(rawdata)
192	# self.updatepos(declstartpos, i)
193	return None, -1
194
195	def convert_charref(self, name):
196	return '&#%s;' % name
197
198	def convert_entityref(self, name):
199	return '&%s;' % name
200
201	def output(self):
202	'''Return processed HTML as a single string'''
203	return ''.join([str(p) for p in self.pieces])
204
205	class _HTMLSanitizer(_BaseHTMLProcessor):
206	acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'article',
207	'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas',
208	'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command',
209	'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir',
210	'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer',
211	'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
212	'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map',
213	'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup',
214	'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
215	'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub',
216	'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead',
217	'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']
218
219	acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
220	'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
221	'background', 'balance', 'bgcolor', 'bgproperties', 'border',
222	'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
223	'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
224	'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
225	'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
226	'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
227	'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
228	'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
229	'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
230	'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
231	'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
232	'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
233	'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
234	'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max',
235	'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows',
236	'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src',
237	'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template',
238	'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign',
239	'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
240	'xml:lang']
241
242	unacceptable_elements_with_end_tag = ['script', 'applet', 'style']
243
244	acceptable_css_properties = ['azimuth', 'background-color',
245	'border-bottom-color', 'border-collapse', 'border-color',
246	'border-left-color', 'border-right-color', 'border-top-color', 'clear',
247	'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
248	'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
249	'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
250	'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
251	'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
252	'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
253	'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
254	'white-space', 'width']
255
256	# survey of common keywords found in feeds
257	acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
258	'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
259	'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
260	'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
261	'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
262	'transparent', 'underline', 'white', 'yellow']
263
264	valid_css_values = re.compile('^(#[0-9a-f]+\|rgb$\d+%?,\d%?,?\d%?$?\|' +
265	'\d{0,2}\.?\d{0,2}(cm\|em\|ex\|in\|mm\|pc\|pt\|px\|%\|,\|\))?)$')
266
267	mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math',
268	'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
269	'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
270	'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
271	'munderover', 'none', 'semantics']
272
273	mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
274	'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
275	'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
276	'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
277	'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
278	'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
279	'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
280	'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
281	'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']
282
283	# svgtiny - foreignObject + linearGradient + radialGradient + stop
284	svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
285	'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
286	'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
287	'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
288	'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
289	'svg', 'switch', 'text', 'title', 'tspan', 'use']
290
291	# svgtiny + class + opacity + offset + xmlns + xmlns:xlink
292	svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
293	'arabic-form', 'ascent', 'attributeName', 'attributeType',
294	'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
295	'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
296	'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
297	'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
298	'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
299	'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
300	'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
301	'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
302	'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
303	'min', 'name', 'offset', 'opacity', 'orient', 'origin',
304	'overline-position', 'overline-thickness', 'panose-1', 'path',
305	'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
306	'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
307	'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
308	'stop-color', 'stop-opacity', 'strikethrough-position',
309	'strikethrough-thickness', 'stroke', 'stroke-dasharray',
310	'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
311	'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
312	'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
313	'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
314	'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
315	'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
316	'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
317	'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
318	'y2', 'zoomAndPan']
319
320	svg_attr_map = None
321	svg_elem_map = None
322
323	acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
324	'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
325	'stroke-opacity']
326
327	def reset(self):
328	_BaseHTMLProcessor.reset(self)
329	self.unacceptablestack = 0
330	self.mathmlOK = 0
331	self.svgOK = 0
332
333	def unknown_starttag(self, tag, attrs):
334	acceptable_attributes = self.acceptable_attributes
335	keymap = {}
336	if not tag in self.acceptable_elements or self.svgOK:
337	if tag in self.unacceptable_elements_with_end_tag:
338	self.unacceptablestack += 1
339
340	# not otherwise acceptable, perhaps it is MathML or SVG?
341	if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
342	self.mathmlOK += 1
343	if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
344	self.svgOK += 1
345
346	# chose acceptable attributes based on tag class, else bail
347	if self.mathmlOK and tag in self.mathml_elements:
348	acceptable_attributes = self.mathml_attributes
349	elif self.svgOK and tag in self.svg_elements:
350	# for most vocabularies, lowercasing is a good idea. Many
351	# svg elements, however, are camel case
352	if not self.svg_attr_map:
353	lower=[attr.lower() for attr in self.svg_attributes]
354	mix=[a for a in self.svg_attributes if a not in lower]
355	self.svg_attributes = lower
356	self.svg_attr_map = dict([(a.lower(),a) for a in mix])
357
358	lower=[attr.lower() for attr in self.svg_elements]
359	mix=[a for a in self.svg_elements if a not in lower]
360	self.svg_elements = lower
361	self.svg_elem_map = dict([(a.lower(),a) for a in mix])
362	acceptable_attributes = self.svg_attributes
363	tag = self.svg_elem_map.get(tag,tag)
364	keymap = self.svg_attr_map
365	elif not tag in self.acceptable_elements:
366	return
367
368	# declare xlink namespace, if needed
369	if self.mathmlOK or self.svgOK:
370	if filter(lambda (n,v): n.startswith('xlink:'),attrs):
371	if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
372	attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
373
374	clean_attrs = []
375	for key, value in self.normalize_attrs(attrs):
376	if key in acceptable_attributes:
377	key=keymap.get(key,key)
378	clean_attrs.append((key,value))
379	elif key=='style':
380	pass
381	## clean_value = self.sanitize_style(value)
382	## if clean_value: clean_attrs.append((key,clean_value))
383	_BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
384
385	def unknown_endtag(self, tag):
386	if not tag in self.acceptable_elements:
387	if tag in self.unacceptable_elements_with_end_tag:
388	self.unacceptablestack -= 1
389	if self.mathmlOK and tag in self.mathml_elements:
390	if tag == 'math' and self.mathmlOK: self.mathmlOK -= 1
391	elif self.svgOK and tag in self.svg_elements:
392	tag = self.svg_elem_map.get(tag,tag)
393	if tag == 'svg' and self.svgOK: self.svgOK -= 1
394	else:
395	return
396	_BaseHTMLProcessor.unknown_endtag(self, tag)
397
398	def handle_pi(self, text):
399	pass
400
401	def handle_decl(self, text):
402	pass
403
404	def handle_data(self, text):
405	if not self.unacceptablestack:
406	_BaseHTMLProcessor.handle_data(self, text)
407
408	def sanitize_style(self, style):
409	# disallow urls
410	style=re.compile('url\s$\s[^\s)]+?\s$\s').sub(' ',style)
411
412	# gauntlet
413	if not re.match("""^([:,;#%.\sa-zA-Z0-9!]\|\w-\w\|'[\s\w]+'\|"[\s\w]+"\|$[\d,\s]+$)*$""", style): return ''
414	if not re.match("^(\s[-\w]+\s:\s[^:;](;\|$))*$", style): return ''
415
416	clean = []
417	for prop,value in re.findall("([-\w]+)\s:\s([^:;]*)",style):
418	if not value: continue
419	if prop.lower() in self.acceptable_css_properties:
420	clean.append(prop + ': ' + value + ';')
421	elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
422	for keyword in value.split():
423	if not keyword in self.acceptable_css_keywords and \
424	not self.valid_css_values.match(keyword):
425	break
426	else:
427	clean.append(prop + ': ' + value + ';')
428	elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
429	clean.append(prop + ': ' + value + ';')
430
431	return ' '.join(clean)
432
433
434	def sanitize_html(htmlSource, encoding, type):
435	p = _HTMLSanitizer(encoding, type)
436	p.feed(htmlSource)
437	data = p.output()
438	data = data.strip().replace('\r\n', '\n')
439	return data

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/lib/galaxy/util/sanitize_html.py @ 2

異なるフォーマットでダウンロード: