1 | # lexer.py |
---|
2 | # Copyright (C) 2006, 2007, 2008, 2009 Michael Bayer mike_mp@zzzcomputing.com |
---|
3 | # |
---|
4 | # This module is part of Mako and is released under |
---|
5 | # the MIT License: http://www.opensource.org/licenses/mit-license.php |
---|
6 | |
---|
7 | """provides the Lexer class for parsing template strings into parse trees.""" |
---|
8 | |
---|
9 | import re, codecs |
---|
10 | from mako import parsetree, exceptions |
---|
11 | from mako.pygen import adjust_whitespace |
---|
12 | |
---|
13 | _regexp_cache = {} |
---|
14 | |
---|
15 | class Lexer(object): |
---|
16 | def __init__(self, text, filename=None, disable_unicode=False, input_encoding=None, preprocessor=None): |
---|
17 | self.text = text |
---|
18 | self.filename = filename |
---|
19 | self.template = parsetree.TemplateNode(self.filename) |
---|
20 | self.matched_lineno = 1 |
---|
21 | self.matched_charpos = 0 |
---|
22 | self.lineno = 1 |
---|
23 | self.match_position = 0 |
---|
24 | self.tag = [] |
---|
25 | self.control_line = [] |
---|
26 | self.disable_unicode = disable_unicode |
---|
27 | self.encoding = input_encoding |
---|
28 | if preprocessor is None: |
---|
29 | self.preprocessor = [] |
---|
30 | elif not hasattr(preprocessor, '__iter__'): |
---|
31 | self.preprocessor = [preprocessor] |
---|
32 | else: |
---|
33 | self.preprocessor = preprocessor |
---|
34 | |
---|
35 | exception_kwargs = property(lambda self:{'source':self.text, 'lineno':self.matched_lineno, 'pos':self.matched_charpos, 'filename':self.filename}) |
---|
36 | |
---|
37 | def match(self, regexp, flags=None): |
---|
38 | """match the given regular expression string and flags to the current text position. |
---|
39 | |
---|
40 | if a match occurs, update the current text and line position.""" |
---|
41 | mp = self.match_position |
---|
42 | try: |
---|
43 | reg = _regexp_cache[(regexp, flags)] |
---|
44 | except KeyError: |
---|
45 | if flags: |
---|
46 | reg = re.compile(regexp, flags) |
---|
47 | else: |
---|
48 | reg = re.compile(regexp) |
---|
49 | _regexp_cache[(regexp, flags)] = reg |
---|
50 | |
---|
51 | match = reg.match(self.text, self.match_position) |
---|
52 | if match: |
---|
53 | (start, end) = match.span() |
---|
54 | if end == start: |
---|
55 | self.match_position = end + 1 |
---|
56 | else: |
---|
57 | self.match_position = end |
---|
58 | self.matched_lineno = self.lineno |
---|
59 | lines = re.findall(r"\n", self.text[mp:self.match_position]) |
---|
60 | cp = mp - 1 |
---|
61 | while (cp >= 0 and cp<self.textlength and self.text[cp] != '\n'): |
---|
62 | cp -=1 |
---|
63 | self.matched_charpos = mp - cp |
---|
64 | self.lineno += len(lines) |
---|
65 | #print "MATCHED:", match.group(0), "LINE START:", self.matched_lineno, "LINE END:", self.lineno |
---|
66 | #print "MATCH:", regexp, "\n", self.text[mp : mp + 15], (match and "TRUE" or "FALSE") |
---|
67 | return match |
---|
68 | |
---|
69 | def parse_until_text(self, *text): |
---|
70 | startpos = self.match_position |
---|
71 | while True: |
---|
72 | match = self.match(r'#.*\n') |
---|
73 | if match: |
---|
74 | continue |
---|
75 | match = self.match(r'(\"\"\"|\'\'\'|\"|\')') |
---|
76 | if match: |
---|
77 | m = self.match(r'.*?%s' % match.group(1), re.S) |
---|
78 | if not m: |
---|
79 | raise exceptions.SyntaxException("Unmatched '%s'" % match.group(1), **self.exception_kwargs) |
---|
80 | else: |
---|
81 | match = self.match(r'(%s)' % r'|'.join(text)) |
---|
82 | if match: |
---|
83 | return (self.text[startpos:self.match_position-len(match.group(1))], match.group(1)) |
---|
84 | else: |
---|
85 | match = self.match(r".*?(?=\"|\'|#|%s)" % r'|'.join(text), re.S) |
---|
86 | if not match: |
---|
87 | raise exceptions.SyntaxException("Expected: %s" % ','.join(text), **self.exception_kwargs) |
---|
88 | |
---|
89 | def append_node(self, nodecls, *args, **kwargs): |
---|
90 | kwargs.setdefault('source', self.text) |
---|
91 | kwargs.setdefault('lineno', self.matched_lineno) |
---|
92 | kwargs.setdefault('pos', self.matched_charpos) |
---|
93 | kwargs['filename'] = self.filename |
---|
94 | node = nodecls(*args, **kwargs) |
---|
95 | if len(self.tag): |
---|
96 | self.tag[-1].nodes.append(node) |
---|
97 | else: |
---|
98 | self.template.nodes.append(node) |
---|
99 | if isinstance(node, parsetree.Tag): |
---|
100 | if len(self.tag): |
---|
101 | node.parent = self.tag[-1] |
---|
102 | self.tag.append(node) |
---|
103 | elif isinstance(node, parsetree.ControlLine): |
---|
104 | if node.isend: |
---|
105 | self.control_line.pop() |
---|
106 | elif node.is_primary: |
---|
107 | self.control_line.append(node) |
---|
108 | elif len(self.control_line) and not self.control_line[-1].is_ternary(node.keyword): |
---|
109 | raise exceptions.SyntaxException("Keyword '%s' not a legal ternary for keyword '%s'" % (node.keyword, self.control_line[-1].keyword), **self.exception_kwargs) |
---|
110 | |
---|
111 | def escape_code(self, text): |
---|
112 | if not self.disable_unicode and self.encoding: |
---|
113 | return text.encode('ascii', 'backslashreplace') |
---|
114 | else: |
---|
115 | return text |
---|
116 | |
---|
117 | def parse(self): |
---|
118 | for preproc in self.preprocessor: |
---|
119 | self.text = preproc(self.text) |
---|
120 | if not isinstance(self.text, unicode) and self.text.startswith(codecs.BOM_UTF8): |
---|
121 | self.text = self.text[len(codecs.BOM_UTF8):] |
---|
122 | parsed_encoding = 'utf-8' |
---|
123 | me = self.match_encoding() |
---|
124 | if me is not None and me != 'utf-8': |
---|
125 | raise exceptions.CompileException("Found utf-8 BOM in file, with conflicting magic encoding comment of '%s'" % me, self.text.decode('utf-8', 'ignore'), 0, 0, self.filename) |
---|
126 | else: |
---|
127 | parsed_encoding = self.match_encoding() |
---|
128 | if parsed_encoding: |
---|
129 | self.encoding = parsed_encoding |
---|
130 | if not self.disable_unicode and not isinstance(self.text, unicode): |
---|
131 | if self.encoding: |
---|
132 | try: |
---|
133 | self.text = self.text.decode(self.encoding) |
---|
134 | except UnicodeDecodeError, e: |
---|
135 | raise exceptions.CompileException("Unicode decode operation of encoding '%s' failed" % self.encoding, self.text.decode('utf-8', 'ignore'), 0, 0, self.filename) |
---|
136 | else: |
---|
137 | try: |
---|
138 | self.text = self.text.decode() |
---|
139 | except UnicodeDecodeError, e: |
---|
140 | raise exceptions.CompileException("Could not read template using encoding of 'ascii'. Did you forget a magic encoding comment?", self.text.decode('utf-8', 'ignore'), 0, 0, self.filename) |
---|
141 | |
---|
142 | self.textlength = len(self.text) |
---|
143 | |
---|
144 | while (True): |
---|
145 | if self.match_position > self.textlength: |
---|
146 | break |
---|
147 | |
---|
148 | if self.match_end(): |
---|
149 | break |
---|
150 | if self.match_expression(): |
---|
151 | continue |
---|
152 | if self.match_control_line(): |
---|
153 | continue |
---|
154 | if self.match_comment(): |
---|
155 | continue |
---|
156 | if self.match_tag_start(): |
---|
157 | continue |
---|
158 | if self.match_tag_end(): |
---|
159 | continue |
---|
160 | if self.match_python_block(): |
---|
161 | continue |
---|
162 | if self.match_text(): |
---|
163 | continue |
---|
164 | |
---|
165 | if self.match_position > self.textlength: |
---|
166 | break |
---|
167 | raise exceptions.CompileException("assertion failed") |
---|
168 | |
---|
169 | if len(self.tag): |
---|
170 | raise exceptions.SyntaxException("Unclosed tag: <%%%s>" % self.tag[-1].keyword, **self.exception_kwargs) |
---|
171 | if len(self.control_line): |
---|
172 | raise exceptions.SyntaxException("Unterminated control keyword: '%s'" % self.control_line[-1].keyword, self.text, self.control_line[-1].lineno, self.control_line[-1].pos, self.filename) |
---|
173 | return self.template |
---|
174 | |
---|
175 | def match_encoding(self): |
---|
176 | match = self.match(r'#.*coding[:=]\s*([-\w.]+).*\r?\n') |
---|
177 | if match: |
---|
178 | return match.group(1) |
---|
179 | else: |
---|
180 | return None |
---|
181 | |
---|
182 | def match_tag_start(self): |
---|
183 | match = self.match(r''' |
---|
184 | \<% # opening tag |
---|
185 | |
---|
186 | ([\w\.\:]+) # keyword |
---|
187 | |
---|
188 | ((?:\s+\w+|\s*=\s*|".*?"|'.*?')*) # attrname, = sign, string expression |
---|
189 | |
---|
190 | \s* # more whitespace |
---|
191 | |
---|
192 | (/)?> # closing |
---|
193 | |
---|
194 | ''', |
---|
195 | |
---|
196 | re.I | re.S | re.X) |
---|
197 | |
---|
198 | if match: |
---|
199 | (keyword, attr, isend) = (match.group(1), match.group(2), match.group(3)) |
---|
200 | self.keyword = keyword |
---|
201 | attributes = {} |
---|
202 | if attr: |
---|
203 | for att in re.findall(r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr): |
---|
204 | (key, val1, val2) = att |
---|
205 | text = val1 or val2 |
---|
206 | text = text.replace('\r\n', '\n') |
---|
207 | attributes[key] = self.escape_code(text) |
---|
208 | self.append_node(parsetree.Tag, keyword, attributes) |
---|
209 | if isend: |
---|
210 | self.tag.pop() |
---|
211 | else: |
---|
212 | if keyword == 'text': |
---|
213 | match = self.match(r'(.*?)(?=\</%text>)', re.S) |
---|
214 | if not match: |
---|
215 | raise exceptions.SyntaxException("Unclosed tag: <%%%s>" % self.tag[-1].keyword, **self.exception_kwargs) |
---|
216 | self.append_node(parsetree.Text, match.group(1)) |
---|
217 | return self.match_tag_end() |
---|
218 | return True |
---|
219 | else: |
---|
220 | return False |
---|
221 | |
---|
222 | def match_tag_end(self): |
---|
223 | match = self.match(r'\</%[\t ]*(.+?)[\t ]*>') |
---|
224 | if match: |
---|
225 | if not len(self.tag): |
---|
226 | raise exceptions.SyntaxException("Closing tag without opening tag: </%%%s>" % match.group(1), **self.exception_kwargs) |
---|
227 | elif self.tag[-1].keyword != match.group(1): |
---|
228 | raise exceptions.SyntaxException("Closing tag </%%%s> does not match tag: <%%%s>" % (match.group(1), self.tag[-1].keyword), **self.exception_kwargs) |
---|
229 | self.tag.pop() |
---|
230 | return True |
---|
231 | else: |
---|
232 | return False |
---|
233 | |
---|
234 | def match_end(self): |
---|
235 | match = self.match(r'\Z', re.S) |
---|
236 | if match: |
---|
237 | string = match.group() |
---|
238 | if string: |
---|
239 | return string |
---|
240 | else: |
---|
241 | return True |
---|
242 | else: |
---|
243 | return False |
---|
244 | |
---|
245 | def match_text(self): |
---|
246 | match = self.match(r""" |
---|
247 | (.*?) # anything, followed by: |
---|
248 | ( |
---|
249 | (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based comment preceded by a consumed \n and whitespace |
---|
250 | | |
---|
251 | (?=\${) # an expression |
---|
252 | | |
---|
253 | (?=\#\*) # multiline comment |
---|
254 | | |
---|
255 | (?=</?[%&]) # a substitution or block or call start or end |
---|
256 | # - don't consume |
---|
257 | | |
---|
258 | (\\\r?\n) # an escaped newline - throw away |
---|
259 | | |
---|
260 | \Z # end of string |
---|
261 | )""", re.X | re.S) |
---|
262 | |
---|
263 | if match: |
---|
264 | text = match.group(1) |
---|
265 | self.append_node(parsetree.Text, text) |
---|
266 | return True |
---|
267 | else: |
---|
268 | return False |
---|
269 | |
---|
270 | def match_python_block(self): |
---|
271 | match = self.match(r"<%(!)?") |
---|
272 | if match: |
---|
273 | (line, pos) = (self.matched_lineno, self.matched_charpos) |
---|
274 | (text, end) = self.parse_until_text(r'%>') |
---|
275 | text = adjust_whitespace(text) + "\n" # the trailing newline helps compiler.parse() not complain about indentation |
---|
276 | self.append_node(parsetree.Code, self.escape_code(text), match.group(1)=='!', lineno=line, pos=pos) |
---|
277 | return True |
---|
278 | else: |
---|
279 | return False |
---|
280 | |
---|
281 | def match_expression(self): |
---|
282 | match = self.match(r"\${") |
---|
283 | if match: |
---|
284 | (line, pos) = (self.matched_lineno, self.matched_charpos) |
---|
285 | (text, end) = self.parse_until_text(r'\|', r'}') |
---|
286 | if end == '|': |
---|
287 | (escapes, end) = self.parse_until_text(r'}') |
---|
288 | else: |
---|
289 | escapes = "" |
---|
290 | text = text.replace('\r\n', '\n') |
---|
291 | self.append_node(parsetree.Expression, self.escape_code(text), escapes.strip(), lineno=line, pos=pos) |
---|
292 | return True |
---|
293 | else: |
---|
294 | return False |
---|
295 | |
---|
296 | def match_control_line(self): |
---|
297 | match = self.match(r"(?<=^)[\t ]*(%|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)(?:\r?\n|\Z)", re.M) |
---|
298 | if match: |
---|
299 | operator = match.group(1) |
---|
300 | text = match.group(2) |
---|
301 | if operator == '%': |
---|
302 | m2 = re.match(r'(end)?(\w+)\s*(.*)', text) |
---|
303 | if not m2: |
---|
304 | raise exceptions.SyntaxException("Invalid control line: '%s'" % text, **self.exception_kwargs) |
---|
305 | (isend, keyword) = m2.group(1, 2) |
---|
306 | isend = (isend is not None) |
---|
307 | |
---|
308 | if isend: |
---|
309 | if not len(self.control_line): |
---|
310 | raise exceptions.SyntaxException("No starting keyword '%s' for '%s'" % (keyword, text), **self.exception_kwargs) |
---|
311 | elif self.control_line[-1].keyword != keyword: |
---|
312 | raise exceptions.SyntaxException("Keyword '%s' doesn't match keyword '%s'" % (text, self.control_line[-1].keyword), **self.exception_kwargs) |
---|
313 | self.append_node(parsetree.ControlLine, keyword, isend, self.escape_code(text)) |
---|
314 | else: |
---|
315 | self.append_node(parsetree.Comment, text) |
---|
316 | return True |
---|
317 | else: |
---|
318 | return False |
---|
319 | |
---|
320 | def match_comment(self): |
---|
321 | """matches the multiline version of a comment""" |
---|
322 | match = self.match(r"<%doc>(.*?)</%doc>", re.S) |
---|
323 | if match: |
---|
324 | self.append_node(parsetree.Comment, match.group(1)) |
---|
325 | return True |
---|
326 | else: |
---|
327 | return False |
---|
328 | |
---|