[3] | 1 | # lexer.py |
---|
| 2 | # Copyright (C) 2006, 2007, 2008, 2009 Michael Bayer mike_mp@zzzcomputing.com |
---|
| 3 | # |
---|
| 4 | # This module is part of Mako and is released under |
---|
| 5 | # the MIT License: http://www.opensource.org/licenses/mit-license.php |
---|
| 6 | |
---|
| 7 | """provides the Lexer class for parsing template strings into parse trees.""" |
---|
| 8 | |
---|
| 9 | import re, codecs |
---|
| 10 | from mako import parsetree, exceptions |
---|
| 11 | from mako.pygen import adjust_whitespace |
---|
| 12 | |
---|
| 13 | _regexp_cache = {} |
---|
| 14 | |
---|
| 15 | class Lexer(object): |
---|
| 16 | def __init__(self, text, filename=None, disable_unicode=False, input_encoding=None, preprocessor=None): |
---|
| 17 | self.text = text |
---|
| 18 | self.filename = filename |
---|
| 19 | self.template = parsetree.TemplateNode(self.filename) |
---|
| 20 | self.matched_lineno = 1 |
---|
| 21 | self.matched_charpos = 0 |
---|
| 22 | self.lineno = 1 |
---|
| 23 | self.match_position = 0 |
---|
| 24 | self.tag = [] |
---|
| 25 | self.control_line = [] |
---|
| 26 | self.disable_unicode = disable_unicode |
---|
| 27 | self.encoding = input_encoding |
---|
| 28 | if preprocessor is None: |
---|
| 29 | self.preprocessor = [] |
---|
| 30 | elif not hasattr(preprocessor, '__iter__'): |
---|
| 31 | self.preprocessor = [preprocessor] |
---|
| 32 | else: |
---|
| 33 | self.preprocessor = preprocessor |
---|
| 34 | |
---|
| 35 | exception_kwargs = property(lambda self:{'source':self.text, 'lineno':self.matched_lineno, 'pos':self.matched_charpos, 'filename':self.filename}) |
---|
| 36 | |
---|
| 37 | def match(self, regexp, flags=None): |
---|
| 38 | """match the given regular expression string and flags to the current text position. |
---|
| 39 | |
---|
| 40 | if a match occurs, update the current text and line position.""" |
---|
| 41 | mp = self.match_position |
---|
| 42 | try: |
---|
| 43 | reg = _regexp_cache[(regexp, flags)] |
---|
| 44 | except KeyError: |
---|
| 45 | if flags: |
---|
| 46 | reg = re.compile(regexp, flags) |
---|
| 47 | else: |
---|
| 48 | reg = re.compile(regexp) |
---|
| 49 | _regexp_cache[(regexp, flags)] = reg |
---|
| 50 | |
---|
| 51 | match = reg.match(self.text, self.match_position) |
---|
| 52 | if match: |
---|
| 53 | (start, end) = match.span() |
---|
| 54 | if end == start: |
---|
| 55 | self.match_position = end + 1 |
---|
| 56 | else: |
---|
| 57 | self.match_position = end |
---|
| 58 | self.matched_lineno = self.lineno |
---|
| 59 | lines = re.findall(r"\n", self.text[mp:self.match_position]) |
---|
| 60 | cp = mp - 1 |
---|
| 61 | while (cp >= 0 and cp<self.textlength and self.text[cp] != '\n'): |
---|
| 62 | cp -=1 |
---|
| 63 | self.matched_charpos = mp - cp |
---|
| 64 | self.lineno += len(lines) |
---|
| 65 | #print "MATCHED:", match.group(0), "LINE START:", self.matched_lineno, "LINE END:", self.lineno |
---|
| 66 | #print "MATCH:", regexp, "\n", self.text[mp : mp + 15], (match and "TRUE" or "FALSE") |
---|
| 67 | return match |
---|
| 68 | |
---|
| 69 | def parse_until_text(self, *text): |
---|
| 70 | startpos = self.match_position |
---|
| 71 | while True: |
---|
| 72 | match = self.match(r'#.*\n') |
---|
| 73 | if match: |
---|
| 74 | continue |
---|
| 75 | match = self.match(r'(\"\"\"|\'\'\'|\"|\')') |
---|
| 76 | if match: |
---|
| 77 | m = self.match(r'.*?%s' % match.group(1), re.S) |
---|
| 78 | if not m: |
---|
| 79 | raise exceptions.SyntaxException("Unmatched '%s'" % match.group(1), **self.exception_kwargs) |
---|
| 80 | else: |
---|
| 81 | match = self.match(r'(%s)' % r'|'.join(text)) |
---|
| 82 | if match: |
---|
| 83 | return (self.text[startpos:self.match_position-len(match.group(1))], match.group(1)) |
---|
| 84 | else: |
---|
| 85 | match = self.match(r".*?(?=\"|\'|#|%s)" % r'|'.join(text), re.S) |
---|
| 86 | if not match: |
---|
| 87 | raise exceptions.SyntaxException("Expected: %s" % ','.join(text), **self.exception_kwargs) |
---|
| 88 | |
---|
| 89 | def append_node(self, nodecls, *args, **kwargs): |
---|
| 90 | kwargs.setdefault('source', self.text) |
---|
| 91 | kwargs.setdefault('lineno', self.matched_lineno) |
---|
| 92 | kwargs.setdefault('pos', self.matched_charpos) |
---|
| 93 | kwargs['filename'] = self.filename |
---|
| 94 | node = nodecls(*args, **kwargs) |
---|
| 95 | if len(self.tag): |
---|
| 96 | self.tag[-1].nodes.append(node) |
---|
| 97 | else: |
---|
| 98 | self.template.nodes.append(node) |
---|
| 99 | if isinstance(node, parsetree.Tag): |
---|
| 100 | if len(self.tag): |
---|
| 101 | node.parent = self.tag[-1] |
---|
| 102 | self.tag.append(node) |
---|
| 103 | elif isinstance(node, parsetree.ControlLine): |
---|
| 104 | if node.isend: |
---|
| 105 | self.control_line.pop() |
---|
| 106 | elif node.is_primary: |
---|
| 107 | self.control_line.append(node) |
---|
| 108 | elif len(self.control_line) and not self.control_line[-1].is_ternary(node.keyword): |
---|
| 109 | raise exceptions.SyntaxException("Keyword '%s' not a legal ternary for keyword '%s'" % (node.keyword, self.control_line[-1].keyword), **self.exception_kwargs) |
---|
| 110 | |
---|
| 111 | def escape_code(self, text): |
---|
| 112 | if not self.disable_unicode and self.encoding: |
---|
| 113 | return text.encode('ascii', 'backslashreplace') |
---|
| 114 | else: |
---|
| 115 | return text |
---|
| 116 | |
---|
| 117 | def parse(self): |
---|
| 118 | for preproc in self.preprocessor: |
---|
| 119 | self.text = preproc(self.text) |
---|
| 120 | if not isinstance(self.text, unicode) and self.text.startswith(codecs.BOM_UTF8): |
---|
| 121 | self.text = self.text[len(codecs.BOM_UTF8):] |
---|
| 122 | parsed_encoding = 'utf-8' |
---|
| 123 | me = self.match_encoding() |
---|
| 124 | if me is not None and me != 'utf-8': |
---|
| 125 | raise exceptions.CompileException("Found utf-8 BOM in file, with conflicting magic encoding comment of '%s'" % me, self.text.decode('utf-8', 'ignore'), 0, 0, self.filename) |
---|
| 126 | else: |
---|
| 127 | parsed_encoding = self.match_encoding() |
---|
| 128 | if parsed_encoding: |
---|
| 129 | self.encoding = parsed_encoding |
---|
| 130 | if not self.disable_unicode and not isinstance(self.text, unicode): |
---|
| 131 | if self.encoding: |
---|
| 132 | try: |
---|
| 133 | self.text = self.text.decode(self.encoding) |
---|
| 134 | except UnicodeDecodeError, e: |
---|
| 135 | raise exceptions.CompileException("Unicode decode operation of encoding '%s' failed" % self.encoding, self.text.decode('utf-8', 'ignore'), 0, 0, self.filename) |
---|
| 136 | else: |
---|
| 137 | try: |
---|
| 138 | self.text = self.text.decode() |
---|
| 139 | except UnicodeDecodeError, e: |
---|
| 140 | raise exceptions.CompileException("Could not read template using encoding of 'ascii'. Did you forget a magic encoding comment?", self.text.decode('utf-8', 'ignore'), 0, 0, self.filename) |
---|
| 141 | |
---|
| 142 | self.textlength = len(self.text) |
---|
| 143 | |
---|
| 144 | while (True): |
---|
| 145 | if self.match_position > self.textlength: |
---|
| 146 | break |
---|
| 147 | |
---|
| 148 | if self.match_end(): |
---|
| 149 | break |
---|
| 150 | if self.match_expression(): |
---|
| 151 | continue |
---|
| 152 | if self.match_control_line(): |
---|
| 153 | continue |
---|
| 154 | if self.match_comment(): |
---|
| 155 | continue |
---|
| 156 | if self.match_tag_start(): |
---|
| 157 | continue |
---|
| 158 | if self.match_tag_end(): |
---|
| 159 | continue |
---|
| 160 | if self.match_python_block(): |
---|
| 161 | continue |
---|
| 162 | if self.match_text(): |
---|
| 163 | continue |
---|
| 164 | |
---|
| 165 | if self.match_position > self.textlength: |
---|
| 166 | break |
---|
| 167 | raise exceptions.CompileException("assertion failed") |
---|
| 168 | |
---|
| 169 | if len(self.tag): |
---|
| 170 | raise exceptions.SyntaxException("Unclosed tag: <%%%s>" % self.tag[-1].keyword, **self.exception_kwargs) |
---|
| 171 | if len(self.control_line): |
---|
| 172 | raise exceptions.SyntaxException("Unterminated control keyword: '%s'" % self.control_line[-1].keyword, self.text, self.control_line[-1].lineno, self.control_line[-1].pos, self.filename) |
---|
| 173 | return self.template |
---|
| 174 | |
---|
| 175 | def match_encoding(self): |
---|
| 176 | match = self.match(r'#.*coding[:=]\s*([-\w.]+).*\r?\n') |
---|
| 177 | if match: |
---|
| 178 | return match.group(1) |
---|
| 179 | else: |
---|
| 180 | return None |
---|
| 181 | |
---|
| 182 | def match_tag_start(self): |
---|
| 183 | match = self.match(r''' |
---|
| 184 | \<% # opening tag |
---|
| 185 | |
---|
| 186 | ([\w\.\:]+) # keyword |
---|
| 187 | |
---|
| 188 | ((?:\s+\w+|\s*=\s*|".*?"|'.*?')*) # attrname, = sign, string expression |
---|
| 189 | |
---|
| 190 | \s* # more whitespace |
---|
| 191 | |
---|
| 192 | (/)?> # closing |
---|
| 193 | |
---|
| 194 | ''', |
---|
| 195 | |
---|
| 196 | re.I | re.S | re.X) |
---|
| 197 | |
---|
| 198 | if match: |
---|
| 199 | (keyword, attr, isend) = (match.group(1), match.group(2), match.group(3)) |
---|
| 200 | self.keyword = keyword |
---|
| 201 | attributes = {} |
---|
| 202 | if attr: |
---|
| 203 | for att in re.findall(r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr): |
---|
| 204 | (key, val1, val2) = att |
---|
| 205 | text = val1 or val2 |
---|
| 206 | text = text.replace('\r\n', '\n') |
---|
| 207 | attributes[key] = self.escape_code(text) |
---|
| 208 | self.append_node(parsetree.Tag, keyword, attributes) |
---|
| 209 | if isend: |
---|
| 210 | self.tag.pop() |
---|
| 211 | else: |
---|
| 212 | if keyword == 'text': |
---|
| 213 | match = self.match(r'(.*?)(?=\</%text>)', re.S) |
---|
| 214 | if not match: |
---|
| 215 | raise exceptions.SyntaxException("Unclosed tag: <%%%s>" % self.tag[-1].keyword, **self.exception_kwargs) |
---|
| 216 | self.append_node(parsetree.Text, match.group(1)) |
---|
| 217 | return self.match_tag_end() |
---|
| 218 | return True |
---|
| 219 | else: |
---|
| 220 | return False |
---|
| 221 | |
---|
| 222 | def match_tag_end(self): |
---|
| 223 | match = self.match(r'\</%[\t ]*(.+?)[\t ]*>') |
---|
| 224 | if match: |
---|
| 225 | if not len(self.tag): |
---|
| 226 | raise exceptions.SyntaxException("Closing tag without opening tag: </%%%s>" % match.group(1), **self.exception_kwargs) |
---|
| 227 | elif self.tag[-1].keyword != match.group(1): |
---|
| 228 | raise exceptions.SyntaxException("Closing tag </%%%s> does not match tag: <%%%s>" % (match.group(1), self.tag[-1].keyword), **self.exception_kwargs) |
---|
| 229 | self.tag.pop() |
---|
| 230 | return True |
---|
| 231 | else: |
---|
| 232 | return False |
---|
| 233 | |
---|
| 234 | def match_end(self): |
---|
| 235 | match = self.match(r'\Z', re.S) |
---|
| 236 | if match: |
---|
| 237 | string = match.group() |
---|
| 238 | if string: |
---|
| 239 | return string |
---|
| 240 | else: |
---|
| 241 | return True |
---|
| 242 | else: |
---|
| 243 | return False |
---|
| 244 | |
---|
| 245 | def match_text(self): |
---|
| 246 | match = self.match(r""" |
---|
| 247 | (.*?) # anything, followed by: |
---|
| 248 | ( |
---|
| 249 | (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based comment preceded by a consumed \n and whitespace |
---|
| 250 | | |
---|
| 251 | (?=\${) # an expression |
---|
| 252 | | |
---|
| 253 | (?=\#\*) # multiline comment |
---|
| 254 | | |
---|
| 255 | (?=</?[%&]) # a substitution or block or call start or end |
---|
| 256 | # - don't consume |
---|
| 257 | | |
---|
| 258 | (\\\r?\n) # an escaped newline - throw away |
---|
| 259 | | |
---|
| 260 | \Z # end of string |
---|
| 261 | )""", re.X | re.S) |
---|
| 262 | |
---|
| 263 | if match: |
---|
| 264 | text = match.group(1) |
---|
| 265 | self.append_node(parsetree.Text, text) |
---|
| 266 | return True |
---|
| 267 | else: |
---|
| 268 | return False |
---|
| 269 | |
---|
| 270 | def match_python_block(self): |
---|
| 271 | match = self.match(r"<%(!)?") |
---|
| 272 | if match: |
---|
| 273 | (line, pos) = (self.matched_lineno, self.matched_charpos) |
---|
| 274 | (text, end) = self.parse_until_text(r'%>') |
---|
| 275 | text = adjust_whitespace(text) + "\n" # the trailing newline helps compiler.parse() not complain about indentation |
---|
| 276 | self.append_node(parsetree.Code, self.escape_code(text), match.group(1)=='!', lineno=line, pos=pos) |
---|
| 277 | return True |
---|
| 278 | else: |
---|
| 279 | return False |
---|
| 280 | |
---|
| 281 | def match_expression(self): |
---|
| 282 | match = self.match(r"\${") |
---|
| 283 | if match: |
---|
| 284 | (line, pos) = (self.matched_lineno, self.matched_charpos) |
---|
| 285 | (text, end) = self.parse_until_text(r'\|', r'}') |
---|
| 286 | if end == '|': |
---|
| 287 | (escapes, end) = self.parse_until_text(r'}') |
---|
| 288 | else: |
---|
| 289 | escapes = "" |
---|
| 290 | text = text.replace('\r\n', '\n') |
---|
| 291 | self.append_node(parsetree.Expression, self.escape_code(text), escapes.strip(), lineno=line, pos=pos) |
---|
| 292 | return True |
---|
| 293 | else: |
---|
| 294 | return False |
---|
| 295 | |
---|
| 296 | def match_control_line(self): |
---|
| 297 | match = self.match(r"(?<=^)[\t ]*(%|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)(?:\r?\n|\Z)", re.M) |
---|
| 298 | if match: |
---|
| 299 | operator = match.group(1) |
---|
| 300 | text = match.group(2) |
---|
| 301 | if operator == '%': |
---|
| 302 | m2 = re.match(r'(end)?(\w+)\s*(.*)', text) |
---|
| 303 | if not m2: |
---|
| 304 | raise exceptions.SyntaxException("Invalid control line: '%s'" % text, **self.exception_kwargs) |
---|
| 305 | (isend, keyword) = m2.group(1, 2) |
---|
| 306 | isend = (isend is not None) |
---|
| 307 | |
---|
| 308 | if isend: |
---|
| 309 | if not len(self.control_line): |
---|
| 310 | raise exceptions.SyntaxException("No starting keyword '%s' for '%s'" % (keyword, text), **self.exception_kwargs) |
---|
| 311 | elif self.control_line[-1].keyword != keyword: |
---|
| 312 | raise exceptions.SyntaxException("Keyword '%s' doesn't match keyword '%s'" % (text, self.control_line[-1].keyword), **self.exception_kwargs) |
---|
| 313 | self.append_node(parsetree.ControlLine, keyword, isend, self.escape_code(text)) |
---|
| 314 | else: |
---|
| 315 | self.append_node(parsetree.Comment, text) |
---|
| 316 | return True |
---|
| 317 | else: |
---|
| 318 | return False |
---|
| 319 | |
---|
| 320 | def match_comment(self): |
---|
| 321 | """matches the multiline version of a comment""" |
---|
| 322 | match = self.match(r"<%doc>(.*?)</%doc>", re.S) |
---|
| 323 | if match: |
---|
| 324 | self.append_node(parsetree.Comment, match.group(1)) |
---|
| 325 | return True |
---|
| 326 | else: |
---|
| 327 | return False |
---|
| 328 | |
---|