| 1 | # -*- coding: utf-8 -*- | 
|---|
| 2 | # | 
|---|
| 3 | # Copyright (C) 2008 Edgewall Software | 
|---|
| 4 | # All rights reserved. | 
|---|
| 5 | # | 
|---|
| 6 | # This software is licensed as described in the file COPYING, which | 
|---|
| 7 | # you should have received as part of this distribution. The terms | 
|---|
| 8 | # are also available at http://babel.edgewall.org/wiki/License. | 
|---|
| 9 | # | 
|---|
| 10 | # This software consists of voluntary contributions made by many | 
|---|
| 11 | # individuals. For the exact contribution history, see the revision | 
|---|
| 12 | # history and logs, available at http://babel.edgewall.org/log/. | 
|---|
| 13 |  | 
|---|
| 14 | """A simple JavaScript 1.5 lexer which is used for the JavaScript | 
|---|
| 15 | extractor. | 
|---|
| 16 | """ | 
|---|
| 17 |  | 
|---|
| 18 | import re | 
|---|
| 19 | from operator import itemgetter | 
|---|
| 20 |  | 
|---|
| 21 |  | 
|---|
| 22 | operators = [ | 
|---|
| 23 |     '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=', | 
|---|
| 24 |     '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=', | 
|---|
| 25 |     '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')', | 
|---|
| 26 |     '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':' | 
|---|
| 27 | ] | 
|---|
| 28 | operators.sort(lambda a, b: cmp(-len(a), -len(b))) | 
|---|
| 29 |  | 
|---|
| 30 | escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'} | 
|---|
| 31 |  | 
|---|
| 32 | rules = [ | 
|---|
| 33 |     (None, re.compile(r'\s+(?u)')), | 
|---|
| 34 |     (None, re.compile(r'<!--.*')), | 
|---|
| 35 |     ('linecomment', re.compile(r'//.*')), | 
|---|
| 36 |     ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')), | 
|---|
| 37 |     ('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')), | 
|---|
| 38 |     ('number', re.compile(r'''(?x)( | 
|---|
| 39 |         (?:0|[1-9]\d*) | 
|---|
| 40 |         (\.\d+)? | 
|---|
| 41 |         ([eE][-+]?\d+)? | | 
|---|
| 42 |         (0x[a-fA-F0-9]+) | 
|---|
| 43 |     )''')), | 
|---|
| 44 |     ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))), | 
|---|
| 45 |     ('string', re.compile(r'''(?xs)( | 
|---|
| 46 |         '(?:[^'\\]*(?:\\.[^'\\]*)*)'  | | 
|---|
| 47 |         "(?:[^"\\]*(?:\\.[^"\\]*)*)" | 
|---|
| 48 |     )''')) | 
|---|
| 49 | ] | 
|---|
| 50 |  | 
|---|
| 51 | division_re = re.compile(r'/=?') | 
|---|
| 52 | regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*(?s)') | 
|---|
| 53 | line_re = re.compile(r'(\r\n|\n|\r)') | 
|---|
| 54 | line_join_re = re.compile(r'\\' + line_re.pattern) | 
|---|
| 55 | uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}') | 
|---|
| 56 |  | 
|---|
| 57 |  | 
|---|
| 58 | class Token(tuple): | 
|---|
| 59 |     """Represents a token as returned by `tokenize`.""" | 
|---|
| 60 |     __slots__ = () | 
|---|
| 61 |  | 
|---|
| 62 |     def __new__(cls, type, value, lineno): | 
|---|
| 63 |         return tuple.__new__(cls, (type, value, lineno)) | 
|---|
| 64 |  | 
|---|
| 65 |     type = property(itemgetter(0)) | 
|---|
| 66 |     value = property(itemgetter(1)) | 
|---|
| 67 |     lineno = property(itemgetter(2)) | 
|---|
| 68 |  | 
|---|
| 69 |  | 
|---|
| 70 | def indicates_division(token): | 
|---|
| 71 |     """A helper function that helps the tokenizer to decide if the current | 
|---|
| 72 |     token may be followed by a division operator. | 
|---|
| 73 |     """ | 
|---|
| 74 |     if token.type == 'operator': | 
|---|
| 75 |         return token.value in (')', ']', '}', '++', '--') | 
|---|
| 76 |     return token.type in ('name', 'number', 'string', 'regexp') | 
|---|
| 77 |  | 
|---|
| 78 |  | 
|---|
| 79 | def unquote_string(string): | 
|---|
| 80 |     """Unquote a string with JavaScript rules.  The string has to start with | 
|---|
| 81 |     string delimiters (``'`` or ``"``.) | 
|---|
| 82 |  | 
|---|
| 83 |     :return: a string | 
|---|
| 84 |     """ | 
|---|
| 85 |     assert string and string[0] == string[-1] and string[0] in '"\'', \ | 
|---|
| 86 |         'string provided is not properly delimited' | 
|---|
| 87 |     string = line_join_re.sub('\\1', string[1:-1]) | 
|---|
| 88 |     result = [] | 
|---|
| 89 |     add = result.append | 
|---|
| 90 |     pos = 0 | 
|---|
| 91 |  | 
|---|
| 92 |     while 1: | 
|---|
| 93 |         # scan for the next escape | 
|---|
| 94 |         escape_pos = string.find('\\', pos) | 
|---|
| 95 |         if escape_pos < 0: | 
|---|
| 96 |             break | 
|---|
| 97 |         add(string[pos:escape_pos]) | 
|---|
| 98 |  | 
|---|
| 99 |         # check which character is escaped | 
|---|
| 100 |         next_char = string[escape_pos + 1] | 
|---|
| 101 |         if next_char in escapes: | 
|---|
| 102 |             add(escapes[next_char]) | 
|---|
| 103 |  | 
|---|
| 104 |         # unicode escapes.  trie to consume up to four characters of | 
|---|
| 105 |         # hexadecimal characters and try to interpret them as unicode | 
|---|
| 106 |         # character point.  If there is no such character point, put | 
|---|
| 107 |         # all the consumed characters into the string. | 
|---|
| 108 |         elif next_char in 'uU': | 
|---|
| 109 |             escaped = uni_escape_re.match(string, escape_pos + 2) | 
|---|
| 110 |             if escaped is not None: | 
|---|
| 111 |                 escaped_value = escaped.group() | 
|---|
| 112 |                 if len(escaped_value) == 4: | 
|---|
| 113 |                     try: | 
|---|
| 114 |                         add(unichr(int(escaped_value, 16))) | 
|---|
| 115 |                     except ValueError: | 
|---|
| 116 |                         pass | 
|---|
| 117 |                     else: | 
|---|
| 118 |                         pos = escape_pos + 6 | 
|---|
| 119 |                         continue | 
|---|
| 120 |                 add(next_char + escaped_value) | 
|---|
| 121 |                 pos = escaped.end() | 
|---|
| 122 |                 continue | 
|---|
| 123 |             else: | 
|---|
| 124 |                 add(next_char) | 
|---|
| 125 |  | 
|---|
| 126 |         # bogus escape.  Just remove the backslash. | 
|---|
| 127 |         else: | 
|---|
| 128 |             add(next_char) | 
|---|
| 129 |         pos = escape_pos + 2 | 
|---|
| 130 |  | 
|---|
| 131 |     if pos < len(string): | 
|---|
| 132 |         add(string[pos:]) | 
|---|
| 133 |  | 
|---|
| 134 |     return u''.join(result) | 
|---|
| 135 |  | 
|---|
| 136 |  | 
|---|
| 137 | def tokenize(source): | 
|---|
| 138 |     """Tokenize a JavaScript source. | 
|---|
| 139 |  | 
|---|
| 140 |     :return: generator of `Token`\s | 
|---|
| 141 |     """ | 
|---|
| 142 |     may_divide = False | 
|---|
| 143 |     pos = 0 | 
|---|
| 144 |     lineno = 1 | 
|---|
| 145 |     end = len(source) | 
|---|
| 146 |  | 
|---|
| 147 |     while pos < end: | 
|---|
| 148 |         # handle regular rules first | 
|---|
| 149 |         for token_type, rule in rules: | 
|---|
| 150 |             match = rule.match(source, pos) | 
|---|
| 151 |             if match is not None: | 
|---|
| 152 |                 break | 
|---|
| 153 |         # if we don't have a match we don't give up yet, but check for | 
|---|
| 154 |         # division operators or regular expression literals, based on | 
|---|
| 155 |         # the status of `may_divide` which is determined by the last | 
|---|
| 156 |         # processed non-whitespace token using `indicates_division`. | 
|---|
| 157 |         else: | 
|---|
| 158 |             if may_divide: | 
|---|
| 159 |                 match = division_re.match(source, pos) | 
|---|
| 160 |                 token_type = 'operator' | 
|---|
| 161 |             else: | 
|---|
| 162 |                 match = regex_re.match(source, pos) | 
|---|
| 163 |                 token_type = 'regexp' | 
|---|
| 164 |             if match is None: | 
|---|
| 165 |                 # woops. invalid syntax. jump one char ahead and try again. | 
|---|
| 166 |                 pos += 1 | 
|---|
| 167 |                 continue | 
|---|
| 168 |  | 
|---|
| 169 |         token_value = match.group() | 
|---|
| 170 |         if token_type is not None: | 
|---|
| 171 |             token = Token(token_type, token_value, lineno) | 
|---|
| 172 |             may_divide = indicates_division(token) | 
|---|
| 173 |             yield token | 
|---|
| 174 |         lineno += len(line_re.findall(token_value)) | 
|---|
| 175 |         pos = match.end() | 
|---|