[3]1# -*- coding: utf-8 -*-
3# Copyright (C) 2007 Edgewall Software
4# All rights reserved.
6# This software is licensed as described in the file COPYING, which
7# you should have received as part of this distribution. The terms
8# are also available at
10# This software consists of voluntary contributions made by many
11# individuals. For the exact contribution history, see the revision
12# history and logs, available at
14"""Basic infrastructure for extracting localizable messages from source files.
16This module defines an extensible system for collecting localizable message
17strings from a variety of sources. A native extractor for Python source files
18is builtin, extractors for other sources can be added using very simple plugins.
20The main entry points into the extraction functionality are the functions
21`extract_from_dir` and `extract_from_file`.
24import os
26    set
27except NameError:
28    from sets import Set as set
29import sys
30from tokenize import generate_tokens, COMMENT, NAME, OP, STRING
32from babel.util import parse_encoding, pathmatch, relpath
33from textwrap import dedent
35__all__ = ['extract', 'extract_from_dir', 'extract_from_file']
36__docformat__ = 'restructuredtext en'
38GROUP_NAME = 'babel.extractors'
41    '_': None,
42    'gettext': None,
43    'ngettext': (1, 2),
44    'ugettext': None,
45    'ungettext': (1, 2),
46    'dgettext': (2,),
47    'dngettext': (2, 3),
48    'N_': None
51DEFAULT_MAPPING = [('**.py', 'python')]
53empty_msgid_warning = (
54'%s: warning: Empty msgid.  It is reserved by GNU gettext: gettext("") '
55'returns the header entry with meta information, not the empty string.')
58def _strip_comment_tags(comments, tags):
59    """Helper function for `extract` that strips comment tags from strings
60    in a list of comment lines.  This functions operates in-place.
61    """
62    def _strip(line):
63        for tag in tags:
64            if line.startswith(tag):
65                return line[len(tag):].strip()
66        return line
67    comments[:] = map(_strip, comments)
70def extract_from_dir(dirname=os.getcwd(), method_map=DEFAULT_MAPPING,
71                     options_map=None, keywords=DEFAULT_KEYWORDS,
72                     comment_tags=(), callback=None, strip_comment_tags=False):
73    """Extract messages from any source files found in the given directory.
75    This function generates tuples of the form:
77        ``(filename, lineno, message, comments)``
79    Which extraction method is used per file is determined by the `method_map`
80    parameter, which maps extended glob patterns to extraction method names.
81    For example, the following is the default mapping:
83    >>> method_map = [
84    ...     ('**.py', 'python')
85    ... ]
87    This basically says that files with the filename extension ".py" at any
88    level inside the directory should be processed by the "python" extraction
89    method. Files that don't match any of the mapping patterns are ignored. See
90    the documentation of the `pathmatch` function for details on the pattern
91    syntax.
93    The following extended mapping would also use the "genshi" extraction
94    method on any file in "templates" subdirectory:
96    >>> method_map = [
97    ...     ('**/templates/**.*', 'genshi'),
98    ...     ('**.py', 'python')
99    ... ]
101    The dictionary provided by the optional `options_map` parameter augments
102    these mappings. It uses extended glob patterns as keys, and the values are
103    dictionaries mapping options names to option values (both strings).
105    The glob patterns of the `options_map` do not necessarily need to be the
106    same as those used in the method mapping. For example, while all files in
107    the ``templates`` folders in an application may be Genshi applications, the
108    options for those files may differ based on extension:
110    >>> options_map = {
111    ...     '**/templates/**.txt': {
112    ...         'template_class': 'genshi.template:TextTemplate',
113    ...         'encoding': 'latin-1'
114    ...     },
115    ...     '**/templates/**.html': {
116    ...         'include_attrs': ''
117    ...     }
118    ... }
120    :param dirname: the path to the directory to extract messages from
121    :param method_map: a list of ``(pattern, method)`` tuples that maps of
122                       extraction method names to extended glob patterns
123    :param options_map: a dictionary of additional options (optional)
124    :param keywords: a dictionary mapping keywords (i.e. names of functions
125                     that should be recognized as translation functions) to
126                     tuples that specify which of their arguments contain
127                     localizable strings
128    :param comment_tags: a list of tags of translator comments to search for
129                         and include in the results
130    :param callback: a function that is called for every file that message are
131                     extracted from, just before the extraction itself is
132                     performed; the function is passed the filename, the name
133                     of the extraction method and and the options dictionary as
134                     positional arguments, in that order
135    :param strip_comment_tags: a flag that if set to `True` causes all comment
136                               tags to be removed from the collected comments.
137    :return: an iterator over ``(filename, lineno, funcname, message)`` tuples
138    :rtype: ``iterator``
139    :see: `pathmatch`
140    """
141    if options_map is None:
142        options_map = {}
144    absname = os.path.abspath(dirname)
145    for root, dirnames, filenames in os.walk(absname):
146        for subdir in dirnames:
147            if subdir.startswith('.') or subdir.startswith('_'):
148                dirnames.remove(subdir)
149        dirnames.sort()
150        filenames.sort()
151        for filename in filenames:
152            filename = relpath(
153                os.path.join(root, filename).replace(os.sep, '/'),
154                dirname
155            )
156            for pattern, method in method_map:
157                if pathmatch(pattern, filename):
158                    filepath = os.path.join(absname, filename)
159                    options = {}
160                    for opattern, odict in options_map.items():
161                        if pathmatch(opattern, filename):
162                            options = odict
163                    if callback:
164                        callback(filename, method, options)
165                    for lineno, message, comments in \
166                          extract_from_file(method, filepath,
167                                            keywords=keywords,
168                                            comment_tags=comment_tags,
169                                            options=options,
170                                            strip_comment_tags=
171                                                strip_comment_tags):
172                        yield filename, lineno, message, comments
173                    break
176def extract_from_file(method, filename, keywords=DEFAULT_KEYWORDS,
177                      comment_tags=(), options=None, strip_comment_tags=False):
178    """Extract messages from a specific file.
180    This function returns a list of tuples of the form:
182        ``(lineno, funcname, message)``
184    :param filename: the path to the file to extract messages from
185    :param method: a string specifying the extraction method (.e.g. "python")
186    :param keywords: a dictionary mapping keywords (i.e. names of functions
187                     that should be recognized as translation functions) to
188                     tuples that specify which of their arguments contain
189                     localizable strings
190    :param comment_tags: a list of translator tags to search for and include
191                         in the results
192    :param strip_comment_tags: a flag that if set to `True` causes all comment
193                               tags to be removed from the collected comments.
194    :param options: a dictionary of additional options (optional)
195    :return: the list of extracted messages
196    :rtype: `list`
197    """
198    fileobj = open(filename, 'U')
199    try:
200        return list(extract(method, fileobj, keywords, comment_tags, options,
201                            strip_comment_tags))
202    finally:
203        fileobj.close()
206def extract(method, fileobj, keywords=DEFAULT_KEYWORDS, comment_tags=(),
207            options=None, strip_comment_tags=False):
208    """Extract messages from the given file-like object using the specified
209    extraction method.
211    This function returns a list of tuples of the form:
213        ``(lineno, message, comments)``
215    The implementation dispatches the actual extraction to plugins, based on the
216    value of the ``method`` parameter.
218    >>> source = '''# foo module
219    ... def run(argv):
220    ...    print _('Hello, world!')
221    ... '''
223    >>> from StringIO import StringIO
224    >>> for message in extract('python', StringIO(source)):
225    ...     print message
226    (3, u'Hello, world!', [])
228    :param method: a string specifying the extraction method (.e.g. "python");
229                   if this is a simple name, the extraction function will be
230                   looked up by entry point; if it is an explicit reference
231                   to a function (of the form ``package.module:funcname`` or
232                   ``package.module.funcname``), the corresponding function
233                   will be imported and used
234    :param fileobj: the file-like object the messages should be extracted from
235    :param keywords: a dictionary mapping keywords (i.e. names of functions
236                     that should be recognized as translation functions) to
237                     tuples that specify which of their arguments contain
238                     localizable strings
239    :param comment_tags: a list of translator tags to search for and include
240                         in the results
241    :param options: a dictionary of additional options (optional)
242    :param strip_comment_tags: a flag that if set to `True` causes all comment
243                               tags to be removed from the collected comments.
244    :return: the list of extracted messages
245    :rtype: `list`
246    :raise ValueError: if the extraction method is not registered
247    """
248    func = None
249    if ':' in method or '.' in method:
250        if ':' not in method:
251            lastdot = method.rfind('.')
252            module, attrname = method[:lastdot], method[lastdot + 1:]
253        else:
254            module, attrname = method.split(':', 1)
255        func = getattr(__import__(module, {}, {}, [attrname]), attrname)
256    else:
257        try:
258            from pkg_resources import working_set
259        except ImportError:
260            # pkg_resources is not available, so we resort to looking up the
261            # builtin extractors directly
262            builtin = {'ignore': extract_nothing, 'python': extract_python}
263            func = builtin.get(method)
264        else:
265            for entry_point in working_set.iter_entry_points(GROUP_NAME,
266                                                             method):
267                func = entry_point.load(require=True)
268                break
269    if func is None:
270        raise ValueError('Unknown extraction method %r' % method)
272    results = func(fileobj, keywords.keys(), comment_tags,
273                   options=options or {})
275    for lineno, funcname, messages, comments in results:
276        if funcname:
277            spec = keywords[funcname] or (1,)
278        else:
279            spec = (1,)
280        if not isinstance(messages, (list, tuple)):
281            messages = [messages]
282        if not messages:
283            continue
285        # Validate the messages against the keyword's specification
286        msgs = []
287        invalid = False
288        # last_index is 1 based like the keyword spec
289        last_index = len(messages)
290        for index in spec:
291            if last_index < index:
292                # Not enough arguments
293                invalid = True
294                break
295            message = messages[index - 1]
296            if message is None:
297                invalid = True
298                break
299            msgs.append(message)
300        if invalid:
301            continue
303        first_msg_index = spec[0] - 1
304        if not messages[first_msg_index]:
305            # An empty string msgid isn't valid, emit a warning
306            where = '%s:%i' % (hasattr(fileobj, 'name') and \
307                          or '(unknown)', lineno)
308            print >> sys.stderr, empty_msgid_warning % where
309            continue
311        messages = tuple(msgs)
312        if len(messages) == 1:
313            messages = messages[0]
315        if strip_comment_tags:
316            _strip_comment_tags(comments, comment_tags)
317        yield lineno, messages, comments
320def extract_nothing(fileobj, keywords, comment_tags, options):
321    """Pseudo extractor that does not actually extract anything, but simply
322    returns an empty list.
323    """
324    return []
327def extract_python(fileobj, keywords, comment_tags, options):
328    """Extract messages from Python source code.
330    :param fileobj: the seekable, file-like object the messages should be
331                    extracted from
332    :param keywords: a list of keywords (i.e. function names) that should be
333                     recognized as translation functions
334    :param comment_tags: a list of translator tags to search for and include
335                         in the results
336    :param options: a dictionary of additional options (optional)
337    :return: an iterator over ``(lineno, funcname, message, comments)`` tuples
338    :rtype: ``iterator``
339    """
340    funcname = lineno = message_lineno = None
341    call_stack = -1
342    buf = []
343    messages = []
344    translator_comments = []
345    in_def = in_translator_comments = False
346    comment_tag = None
348    encoding = parse_encoding(fileobj) or options.get('encoding', 'iso-8859-1')
350    tokens = generate_tokens(fileobj.readline)
351    for tok, value, (lineno, _), _, _ in tokens:
352        if call_stack == -1 and tok == NAME and value in ('def', 'class'):
353            in_def = True
354        elif tok == OP and value == '(':
355            if in_def:
356                # Avoid false positives for declarations such as:
357                # def gettext(arg='message'):
358                in_def = False
359                continue
360            if funcname:
361                message_lineno = lineno
362                call_stack += 1
363        elif in_def and tok == OP and value == ':':
364            # End of a class definition without parens
365            in_def = False
366            continue
367        elif call_stack == -1 and tok == COMMENT:
368            # Strip the comment token from the line
369            value = value.decode(encoding)[1:].strip()
370            if in_translator_comments and \
371                    translator_comments[-1][0] == lineno - 1:
372                # We're already inside a translator comment, continue appending
373                translator_comments.append((lineno, value))
374                continue
375            # If execution reaches this point, let's see if comment line
376            # starts with one of the comment tags
377            for comment_tag in comment_tags:
378                if value.startswith(comment_tag):
379                    in_translator_comments = True
380                    translator_comments.append((lineno, value))
381                    break
382        elif funcname and call_stack == 0:
383            if tok == OP and value == ')':
384                if buf:
385                    messages.append(''.join(buf))
386                    del buf[:]
387                else:
388                    messages.append(None)
390                if len(messages) > 1:
391                    messages = tuple(messages)
392                else:
393                    messages = messages[0]
394                # Comments don't apply unless they immediately preceed the
395                # message
396                if translator_comments and \
397                        translator_comments[-1][0] < message_lineno - 1:
398                    translator_comments = []
400                yield (message_lineno, funcname, messages,
401                       [comment[1] for comment in translator_comments])
403                funcname = lineno = message_lineno = None
404                call_stack = -1
405                messages = []
406                translator_comments = []
407                in_translator_comments = False
408            elif tok == STRING:
409                # Unwrap quotes in a safe manner, maintaining the string's
410                # encoding
411                #
412                # aid=617979&group_id=5470
413                value = eval('# coding=%s\n%s' % (encoding, value),
414                             {'__builtins__':{}}, {})
415                if isinstance(value, str):
416                    value = value.decode(encoding)
417                buf.append(value)
418            elif tok == OP and value == ',':
419                if buf:
420                    messages.append(''.join(buf))
421                    del buf[:]
422                else:
423                    messages.append(None)
424                if translator_comments:
425                    # We have translator comments, and since we're on a
426                    # comma(,) user is allowed to break into a new line
427                    # Let's increase the last comment's lineno in order
428                    # for the comment to still be a valid one
429                    old_lineno, old_comment = translator_comments.pop()
430                    translator_comments.append((old_lineno+1, old_comment))
431        elif call_stack > 0 and tok == OP and value == ')':
432            call_stack -= 1
433        elif funcname and call_stack == -1:
434            funcname = None
435        elif tok == NAME and value in keywords:
436            funcname = value
439def extract_javascript(fileobj, keywords, comment_tags, options):
440    """Extract messages from JavaScript source code.
442    :param fileobj: the seekable, file-like object the messages should be
443                    extracted from
444    :param keywords: a list of keywords (i.e. function names) that should be
445                     recognized as translation functions
446    :param comment_tags: a list of translator tags to search for and include
447                         in the results
448    :param options: a dictionary of additional options (optional)
449    :return: an iterator over ``(lineno, funcname, message, comments)`` tuples
450    :rtype: ``iterator``
451    """
452    from babel.messages.jslexer import tokenize, unquote_string
453    funcname = message_lineno = None
454    messages = []
455    last_argument = None
456    translator_comments = []
457    concatenate_next = False
458    encoding = options.get('encoding', 'utf-8')
459    last_token = None
460    call_stack = -1
462    for token in tokenize(
463        if token.type == 'operator' and token.value == '(':
464            if funcname:
465                message_lineno = token.lineno
466                call_stack += 1
468        elif call_stack == -1 and token.type == 'linecomment':
469            value = token.value[2:].strip()
470            if translator_comments and \
471               translator_comments[-1][0] == token.lineno - 1:
472                translator_comments.append((token.lineno, value))
473                continue
475            for comment_tag in comment_tags:
476                if value.startswith(comment_tag):
477                    translator_comments.append((token.lineno, value.strip()))
478                    break
480        elif token.type == 'multilinecomment':
481            # only one multi-line comment may preceed a translation
482            translator_comments = []
483            value = token.value[2:-2].strip()
484            for comment_tag in comment_tags:
485                if value.startswith(comment_tag):
486                    lines = value.splitlines()
487                    if lines:
488                        lines[0] = lines[0].strip()
489                        lines[1:] = dedent('\n'.join(lines[1:])).splitlines()
490                        for offset, line in enumerate(lines):
491                            translator_comments.append((token.lineno + offset,
492                                                        line))
493                    break
495        elif funcname and call_stack == 0:
496            if token.type == 'operator' and token.value == ')':
497                if last_argument is not None:
498                    messages.append(last_argument)
499                if len(messages) > 1:
500                    messages = tuple(messages)
501                elif messages:
502                    messages = messages[0]
503                else:
504                    messages = None
506                # Comments don't apply unless they immediately preceed the
507                # message
508                if translator_comments and \
509                   translator_comments[-1][0] < message_lineno - 1:
510                    translator_comments = []
512                if messages is not None:
513                    yield (message_lineno, funcname, messages,
514                           [comment[1] for comment in translator_comments])
516                funcname = message_lineno = last_argument = None
517                concatenate_next = False
518                translator_comments = []
519                messages = []
520                call_stack = -1
522            elif token.type == 'string':
523                new_value = unquote_string(token.value)
524                if concatenate_next:
525                    last_argument = (last_argument or '') + new_value
526                    concatenate_next = False
527                else:
528                    last_argument = new_value
530            elif token.type == 'operator':
531                if token.value == ',':
532                    if last_argument is not None:
533                        messages.append(last_argument)
534                        last_argument = None
535                    else:
536                        messages.append(None)
537                    concatenate_next = False
538                elif token.value == '+':
539                    concatenate_next = True
541        elif call_stack > 0 and token.type == 'operator' \
542             and token.value == ')':
543            call_stack -= 1
545        elif funcname and call_stack == -1:
546            funcname = None
548        elif call_stack == -1 and token.type == 'name' and \
549             token.value in keywords and \
550             (last_token is None or last_token.type != 'name' or
551              last_token.value != 'function'):
552            funcname = token.value
554        last_token = token
