#!/usr/bin/env python # _*_ coding: latin1 _*_ """This is Textile A Humane Web Text Generator TODO: * Make it work with Python 2.1. * Make it work with Python 1.5.2? Or that's too optimistic? --- To get an overview of all PyTextile's features, simply type 'tell me about textile.' in a single line. """ __authors__ = ["Roberto A. F. De Almeida (roberto@dealmeida.net)", "Mark Pilgrim (f8dy@diveintomark.org)"] __version__ = "2.0.10" __date__ = "2004/10/06" __copyright__ = """ Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/ Copyright (c) 2003, Mark Pilgrim, http://diveintomark.org/ All rights reserved. Original PHP version: Version 1.0 21 Feb, 2003 Copyright (c) 2003, Dean Allen, www.textism.com All rights reserved. Parts of the documentation and some of the regular expressions are (c) Brad Choate, http://bradchoate.com/. Thanks, Brad! """ __license__ = """ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name Textile nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ __history__ = """ 1.0 - 2003/03/19 - MAP - initial release 1.01 - 2003/03/19 - MAP - don't strip whitespace within
 tags;
  map high-bit ASCII to HTML numeric entities
1.02 - 2003/03/19 - MAP - changed hyperlink qtag expression to only
  match valid URL characters (per RFC 2396); fixed preg_replace to
  not match across line breaks (solves lots of problems with
  mistakenly matching overlapping inline markup); fixed whitespace
  stripping to only strip whitespace from beginning and end of lines,
  not immediately before and after HTML tags.
1.03 - 2003/03/20 - MAP - changed hyperlink qtag again to more
  closely match original Textile (fixes problems with links
  immediately followed by punctuation -- somewhere Dean is
  grinning right now); handle curly apostrophe with "ve"
  contraction; clean up empty titles at end.
1.04 - 2003/03/23 - MAP - lstrip input to deal with extra spaces at
  beginning of first line; tweaked list loop to handle consecutive lists
1.1 - 2003/06/06 - MAP - created initial test suite for links and images,
  and fixed a bunch of related bugs to pass them
1.11 - 2003/07/20 - CL - don't demoronise unicode strings; handle
  "they're" properly
1.12 - 2003/07/23 - GW - print debug messages to stderr; handle bq(cite).
1.13 - 2003/07/23 - MAP - wrap bq. text in 

...

2 - 2004/03/26 - RAFA - rewritten from (almost) scratch to include all features from Textile 2 and a little bit more. 2.0.1 - 2004/04/02 - RAFA - Fixed validating function that uses uTidyLib. 2.0.2 - 2004/04/02 - RAFA - Fixed problem with caps letters in URLs. 2.0.3 - 2004/04/19 - RAFA - Multiple classes are allowed, thanks to Dave Anderson. The "lang" attribute is now removed from , to be valid XHTML. Fixed UCAS problem. 2.0.4 - 2004/05/20 - RAFA, CLB - Added inline formatting to table cells. Curt Bergmann fixed a bug with the colspan formatting. Added Amazon Associated id. 2.0.5 - 2004/06/01 - CL - Applied patch from Chris Lawrence to (1) fix that Amazon associates ID was being added to all search URIs, (2) customize the Amazon site used with the AMAZON variable, and (3) added an "isbn" URI type that links directly to an Amazon product by ISBN or Amazon ASIN. 2.0.6 - 2004/06/02 - RAFA - Fixed CAPS problem, again. I hope this is the last time. 2.0.7 - 2004/06/04 - RAFA, MW - Fixed bullet macro, thanks to Adam Messinger. Added patch from Michal Wallace changing {}.pop() for compatibility with Python 2.2.x. 2.0.8 - 2004/06/25 - RAFA - Strip tags when adding the content from a footnote to the reference link. Escaped '<' and '>' in the self- generated documentation. 2.0.9 - 2004/10/04 - RAFA - In images, if ALT is not defined, add an empty attribute. Added "LaTeX" style open/close quotes. Fixed a bug where the acronym definition was being formatted with inline rules. Handle "broken" lines correctly, removing the
from inside split HTML tags. 2.0.10 - 2004/10/06 - RAFA, LO - Escape all non-escaped ampersands. Applied "trivial patch" from Ludvig Omholt to remove newline right after the
 tag.
"""

# Set your encoding here.
ENCODING = 'latin-1'

# Output? Non-ASCII characters will be automatically
# converted to XML entities if you choose ASCII.
OUTPUT = 'ascii'

# PyTextile can optionally validate the generated
# XHTML code. We can use either mxTidy or uTidyLib.
# You can change the default behaviour here.
VALIDATE = 0

# If you want h1. to be translated to something other
# than 

, change this offset. You can also pass it # as an argument to textile(). HEAD_OFFSET = 0 # If you want to use itex2mml, specify the full path # to the binary here. You can download it from here: # http://golem.ph.utexas.edu/~distler/blog/files/itexToMML.tar.gz itex2mml = None #itex2mml = '/usr/local/bin/itex2MML' #itex2mml = '/usr/people/almeida/bin/itex2MML' # PyTextile can optionally sanitize the generated XHTML, # which is good for weblog comments or if you don't trust # yourself. SANITIZE = 0 # Turn debug on? DEBUGLEVEL = 0 # Amazon associate for links: "keywords":amazon # If you don't have one, please consider leaving mine here as # a small compensation for writing PyTextile. It's commented # off as default. #amazon_associate_id = 'bomtempo-21' amazon_associate_id = None #AMAZON = 'www.amazon.co.uk' AMAZON = 'www.amazon.com' import re import sys import os import sgmllib import unicodedata def _in_tag(text, tag): """Extracts text from inside a tag. This function extracts the text from inside a given tag. It's useful to get the text between or
 when using the validators or the colorizer.
    """
    if text.count('<%s' % tag):
        text = text.split('<%s' % tag, 1)[1]
        if text.count('>'):
            text = text.split('>', 1)[1]
    if text.count('

from input. code = _in_tag(code_out.getvalue(), 'pre') # Fix newlines. code = code.replace('\n', '\n') return code except ImportError: htmlizer = None # PyTextile can optionally validate the generated # XHTML code using either mxTidy or uTidyLib. try: # This is mxTidy. from mx.Tidy import Tidy def _tidy1(text): """mxTidy's XHTML validator. This function is a wrapper to mxTidy's validator. """ nerrors, nwarnings, text, errortext = Tidy.tidy(text, output_xhtml=1, numeric_entities=1, wrap=0) return _in_tag(text, 'body') _tidy = _tidy1 except ImportError: try: # This is uTidyLib. import tidy def _tidy2(text): """uTidyLib's XHTML validator. This function is a wrapper to uTidyLib's validator. """ text = tidy.parseString(text, output_xhtml=1, add_xml_decl=0, indent=0, tidy_mark=0) return _in_tag(str(text), 'body') _tidy = _tidy2 except ImportError: _tidy = None # This is good for debugging. def _debug(s, level=1): """Outputs debug information to sys.stderr. This function outputs debug information if DEBUGLEVEL is higher than a given treshold. """ if DEBUGLEVEL >= level: print >> sys.stderr, s ############################# # Useful regular expressions. parameters = { # Horizontal alignment. 'align': r'''(?:(?:<>|[<>=]) # Either '<>', '<', '>' or '=' (?![^\s]*(?:<>|[<>=]))) # Look-ahead to ensure it happens once ''', # Horizontal padding. 'padding': r'''(?:[\(\)]+) # Any number of '(' and/or ')' ''', # Class and/or id. 'classid': r'''( # (?:\(\#[\w]+\)) # (#id) | # (?:\((?:[\w]+(?:\s[\w]+)*) # (?:\#[\w]+)?\)) # (class1 class2 ... classn#id) or (class1 class2 ... classn) ) # (?![^\s]*(?:\([\w#]+\))) # must happen once ''', # Language. 'lang': r'''(?:\[[\w-]+\]) # [lang] (?![^\s]*(?:\[.*?\])) # must happen once ''', # Style. 'style': r'''(?:{[^\}]+}) # {style} (?![^\s]*(?:{.*?})) # must happen once ''', } res = { # Punctuation. 'punct': r'''[\!"#\$%&'()\*\+,\-\./:;<=>\?@\[\\\]\^_`{\|}\~]''', # URL regular expression. 'url': r'''(?=[a-zA-Z0-9./#]) # Must start correctly (?: # Match the leading part (proto://hostname, or just hostname) (?:ftp|https?|telnet|nntp) # protocol :// # :// (?: # Optional 'username:password@' \w+ # username (?::\w+)? # optional :password @ # @ )? # [-\w]+(?:\.\w[-\w]*)+ # hostname (sub.example.com) | # (?:mailto:)? # Optional mailto: [-\+\w]+ # username \@ # at [-\w]+(?:\.\w[-\w]*)+ # hostname | # (?:[a-z0-9](?:[-a-z0-9]*[a-z0-9])?\.)+ # domain without protocol (?:com\b # TLD | edu\b # | biz\b # | gov\b # | in(?:t|fo)\b # .int or .info | mil\b # | net\b # | org\b # | museum\b # | aero\b # | coop\b # | name\b # | pro\b # | [a-z][a-z]\b # two-letter country codes ) # )? # (?::\d+)? # Optional port number (?: # Rest of the URL, optional /? # Start with '/' [^.!,?;:"'<>()\[\]{}\s\x7F-\xFF]* # Can't start with these (?: # [.!,?;:]+ # One or more of these [^.!,?;:"'<>()\[\]{}\s\x7F-\xFF]+ # Can't finish with these #'" # # or ' or " )* # )? # ''', # Block attributes. 'battr': r'''(?P # (?: %(align)s # alignment | %(classid)s # class and/or id | %(padding)s # padding tags | %(lang)s # [lang] | %(style)s # {style} )+ # )? # ''' % parameters, # (Un)ordered list attributes. 'olattr': r'''(?P # (?: %(align)s # alignment | ((?:\(\#[\w]+\)) # (#id) | # (?:\((?:[\w]+(?:\s[\w]+)*) # (?:\#[\w]+)?\)) # (class1 class2 ... classn#id) or (class1 class2 ... classn) ) # | %(padding)s # padding tags | %(lang)s # [lang] | %(style)s # {style} )+ # )? # ''' % parameters, # List item attributes. 'liattr': r'''(?P # (?: %(align)s # alignment | %(classid)s # class and/or id | %(padding)s # padding tags | %(lang)s # [lang] | %(style)s # {style} )+ # )? # ''' % parameters, # Qtag attributes. 'qattr': r'''(?P # (?: %(classid)s # class and/or id | %(lang)s # [lang] | %(style)s # {style} )+ # )? # ''' % parameters, # Link attributes. 'lattr': r'''(?P # Links attributes (?: %(align)s # alignment | %(classid)s # class and/or id | %(lang)s # [lang] | %(style)s # {style} )+ # )? # ''' % parameters, # Image attributes. 'iattr': r'''(?P # (?: # (?: [<>]+ # horizontal alignment tags (?![^\s]*(?:[<>]))) # (must happen once) | # (?: [\-\^~]+ # vertical alignment tags (?![^\s]*(?:[\-\^~]))) # (must happen once) | %(classid)s # class and/or id | %(padding)s # padding tags | %(style)s # {style} )+ # )? # ''' % parameters, # Resize attributes. 'resize': r'''(?: # (?:([\d]+%?)x([\d]+%?)) # 20x10 | # (?: # or (?:([\d]+)%?w\s([\d]+)%?h) # 10h 20w | # or (?:([\d]+)%?h\s([\d]+)%?w) # 20w 10h ) # )? # ''', # Table attributes. 'tattr': r'''(?P # (?: # (?: [\^~] # vertical alignment (?![^\s]*(?:[\^~]))) # (must happen once) | %(align)s # alignment | %(lang)s # [lang] | %(style)s # {style} | %(classid)s # class and/or id | %(padding)s # padding | _ # is this a header row/cell? | \\\d+ # colspan | /\d+ # rowspan )+ # )? # ''' % parameters, } def preg_replace(pattern, replacement, text): """Alternative re.sub that handles empty groups. This acts like re.sub, except it replaces empty groups with '' instead of raising an exception. """ def replacement_func(matchobj): counter = 1 rc = replacement _debug(matchobj.groups()) for matchitem in matchobj.groups(): if not matchitem: matchitem = '' rc = rc.replace(r'\%s' % counter, matchitem) counter += 1 return rc p = re.compile(pattern) _debug(pattern) return p.sub(replacement_func, text) def html_replace(pattern, replacement, text): """Replacement outside HTML tags. Does a preg_replace only outside HTML tags. """ # If there is no html, do a simple search and replace. if not re.search(r'''<.*>''', text): return preg_replace(pattern, replacement, text) else: lines = [] # Else split the text into an array at <>. for line in re.split('(<.*?>)', text): if not re.match('<.*?>', line): line = preg_replace(pattern, replacement, line) lines.append(line) return ''.join(lines) # PyTextile can optionally sanitize the generated XHTML, # which is good for weblog comments. This code is from # Mark Pilgrim's feedparser. class _BaseHTMLProcessor(sgmllib.SGMLParser): elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param'] def __init__(self): sgmllib.SGMLParser.__init__(self) def reset(self): self.pieces = [] sgmllib.SGMLParser.reset(self) def normalize_attrs(self, attrs): # utility method to be called by descendants attrs = [(k.lower(), sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v).strip()) for k, v in attrs] attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] return attrs def unknown_starttag(self, tag, attrs): # called for each start tag # attrs is a list of (attr, value) tuples # e.g. for
, tag="pre", attrs=[("class", "screen")]
        strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
        if tag in self.elements_no_end_tag:
            self.pieces.append("<%(tag)s%(strattrs)s />" % locals())
        else:
            self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
        
    def unknown_endtag(self, tag):
        # called for each end tag, e.g. for 
, tag will be "pre" # Reconstruct the original end tag. if tag not in self.elements_no_end_tag: self.pieces.append("" % locals()) def handle_charref(self, ref): # called for each character reference, e.g. for " ", ref will be "160" # Reconstruct the original character reference. self.pieces.append("&#%(ref)s;" % locals()) def handle_entityref(self, ref): # called for each entity reference, e.g. for "©", ref will be "copy" # Reconstruct the original entity reference. self.pieces.append("&%(ref)s;" % locals()) def handle_data(self, text): # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references # Store the original text verbatim. self.pieces.append(text) def handle_comment(self, text): # called for each HTML comment, e.g. # Reconstruct the original comment. self.pieces.append("" % locals()) def handle_pi(self, text): # called for each processing instruction, e.g. # Reconstruct original processing instruction. self.pieces.append("" % locals()) def handle_decl(self, text): # called for the DOCTYPE, if present, e.g. # # Reconstruct original DOCTYPE self.pieces.append("" % locals()) def output(self): """Return processed HTML as a single string""" return "".join(self.pieces) class _HTMLSanitizer(_BaseHTMLProcessor): acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var'] acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width'] unacceptable_elements_with_end_tag = ['script', 'applet'] # This if for MathML. mathml_elements = ['math', 'mi', 'mn', 'mo', 'mrow', 'msup'] mathml_attributes = ['mode', 'xmlns'] acceptable_elements = acceptable_elements + mathml_elements acceptable_attributes = acceptable_attributes + mathml_attributes def reset(self): _BaseHTMLProcessor.reset(self) self.unacceptablestack = 0 def unknown_starttag(self, tag, attrs): if not tag in self.acceptable_elements: if tag in self.unacceptable_elements_with_end_tag: self.unacceptablestack += 1 return attrs = self.normalize_attrs(attrs) attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes] _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) def unknown_endtag(self, tag): if not tag in self.acceptable_elements: if tag in self.unacceptable_elements_with_end_tag: self.unacceptablestack -= 1 return _BaseHTMLProcessor.unknown_endtag(self, tag) def handle_pi(self, text): pass def handle_decl(self, text): pass def handle_data(self, text): if not self.unacceptablestack: _BaseHTMLProcessor.handle_data(self, text) class Textiler: """Textile formatter. This is the base class for the PyTextile text processor. """ def __init__(self, text=''): """Instantiate the class, passing the text to be formatted. Here we pre-process the text and collect all the link lookups for later. """ self.text = text # Basic regular expressions. self.res = res # Smart searches. self.searches = {} self.searches['imdb'] = 'http://www.imdb.com/Find?for=%s' self.searches['google'] = 'http://www.google.com/search?q=%s' self.searches['python'] = 'http://www.python.org/doc/current/lib/module-%s.html' if amazon_associate_id: self.searches['isbn'] = ''.join(['http://', AMAZON, '/exec/obidos/ASIN/%s/', amazon_associate_id]) self.searches['amazon'] = ''.join(['http://', AMAZON, '/exec/obidos/external-search?mode=blended&keyword=%s&tag=', amazon_associate_id]) else: self.searches['isbn'] = ''.join(['http://', AMAZON, '/exec/obidos/ASIN/%s']) self.searches['amazon'] = ''.join(['http://', AMAZON, '/exec/obidos/external-search?mode=blended&keyword=%s']) # These are the blocks we know. self.signatures = [ # Paragraph. (r'''^p # Paragraph signature %(battr)s # Paragraph attributes (?P\.) # . (?P\.)? # Extended paragraph denoted by a second dot \s # whitespace (?P.*) # text ''' % self.res, self.paragraph), # Pre-formatted text. (r'''^pre # Pre signature %(battr)s # Pre attributes (?P\.) # . (?P\.)? # Extended pre denoted by a second dot \s # whitespace (?P.*) # text ''' % self.res, self.pre), # Block code. (r'''^bc # Blockcode signature %(battr)s # Blockcode attributes (?P\.) # . (?P\.)? # Extended blockcode denoted by a second dot \s # whitespace (?P.*) # text ''' % self.res, self.bc), # Blockquote. (r'''^bq # Blockquote signature %(battr)s # Blockquote attributes (?P\.) # . (?P\.)? # Extended blockquote denoted by a second dot (:(?P # Optional cite attribute ( # %(url)s # URL | "[\w]+(?:\s[\w]+)*" # "Name inside quotes" )) # )? # \s # whitespace (?P.*) # text ''' % self.res, self.blockquote), # Header. (r'''^h # Header signature (?P
\d) # Header number %(battr)s # Header attributes (?P\.) # . (?P\.)? # Extended header denoted by a second dot \s # whitespace (?P.*) # text ''' % self.res, self.header), # Footnote. (r'''^fn # Footnote signature (?P[\d]+) # Footnote number (?P\.) # . (?P\.)? # Extended footnote denoted by a second dot \s # whitespace (?P.*) # text ''', self.footnote), # Definition list. (r'''^dl # Definition list signature %(battr)s # Definition list attributes (?P\.) # . (?P\.)? # Extended definition list denoted by a second dot \s # whitespace (?P.*) # text ''' % self.res, self.dl), # Ordered list (attributes to first
  • ). (r'''^%(olattr)s # Ordered list attributes \# # Ordered list signature %(liattr)s # List item attributes (?P\.)? # . \s # whitespace (?P.*) # text ''' % self.res, self.ol), # Unordered list (attributes to first
  • ). (r'''^%(olattr)s # Unrdered list attributes \* # Unordered list signature %(liattr)s # Unordered list attributes (?P\.)? # . \s # whitespace (?P.*) # text ''' % self.res, self.ul), # Escaped text. (r'''^==?(?P.*?)(==)?$ # Escaped text ''', self.escape), (r'''^(?P<.*)$ # XHTML tag ''', self.escape), # itex code. (r'''^(?P # itex code \\\[ # starts with \[ .*? # complicated mathematical equations go here \\\]) # ends with \] ''', self.itex), # Tables. (r'''^table # Table signature %(tattr)s # Table attributes (?P\.) # . (?P\.)? # Extended blockcode denoted by a second dot \s # whitespace (?P.*) # text ''' % self.res, self.table), # Simple tables. (r'''^(?P \| .*) ''', self.table), # About. (r'''^(?Ptell\sme\sabout\stextile\.)$''', self.about), ] def preprocess(self): """Pre-processing of the text. Remove whitespace, fix carriage returns. """ # Remove whitespace. self.text = self.text.strip() # Zap carriage returns. self.text = self.text.replace("\r\n", "\n") self.text = self.text.replace("\r", "\n") # Minor sanitizing. self.text = self.sanitize(self.text) def grab_links(self): """Grab link lookups. Check the text for link lookups, store them in a dictionary, and clean them up. """ # Grab links like this: '[id]example.com' links = {} p = re.compile(r'''(?:^|\n)\[([\w]+?)\](%(url)s)(?:$|\n)''' % self.res, re.VERBOSE) for key, link in p.findall(self.text): links[key] = link # And clear them from the text. self.text = p.sub('', self.text) return links def process(self, head_offset=HEAD_OFFSET, validate=VALIDATE, sanitize=SANITIZE, output=OUTPUT, encoding=ENCODING): """Process the text. Here we actually process the text, splitting the text in blocks and applying the corresponding function to each one of them. """ # Basic global changes. self.preprocess() # Grab lookup links and clean them from the text. self._links = self.grab_links() # Offset for the headers. self.head_offset = head_offset # Process each block. self.blocks = self.split_text() text = [] for [function, captures] in self.blocks: text.append(function(**captures)) text = '\n\n'.join(text) # Add titles to footnotes. text = self.footnotes(text) # Convert to desired output. text = unicode(text, encoding) text = text.encode(output, 'xmlcharrefreplace') # Sanitize? if sanitize: p = _HTMLSanitizer() p.feed(text) text = p.output() # Validate output. if _tidy and validate: text = _tidy(text) return text def sanitize(self, text): """Fix single tags. Fix tags like ,
    and
    . --- h1. Sanitizing Textile can help you generate valid XHTML(eXtensible HyperText Markup Language). It will fix any single tags that are not properly closed, like @@, @
    @ and @
    @. If you have "mx.Tidy":http://www.egenix.com/files/python/mxTidy.html and/or "µTidyLib":http://utidylib.sourceforge.net/ installed, it also can optionally validade the generated code with these wrappers to ensure 100% valid XHTML(eXtensible HyperText Markup Language). """ # Fix single tags like and
    . text = preg_replace(r'''<(img|br|hr)(.*?)(?:\s*/?\s*)?>''', r'''<\1\2 />''', text) # Remove ampersands. text = preg_replace(r'''&(?!#?[xX]?(?:[0-9a-fA-F]+|\w{1,8});)''', r'''&''', text) return text def split_text(self): """Process the blocks from the text. Split the blocks according to the signatures, join extended blocks and associate each one of them with a function to process them. --- h1. Blocks Textile process your text by dividing it in blocks. Each block is identified by a signature and separated from other blocks by an empty line. All signatures should end with a period followed by a space. A header @

    @ can be done this way: pre. h1. This is a header 1. Blocks may continue for multiple paragraphs of text. If you want a block signature to stay "active", use two periods after the signature instead of one. For example: pre.. bq.. This is paragraph one of a block quote. This is paragraph two of a block quote. =p. Now we're back to a regular paragraph. p. Becomes: pre..

    This is paragraph one of a block quote.

    This is paragraph two of a block quote.

    Now we’re back to a regular paragraph.

    p. The blocks can be customised by adding parameters between the signature and the period. These include: dl. {style rule}:A CSS(Cascading Style Sheets) style rule. [ll]:A language identifier (for a "lang" attribute). (class) or (#id) or (class#id):For CSS(Cascading Style Sheets) class and id attributes. >, <, =, <>:Modifier characters for alignment. Right-justification, left-justification, centered, and full-justification. The paragraph will also receive the class names "right", "left", "center" and "justify", respectively. ( (one or more):Adds padding on the left. 1em per "(" character is applied. When combined with the align-left or align-right modifier, it makes the block float. ) (one or more):Adds padding on the right. 1em per ")" character is applied. When combined with the align-left or align-right modifier, it makes the block float. Here's an overloaded example: pre. p(())>(class#id)[en]{color:red}. A simple paragraph. Becomes: pre.

    A simple paragraph.

    """ # Clear signature. clear_sig = r'''^clear(?P[<>])?\.$''' clear = None extending = 0 # We capture the \n's because they are important inside "pre..". blocks = re.split(r'''(\n{2,})''', self.text) output = [] for block in blocks: # Check for the clear signature. m = re.match(clear_sig, block) if m: clear = m.group('alignment') if clear: clear = {'<': 'clear:left;', '>': 'clear:right;'}[clear] else: clear = 'clear:both;' else: # Check each of the code signatures. for regexp, function in self.signatures: p = re.compile(regexp, (re.VERBOSE | re.DOTALL)) m = p.match(block) if m: # Put everything in a dictionary. captures = m.groupdict() # If we are extending a block, we require a dot to # break it, so we can start lines with '#' inside # an extended
     without matching an ordered list.
                            if extending and not captures.get('dot', None):
                                output[-1][1]['text'] += block
                                break 
                            elif captures.has_key('dot'):
                                del captures['dot']
                                
                            # If a signature matches, we are not extending a block.
                            extending = 0
    
                            # Check if we should extend this block.
                            if captures.has_key('extend'):
                                extending = captures['extend']
                                del captures['extend']
                                
                            # Apply head_offset.
                            if captures.has_key('header'):
                                captures['header'] = int(captures['header']) + self.head_offset
    
                            # Apply clear.
                            if clear:
                                captures['clear'] = clear
                                clear = None
    
                            # Save the block to be processed later.
                            output.append([function, captures])
    
                            break
    
                    else:
                        if extending:
                            # Append the text to the last block.
                            output[-1][1]['text'] += block
                        elif block.strip():
                            output.append([self.paragraph, {'text': block}])
        
            return output
    
    
        def parse_params(self, parameters, clear=None, align_type='block'):
            """Parse the parameters from a block signature.
    
            This function parses the parameters from a block signature,
            splitting the information about class, id, language and
            style. The positioning (indentation and alignment) is parsed
            and stored in the style.
    
            A paragraph like:
    
                p>(class#id){color:red}[en]. Paragraph.
    
            or:
                
                p{color:red}[en](class#id)>. Paragraph.
    
            will have its parameters parsed to:
    
                output = {'lang' : 'en',
                          'class': 'class',
                          'id'   : 'id',
                          'style': 'color:red;text-align:right;'}
    
            Note that order is not important.
            """
            if not parameters:
                if clear:
                    return {'style': clear}
                else:
                    return {}
    
            output = {}
            
            # Match class from (class) or (class#id).
            m = re.search(r'''\((?P[\w]+(\s[\w]+)*)(\#[\w]+)?\)''', parameters)
            if m: output['class'] = m.group('class')
    
            # Match id from (#id) or (class#id).
            m = re.search(r'''\([\w]*(\s[\w]+)*\#(?P[\w]+)\)''', parameters)
            if m: output['id'] = m.group('id')
    
            # Match [language].
            m = re.search(r'''\[(?P[\w-]+)\]''', parameters)
            if m: output['lang'] = m.group('lang')
    
            # Match {style}.
            m = re.search(r'''{(?P