source: OpenRLabs-Git/deploy/rlabs-docker/web2py-rlabs/gluon/contrib/markdown/markdown2.py

main
Last change on this file was 42bd667, checked in by David Fuertes <dfuertes@…>, 4 years ago

Historial Limpio

  • Property mode set to 100755
File size: 96.7 KB
Line 
1#!/usr/bin/env python
2# Copyright (c) 2012 Trent Mick.
3# Copyright (c) 2007-2008 ActiveState Corp.
4# License: MIT (http://www.opensource.org/licenses/mit-license.php)
5
6from __future__ import generators
7from __future__ import print_function
8
9r"""A fast and complete Python implementation of Markdown.
10
11[from http://daringfireball.net/projects/markdown/]
12> Markdown is a text-to-HTML filter; it translates an easy-to-read /
13> easy-to-write structured text format into HTML.  Markdown's text
14> format is most similar to that of plain text email, and supports
15> features such as headers, *emphasis*, code blocks, blockquotes, and
16> links.
17>
18> Markdown's syntax is designed not as a generic markup language, but
19> specifically to serve as a front-end to (X)HTML. You can use span-level
20> HTML tags anywhere in a Markdown document, and you can use block level
21> HTML tags (like <div> and <table> as well).
22
23Module usage:
24
25    >>> import markdown2
26    >>> markdown2.markdown("*boo!*")  # or use `html = markdown_path(PATH)`
27    u'<p><em>boo!</em></p>\n'
28
29    >>> markdowner = Markdown()
30    >>> markdowner.convert("*boo!*")
31    u'<p><em>boo!</em></p>\n'
32    >>> markdowner.convert("**boom!**")
33    u'<p><strong>boom!</strong></p>\n'
34
35This implementation of Markdown implements the full "core" syntax plus a
36number of extras (e.g., code syntax coloring, footnotes) as described on
37<https://github.com/trentm/python-markdown2/wiki/Extras>.
38"""
39
40cmdln_desc = """A fast and complete Python implementation of Markdown, a
41text-to-HTML conversion tool for web writers.
42
43Supported extra syntax options (see -x|--extras option below and
44see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
45
46* code-friendly: Disable _ and __ for em and strong.
47* cuddled-lists: Allow lists to be cuddled to the preceding paragraph.
48* fenced-code-blocks: Allows a code block to not have to be indented
49  by fencing it with '```' on a line before and after. Based on
50  <http://github.github.com/github-flavored-markdown/> with support for
51  syntax highlighting.
52* footnotes: Support footnotes as in use on daringfireball.net and
53  implemented in other Markdown processors (tho not in Markdown.pl v1.0.1).
54* header-ids: Adds "id" attributes to headers. The id value is a slug of
55  the header text.
56* html-classes: Takes a dict mapping html tag names (lowercase) to a
57  string to use for a "class" tag attribute. Currently only supports "img",
58  "table", "pre" and "code" tags. Add an issue if you require this for other
59  tags.
60* markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to
61  have markdown processing be done on its contents. Similar to
62  <http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with
63  some limitations.
64* metadata: Extract metadata from a leading '---'-fenced block.
65  See <https://github.com/trentm/python-markdown2/issues/77> for details.
66* nofollow: Add `rel="nofollow"` to add `<a>` tags with an href. See
67  <http://en.wikipedia.org/wiki/Nofollow>.
68* pyshell: Treats unindented Python interactive shell sessions as <code>
69  blocks.
70* link-patterns: Auto-link given regex patterns in text (e.g. bug number
71  references, revision number references).
72* smarty-pants: Replaces ' and " with curly quotation marks or curly
73  apostrophes.  Replaces --, ---, ..., and . . . with en dashes, em dashes,
74  and ellipses.
75* spoiler: A special kind of blockquote commonly hidden behind a
76  click on SO. Syntax per <http://meta.stackexchange.com/a/72878>.
77* toc: The returned HTML string gets a new "toc_html" attribute which is
78  a Table of Contents for the document. (experimental)
79* xml: Passes one-liner processing instructions and namespaced XML tags.
80* tables: Tables using the same format as GFM
81  <https://help.github.com/articles/github-flavored-markdown#tables> and
82  PHP-Markdown Extra <https://michelf.ca/projects/php-markdown/extra/#table>.
83* wiki-tables: Google Code Wiki-style tables. See
84  <http://code.google.com/p/support/wiki/WikiSyntax#Tables>.
85"""
86
87# Dev Notes:
88# - Python's regex syntax doesn't have '\z', so I'm using '\Z'. I'm
89#   not yet sure if there implications with this. Compare 'pydoc sre'
90#   and 'perldoc perlre'.
91
92__version_info__ = (2, 3, 1)
93__version__ = '.'.join(map(str, __version_info__))
94__author__ = "Trent Mick"
95
96import sys
97import re
98import logging
99try:
100    from hashlib import md5
101except ImportError:
102    from md5 import md5
103import optparse
104from random import random, randint
105import codecs
106
107
108#---- Python version compat
109
110if sys.version_info[:2] < (2,4):
111    def reversed(sequence):
112        for i in sequence[::-1]:
113            yield i
114
115# Use `bytes` for byte strings and `unicode` for unicode strings (str in Py3).
116if sys.version_info[0] <= 2:
117    py3 = False
118    try:
119        bytes
120    except NameError:
121        bytes = str
122    base_string_type = basestring
123elif sys.version_info[0] >= 3:
124    py3 = True
125    unicode = str
126    base_string_type = str
127
128
129
130#---- globals
131
132DEBUG = False
133log = logging.getLogger("markdown")
134
135DEFAULT_TAB_WIDTH = 4
136
137
138SECRET_SALT = bytes(randint(0, 1000000))
139def _hash_text(s):
140    return 'md5-' + md5(SECRET_SALT + s.encode("utf-8")).hexdigest()
141
142# Table of hash values for escaped characters:
143g_escape_table = dict([(ch, _hash_text(ch))
144    for ch in '\\`*_{}[]()>#+-.!'])
145
146
147
148#---- exceptions
149
150class MarkdownError(Exception):
151    pass
152
153
154
155#---- public api
156
157def markdown_path(path, encoding="utf-8",
158                  html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
159                  safe_mode=None, extras=None, link_patterns=None,
160                  use_file_vars=False):
161    fp = codecs.open(path, 'r', encoding)
162    text = fp.read()
163    fp.close()
164    return Markdown(html4tags=html4tags, tab_width=tab_width,
165                    safe_mode=safe_mode, extras=extras,
166                    link_patterns=link_patterns,
167                    use_file_vars=use_file_vars).convert(text)
168
169def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
170             safe_mode=None, extras=None, link_patterns=None,
171             use_file_vars=False):
172    return Markdown(html4tags=html4tags, tab_width=tab_width,
173                    safe_mode=safe_mode, extras=extras,
174                    link_patterns=link_patterns,
175                    use_file_vars=use_file_vars).convert(text)
176
177class Markdown(object):
178    # The dict of "extras" to enable in processing -- a mapping of
179    # extra name to argument for the extra. Most extras do not have an
180    # argument, in which case the value is None.
181    #
182    # This can be set via (a) subclassing and (b) the constructor
183    # "extras" argument.
184    extras = None
185
186    urls = None
187    titles = None
188    html_blocks = None
189    html_spans = None
190    html_removed_text = "[HTML_REMOVED]"  # for compat with markdown.py
191
192    # Used to track when we're inside an ordered or unordered list
193    # (see _ProcessListItems() for details):
194    list_level = 0
195
196    _ws_only_line_re = re.compile(r"^[ \t]+$", re.M)
197
198    def __init__(self, html4tags=False, tab_width=4, safe_mode=None,
199                 extras=None, link_patterns=None, use_file_vars=False):
200        if html4tags:
201            self.empty_element_suffix = ">"
202        else:
203            self.empty_element_suffix = " />"
204        self.tab_width = tab_width
205
206        # For compatibility with earlier markdown2.py and with
207        # markdown.py's safe_mode being a boolean,
208        #   safe_mode == True -> "replace"
209        if safe_mode is True:
210            self.safe_mode = "replace"
211        else:
212            self.safe_mode = safe_mode
213
214        # Massaging and building the "extras" info.
215        if self.extras is None:
216            self.extras = {}
217        elif not isinstance(self.extras, dict):
218            self.extras = dict([(e, None) for e in self.extras])
219        if extras:
220            if not isinstance(extras, dict):
221                extras = dict([(e, None) for e in extras])
222            self.extras.update(extras)
223        assert isinstance(self.extras, dict)
224        if "toc" in self.extras and not "header-ids" in self.extras:
225            self.extras["header-ids"] = None   # "toc" implies "header-ids"
226        self._instance_extras = self.extras.copy()
227
228        self.link_patterns = link_patterns
229        self.use_file_vars = use_file_vars
230        self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M)
231
232        self._escape_table = g_escape_table.copy()
233        if "smarty-pants" in self.extras:
234            self._escape_table['"'] = _hash_text('"')
235            self._escape_table["'"] = _hash_text("'")
236
237    def reset(self):
238        self.urls = {}
239        self.titles = {}
240        self.html_blocks = {}
241        self.html_spans = {}
242        self.list_level = 0
243        self.extras = self._instance_extras.copy()
244        if "footnotes" in self.extras:
245            self.footnotes = {}
246            self.footnote_ids = []
247        if "header-ids" in self.extras:
248            self._count_from_header_id = {} # no `defaultdict` in Python 2.4
249        if "metadata" in self.extras:
250            self.metadata = {}
251
252    # Per <https://developer.mozilla.org/en-US/docs/HTML/Element/a> "rel"
253    # should only be used in <a> tags with an "href" attribute.
254    _a_nofollow = re.compile(r"<(a)([^>]*href=)", re.IGNORECASE)
255
256    def convert(self, text):
257        """Convert the given text."""
258        # Main function. The order in which other subs are called here is
259        # essential. Link and image substitutions need to happen before
260        # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
261        # and <img> tags get encoded.
262
263        # Clear the global hashes. If we don't clear these, you get conflicts
264        # from other articles when generating a page which contains more than
265        # one article (e.g. an index page that shows the N most recent
266        # articles):
267        self.reset()
268
269        if not isinstance(text, unicode):
270            #TODO: perhaps shouldn't presume UTF-8 for string input?
271            text = unicode(text, 'utf-8')
272
273        if self.use_file_vars:
274            # Look for emacs-style file variable hints.
275            emacs_vars = self._get_emacs_vars(text)
276            if "markdown-extras" in emacs_vars:
277                splitter = re.compile("[ ,]+")
278                for e in splitter.split(emacs_vars["markdown-extras"]):
279                    if '=' in e:
280                        ename, earg = e.split('=', 1)
281                        try:
282                            earg = int(earg)
283                        except ValueError:
284                            pass
285                    else:
286                        ename, earg = e, None
287                    self.extras[ename] = earg
288
289        # Standardize line endings:
290        text = re.sub("\r\n|\r", "\n", text)
291
292        # Make sure $text ends with a couple of newlines:
293        text += "\n\n"
294
295        # Convert all tabs to spaces.
296        text = self._detab(text)
297
298        # Strip any lines consisting only of spaces and tabs.
299        # This makes subsequent regexen easier to write, because we can
300        # match consecutive blank lines with /\n+/ instead of something
301        # contorted like /[ \t]*\n+/ .
302        text = self._ws_only_line_re.sub("", text)
303
304        # strip metadata from head and extract
305        if "metadata" in self.extras:
306            text = self._extract_metadata(text)
307
308        text = self.preprocess(text)
309
310        if "fenced-code-blocks" in self.extras and not self.safe_mode:
311            text = self._do_fenced_code_blocks(text)
312
313        if self.safe_mode:
314            text = self._hash_html_spans(text)
315
316        # Turn block-level HTML blocks into hash entries
317        text = self._hash_html_blocks(text, raw=True)
318
319        if "fenced-code-blocks" in self.extras and self.safe_mode:
320            text = self._do_fenced_code_blocks(text)
321
322        # Strip link definitions, store in hashes.
323        if "footnotes" in self.extras:
324            # Must do footnotes first because an unlucky footnote defn
325            # looks like a link defn:
326            #   [^4]: this "looks like a link defn"
327            text = self._strip_footnote_definitions(text)
328        text = self._strip_link_definitions(text)
329
330        text = self._run_block_gamut(text)
331
332        if "footnotes" in self.extras:
333            text = self._add_footnotes(text)
334
335        text = self.postprocess(text)
336
337        text = self._unescape_special_chars(text)
338
339        if self.safe_mode:
340            text = self._unhash_html_spans(text)
341
342        if "nofollow" in self.extras:
343            text = self._a_nofollow.sub(r'<\1 rel="nofollow"\2', text)
344
345        text += "\n"
346
347        rv = UnicodeWithAttrs(text)
348        if "toc" in self.extras:
349            rv._toc = self._toc
350        if "metadata" in self.extras:
351            rv.metadata = self.metadata
352        return rv
353
354    def postprocess(self, text):
355        """A hook for subclasses to do some postprocessing of the html, if
356        desired. This is called before unescaping of special chars and
357        unhashing of raw HTML spans.
358        """
359        return text
360
361    def preprocess(self, text):
362        """A hook for subclasses to do some preprocessing of the Markdown, if
363        desired. This is called after basic formatting of the text, but prior
364        to any extras, safe mode, etc. processing.
365        """
366        return text
367
368    # Is metadata if the content starts with '---'-fenced `key: value`
369    # pairs. E.g. (indented for presentation):
370    #   ---
371    #   foo: bar
372    #   another-var: blah blah
373    #   ---
374    _metadata_pat = re.compile("""^---[ \t]*\n((?:[ \t]*[^ \t:]+[ \t]*:[^\n]*\n)+)---[ \t]*\n""")
375
376    def _extract_metadata(self, text):
377        # fast test
378        if not text.startswith("---"):
379            return text
380        match = self._metadata_pat.match(text)
381        if not match:
382            return text
383
384        tail = text[len(match.group(0)):]
385        metadata_str = match.group(1).strip()
386        for line in metadata_str.split('\n'):
387            key, value = line.split(':', 1)
388            self.metadata[key.strip()] = value.strip()
389
390        return tail
391
392
393    _emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE)
394    # This regular expression is intended to match blocks like this:
395    #    PREFIX Local Variables: SUFFIX
396    #    PREFIX mode: Tcl SUFFIX
397    #    PREFIX End: SUFFIX
398    # Some notes:
399    # - "[ \t]" is used instead of "\s" to specifically exclude newlines
400    # - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does
401    #   not like anything other than Unix-style line terminators.
402    _emacs_local_vars_pat = re.compile(r"""^
403        (?P<prefix>(?:[^\r\n|\n|\r])*?)
404        [\ \t]*Local\ Variables:[\ \t]*
405        (?P<suffix>.*?)(?:\r\n|\n|\r)
406        (?P<content>.*?\1End:)
407        """, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)
408
409    def _get_emacs_vars(self, text):
410        """Return a dictionary of emacs-style local variables.
411
412        Parsing is done loosely according to this spec (and according to
413        some in-practice deviations from this):
414        http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables
415        """
416        emacs_vars = {}
417        SIZE = pow(2, 13) # 8kB
418
419        # Search near the start for a '-*-'-style one-liner of variables.
420        head = text[:SIZE]
421        if "-*-" in head:
422            match = self._emacs_oneliner_vars_pat.search(head)
423            if match:
424                emacs_vars_str = match.group(1)
425                assert '\n' not in emacs_vars_str
426                emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';')
427                                  if s.strip()]
428                if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]:
429                    # While not in the spec, this form is allowed by emacs:
430                    #   -*- Tcl -*-
431                    # where the implied "variable" is "mode". This form
432                    # is only allowed if there are no other variables.
433                    emacs_vars["mode"] = emacs_var_strs[0].strip()
434                else:
435                    for emacs_var_str in emacs_var_strs:
436                        try:
437                            variable, value = emacs_var_str.strip().split(':', 1)
438                        except ValueError:
439                            log.debug("emacs variables error: malformed -*- "
440                                      "line: %r", emacs_var_str)
441                            continue
442                        # Lowercase the variable name because Emacs allows "Mode"
443                        # or "mode" or "MoDe", etc.
444                        emacs_vars[variable.lower()] = value.strip()
445
446        tail = text[-SIZE:]
447        if "Local Variables" in tail:
448            match = self._emacs_local_vars_pat.search(tail)
449            if match:
450                prefix = match.group("prefix")
451                suffix = match.group("suffix")
452                lines = match.group("content").splitlines(0)
453                #print "prefix=%r, suffix=%r, content=%r, lines: %s"\
454                #      % (prefix, suffix, match.group("content"), lines)
455
456                # Validate the Local Variables block: proper prefix and suffix
457                # usage.
458                for i, line in enumerate(lines):
459                    if not line.startswith(prefix):
460                        log.debug("emacs variables error: line '%s' "
461                                  "does not use proper prefix '%s'"
462                                  % (line, prefix))
463                        return {}
464                    # Don't validate suffix on last line. Emacs doesn't care,
465                    # neither should we.
466                    if i != len(lines)-1 and not line.endswith(suffix):
467                        log.debug("emacs variables error: line '%s' "
468                                  "does not use proper suffix '%s'"
469                                  % (line, suffix))
470                        return {}
471
472                # Parse out one emacs var per line.
473                continued_for = None
474                for line in lines[:-1]: # no var on the last line ("PREFIX End:")
475                    if prefix: line = line[len(prefix):] # strip prefix
476                    if suffix: line = line[:-len(suffix)] # strip suffix
477                    line = line.strip()
478                    if continued_for:
479                        variable = continued_for
480                        if line.endswith('\\'):
481                            line = line[:-1].rstrip()
482                        else:
483                            continued_for = None
484                        emacs_vars[variable] += ' ' + line
485                    else:
486                        try:
487                            variable, value = line.split(':', 1)
488                        except ValueError:
489                            log.debug("local variables error: missing colon "
490                                      "in local variables entry: '%s'" % line)
491                            continue
492                        # Do NOT lowercase the variable name, because Emacs only
493                        # allows "mode" (and not "Mode", "MoDe", etc.) in this block.
494                        value = value.strip()
495                        if value.endswith('\\'):
496                            value = value[:-1].rstrip()
497                            continued_for = variable
498                        else:
499                            continued_for = None
500                        emacs_vars[variable] = value
501
502        # Unquote values.
503        for var, val in list(emacs_vars.items()):
504            if len(val) > 1 and (val.startswith('"') and val.endswith('"')
505               or val.startswith('"') and val.endswith('"')):
506                emacs_vars[var] = val[1:-1]
507
508        return emacs_vars
509
510    # Cribbed from a post by Bart Lateur:
511    # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
512    _detab_re = re.compile(r'(.*?)\t', re.M)
513    def _detab_sub(self, match):
514        g1 = match.group(1)
515        return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width))
516    def _detab(self, text):
517        r"""Remove (leading?) tabs from a file.
518
519            >>> m = Markdown()
520            >>> m._detab("\tfoo")
521            '    foo'
522            >>> m._detab("  \tfoo")
523            '    foo'
524            >>> m._detab("\t  foo")
525            '      foo'
526            >>> m._detab("  foo")
527            '  foo'
528            >>> m._detab("  foo\n\tbar\tblam")
529            '  foo\n    bar blam'
530        """
531        if '\t' not in text:
532            return text
533        return self._detab_re.subn(self._detab_sub, text)[0]
534
535    # I broke out the html5 tags here and add them to _block_tags_a and
536    # _block_tags_b.  This way html5 tags are easy to keep track of.
537    _html5tags = '|article|aside|header|hgroup|footer|nav|section|figure|figcaption'
538
539    _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del'
540    _block_tags_a += _html5tags
541
542    _strict_tag_block_re = re.compile(r"""
543        (                       # save in \1
544            ^                   # start of line  (with re.M)
545            <(%s)               # start tag = \2
546            \b                  # word break
547            (.*\n)*?            # any number of lines, minimally matching
548            </\2>               # the matching end tag
549            [ \t]*              # trailing spaces/tabs
550            (?=\n+|\Z)          # followed by a newline or end of document
551        )
552        """ % _block_tags_a,
553        re.X | re.M)
554
555    _block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math'
556    _block_tags_b += _html5tags
557
558    _liberal_tag_block_re = re.compile(r"""
559        (                       # save in \1
560            ^                   # start of line  (with re.M)
561            <(%s)               # start tag = \2
562            \b                  # word break
563            (.*\n)*?            # any number of lines, minimally matching
564            .*</\2>             # the matching end tag
565            [ \t]*              # trailing spaces/tabs
566            (?=\n+|\Z)          # followed by a newline or end of document
567        )
568        """ % _block_tags_b,
569        re.X | re.M)
570
571    _html_markdown_attr_re = re.compile(
572        r'''\s+markdown=("1"|'1')''')
573    def _hash_html_block_sub(self, match, raw=False):
574        html = match.group(1)
575        if raw and self.safe_mode:
576            html = self._sanitize_html(html)
577        elif 'markdown-in-html' in self.extras and 'markdown=' in html:
578            first_line = html.split('\n', 1)[0]
579            m = self._html_markdown_attr_re.search(first_line)
580            if m:
581                lines = html.split('\n')
582                middle = '\n'.join(lines[1:-1])
583                last_line = lines[-1]
584                first_line = first_line[:m.start()] + first_line[m.end():]
585                f_key = _hash_text(first_line)
586                self.html_blocks[f_key] = first_line
587                l_key = _hash_text(last_line)
588                self.html_blocks[l_key] = last_line
589                return ''.join(["\n\n", f_key,
590                    "\n\n", middle, "\n\n",
591                    l_key, "\n\n"])
592        key = _hash_text(html)
593        self.html_blocks[key] = html
594        return "\n\n" + key + "\n\n"
595
596    def _hash_html_blocks(self, text, raw=False):
597        """Hashify HTML blocks
598
599        We only want to do this for block-level HTML tags, such as headers,
600        lists, and tables. That's because we still want to wrap <p>s around
601        "paragraphs" that are wrapped in non-block-level tags, such as anchors,
602        phrase emphasis, and spans. The list of tags we're looking for is
603        hard-coded.
604
605        @param raw {boolean} indicates if these are raw HTML blocks in
606            the original source. It makes a difference in "safe" mode.
607        """
608        if '<' not in text:
609            return text
610
611        # Pass `raw` value into our calls to self._hash_html_block_sub.
612        hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw)
613
614        # First, look for nested blocks, e.g.:
615        #   <div>
616        #       <div>
617        #       tags for inner block must be indented.
618        #       </div>
619        #   </div>
620        #
621        # The outermost tags must start at the left margin for this to match, and
622        # the inner nested divs must be indented.
623        # We need to do this before the next, more liberal match, because the next
624        # match will start at the first `<div>` and stop at the first `</div>`.
625        text = self._strict_tag_block_re.sub(hash_html_block_sub, text)
626
627        # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
628        text = self._liberal_tag_block_re.sub(hash_html_block_sub, text)
629
630        # Special case just for <hr />. It was easier to make a special
631        # case than to make the other regex more complicated.
632        if "<hr" in text:
633            _hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width)
634            text = _hr_tag_re.sub(hash_html_block_sub, text)
635
636        # Special case for standalone HTML comments:
637        if "<!--" in text:
638            start = 0
639            while True:
640                # Delimiters for next comment block.
641                try:
642                    start_idx = text.index("<!--", start)
643                except ValueError:
644                    break
645                try:
646                    end_idx = text.index("-->", start_idx) + 3
647                except ValueError:
648                    break
649
650                # Start position for next comment block search.
651                start = end_idx
652
653                # Validate whitespace before comment.
654                if start_idx:
655                    # - Up to `tab_width - 1` spaces before start_idx.
656                    for i in range(self.tab_width - 1):
657                        if text[start_idx - 1] != ' ':
658                            break
659                        start_idx -= 1
660                        if start_idx == 0:
661                            break
662                    # - Must be preceded by 2 newlines or hit the start of
663                    #   the document.
664                    if start_idx == 0:
665                        pass
666                    elif start_idx == 1 and text[0] == '\n':
667                        start_idx = 0  # to match minute detail of Markdown.pl regex
668                    elif text[start_idx-2:start_idx] == '\n\n':
669                        pass
670                    else:
671                        break
672
673                # Validate whitespace after comment.
674                # - Any number of spaces and tabs.
675                while end_idx < len(text):
676                    if text[end_idx] not in ' \t':
677                        break
678                    end_idx += 1
679                # - Must be following by 2 newlines or hit end of text.
680                if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'):
681                    continue
682
683                # Escape and hash (must match `_hash_html_block_sub`).
684                html = text[start_idx:end_idx]
685                if raw and self.safe_mode:
686                    html = self._sanitize_html(html)
687                key = _hash_text(html)
688                self.html_blocks[key] = html
689                text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:]
690
691        if "xml" in self.extras:
692            # Treat XML processing instructions and namespaced one-liner
693            # tags as if they were block HTML tags. E.g., if standalone
694            # (i.e. are their own paragraph), the following do not get
695            # wrapped in a <p> tag:
696            #    <?foo bar?>
697            #
698            #    <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/>
699            _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width)
700            text = _xml_oneliner_re.sub(hash_html_block_sub, text)
701
702        return text
703
704    def _strip_link_definitions(self, text):
705        # Strips link definitions from text, stores the URLs and titles in
706        # hash references.
707        less_than_tab = self.tab_width - 1
708
709        # Link defs are in the form:
710        #   [id]: url "optional title"
711        _link_def_re = re.compile(r"""
712            ^[ ]{0,%d}\[(.+)\]: # id = \1
713              [ \t]*
714              \n?               # maybe *one* newline
715              [ \t]*
716            <?(.+?)>?           # url = \2
717              [ \t]*
718            (?:
719                \n?             # maybe one newline
720                [ \t]*
721                (?<=\s)         # lookbehind for whitespace
722                ['"(]
723                ([^\n]*)        # title = \3
724                ['")]
725                [ \t]*
726            )?  # title is optional
727            (?:\n+|\Z)
728            """ % less_than_tab, re.X | re.M | re.U)
729        return _link_def_re.sub(self._extract_link_def_sub, text)
730
731    def _extract_link_def_sub(self, match):
732        id, url, title = match.groups()
733        key = id.lower()    # Link IDs are case-insensitive
734        self.urls[key] = self._encode_amps_and_angles(url)
735        if title:
736            self.titles[key] = title
737        return ""
738
739    def _extract_footnote_def_sub(self, match):
740        id, text = match.groups()
741        text = _dedent(text, skip_first_line=not text.startswith('\n')).strip()
742        normed_id = re.sub(r'\W', '-', id)
743        # Ensure footnote text ends with a couple newlines (for some
744        # block gamut matches).
745        self.footnotes[normed_id] = text + "\n\n"
746        return ""
747
748    def _strip_footnote_definitions(self, text):
749        """A footnote definition looks like this:
750
751            [^note-id]: Text of the note.
752
753                May include one or more indented paragraphs.
754
755        Where,
756        - The 'note-id' can be pretty much anything, though typically it
757          is the number of the footnote.
758        - The first paragraph may start on the next line, like so:
759
760            [^note-id]:
761                Text of the note.
762        """
763        less_than_tab = self.tab_width - 1
764        footnote_def_re = re.compile(r'''
765            ^[ ]{0,%d}\[\^(.+)\]:   # id = \1
766            [ \t]*
767            (                       # footnote text = \2
768              # First line need not start with the spaces.
769              (?:\s*.*\n+)
770              (?:
771                (?:[ ]{%d} | \t)  # Subsequent lines must be indented.
772                .*\n+
773              )*
774            )
775            # Lookahead for non-space at line-start, or end of doc.
776            (?:(?=^[ ]{0,%d}\S)|\Z)
777            ''' % (less_than_tab, self.tab_width, self.tab_width),
778            re.X | re.M)
779        return footnote_def_re.sub(self._extract_footnote_def_sub, text)
780
781    _hr_re = re.compile(r'^[ ]{0,3}([-_*][ ]{0,2}){3,}$', re.M)
782
783    def _run_block_gamut(self, text):
784        # These are all the transformations that form block-level
785        # tags like paragraphs, headers, and list items.
786
787        if "fenced-code-blocks" in self.extras:
788            text = self._do_fenced_code_blocks(text)
789
790        text = self._do_headers(text)
791
792        # Do Horizontal Rules:
793        # On the number of spaces in horizontal rules: The spec is fuzzy: "If
794        # you wish, you may use spaces between the hyphens or asterisks."
795        # Markdown.pl 1.0.1's hr regexes limit the number of spaces between the
796        # hr chars to one or two. We'll reproduce that limit here.
797        hr = "\n<hr"+self.empty_element_suffix+"\n"
798        text = re.sub(self._hr_re, hr, text)
799
800        text = self._do_lists(text)
801
802        if "pyshell" in self.extras:
803            text = self._prepare_pyshell_blocks(text)
804        if "wiki-tables" in self.extras:
805            text = self._do_wiki_tables(text)
806        if "tables" in self.extras:
807            text = self._do_tables(text)
808
809        text = self._do_code_blocks(text)
810
811        text = self._do_block_quotes(text)
812
813        # We already ran _HashHTMLBlocks() before, in Markdown(), but that
814        # was to escape raw HTML in the original Markdown source. This time,
815        # we're escaping the markup we've just created, so that we don't wrap
816        # <p> tags around block-level tags.
817        text = self._hash_html_blocks(text)
818
819        text = self._form_paragraphs(text)
820
821        return text
822
823    def _pyshell_block_sub(self, match):
824        lines = match.group(0).splitlines(0)
825        _dedentlines(lines)
826        indent = ' ' * self.tab_width
827        s = ('\n' # separate from possible cuddled paragraph
828             + indent + ('\n'+indent).join(lines)
829             + '\n\n')
830        return s
831
832    def _prepare_pyshell_blocks(self, text):
833        """Ensure that Python interactive shell sessions are put in
834        code blocks -- even if not properly indented.
835        """
836        if ">>>" not in text:
837            return text
838
839        less_than_tab = self.tab_width - 1
840        _pyshell_block_re = re.compile(r"""
841            ^([ ]{0,%d})>>>[ ].*\n   # first line
842            ^(\1.*\S+.*\n)*         # any number of subsequent lines
843            ^\n                     # ends with a blank line
844            """ % less_than_tab, re.M | re.X)
845
846        return _pyshell_block_re.sub(self._pyshell_block_sub, text)
847
848    def _table_sub(self, match):
849        trim_space_re = '^[ \t\n]+|[ \t\n]+$'
850        trim_bar_re = '^\||\|$'
851
852        head, underline, body = match.groups()
853
854        # Determine aligns for columns.
855        cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", underline)).split('|')]
856        align_from_col_idx = {}
857        for col_idx, col in enumerate(cols):
858            if col[0] == ':' and col[-1] == ':':
859                align_from_col_idx[col_idx] = ' align="center"'
860            elif col[0] == ':':
861                align_from_col_idx[col_idx] = ' align="left"'
862            elif col[-1] == ':':
863                align_from_col_idx[col_idx] = ' align="right"'
864
865        # thead
866        hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<thead>', '<tr>']
867        cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)).split('|')]
868        for col_idx, col in enumerate(cols):
869            hlines.append('  <th%s>%s</th>' % (
870                align_from_col_idx.get(col_idx, ''),
871                self._run_span_gamut(col)
872            ))
873        hlines.append('</tr>')
874        hlines.append('</thead>')
875
876        # tbody
877        hlines.append('<tbody>')
878        for line in body.strip('\n').split('\n'):
879            hlines.append('<tr>')
880            cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)).split('|')]
881            for col_idx, col in enumerate(cols):
882                hlines.append('  <td%s>%s</td>' % (
883                    align_from_col_idx.get(col_idx, ''),
884                    self._run_span_gamut(col)
885                ))
886            hlines.append('</tr>')
887        hlines.append('</tbody>')
888        hlines.append('</table>')
889
890        return '\n'.join(hlines) + '\n'
891
892    def _do_tables(self, text):
893        """Copying PHP-Markdown and GFM table syntax. Some regex borrowed from
894        https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538
895        """
896        less_than_tab = self.tab_width - 1
897        table_re = re.compile(r'''
898                (?:(?<=\n\n)|\A\n?)             # leading blank line
899
900                ^[ ]{0,%d}                      # allowed whitespace
901                (.*[|].*)  \n                   # $1: header row (at least one pipe)
902
903                ^[ ]{0,%d}                      # allowed whitespace
904                (                               # $2: underline row
905                    # underline row with leading bar
906                    (?:  \|\ *:?-+:?\ *  )+  \|?  \n
907                    |
908                    # or, underline row without leading bar
909                    (?:  \ *:?-+:?\ *\|  )+  (?:  \ *:?-+:?\ *  )?  \n
910                )
911
912                (                               # $3: data rows
913                    (?:
914                        ^[ ]{0,%d}(?!\ )         # ensure line begins with 0 to less_than_tab spaces
915                        .*\|.*  \n
916                    )+
917                )
918            ''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X)
919        return table_re.sub(self._table_sub, text)
920
921    def _wiki_table_sub(self, match):
922        ttext = match.group(0).strip()
923        #print 'wiki table: %r' % match.group(0)
924        rows = []
925        for line in ttext.splitlines(0):
926            line = line.strip()[2:-2].strip()
927            row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)]
928            rows.append(row)
929        #pprint(rows)
930        hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<tbody>']
931        for row in rows:
932            hrow = ['<tr>']
933            for cell in row:
934                hrow.append('<td>')
935                hrow.append(self._run_span_gamut(cell))
936                hrow.append('</td>')
937            hrow.append('</tr>')
938            hlines.append(''.join(hrow))
939        hlines += ['</tbody>', '</table>']
940        return '\n'.join(hlines) + '\n'
941
942    def _do_wiki_tables(self, text):
943        # Optimization.
944        if "||" not in text:
945            return text
946
947        less_than_tab = self.tab_width - 1
948        wiki_table_re = re.compile(r'''
949            (?:(?<=\n\n)|\A\n?)            # leading blank line
950            ^([ ]{0,%d})\|\|.+?\|\|[ ]*\n  # first line
951            (^\1\|\|.+?\|\|\n)*        # any number of subsequent lines
952            ''' % less_than_tab, re.M | re.X)
953        return wiki_table_re.sub(self._wiki_table_sub, text)
954
955    def _run_span_gamut(self, text):
956        # These are all the transformations that occur *within* block-level
957        # tags like paragraphs, headers, and list items.
958
959        text = self._do_code_spans(text)
960
961        text = self._escape_special_chars(text)
962
963        # Process anchor and image tags.
964        text = self._do_links(text)
965
966        # Make links out of things like `<http://example.com/>`
967        # Must come after _do_links(), because you can use < and >
968        # delimiters in inline links like [this](<url>).
969        text = self._do_auto_links(text)
970
971        if "link-patterns" in self.extras:
972            text = self._do_link_patterns(text)
973
974        text = self._encode_amps_and_angles(text)
975
976        if "strike" in self.extras:
977            text = self._do_strike(text)
978
979        text = self._do_italics_and_bold(text)
980
981        if "smarty-pants" in self.extras:
982            text = self._do_smart_punctuation(text)
983
984        # Do hard breaks:
985        if "break-on-newline" in self.extras:
986            text = re.sub(r" *\n", "<br%s\n" % self.empty_element_suffix, text)
987        else:
988            text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
989
990        return text
991
992    # "Sorta" because auto-links are identified as "tag" tokens.
993    _sorta_html_tokenize_re = re.compile(r"""
994        (
995            # tag
996            </?
997            (?:\w+)                                     # tag name
998            (?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))*  # attributes
999            \s*/?>
1000            |
1001            # auto-link (e.g., <http://www.activestate.com/>)
1002            <\w+[^>]*>
1003            |
1004            <!--.*?-->      # comment
1005            |
1006            <\?.*?\?>       # processing instruction
1007        )
1008        """, re.X)
1009
1010    def _escape_special_chars(self, text):
1011        # Python markdown note: the HTML tokenization here differs from
1012        # that in Markdown.pl, hence the behaviour for subtle cases can
1013        # differ (I believe the tokenizer here does a better job because
1014        # it isn't susceptible to unmatched '<' and '>' in HTML tags).
1015        # Note, however, that '>' is not allowed in an auto-link URL
1016        # here.
1017        escaped = []
1018        is_html_markup = False
1019        for token in self._sorta_html_tokenize_re.split(text):
1020            if is_html_markup:
1021                # Within tags/HTML-comments/auto-links, encode * and _
1022                # so they don't conflict with their use in Markdown for
1023                # italics and strong.  We're replacing each such
1024                # character with its corresponding MD5 checksum value;
1025                # this is likely overkill, but it should prevent us from
1026                # colliding with the escape values by accident.
1027                escaped.append(token.replace('*', self._escape_table['*'])
1028                                    .replace('_', self._escape_table['_']))
1029            else:
1030                escaped.append(self._encode_backslash_escapes(token))
1031            is_html_markup = not is_html_markup
1032        return ''.join(escaped)
1033
1034    def _hash_html_spans(self, text):
1035        # Used for safe_mode.
1036
1037        def _is_auto_link(s):
1038            if ':' in s and self._auto_link_re.match(s):
1039                return True
1040            elif '@' in s and self._auto_email_link_re.match(s):
1041                return True
1042            return False
1043
1044        tokens = []
1045        is_html_markup = False
1046        for token in self._sorta_html_tokenize_re.split(text):
1047            if is_html_markup and not _is_auto_link(token):
1048                sanitized = self._sanitize_html(token)
1049                key = _hash_text(sanitized)
1050                self.html_spans[key] = sanitized
1051                tokens.append(key)
1052            else:
1053                tokens.append(token)
1054            is_html_markup = not is_html_markup
1055        return ''.join(tokens)
1056
1057    def _unhash_html_spans(self, text):
1058        for key, sanitized in list(self.html_spans.items()):
1059            text = text.replace(key, sanitized)
1060        return text
1061
1062    def _sanitize_html(self, s):
1063        if self.safe_mode == "replace":
1064            return self.html_removed_text
1065        elif self.safe_mode == "escape":
1066            replacements = [
1067                ('&', '&amp;'),
1068                ('<', '&lt;'),
1069                ('>', '&gt;'),
1070            ]
1071            for before, after in replacements:
1072                s = s.replace(before, after)
1073            return s
1074        else:
1075            raise MarkdownError("invalid value for 'safe_mode': %r (must be "
1076                                "'escape' or 'replace')" % self.safe_mode)
1077
1078    _inline_link_title = re.compile(r'''
1079            (                   # \1
1080              [ \t]+
1081              (['"])            # quote char = \2
1082              (?P<title>.*?)
1083              \2
1084            )?                  # title is optional
1085          \)$
1086        ''', re.X | re.S)
1087    _tail_of_reference_link_re = re.compile(r'''
1088          # Match tail of: [text][id]
1089          [ ]?          # one optional space
1090          (?:\n[ ]*)?   # one optional newline followed by spaces
1091          \[
1092            (?P<id>.*?)
1093          \]
1094        ''', re.X | re.S)
1095
1096    _whitespace = re.compile(r'\s*')
1097
1098    _strip_anglebrackets = re.compile(r'<(.*)>.*')
1099
1100    def _find_non_whitespace(self, text, start):
1101        """Returns the index of the first non-whitespace character in text
1102        after (and including) start
1103        """
1104        match = self._whitespace.match(text, start)
1105        return match.end()
1106
1107    def _find_balanced(self, text, start, open_c, close_c):
1108        """Returns the index where the open_c and close_c characters balance
1109        out - the same number of open_c and close_c are encountered - or the
1110        end of string if it's reached before the balance point is found.
1111        """
1112        i = start
1113        l = len(text)
1114        count = 1
1115        while count > 0 and i < l:
1116            if text[i] == open_c:
1117                count += 1
1118            elif text[i] == close_c:
1119                count -= 1
1120            i += 1
1121        return i
1122
1123    def _extract_url_and_title(self, text, start):
1124        """Extracts the url and (optional) title from the tail of a link"""
1125        # text[start] equals the opening parenthesis
1126        idx = self._find_non_whitespace(text, start+1)
1127        if idx == len(text):
1128            return None, None, None
1129        end_idx = idx
1130        has_anglebrackets = text[idx] == "<"
1131        if has_anglebrackets:
1132            end_idx = self._find_balanced(text, end_idx+1, "<", ">")
1133        end_idx = self._find_balanced(text, end_idx, "(", ")")
1134        match = self._inline_link_title.search(text, idx, end_idx)
1135        if not match:
1136            return None, None, None
1137        url, title = text[idx:match.start()], match.group("title")
1138        if has_anglebrackets:
1139            url = self._strip_anglebrackets.sub(r'\1', url)
1140        return url, title, end_idx
1141
1142    def _do_links(self, text):
1143        """Turn Markdown link shortcuts into XHTML <a> and <img> tags.
1144
1145        This is a combination of Markdown.pl's _DoAnchors() and
1146        _DoImages(). They are done together because that simplified the
1147        approach. It was necessary to use a different approach than
1148        Markdown.pl because of the lack of atomic matching support in
1149        Python's regex engine used in $g_nested_brackets.
1150        """
1151        MAX_LINK_TEXT_SENTINEL = 3000  # markdown2 issue 24
1152
1153        # `anchor_allowed_pos` is used to support img links inside
1154        # anchors, but not anchors inside anchors. An anchor's start
1155        # pos must be `>= anchor_allowed_pos`.
1156        anchor_allowed_pos = 0
1157
1158        curr_pos = 0
1159        while True: # Handle the next link.
1160            # The next '[' is the start of:
1161            # - an inline anchor:   [text](url "title")
1162            # - a reference anchor: [text][id]
1163            # - an inline img:      ![text](url "title")
1164            # - a reference img:    ![text][id]
1165            # - a footnote ref:     [^id]
1166            #   (Only if 'footnotes' extra enabled)
1167            # - a footnote defn:    [^id]: ...
1168            #   (Only if 'footnotes' extra enabled) These have already
1169            #   been stripped in _strip_footnote_definitions() so no
1170            #   need to watch for them.
1171            # - a link definition:  [id]: url "title"
1172            #   These have already been stripped in
1173            #   _strip_link_definitions() so no need to watch for them.
1174            # - not markup:         [...anything else...
1175            try:
1176                start_idx = text.index('[', curr_pos)
1177            except ValueError:
1178                break
1179            text_length = len(text)
1180
1181            # Find the matching closing ']'.
1182            # Markdown.pl allows *matching* brackets in link text so we
1183            # will here too. Markdown.pl *doesn't* currently allow
1184            # matching brackets in img alt text -- we'll differ in that
1185            # regard.
1186            bracket_depth = 0
1187            for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,
1188                                            text_length)):
1189                ch = text[p]
1190                if ch == ']':
1191                    bracket_depth -= 1
1192                    if bracket_depth < 0:
1193                        break
1194                elif ch == '[':
1195                    bracket_depth += 1
1196            else:
1197                # Closing bracket not found within sentinel length.
1198                # This isn't markup.
1199                curr_pos = start_idx + 1
1200                continue
1201            link_text = text[start_idx+1:p]
1202
1203            # Possibly a footnote ref?
1204            if "footnotes" in self.extras and link_text.startswith("^"):
1205                normed_id = re.sub(r'\W', '-', link_text[1:])
1206                if normed_id in self.footnotes:
1207                    self.footnote_ids.append(normed_id)
1208                    result = '<sup class="footnote-ref" id="fnref-%s">' \
1209                             '<a href="#fn-%s">%s</a></sup>' \
1210                             % (normed_id, normed_id, len(self.footnote_ids))
1211                    text = text[:start_idx] + result + text[p+1:]
1212                else:
1213                    # This id isn't defined, leave the markup alone.
1214                    curr_pos = p+1
1215                continue
1216
1217            # Now determine what this is by the remainder.
1218            p += 1
1219            if p == text_length:
1220                return text
1221
1222            # Inline anchor or img?
1223            if text[p] == '(': # attempt at perf improvement
1224                url, title, url_end_idx = self._extract_url_and_title(text, p)
1225                if url is not None:
1226                    # Handle an inline anchor or img.
1227                    is_img = start_idx > 0 and text[start_idx-1] == "!"
1228                    if is_img:
1229                        start_idx -= 1
1230
1231                    # We've got to encode these to avoid conflicting
1232                    # with italics/bold.
1233                    url = url.replace('*', self._escape_table['*']) \
1234                             .replace('_', self._escape_table['_'])
1235                    if title:
1236                        title_str = ' title="%s"' % (
1237                            _xml_escape_attr(title)
1238                                .replace('*', self._escape_table['*'])
1239                                .replace('_', self._escape_table['_']))
1240                    else:
1241                        title_str = ''
1242                    if is_img:
1243                        img_class_str = self._html_class_str_from_tag("img")
1244                        result = '<img src="%s" alt="%s"%s%s%s' \
1245                            % (url.replace('"', '&quot;'),
1246                               _xml_escape_attr(link_text),
1247                               title_str, img_class_str, self.empty_element_suffix)
1248                        if "smarty-pants" in self.extras:
1249                            result = result.replace('"', self._escape_table['"'])
1250                        curr_pos = start_idx + len(result)
1251                        text = text[:start_idx] + result + text[url_end_idx:]
1252                    elif start_idx >= anchor_allowed_pos:
1253                        result_head = '<a href="%s"%s>' % (url, title_str)
1254                        result = '%s%s</a>' % (result_head, link_text)
1255                        if "smarty-pants" in self.extras:
1256                            result = result.replace('"', self._escape_table['"'])
1257                        # <img> allowed from curr_pos on, <a> from
1258                        # anchor_allowed_pos on.
1259                        curr_pos = start_idx + len(result_head)
1260                        anchor_allowed_pos = start_idx + len(result)
1261                        text = text[:start_idx] + result + text[url_end_idx:]
1262                    else:
1263                        # Anchor not allowed here.
1264                        curr_pos = start_idx + 1
1265                    continue
1266
1267            # Reference anchor or img?
1268            else:
1269                match = self._tail_of_reference_link_re.match(text, p)
1270                if match:
1271                    # Handle a reference-style anchor or img.
1272                    is_img = start_idx > 0 and text[start_idx-1] == "!"
1273                    if is_img:
1274                        start_idx -= 1
1275                    link_id = match.group("id").lower()
1276                    if not link_id:
1277                        link_id = link_text.lower()  # for links like [this][]
1278                    if link_id in self.urls:
1279                        url = self.urls[link_id]
1280                        # We've got to encode these to avoid conflicting
1281                        # with italics/bold.
1282                        url = url.replace('*', self._escape_table['*']) \
1283                                 .replace('_', self._escape_table['_'])
1284                        title = self.titles.get(link_id)
1285                        if title:
1286                            title = _xml_escape_attr(title) \
1287                                .replace('*', self._escape_table['*']) \
1288                                .replace('_', self._escape_table['_'])
1289                            title_str = ' title="%s"' % title
1290                        else:
1291                            title_str = ''
1292                        if is_img:
1293                            img_class_str = self._html_class_str_from_tag("img")
1294                            result = '<img src="%s" alt="%s"%s%s%s' \
1295                                % (url.replace('"', '&quot;'),
1296                                   link_text.replace('"', '&quot;'),
1297                                   title_str, img_class_str, self.empty_element_suffix)
1298                            if "smarty-pants" in self.extras:
1299                                result = result.replace('"', self._escape_table['"'])
1300                            curr_pos = start_idx + len(result)
1301                            text = text[:start_idx] + result + text[match.end():]
1302                        elif start_idx >= anchor_allowed_pos:
1303                            result = '<a href="%s"%s>%s</a>' \
1304                                % (url, title_str, link_text)
1305                            result_head = '<a href="%s"%s>' % (url, title_str)
1306                            result = '%s%s</a>' % (result_head, link_text)
1307                            if "smarty-pants" in self.extras:
1308                                result = result.replace('"', self._escape_table['"'])
1309                            # <img> allowed from curr_pos on, <a> from
1310                            # anchor_allowed_pos on.
1311                            curr_pos = start_idx + len(result_head)
1312                            anchor_allowed_pos = start_idx + len(result)
1313                            text = text[:start_idx] + result + text[match.end():]
1314                        else:
1315                            # Anchor not allowed here.
1316                            curr_pos = start_idx + 1
1317                    else:
1318                        # This id isn't defined, leave the markup alone.
1319                        curr_pos = match.end()
1320                    continue
1321
1322            # Otherwise, it isn't markup.
1323            curr_pos = start_idx + 1
1324
1325        return text
1326
1327    def header_id_from_text(self, text, prefix, n):
1328        """Generate a header id attribute value from the given header
1329        HTML content.
1330
1331        This is only called if the "header-ids" extra is enabled.
1332        Subclasses may override this for different header ids.
1333
1334        @param text {str} The text of the header tag
1335        @param prefix {str} The requested prefix for header ids. This is the
1336            value of the "header-ids" extra key, if any. Otherwise, None.
1337        @param n {int} The <hN> tag number, i.e. `1` for an <h1> tag.
1338        @returns {str} The value for the header tag's "id" attribute. Return
1339            None to not have an id attribute and to exclude this header from
1340            the TOC (if the "toc" extra is specified).
1341        """
1342        header_id = _slugify(text)
1343        if prefix and isinstance(prefix, base_string_type):
1344            header_id = prefix + '-' + header_id
1345        if header_id in self._count_from_header_id:
1346            self._count_from_header_id[header_id] += 1
1347            header_id += '-%s' % self._count_from_header_id[header_id]
1348        else:
1349            self._count_from_header_id[header_id] = 1
1350        return header_id
1351
1352    _toc = None
1353    def _toc_add_entry(self, level, id, name):
1354        if self._toc is None:
1355            self._toc = []
1356        self._toc.append((level, id, self._unescape_special_chars(name)))
1357
1358    _h_re_base = r'''
1359        (^(.+)[ \t]*\n(=+|-+)[ \t]*\n+)
1360        |
1361        (^(\#{1,6})  # \1 = string of #'s
1362        [ \t]%s
1363        (.+?)       # \2 = Header text
1364        [ \t]*
1365        (?<!\\)     # ensure not an escaped trailing '#'
1366        \#*         # optional closing #'s (not counted)
1367        \n+
1368        )
1369        '''
1370
1371    _h_re = re.compile(_h_re_base % '*', re.X | re.M)
1372    _h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M)
1373
1374    def _h_sub(self, match):
1375        if match.group(1) is not None:
1376            # Setext header
1377            n = {"=": 1, "-": 2}[match.group(3)[0]]
1378            header_group = match.group(2)
1379        else:
1380            # atx header
1381            n = len(match.group(5))
1382            header_group = match.group(6)
1383
1384        demote_headers = self.extras.get("demote-headers")
1385        if demote_headers:
1386            n = min(n + demote_headers, 6)
1387        header_id_attr = ""
1388        if "header-ids" in self.extras:
1389            header_id = self.header_id_from_text(header_group,
1390                self.extras["header-ids"], n)
1391            if header_id:
1392                header_id_attr = ' id="%s"' % header_id
1393        html = self._run_span_gamut(header_group)
1394        if "toc" in self.extras and header_id:
1395            self._toc_add_entry(n, header_id, html)
1396        return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
1397
1398    def _do_headers(self, text):
1399        # Setext-style headers:
1400        #     Header 1
1401        #     ========
1402        #
1403        #     Header 2
1404        #     --------
1405
1406        # atx-style headers:
1407        #   # Header 1
1408        #   ## Header 2
1409        #   ## Header 2 with closing hashes ##
1410        #   ...
1411        #   ###### Header 6
1412
1413        if 'tag-friendly' in self.extras:
1414            return self._h_re_tag_friendly.sub(self._h_sub, text)
1415        return self._h_re.sub(self._h_sub, text)
1416
1417    _marker_ul_chars  = '*+-'
1418    _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars
1419    _marker_ul = '(?:[%s])' % _marker_ul_chars
1420    _marker_ol = r'(?:\d+\.)'
1421
1422    def _list_sub(self, match):
1423        lst = match.group(1)
1424        lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol"
1425        result = self._process_list_items(lst)
1426        if self.list_level:
1427            return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type)
1428        else:
1429            return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type)
1430
1431    def _do_lists(self, text):
1432        # Form HTML ordered (numbered) and unordered (bulleted) lists.
1433
1434        # Iterate over each *non-overlapping* list match.
1435        pos = 0
1436        while True:
1437            # Find the *first* hit for either list style (ul or ol). We
1438            # match ul and ol separately to avoid adjacent lists of different
1439            # types running into each other (see issue #16).
1440            hits = []
1441            for marker_pat in (self._marker_ul, self._marker_ol):
1442                less_than_tab = self.tab_width - 1
1443                whole_list = r'''
1444                    (                   # \1 = whole list
1445                      (                 # \2
1446                        [ ]{0,%d}
1447                        (%s)            # \3 = first list item marker
1448                        [ \t]+
1449                        (?!\ *\3\ )     # '- - - ...' isn't a list. See 'not_quite_a_list' test case.
1450                      )
1451                      (?:.+?)
1452                      (                 # \4
1453                          \Z
1454                        |
1455                          \n{2,}
1456                          (?=\S)
1457                          (?!           # Negative lookahead for another list item marker
1458                            [ \t]*
1459                            %s[ \t]+
1460                          )
1461                      )
1462                    )
1463                ''' % (less_than_tab, marker_pat, marker_pat)
1464                if self.list_level:  # sub-list
1465                    list_re = re.compile("^"+whole_list, re.X | re.M | re.S)
1466                else:
1467                    list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list,
1468                                         re.X | re.M | re.S)
1469                match = list_re.search(text, pos)
1470                if match:
1471                    hits.append((match.start(), match))
1472            if not hits:
1473                break
1474            hits.sort()
1475            match = hits[0][1]
1476            start, end = match.span()
1477            middle = self._list_sub(match)
1478            text = text[:start] + middle + text[end:]
1479            pos = start + len(middle) # start pos for next attempted match
1480
1481        return text
1482
1483    _list_item_re = re.compile(r'''
1484        (\n)?                   # leading line = \1
1485        (^[ \t]*)               # leading whitespace = \2
1486        (?P<marker>%s) [ \t]+   # list marker = \3
1487        ((?:.+?)                # list item text = \4
1488         (\n{1,2}))             # eols = \5
1489        (?= \n* (\Z | \2 (?P<next_marker>%s) [ \t]+))
1490        ''' % (_marker_any, _marker_any),
1491        re.M | re.X | re.S)
1492
1493    _last_li_endswith_two_eols = False
1494    def _list_item_sub(self, match):
1495        item = match.group(4)
1496        leading_line = match.group(1)
1497        if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
1498            item = self._run_block_gamut(self._outdent(item))
1499        else:
1500            # Recursion for sub-lists:
1501            item = self._do_lists(self._outdent(item))
1502            if item.endswith('\n'):
1503                item = item[:-1]
1504            item = self._run_span_gamut(item)
1505        self._last_li_endswith_two_eols = (len(match.group(5)) == 2)
1506        return "<li>%s</li>\n" % item
1507
1508    def _process_list_items(self, list_str):
1509        # Process the contents of a single ordered or unordered list,
1510        # splitting it into individual list items.
1511
1512        # The $g_list_level global keeps track of when we're inside a list.
1513        # Each time we enter a list, we increment it; when we leave a list,
1514        # we decrement. If it's zero, we're not in a list anymore.
1515        #
1516        # We do this because when we're not inside a list, we want to treat
1517        # something like this:
1518        #
1519        #       I recommend upgrading to version
1520        #       8. Oops, now this line is treated
1521        #       as a sub-list.
1522        #
1523        # As a single paragraph, despite the fact that the second line starts
1524        # with a digit-period-space sequence.
1525        #
1526        # Whereas when we're inside a list (or sub-list), that line will be
1527        # treated as the start of a sub-list. What a kludge, huh? This is
1528        # an aspect of Markdown's syntax that's hard to parse perfectly
1529        # without resorting to mind-reading. Perhaps the solution is to
1530        # change the syntax rules such that sub-lists must start with a
1531        # starting cardinal number; e.g. "1." or "a.".
1532        self.list_level += 1
1533        self._last_li_endswith_two_eols = False
1534        list_str = list_str.rstrip('\n') + '\n'
1535        list_str = self._list_item_re.sub(self._list_item_sub, list_str)
1536        self.list_level -= 1
1537        return list_str
1538
1539    def _get_pygments_lexer(self, lexer_name):
1540        try:
1541            from pygments import lexers, util
1542        except ImportError:
1543            return None
1544        try:
1545            return lexers.get_lexer_by_name(lexer_name)
1546        except util.ClassNotFound:
1547            return None
1548
1549    def _color_with_pygments(self, codeblock, lexer, **formatter_opts):
1550        import pygments
1551        import pygments.formatters
1552
1553        class HtmlCodeFormatter(pygments.formatters.HtmlFormatter):
1554            def _wrap_code(self, inner):
1555                """A function for use in a Pygments Formatter which
1556                wraps in <code> tags.
1557                """
1558                yield 0, "<code>"
1559                for tup in inner:
1560                    yield tup
1561                yield 0, "</code>"
1562
1563            def wrap(self, source, outfile):
1564                """Return the source with a code, pre, and div."""
1565                return self._wrap_div(self._wrap_pre(self._wrap_code(source)))
1566
1567        formatter_opts.setdefault("cssclass", "codehilite")
1568        formatter = HtmlCodeFormatter(**formatter_opts)
1569        return pygments.highlight(codeblock, lexer, formatter)
1570
1571    def _code_block_sub(self, match, is_fenced_code_block=False):
1572        lexer_name = None
1573        if is_fenced_code_block:
1574            lexer_name = match.group(1)
1575            if lexer_name:
1576                formatter_opts = self.extras['fenced-code-blocks'] or {}
1577            codeblock = match.group(2)
1578            codeblock = codeblock[:-1]  # drop one trailing newline
1579        else:
1580            codeblock = match.group(1)
1581            codeblock = self._outdent(codeblock)
1582            codeblock = self._detab(codeblock)
1583            codeblock = codeblock.lstrip('\n')  # trim leading newlines
1584            codeblock = codeblock.rstrip()      # trim trailing whitespace
1585
1586            # Note: "code-color" extra is DEPRECATED.
1587            if "code-color" in self.extras and codeblock.startswith(":::"):
1588                lexer_name, rest = codeblock.split('\n', 1)
1589                lexer_name = lexer_name[3:].strip()
1590                codeblock = rest.lstrip("\n")   # Remove lexer declaration line.
1591                formatter_opts = self.extras['code-color'] or {}
1592
1593        if lexer_name:
1594            def unhash_code( codeblock ):
1595                for key, sanitized in list(self.html_spans.items()):
1596                    codeblock = codeblock.replace(key, sanitized)
1597                replacements = [
1598                    ("&amp;", "&"),
1599                    ("&lt;", "<"),
1600                    ("&gt;", ">")
1601                ]
1602                for old, new in replacements:
1603                    codeblock = codeblock.replace(old, new)
1604                return codeblock
1605            lexer = self._get_pygments_lexer(lexer_name)
1606            if lexer:
1607                codeblock = unhash_code( codeblock )
1608                colored = self._color_with_pygments(codeblock, lexer,
1609                                                    **formatter_opts)
1610                return "\n\n%s\n\n" % colored
1611
1612        codeblock = self._encode_code(codeblock)
1613        pre_class_str = self._html_class_str_from_tag("pre")
1614        code_class_str = self._html_class_str_from_tag("code")
1615        return "\n\n<pre%s><code%s>%s\n</code></pre>\n\n" % (
1616            pre_class_str, code_class_str, codeblock)
1617
1618    def _html_class_str_from_tag(self, tag):
1619        """Get the appropriate ' class="..."' string (note the leading
1620        space), if any, for the given tag.
1621        """
1622        if "html-classes" not in self.extras:
1623            return ""
1624        try:
1625            html_classes_from_tag = self.extras["html-classes"]
1626        except TypeError:
1627            return ""
1628        else:
1629            if tag in html_classes_from_tag:
1630                return ' class="%s"' % html_classes_from_tag[tag]
1631        return ""
1632
1633    def _do_code_blocks(self, text):
1634        """Process Markdown `<pre><code>` blocks."""
1635        code_block_re = re.compile(r'''
1636            (?:\n\n|\A\n?)
1637            (               # $1 = the code block -- one or more lines, starting with a space/tab
1638              (?:
1639                (?:[ ]{%d} | \t)  # Lines must start with a tab or a tab-width of spaces
1640                .*\n+
1641              )+
1642            )
1643            ((?=^[ ]{0,%d}\S)|\Z)   # Lookahead for non-space at line-start, or end of doc
1644            # Lookahead to make sure this block isn't already in a code block.
1645            # Needed when syntax highlighting is being used.
1646            (?![^<]*\</code\>)
1647            ''' % (self.tab_width, self.tab_width),
1648            re.M | re.X)
1649        return code_block_re.sub(self._code_block_sub, text)
1650
1651    _fenced_code_block_re = re.compile(r'''
1652        (?:\n\n|\A\n?)
1653        ^```([\w+-]+)?[ \t]*\n      # opening fence, $1 = optional lang
1654        (.*?)                       # $2 = code block content
1655        ^```[ \t]*\n                # closing fence
1656        ''', re.M | re.X | re.S)
1657
1658    def _fenced_code_block_sub(self, match):
1659        return self._code_block_sub(match, is_fenced_code_block=True);
1660
1661    def _do_fenced_code_blocks(self, text):
1662        """Process ```-fenced unindented code blocks ('fenced-code-blocks' extra)."""
1663        return self._fenced_code_block_re.sub(self._fenced_code_block_sub, text)
1664
1665    # Rules for a code span:
1666    # - backslash escapes are not interpreted in a code span
1667    # - to include one or or a run of more backticks the delimiters must
1668    #   be a longer run of backticks
1669    # - cannot start or end a code span with a backtick; pad with a
1670    #   space and that space will be removed in the emitted HTML
1671    # See `test/tm-cases/escapes.text` for a number of edge-case
1672    # examples.
1673    _code_span_re = re.compile(r'''
1674            (?<!\\)
1675            (`+)        # \1 = Opening run of `
1676            (?!`)       # See Note A test/tm-cases/escapes.text
1677            (.+?)       # \2 = The code block
1678            (?<!`)
1679            \1          # Matching closer
1680            (?!`)
1681        ''', re.X | re.S)
1682
1683    def _code_span_sub(self, match):
1684        c = match.group(2).strip(" \t")
1685        c = self._encode_code(c)
1686        return "<code>%s</code>" % c
1687
1688    def _do_code_spans(self, text):
1689        #   *   Backtick quotes are used for <code></code> spans.
1690        #
1691        #   *   You can use multiple backticks as the delimiters if you want to
1692        #       include literal backticks in the code span. So, this input:
1693        #
1694        #         Just type ``foo `bar` baz`` at the prompt.
1695        #
1696        #       Will translate to:
1697        #
1698        #         <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1699        #
1700        #       There's no arbitrary limit to the number of backticks you
1701        #       can use as delimters. If you need three consecutive backticks
1702        #       in your code, use four for delimiters, etc.
1703        #
1704        #   *   You can use spaces to get literal backticks at the edges:
1705        #
1706        #         ... type `` `bar` `` ...
1707        #
1708        #       Turns to:
1709        #
1710        #         ... type <code>`bar`</code> ...
1711        return self._code_span_re.sub(self._code_span_sub, text)
1712
1713    def _encode_code(self, text):
1714        """Encode/escape certain characters inside Markdown code runs.
1715        The point is that in code, these characters are literals,
1716        and lose their special Markdown meanings.
1717        """
1718        replacements = [
1719            # Encode all ampersands; HTML entities are not
1720            # entities within a Markdown code span.
1721            ('&', '&amp;'),
1722            # Do the angle bracket song and dance:
1723            ('<', '&lt;'),
1724            ('>', '&gt;'),
1725        ]
1726        for before, after in replacements:
1727            text = text.replace(before, after)
1728        hashed = _hash_text(text)
1729        self._escape_table[text] = hashed
1730        return hashed
1731
1732    _strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S)
1733    def _do_strike(self, text):
1734        text = self._strike_re.sub(r"<strike>\1</strike>", text)
1735        return text
1736
1737    _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S)
1738    _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)
1739    _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
1740    _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
1741    def _do_italics_and_bold(self, text):
1742        # <strong> must go first:
1743        if "code-friendly" in self.extras:
1744            text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text)
1745            text = self._code_friendly_em_re.sub(r"<em>\1</em>", text)
1746        else:
1747            text = self._strong_re.sub(r"<strong>\2</strong>", text)
1748            text = self._em_re.sub(r"<em>\2</em>", text)
1749        return text
1750
1751    # "smarty-pants" extra: Very liberal in interpreting a single prime as an
1752    # apostrophe; e.g. ignores the fact that "round", "bout", "twer", and
1753    # "twixt" can be written without an initial apostrophe. This is fine because
1754    # using scare quotes (single quotation marks) is rare.
1755    _apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))")
1756    _contractions = ["tis", "twas", "twer", "neath", "o", "n",
1757        "round", "bout", "twixt", "nuff", "fraid", "sup"]
1758    def _do_smart_contractions(self, text):
1759        text = self._apostrophe_year_re.sub(r"&#8217;\1", text)
1760        for c in self._contractions:
1761            text = text.replace("'%s" % c, "&#8217;%s" % c)
1762            text = text.replace("'%s" % c.capitalize(),
1763                "&#8217;%s" % c.capitalize())
1764        return text
1765
1766    # Substitute double-quotes before single-quotes.
1767    _opening_single_quote_re = re.compile(r"(?<!\S)'(?=\S)")
1768    _opening_double_quote_re = re.compile(r'(?<!\S)"(?=\S)')
1769    _closing_single_quote_re = re.compile(r"(?<=\S)'")
1770    _closing_double_quote_re = re.compile(r'(?<=\S)"(?=(\s|,|;|\.|\?|!|$))')
1771    def _do_smart_punctuation(self, text):
1772        """Fancifies 'single quotes', "double quotes", and apostrophes.
1773        Converts --, ---, and ... into en dashes, em dashes, and ellipses.
1774
1775        Inspiration is: <http://daringfireball.net/projects/smartypants/>
1776        See "test/tm-cases/smarty_pants.text" for a full discussion of the
1777        support here and
1778        <http://code.google.com/p/python-markdown2/issues/detail?id=42> for a
1779        discussion of some diversion from the original SmartyPants.
1780        """
1781        if "'" in text: # guard for perf
1782            text = self._do_smart_contractions(text)
1783            text = self._opening_single_quote_re.sub("&#8216;", text)
1784            text = self._closing_single_quote_re.sub("&#8217;", text)
1785
1786        if '"' in text: # guard for perf
1787            text = self._opening_double_quote_re.sub("&#8220;", text)
1788            text = self._closing_double_quote_re.sub("&#8221;", text)
1789
1790        text = text.replace("---", "&#8212;")
1791        text = text.replace("--", "&#8211;")
1792        text = text.replace("...", "&#8230;")
1793        text = text.replace(" . . . ", "&#8230;")
1794        text = text.replace(". . .", "&#8230;")
1795        return text
1796
1797    _block_quote_base = r'''
1798        (                           # Wrap whole match in \1
1799          (
1800            ^[ \t]*>%s[ \t]?        # '>' at the start of a line
1801              .+\n                  # rest of the first line
1802            (.+\n)*                 # subsequent consecutive lines
1803            \n*                     # blanks
1804          )+
1805        )
1806    '''
1807    _block_quote_re = re.compile(_block_quote_base % '', re.M | re.X)
1808    _block_quote_re_spoiler = re.compile(_block_quote_base % '[ \t]*?!?', re.M | re.X)
1809    _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M);
1810    _bq_one_level_re_spoiler = re.compile('^[ \t]*>[ \t]*?![ \t]?', re.M);
1811    _bq_all_lines_spoilers = re.compile(r'\A(?:^[ \t]*>[ \t]*?!.*[\n\r]*)+\Z', re.M)
1812    _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
1813    def _dedent_two_spaces_sub(self, match):
1814        return re.sub(r'(?m)^  ', '', match.group(1))
1815
1816    def _block_quote_sub(self, match):
1817        bq = match.group(1)
1818        is_spoiler = 'spoiler' in self.extras and self._bq_all_lines_spoilers.match(bq)
1819        # trim one level of quoting
1820        if is_spoiler:
1821            bq = self._bq_one_level_re_spoiler.sub('', bq)
1822        else:
1823            bq = self._bq_one_level_re.sub('', bq)
1824        # trim whitespace-only lines
1825        bq = self._ws_only_line_re.sub('', bq)
1826        bq = self._run_block_gamut(bq)          # recurse
1827
1828        bq = re.sub('(?m)^', '  ', bq)
1829        # These leading spaces screw with <pre> content, so we need to fix that:
1830        bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)
1831
1832        if is_spoiler:
1833            return '<blockquote class="spoiler">\n%s\n</blockquote>\n\n' % bq
1834        else:
1835            return '<blockquote>\n%s\n</blockquote>\n\n' % bq
1836
1837    def _do_block_quotes(self, text):
1838        if '>' not in text:
1839            return text
1840        if 'spoiler' in self.extras:
1841            return self._block_quote_re_spoiler.sub(self._block_quote_sub, text)
1842        else:
1843            return self._block_quote_re.sub(self._block_quote_sub, text)
1844
1845    def _form_paragraphs(self, text):
1846        # Strip leading and trailing lines:
1847        text = text.strip('\n')
1848
1849        # Wrap <p> tags.
1850        grafs = []
1851        for i, graf in enumerate(re.split(r"\n{2,}", text)):
1852            if graf in self.html_blocks:
1853                # Unhashify HTML blocks
1854                grafs.append(self.html_blocks[graf])
1855            else:
1856                cuddled_list = None
1857                if "cuddled-lists" in self.extras:
1858                    # Need to put back trailing '\n' for `_list_item_re`
1859                    # match at the end of the paragraph.
1860                    li = self._list_item_re.search(graf + '\n')
1861                    # Two of the same list marker in this paragraph: a likely
1862                    # candidate for a list cuddled to preceding paragraph
1863                    # text (issue 33). Note the `[-1]` is a quick way to
1864                    # consider numeric bullets (e.g. "1." and "2.") to be
1865                    # equal.
1866                    if (li and len(li.group(2)) <= 3 and li.group("next_marker")
1867                        and li.group("marker")[-1] == li.group("next_marker")[-1]):
1868                        start = li.start()
1869                        cuddled_list = self._do_lists(graf[start:]).rstrip("\n")
1870                        assert cuddled_list.startswith("<ul>") or cuddled_list.startswith("<ol>")
1871                        graf = graf[:start]
1872
1873                # Wrap <p> tags.
1874                graf = self._run_span_gamut(graf)
1875                grafs.append("<p>" + graf.lstrip(" \t") + "</p>")
1876
1877                if cuddled_list:
1878                    grafs.append(cuddled_list)
1879
1880        return "\n\n".join(grafs)
1881
1882    def _add_footnotes(self, text):
1883        if self.footnotes:
1884            footer = [
1885                '<div class="footnotes">',
1886                '<hr' + self.empty_element_suffix,
1887                '<ol>',
1888            ]
1889            for i, id in enumerate(self.footnote_ids):
1890                if i != 0:
1891                    footer.append('')
1892                footer.append('<li id="fn-%s">' % id)
1893                footer.append(self._run_block_gamut(self.footnotes[id]))
1894                backlink = ('<a href="#fnref-%s" '
1895                    'class="footnoteBackLink" '
1896                    'title="Jump back to footnote %d in the text.">'
1897                    '&#8617;</a>' % (id, i+1))
1898                if footer[-1].endswith("</p>"):
1899                    footer[-1] = footer[-1][:-len("</p>")] \
1900                        + '&#160;' + backlink + "</p>"
1901                else:
1902                    footer.append("\n<p>%s</p>" % backlink)
1903                footer.append('</li>')
1904            footer.append('</ol>')
1905            footer.append('</div>')
1906            return text + '\n\n' + '\n'.join(footer)
1907        else:
1908            return text
1909
1910    # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1911    #   http://bumppo.net/projects/amputator/
1912    _ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')
1913    _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I)
1914    _naked_gt_re = re.compile(r'''(?<![a-z0-9?!/'"-])>''', re.I)
1915
1916    def _encode_amps_and_angles(self, text):
1917        # Smart processing for ampersands and angle brackets that need
1918        # to be encoded.
1919        text = self._ampersand_re.sub('&amp;', text)
1920
1921        # Encode naked <'s
1922        text = self._naked_lt_re.sub('&lt;', text)
1923
1924        # Encode naked >'s
1925        # Note: Other markdown implementations (e.g. Markdown.pl, PHP
1926        # Markdown) don't do this.
1927        text = self._naked_gt_re.sub('&gt;', text)
1928        return text
1929
1930    def _encode_backslash_escapes(self, text):
1931        for ch, escape in list(self._escape_table.items()):
1932            text = text.replace("\\"+ch, escape)
1933        return text
1934
1935    _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I)
1936    def _auto_link_sub(self, match):
1937        g1 = match.group(1)
1938        return '<a href="%s">%s</a>' % (g1, g1)
1939
1940    _auto_email_link_re = re.compile(r"""
1941          <
1942           (?:mailto:)?
1943          (
1944              [-.\w]+
1945              \@
1946              [-\w]+(\.[-\w]+)*\.[a-z]+
1947          )
1948          >
1949        """, re.I | re.X | re.U)
1950    def _auto_email_link_sub(self, match):
1951        return self._encode_email_address(
1952            self._unescape_special_chars(match.group(1)))
1953
1954    def _do_auto_links(self, text):
1955        text = self._auto_link_re.sub(self._auto_link_sub, text)
1956        text = self._auto_email_link_re.sub(self._auto_email_link_sub, text)
1957        return text
1958
1959    def _encode_email_address(self, addr):
1960        #  Input: an email address, e.g. "foo@example.com"
1961        #
1962        #  Output: the email address as a mailto link, with each character
1963        #      of the address encoded as either a decimal or hex entity, in
1964        #      the hopes of foiling most address harvesting spam bots. E.g.:
1965        #
1966        #    <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1967        #       x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1968        #       &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1969        #
1970        #  Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1971        #  mailing list: <http://tinyurl.com/yu7ue>
1972        chars = [_xml_encode_email_char_at_random(ch)
1973                 for ch in "mailto:" + addr]
1974        # Strip the mailto: from the visible part.
1975        addr = '<a href="%s">%s</a>' \
1976               % (''.join(chars), ''.join(chars[7:]))
1977        return addr
1978
1979    def _do_link_patterns(self, text):
1980        """Caveat emptor: there isn't much guarding against link
1981        patterns being formed inside other standard Markdown links, e.g.
1982        inside a [link def][like this].
1983
1984        Dev Notes: *Could* consider prefixing regexes with a negative
1985        lookbehind assertion to attempt to guard against this.
1986        """
1987        link_from_hash = {}
1988        for regex, repl in self.link_patterns:
1989            replacements = []
1990            for match in regex.finditer(text):
1991                if hasattr(repl, "__call__"):
1992                    href = repl(match)
1993                else:
1994                    href = match.expand(repl)
1995                replacements.append((match.span(), href))
1996            for (start, end), href in reversed(replacements):
1997                escaped_href = (
1998                    href.replace('"', '&quot;')  # b/c of attr quote
1999                        # To avoid markdown <em> and <strong>:
2000                        .replace('*', self._escape_table['*'])
2001                        .replace('_', self._escape_table['_']))
2002                link = '<a href="%s">%s</a>' % (escaped_href, text[start:end])
2003                hash = _hash_text(link)
2004                link_from_hash[hash] = link
2005                text = text[:start] + hash + text[end:]
2006        for hash, link in list(link_from_hash.items()):
2007            text = text.replace(hash, link)
2008        return text
2009
2010    def _unescape_special_chars(self, text):
2011        # Swap back in all the special characters we've hidden.
2012        for ch, hash in list(self._escape_table.items()):
2013            text = text.replace(hash, ch)
2014        return text
2015
2016    def _outdent(self, text):
2017        # Remove one level of line-leading tabs or spaces
2018        return self._outdent_re.sub('', text)
2019
2020
2021class MarkdownWithExtras(Markdown):
2022    """A markdowner class that enables most extras:
2023
2024    - footnotes
2025    - code-color (only has effect if 'pygments' Python module on path)
2026
2027    These are not included:
2028    - pyshell (specific to Python-related documenting)
2029    - code-friendly (because it *disables* part of the syntax)
2030    - link-patterns (because you need to specify some actual
2031      link-patterns anyway)
2032    """
2033    extras = ["footnotes", "code-color"]
2034
2035
2036#---- internal support functions
2037
2038class UnicodeWithAttrs(unicode):
2039    """A subclass of unicode used for the return value of conversion to
2040    possibly attach some attributes. E.g. the "toc_html" attribute when
2041    the "toc" extra is used.
2042    """
2043    metadata = None
2044    _toc = None
2045    def toc_html(self):
2046        """Return the HTML for the current TOC.
2047
2048        This expects the `_toc` attribute to have been set on this instance.
2049        """
2050        if self._toc is None:
2051            return None
2052
2053        def indent():
2054            return '  ' * (len(h_stack) - 1)
2055        lines = []
2056        h_stack = [0]   # stack of header-level numbers
2057        for level, id, name in self._toc:
2058            if level > h_stack[-1]:
2059                lines.append("%s<ul>" % indent())
2060                h_stack.append(level)
2061            elif level == h_stack[-1]:
2062                lines[-1] += "</li>"
2063            else:
2064                while level < h_stack[-1]:
2065                    h_stack.pop()
2066                    if not lines[-1].endswith("</li>"):
2067                        lines[-1] += "</li>"
2068                    lines.append("%s</ul></li>" % indent())
2069            lines.append('%s<li><a href="#%s">%s</a>' % (
2070                indent(), id, name))
2071        while len(h_stack) > 1:
2072            h_stack.pop()
2073            if not lines[-1].endswith("</li>"):
2074                lines[-1] += "</li>"
2075            lines.append("%s</ul>" % indent())
2076        return '\n'.join(lines) + '\n'
2077    toc_html = property(toc_html)
2078
2079## {{{ http://code.activestate.com/recipes/577257/ (r1)
2080_slugify_strip_re = re.compile(r'[^\w\s-]')
2081_slugify_hyphenate_re = re.compile(r'[-\s]+')
2082def _slugify(value):
2083    """
2084    Normalizes string, converts to lowercase, removes non-alpha characters,
2085    and converts spaces to hyphens.
2086
2087    From Django's "django/template/defaultfilters.py".
2088    """
2089    import unicodedata
2090    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
2091    value = _slugify_strip_re.sub('', value).strip().lower()
2092    return _slugify_hyphenate_re.sub('-', value)
2093## end of http://code.activestate.com/recipes/577257/ }}}
2094
2095
2096# From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
2097def _curry(*args, **kwargs):
2098    function, args = args[0], args[1:]
2099    def result(*rest, **kwrest):
2100        combined = kwargs.copy()
2101        combined.update(kwrest)
2102        return function(*args + rest, **combined)
2103    return result
2104
2105# Recipe: regex_from_encoded_pattern (1.0)
2106def _regex_from_encoded_pattern(s):
2107    """'foo'    -> re.compile(re.escape('foo'))
2108       '/foo/'  -> re.compile('foo')
2109       '/foo/i' -> re.compile('foo', re.I)
2110    """
2111    if s.startswith('/') and s.rfind('/') != 0:
2112        # Parse it: /PATTERN/FLAGS
2113        idx = s.rfind('/')
2114        pattern, flags_str = s[1:idx], s[idx+1:]
2115        flag_from_char = {
2116            "i": re.IGNORECASE,
2117            "l": re.LOCALE,
2118            "s": re.DOTALL,
2119            "m": re.MULTILINE,
2120            "u": re.UNICODE,
2121        }
2122        flags = 0
2123        for char in flags_str:
2124            try:
2125                flags |= flag_from_char[char]
2126            except KeyError:
2127                raise ValueError("unsupported regex flag: '%s' in '%s' "
2128                                 "(must be one of '%s')"
2129                                 % (char, s, ''.join(list(flag_from_char.keys()))))
2130        return re.compile(s[1:idx], flags)
2131    else: # not an encoded regex
2132        return re.compile(re.escape(s))
2133
2134# Recipe: dedent (0.1.2)
2135def _dedentlines(lines, tabsize=8, skip_first_line=False):
2136    """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
2137
2138        "lines" is a list of lines to dedent.
2139        "tabsize" is the tab width to use for indent width calculations.
2140        "skip_first_line" is a boolean indicating if the first line should
2141            be skipped for calculating the indent width and for dedenting.
2142            This is sometimes useful for docstrings and similar.
2143
2144    Same as dedent() except operates on a sequence of lines. Note: the
2145    lines list is modified **in-place**.
2146    """
2147    DEBUG = False
2148    if DEBUG:
2149        print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
2150              % (tabsize, skip_first_line))
2151    margin = None
2152    for i, line in enumerate(lines):
2153        if i == 0 and skip_first_line: continue
2154        indent = 0
2155        for ch in line:
2156            if ch == ' ':
2157                indent += 1
2158            elif ch == '\t':
2159                indent += tabsize - (indent % tabsize)
2160            elif ch in '\r\n':
2161                continue # skip all-whitespace lines
2162            else:
2163                break
2164        else:
2165            continue # skip all-whitespace lines
2166        if DEBUG: print("dedent: indent=%d: %r" % (indent, line))
2167        if margin is None:
2168            margin = indent
2169        else:
2170            margin = min(margin, indent)
2171    if DEBUG: print("dedent: margin=%r" % margin)
2172
2173    if margin is not None and margin > 0:
2174        for i, line in enumerate(lines):
2175            if i == 0 and skip_first_line: continue
2176            removed = 0
2177            for j, ch in enumerate(line):
2178                if ch == ' ':
2179                    removed += 1
2180                elif ch == '\t':
2181                    removed += tabsize - (removed % tabsize)
2182                elif ch in '\r\n':
2183                    if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line)
2184                    lines[i] = lines[i][j:]
2185                    break
2186                else:
2187                    raise ValueError("unexpected non-whitespace char %r in "
2188                                     "line %r while removing %d-space margin"
2189                                     % (ch, line, margin))
2190                if DEBUG:
2191                    print("dedent: %r: %r -> removed %d/%d"\
2192                          % (line, ch, removed, margin))
2193                if removed == margin:
2194                    lines[i] = lines[i][j+1:]
2195                    break
2196                elif removed > margin:
2197                    lines[i] = ' '*(removed-margin) + lines[i][j+1:]
2198                    break
2199            else:
2200                if removed:
2201                    lines[i] = lines[i][removed:]
2202    return lines
2203
2204def _dedent(text, tabsize=8, skip_first_line=False):
2205    """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
2206
2207        "text" is the text to dedent.
2208        "tabsize" is the tab width to use for indent width calculations.
2209        "skip_first_line" is a boolean indicating if the first line should
2210            be skipped for calculating the indent width and for dedenting.
2211            This is sometimes useful for docstrings and similar.
2212
2213    textwrap.dedent(s), but don't expand tabs to spaces
2214    """
2215    lines = text.splitlines(1)
2216    _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
2217    return ''.join(lines)
2218
2219
2220class _memoized(object):
2221   """Decorator that caches a function's return value each time it is called.
2222   If called later with the same arguments, the cached value is returned, and
2223   not re-evaluated.
2224
2225   http://wiki.python.org/moin/PythonDecoratorLibrary
2226   """
2227   def __init__(self, func):
2228      self.func = func
2229      self.cache = {}
2230   def __call__(self, *args):
2231      try:
2232         return self.cache[args]
2233      except KeyError:
2234         self.cache[args] = value = self.func(*args)
2235         return value
2236      except TypeError:
2237         # uncachable -- for instance, passing a list as an argument.
2238         # Better to not cache than to blow up entirely.
2239         return self.func(*args)
2240   def __repr__(self):
2241      """Return the function's docstring."""
2242      return self.func.__doc__
2243
2244
2245def _xml_oneliner_re_from_tab_width(tab_width):
2246    """Standalone XML processing instruction regex."""
2247    return re.compile(r"""
2248        (?:
2249            (?<=\n\n)       # Starting after a blank line
2250            |               # or
2251            \A\n?           # the beginning of the doc
2252        )
2253        (                           # save in $1
2254            [ ]{0,%d}
2255            (?:
2256                <\?\w+\b\s+.*?\?>   # XML processing instruction
2257                |
2258                <\w+:\w+\b\s+.*?/>  # namespaced single tag
2259            )
2260            [ \t]*
2261            (?=\n{2,}|\Z)       # followed by a blank line or end of document
2262        )
2263        """ % (tab_width - 1), re.X)
2264_xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
2265
2266def _hr_tag_re_from_tab_width(tab_width):
2267     return re.compile(r"""
2268        (?:
2269            (?<=\n\n)       # Starting after a blank line
2270            |               # or
2271            \A\n?           # the beginning of the doc
2272        )
2273        (                       # save in \1
2274            [ ]{0,%d}
2275            <(hr)               # start tag = \2
2276            \b                  # word break
2277            ([^<>])*?           #
2278            /?>                 # the matching end tag
2279            [ \t]*
2280            (?=\n{2,}|\Z)       # followed by a blank line or end of document
2281        )
2282        """ % (tab_width - 1), re.X)
2283_hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width)
2284
2285
2286def _xml_escape_attr(attr, skip_single_quote=True):
2287    """Escape the given string for use in an HTML/XML tag attribute.
2288
2289    By default this doesn't bother with escaping `'` to `&#39;`, presuming that
2290    the tag attribute is surrounded by double quotes.
2291    """
2292    escaped = (attr
2293        .replace('&', '&amp;')
2294        .replace('"', '&quot;')
2295        .replace('<', '&lt;')
2296        .replace('>', '&gt;'))
2297    if not skip_single_quote:
2298        escaped = escaped.replace("'", "&#39;")
2299    return escaped
2300
2301
2302def _xml_encode_email_char_at_random(ch):
2303    r = random()
2304    # Roughly 10% raw, 45% hex, 45% dec.
2305    # '@' *must* be encoded. I [John Gruber] insist.
2306    # Issue 26: '_' must be encoded.
2307    if r > 0.9 and ch not in "@_":
2308        return ch
2309    elif r < 0.45:
2310        # The [1:] is to drop leading '0': 0x63 -> x63
2311        return '&#%s;' % hex(ord(ch))[1:]
2312    else:
2313        return '&#%s;' % ord(ch)
2314
2315
2316
2317#---- mainline
2318
2319class _NoReflowFormatter(optparse.IndentedHelpFormatter):
2320    """An optparse formatter that does NOT reflow the description."""
2321    def format_description(self, description):
2322        return description or ""
2323
2324def _test():
2325    import doctest
2326    doctest.testmod()
2327
2328def main(argv=None):
2329    if argv is None:
2330        argv = sys.argv
2331    if not logging.root.handlers:
2332        logging.basicConfig()
2333
2334    usage = "usage: %prog [PATHS...]"
2335    version = "%prog "+__version__
2336    parser = optparse.OptionParser(prog="markdown2", usage=usage,
2337        version=version, description=cmdln_desc,
2338        formatter=_NoReflowFormatter())
2339    parser.add_option("-v", "--verbose", dest="log_level",
2340                      action="store_const", const=logging.DEBUG,
2341                      help="more verbose output")
2342    parser.add_option("--encoding",
2343                      help="specify encoding of text content")
2344    parser.add_option("--html4tags", action="store_true", default=False,
2345                      help="use HTML 4 style for empty element tags")
2346    parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode",
2347                      help="sanitize literal HTML: 'escape' escapes "
2348                           "HTML meta chars, 'replace' replaces with an "
2349                           "[HTML_REMOVED] note")
2350    parser.add_option("-x", "--extras", action="append",
2351                      help="Turn on specific extra features (not part of "
2352                           "the core Markdown spec). See above.")
2353    parser.add_option("--use-file-vars",
2354                      help="Look for and use Emacs-style 'markdown-extras' "
2355                           "file var to turn on extras. See "
2356                           "<https://github.com/trentm/python-markdown2/wiki/Extras>")
2357    parser.add_option("--link-patterns-file",
2358                      help="path to a link pattern file")
2359    parser.add_option("--self-test", action="store_true",
2360                      help="run internal self-tests (some doctests)")
2361    parser.add_option("--compare", action="store_true",
2362                      help="run against Markdown.pl as well (for testing)")
2363    parser.set_defaults(log_level=logging.INFO, compare=False,
2364                        encoding="utf-8", safe_mode=None, use_file_vars=False)
2365    opts, paths = parser.parse_args()
2366    log.setLevel(opts.log_level)
2367
2368    if opts.self_test:
2369        return _test()
2370
2371    if opts.extras:
2372        extras = {}
2373        for s in opts.extras:
2374            splitter = re.compile("[,;: ]+")
2375            for e in splitter.split(s):
2376                if '=' in e:
2377                    ename, earg = e.split('=', 1)
2378                    try:
2379                        earg = int(earg)
2380                    except ValueError:
2381                        pass
2382                else:
2383                    ename, earg = e, None
2384                extras[ename] = earg
2385    else:
2386        extras = None
2387
2388    if opts.link_patterns_file:
2389        link_patterns = []
2390        f = open(opts.link_patterns_file)
2391        try:
2392            for i, line in enumerate(f.readlines()):
2393                if not line.strip(): continue
2394                if line.lstrip().startswith("#"): continue
2395                try:
2396                    pat, href = line.rstrip().rsplit(None, 1)
2397                except ValueError:
2398                    raise MarkdownError("%s:%d: invalid link pattern line: %r"
2399                                        % (opts.link_patterns_file, i+1, line))
2400                link_patterns.append(
2401                    (_regex_from_encoded_pattern(pat), href))
2402        finally:
2403            f.close()
2404    else:
2405        link_patterns = None
2406
2407    from os.path import join, dirname, abspath, exists
2408    markdown_pl = join(dirname(dirname(abspath(__file__))), "test",
2409                       "Markdown.pl")
2410    if not paths:
2411        paths = ['-']
2412    for path in paths:
2413        if path == '-':
2414            text = sys.stdin.read()
2415        else:
2416            fp = codecs.open(path, 'r', opts.encoding)
2417            text = fp.read()
2418            fp.close()
2419        if opts.compare:
2420            from subprocess import Popen, PIPE
2421            print("==== Markdown.pl ====")
2422            p = Popen('perl %s' % markdown_pl, shell=True, stdin=PIPE, stdout=PIPE, close_fds=True)
2423            p.stdin.write(text.encode('utf-8'))
2424            p.stdin.close()
2425            perl_html = p.stdout.read().decode('utf-8')
2426            if py3:
2427                sys.stdout.write(perl_html)
2428            else:
2429                sys.stdout.write(perl_html.encode(
2430                    sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2431            print("==== markdown2.py ====")
2432        html = markdown(text,
2433            html4tags=opts.html4tags,
2434            safe_mode=opts.safe_mode,
2435            extras=extras, link_patterns=link_patterns,
2436            use_file_vars=opts.use_file_vars)
2437        if py3:
2438            sys.stdout.write(html)
2439        else:
2440            sys.stdout.write(html.encode(
2441                sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2442        if extras and "toc" in extras:
2443            log.debug("toc_html: " +
2444                html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2445        if opts.compare:
2446            test_dir = join(dirname(dirname(abspath(__file__))), "test")
2447            if exists(join(test_dir, "test_markdown2.py")):
2448                sys.path.insert(0, test_dir)
2449                from test_markdown2 import norm_html_from_html
2450                norm_html = norm_html_from_html(html)
2451                norm_perl_html = norm_html_from_html(perl_html)
2452            else:
2453                norm_html = html
2454                norm_perl_html = perl_html
2455            print("==== match? %r ====" % (norm_perl_html == norm_html))
2456
2457
2458if __name__ == "__main__":
2459    sys.exit( main(sys.argv) )
Note: See TracBrowser for help on using the repository browser.