Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: OpenRLabs-Git/deploy/rlabs-docker/web2py-rlabs/gluon/contrib/markdown/markdown2.py

main

Last change on this file was 42bd667, checked in by David Fuertes <dfuertes@…>, 4 years ago
Historial Limpio
Property mode set to `100755`
File size: 96.7 KB

Line
1	#!/usr/bin/env python
2	# Copyright (c) 2012 Trent Mick.
3	# Copyright (c) 2007-2008 ActiveState Corp.
4	# License: MIT (http://www.opensource.org/licenses/mit-license.php)
5
6	from __future__ import generators
7	from __future__ import print_function
8
9	r"""A fast and complete Python implementation of Markdown.
10
11	[from http://daringfireball.net/projects/markdown/]
12	> Markdown is a text-to-HTML filter; it translates an easy-to-read /
13	> easy-to-write structured text format into HTML. Markdown's text
14	> format is most similar to that of plain text email, and supports
15	> features such as headers, emphasis, code blocks, blockquotes, and
16	> links.
17	>
18	> Markdown's syntax is designed not as a generic markup language, but
19	> specifically to serve as a front-end to (X)HTML. You can use span-level
20	> HTML tags anywhere in a Markdown document, and you can use block level
21	> HTML tags (like <div> and <table> as well).
22
23	Module usage:
24
25	>>> import markdown2
26	>>> markdown2.markdown("boo!") # or use `html = markdown_path(PATH)`
27	u'<p><em>boo!</em></p>\n'
28
29	>>> markdowner = Markdown()
30	>>> markdowner.convert("boo!")
31	u'<p><em>boo!</em></p>\n'
32	>>> markdowner.convert("boom!")
33	u'<p><strong>boom!</strong></p>\n'
34
35	This implementation of Markdown implements the full "core" syntax plus a
36	number of extras (e.g., code syntax coloring, footnotes) as described on
37	<https://github.com/trentm/python-markdown2/wiki/Extras>.
38	"""
39
40	cmdln_desc = """A fast and complete Python implementation of Markdown, a
41	text-to-HTML conversion tool for web writers.
42
43	Supported extra syntax options (see -x\|--extras option below and
44	see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
45
46	* code-friendly: Disable _ and __ for em and strong.
47	* cuddled-lists: Allow lists to be cuddled to the preceding paragraph.
48	* fenced-code-blocks: Allows a code block to not have to be indented
49	by fencing it with '```' on a line before and after. Based on
50	<http://github.github.com/github-flavored-markdown/> with support for
51	syntax highlighting.
52	* footnotes: Support footnotes as in use on daringfireball.net and
53	implemented in other Markdown processors (tho not in Markdown.pl v1.0.1).
54	* header-ids: Adds "id" attributes to headers. The id value is a slug of
55	the header text.
56	* html-classes: Takes a dict mapping html tag names (lowercase) to a
57	string to use for a "class" tag attribute. Currently only supports "img",
58	"table", "pre" and "code" tags. Add an issue if you require this for other
59	tags.
60	* markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to
61	have markdown processing be done on its contents. Similar to
62	<http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with
63	some limitations.
64	* metadata: Extract metadata from a leading '---'-fenced block.
65	See <https://github.com/trentm/python-markdown2/issues/77> for details.
66	* nofollow: Add `rel="nofollow"` to add `<a>` tags with an href. See
67	<http://en.wikipedia.org/wiki/Nofollow>.
68	* pyshell: Treats unindented Python interactive shell sessions as <code>
69	blocks.
70	* link-patterns: Auto-link given regex patterns in text (e.g. bug number
71	references, revision number references).
72	* smarty-pants: Replaces ' and " with curly quotation marks or curly
73	apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes,
74	and ellipses.
75	* spoiler: A special kind of blockquote commonly hidden behind a
76	click on SO. Syntax per <http://meta.stackexchange.com/a/72878>.
77	* toc: The returned HTML string gets a new "toc_html" attribute which is
78	a Table of Contents for the document. (experimental)
79	* xml: Passes one-liner processing instructions and namespaced XML tags.
80	* tables: Tables using the same format as GFM
81	<https://help.github.com/articles/github-flavored-markdown#tables> and
82	PHP-Markdown Extra <https://michelf.ca/projects/php-markdown/extra/#table>.
83	* wiki-tables: Google Code Wiki-style tables. See
84	<http://code.google.com/p/support/wiki/WikiSyntax#Tables>.
85	"""
86
87	# Dev Notes:
88	# - Python's regex syntax doesn't have '\z', so I'm using '\Z'. I'm
89	# not yet sure if there implications with this. Compare 'pydoc sre'
90	# and 'perldoc perlre'.
91
92	__version_info__ = (2, 3, 1)
93	__version__ = '.'.join(map(str, __version_info__))
94	__author__ = "Trent Mick"
95
96	import sys
97	import re
98	import logging
99	try:
100	from hashlib import md5
101	except ImportError:
102	from md5 import md5
103	import optparse
104	from random import random, randint
105	import codecs
106
107
108	#---- Python version compat
109
110	if sys.version_info[:2] < (2,4):
111	def reversed(sequence):
112	for i in sequence[::-1]:
113	yield i
114
115	# Use `bytes` for byte strings and `unicode` for unicode strings (str in Py3).
116	if sys.version_info[0] <= 2:
117	py3 = False
118	try:
119	bytes
120	except NameError:
121	bytes = str
122	base_string_type = basestring
123	elif sys.version_info[0] >= 3:
124	py3 = True
125	unicode = str
126	base_string_type = str
127
128
129
130	#---- globals
131
132	DEBUG = False
133	log = logging.getLogger("markdown")
134
135	DEFAULT_TAB_WIDTH = 4
136
137
138	SECRET_SALT = bytes(randint(0, 1000000))
139	def _hash_text(s):
140	return 'md5-' + md5(SECRET_SALT + s.encode("utf-8")).hexdigest()
141
142	# Table of hash values for escaped characters:
143	g_escape_table = dict([(ch, _hash_text(ch))
144	for ch in '\\`*_{}[]()>#+-.!'])
145
146
147
148	#---- exceptions
149
150	class MarkdownError(Exception):
151	pass
152
153
154
155	#---- public api
156
157	def markdown_path(path, encoding="utf-8",
158	html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
159	safe_mode=None, extras=None, link_patterns=None,
160	use_file_vars=False):
161	fp = codecs.open(path, 'r', encoding)
162	text = fp.read()
163	fp.close()
164	return Markdown(html4tags=html4tags, tab_width=tab_width,
165	safe_mode=safe_mode, extras=extras,
166	link_patterns=link_patterns,
167	use_file_vars=use_file_vars).convert(text)
168
169	def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
170	safe_mode=None, extras=None, link_patterns=None,
171	use_file_vars=False):
172	return Markdown(html4tags=html4tags, tab_width=tab_width,
173	safe_mode=safe_mode, extras=extras,
174	link_patterns=link_patterns,
175	use_file_vars=use_file_vars).convert(text)
176
177	class Markdown(object):
178	# The dict of "extras" to enable in processing -- a mapping of
179	# extra name to argument for the extra. Most extras do not have an
180	# argument, in which case the value is None.
181	#
182	# This can be set via (a) subclassing and (b) the constructor
183	# "extras" argument.
184	extras = None
185
186	urls = None
187	titles = None
188	html_blocks = None
189	html_spans = None
190	html_removed_text = "[HTML_REMOVED]" # for compat with markdown.py
191
192	# Used to track when we're inside an ordered or unordered list
193	# (see _ProcessListItems() for details):
194	list_level = 0
195
196	_ws_only_line_re = re.compile(r"^[ \t]+$", re.M)
197
198	def __init__(self, html4tags=False, tab_width=4, safe_mode=None,
199	extras=None, link_patterns=None, use_file_vars=False):
200	if html4tags:
201	self.empty_element_suffix = ">"
202	else:
203	self.empty_element_suffix = " />"
204	self.tab_width = tab_width
205
206	# For compatibility with earlier markdown2.py and with
207	# markdown.py's safe_mode being a boolean,
208	# safe_mode == True -> "replace"
209	if safe_mode is True:
210	self.safe_mode = "replace"
211	else:
212	self.safe_mode = safe_mode
213
214	# Massaging and building the "extras" info.
215	if self.extras is None:
216	self.extras = {}
217	elif not isinstance(self.extras, dict):
218	self.extras = dict([(e, None) for e in self.extras])
219	if extras:
220	if not isinstance(extras, dict):
221	extras = dict([(e, None) for e in extras])
222	self.extras.update(extras)
223	assert isinstance(self.extras, dict)
224	if "toc" in self.extras and not "header-ids" in self.extras:
225	self.extras["header-ids"] = None # "toc" implies "header-ids"
226	self._instance_extras = self.extras.copy()
227
228	self.link_patterns = link_patterns
229	self.use_file_vars = use_file_vars
230	self._outdent_re = re.compile(r'^(\t\|[ ]{1,%d})' % tab_width, re.M)
231
232	self._escape_table = g_escape_table.copy()
233	if "smarty-pants" in self.extras:
234	self._escape_table['"'] = _hash_text('"')
235	self._escape_table["'"] = _hash_text("'")
236
237	def reset(self):
238	self.urls = {}
239	self.titles = {}
240	self.html_blocks = {}
241	self.html_spans = {}
242	self.list_level = 0
243	self.extras = self._instance_extras.copy()
244	if "footnotes" in self.extras:
245	self.footnotes = {}
246	self.footnote_ids = []
247	if "header-ids" in self.extras:
248	self._count_from_header_id = {} # no `defaultdict` in Python 2.4
249	if "metadata" in self.extras:
250	self.metadata = {}
251
252	# Per <https://developer.mozilla.org/en-US/docs/HTML/Element/a> "rel"
253	# should only be used in <a> tags with an "href" attribute.
254	_a_nofollow = re.compile(r"<(a)([^>]*href=)", re.IGNORECASE)
255
256	def convert(self, text):
257	"""Convert the given text."""
258	# Main function. The order in which other subs are called here is
259	# essential. Link and image substitutions need to happen before
260	# _EscapeSpecialChars(), so that any *'s or _'s in the <a>
261	# and <img> tags get encoded.
262
263	# Clear the global hashes. If we don't clear these, you get conflicts
264	# from other articles when generating a page which contains more than
265	# one article (e.g. an index page that shows the N most recent
266	# articles):
267	self.reset()
268
269	if not isinstance(text, unicode):
270	#TODO: perhaps shouldn't presume UTF-8 for string input?
271	text = unicode(text, 'utf-8')
272
273	if self.use_file_vars:
274	# Look for emacs-style file variable hints.
275	emacs_vars = self._get_emacs_vars(text)
276	if "markdown-extras" in emacs_vars:
277	splitter = re.compile("[ ,]+")
278	for e in splitter.split(emacs_vars["markdown-extras"]):
279	if '=' in e:
280	ename, earg = e.split('=', 1)
281	try:
282	earg = int(earg)
283	except ValueError:
284	pass
285	else:
286	ename, earg = e, None
287	self.extras[ename] = earg
288
289	# Standardize line endings:
290	text = re.sub("\r\n\|\r", "\n", text)
291
292	# Make sure $text ends with a couple of newlines:
293	text += "\n\n"
294
295	# Convert all tabs to spaces.
296	text = self._detab(text)
297
298	# Strip any lines consisting only of spaces and tabs.
299	# This makes subsequent regexen easier to write, because we can
300	# match consecutive blank lines with /\n+/ instead of something
301	# contorted like /[ \t]*\n+/ .
302	text = self._ws_only_line_re.sub("", text)
303
304	# strip metadata from head and extract
305	if "metadata" in self.extras:
306	text = self._extract_metadata(text)
307
308	text = self.preprocess(text)
309
310	if "fenced-code-blocks" in self.extras and not self.safe_mode:
311	text = self._do_fenced_code_blocks(text)
312
313	if self.safe_mode:
314	text = self._hash_html_spans(text)
315
316	# Turn block-level HTML blocks into hash entries
317	text = self._hash_html_blocks(text, raw=True)
318
319	if "fenced-code-blocks" in self.extras and self.safe_mode:
320	text = self._do_fenced_code_blocks(text)
321
322	# Strip link definitions, store in hashes.
323	if "footnotes" in self.extras:
324	# Must do footnotes first because an unlucky footnote defn
325	# looks like a link defn:
326	# [^4]: this "looks like a link defn"
327	text = self._strip_footnote_definitions(text)
328	text = self._strip_link_definitions(text)
329
330	text = self._run_block_gamut(text)
331
332	if "footnotes" in self.extras:
333	text = self._add_footnotes(text)
334
335	text = self.postprocess(text)
336
337	text = self._unescape_special_chars(text)
338
339	if self.safe_mode:
340	text = self._unhash_html_spans(text)
341
342	if "nofollow" in self.extras:
343	text = self._a_nofollow.sub(r'<\1 rel="nofollow"\2', text)
344
345	text += "\n"
346
347	rv = UnicodeWithAttrs(text)
348	if "toc" in self.extras:
349	rv._toc = self._toc
350	if "metadata" in self.extras:
351	rv.metadata = self.metadata
352	return rv
353
354	def postprocess(self, text):
355	"""A hook for subclasses to do some postprocessing of the html, if
356	desired. This is called before unescaping of special chars and
357	unhashing of raw HTML spans.
358	"""
359	return text
360
361	def preprocess(self, text):
362	"""A hook for subclasses to do some preprocessing of the Markdown, if
363	desired. This is called after basic formatting of the text, but prior
364	to any extras, safe mode, etc. processing.
365	"""
366	return text
367
368	# Is metadata if the content starts with '---'-fenced `key: value`
369	# pairs. E.g. (indented for presentation):
370	# ---
371	# foo: bar
372	# another-var: blah blah
373	# ---
374	_metadata_pat = re.compile("""^---[ \t]\n((?:[ \t][^ \t:]+[ \t]:[^\n]\n)+)---[ \t]*\n""")
375
376	def _extract_metadata(self, text):
377	# fast test
378	if not text.startswith("---"):
379	return text
380	match = self._metadata_pat.match(text)
381	if not match:
382	return text
383
384	tail = text[len(match.group(0)):]
385	metadata_str = match.group(1).strip()
386	for line in metadata_str.split('\n'):
387	key, value = line.split(':', 1)
388	self.metadata[key.strip()] = value.strip()
389
390	return tail
391
392
393	_emacs_oneliner_vars_pat = re.compile(r"-\-\s([^\r\n]?)\s-\*-", re.UNICODE)
394	# This regular expression is intended to match blocks like this:
395	# PREFIX Local Variables: SUFFIX
396	# PREFIX mode: Tcl SUFFIX
397	# PREFIX End: SUFFIX
398	# Some notes:
399	# - "[ \t]" is used instead of "\s" to specifically exclude newlines
400	# - "(\r\n\|\n\|\r)" is used instead of "$" because the sre engine does
401	# not like anything other than Unix-style line terminators.
402	_emacs_local_vars_pat = re.compile(r"""^
403	(?P<prefix>(?:[^\r\n\|\n\|\r])*?)
404	[\ \t]Local\ Variables:[\ \t]
405	(?P<suffix>.*?)(?:\r\n\|\n\|\r)
406	(?P<content>.*?\1End:)
407	""", re.IGNORECASE \| re.MULTILINE \| re.DOTALL \| re.VERBOSE)
408
409	def _get_emacs_vars(self, text):
410	"""Return a dictionary of emacs-style local variables.
411
412	Parsing is done loosely according to this spec (and according to
413	some in-practice deviations from this):
414	http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables
415	"""
416	emacs_vars = {}
417	SIZE = pow(2, 13) # 8kB
418
419	# Search near the start for a '-*-'-style one-liner of variables.
420	head = text[:SIZE]
421	if "-*-" in head:
422	match = self._emacs_oneliner_vars_pat.search(head)
423	if match:
424	emacs_vars_str = match.group(1)
425	assert '\n' not in emacs_vars_str
426	emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';')
427	if s.strip()]
428	if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]:
429	# While not in the spec, this form is allowed by emacs:
430	# -- Tcl --
431	# where the implied "variable" is "mode". This form
432	# is only allowed if there are no other variables.
433	emacs_vars["mode"] = emacs_var_strs[0].strip()
434	else:
435	for emacs_var_str in emacs_var_strs:
436	try:
437	variable, value = emacs_var_str.strip().split(':', 1)
438	except ValueError:
439	log.debug("emacs variables error: malformed -*- "
440	"line: %r", emacs_var_str)
441	continue
442	# Lowercase the variable name because Emacs allows "Mode"
443	# or "mode" or "MoDe", etc.
444	emacs_vars[variable.lower()] = value.strip()
445
446	tail = text[-SIZE:]
447	if "Local Variables" in tail:
448	match = self._emacs_local_vars_pat.search(tail)
449	if match:
450	prefix = match.group("prefix")
451	suffix = match.group("suffix")
452	lines = match.group("content").splitlines(0)
453	#print "prefix=%r, suffix=%r, content=%r, lines: %s"\
454	# % (prefix, suffix, match.group("content"), lines)
455
456	# Validate the Local Variables block: proper prefix and suffix
457	# usage.
458	for i, line in enumerate(lines):
459	if not line.startswith(prefix):
460	log.debug("emacs variables error: line '%s' "
461	"does not use proper prefix '%s'"
462	% (line, prefix))
463	return {}
464	# Don't validate suffix on last line. Emacs doesn't care,
465	# neither should we.
466	if i != len(lines)-1 and not line.endswith(suffix):
467	log.debug("emacs variables error: line '%s' "
468	"does not use proper suffix '%s'"
469	% (line, suffix))
470	return {}
471
472	# Parse out one emacs var per line.
473	continued_for = None
474	for line in lines[:-1]: # no var on the last line ("PREFIX End:")
475	if prefix: line = line[len(prefix):] # strip prefix
476	if suffix: line = line[:-len(suffix)] # strip suffix
477	line = line.strip()
478	if continued_for:
479	variable = continued_for
480	if line.endswith('\\'):
481	line = line[:-1].rstrip()
482	else:
483	continued_for = None
484	emacs_vars[variable] += ' ' + line
485	else:
486	try:
487	variable, value = line.split(':', 1)
488	except ValueError:
489	log.debug("local variables error: missing colon "
490	"in local variables entry: '%s'" % line)
491	continue
492	# Do NOT lowercase the variable name, because Emacs only
493	# allows "mode" (and not "Mode", "MoDe", etc.) in this block.
494	value = value.strip()
495	if value.endswith('\\'):
496	value = value[:-1].rstrip()
497	continued_for = variable
498	else:
499	continued_for = None
500	emacs_vars[variable] = value
501
502	# Unquote values.
503	for var, val in list(emacs_vars.items()):
504	if len(val) > 1 and (val.startswith('"') and val.endswith('"')
505	or val.startswith('"') and val.endswith('"')):
506	emacs_vars[var] = val[1:-1]
507
508	return emacs_vars
509
510	# Cribbed from a post by Bart Lateur:
511	# <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
512	_detab_re = re.compile(r'(.*?)\t', re.M)
513	def _detab_sub(self, match):
514	g1 = match.group(1)
515	return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width))
516	def _detab(self, text):
517	r"""Remove (leading?) tabs from a file.
518
519	>>> m = Markdown()
520	>>> m._detab("\tfoo")
521	' foo'
522	>>> m._detab(" \tfoo")
523	' foo'
524	>>> m._detab("\t foo")
525	' foo'
526	>>> m._detab(" foo")
527	' foo'
528	>>> m._detab(" foo\n\tbar\tblam")
529	' foo\n bar blam'
530	"""
531	if '\t' not in text:
532	return text
533	return self._detab_re.subn(self._detab_sub, text)[0]
534
535	# I broke out the html5 tags here and add them to _block_tags_a and
536	# _block_tags_b. This way html5 tags are easy to keep track of.
537	_html5tags = '\|article\|aside\|header\|hgroup\|footer\|nav\|section\|figure\|figcaption'
538
539	_block_tags_a = 'p\|div\|h[1-6]\|blockquote\|pre\|table\|dl\|ol\|ul\|script\|noscript\|form\|fieldset\|iframe\|math\|ins\|del'
540	_block_tags_a += _html5tags
541
542	_strict_tag_block_re = re.compile(r"""
543	( # save in \1
544	^ # start of line (with re.M)
545	<(%s) # start tag = \2
546	\b # word break
547	(.\n)? # any number of lines, minimally matching
548	</\2> # the matching end tag
549	[ \t]* # trailing spaces/tabs
550	(?=\n+\|\Z) # followed by a newline or end of document
551	)
552	""" % _block_tags_a,
553	re.X \| re.M)
554
555	_block_tags_b = 'p\|div\|h[1-6]\|blockquote\|pre\|table\|dl\|ol\|ul\|script\|noscript\|form\|fieldset\|iframe\|math'
556	_block_tags_b += _html5tags
557
558	_liberal_tag_block_re = re.compile(r"""
559	( # save in \1
560	^ # start of line (with re.M)
561	<(%s) # start tag = \2
562	\b # word break
563	(.\n)? # any number of lines, minimally matching
564	.*</\2> # the matching end tag
565	[ \t]* # trailing spaces/tabs
566	(?=\n+\|\Z) # followed by a newline or end of document
567	)
568	""" % _block_tags_b,
569	re.X \| re.M)
570
571	_html_markdown_attr_re = re.compile(
572	r'''\s+markdown=("1"\|'1')''')
573	def _hash_html_block_sub(self, match, raw=False):
574	html = match.group(1)
575	if raw and self.safe_mode:
576	html = self._sanitize_html(html)
577	elif 'markdown-in-html' in self.extras and 'markdown=' in html:
578	first_line = html.split('\n', 1)[0]
579	m = self._html_markdown_attr_re.search(first_line)
580	if m:
581	lines = html.split('\n')
582	middle = '\n'.join(lines[1:-1])
583	last_line = lines[-1]
584	first_line = first_line[:m.start()] + first_line[m.end():]
585	f_key = _hash_text(first_line)
586	self.html_blocks[f_key] = first_line
587	l_key = _hash_text(last_line)
588	self.html_blocks[l_key] = last_line
589	return ''.join(["\n\n", f_key,
590	"\n\n", middle, "\n\n",
591	l_key, "\n\n"])
592	key = _hash_text(html)
593	self.html_blocks[key] = html
594	return "\n\n" + key + "\n\n"
595
596	def _hash_html_blocks(self, text, raw=False):
597	"""Hashify HTML blocks
598
599	We only want to do this for block-level HTML tags, such as headers,
600	lists, and tables. That's because we still want to wrap <p>s around
601	"paragraphs" that are wrapped in non-block-level tags, such as anchors,
602	phrase emphasis, and spans. The list of tags we're looking for is
603	hard-coded.
604
605	@param raw {boolean} indicates if these are raw HTML blocks in
606	the original source. It makes a difference in "safe" mode.
607	"""
608	if '<' not in text:
609	return text
610
611	# Pass `raw` value into our calls to self._hash_html_block_sub.
612	hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw)
613
614	# First, look for nested blocks, e.g.:
615	# <div>
616	# <div>
617	# tags for inner block must be indented.
618	# </div>
619	# </div>
620	#
621	# The outermost tags must start at the left margin for this to match, and
622	# the inner nested divs must be indented.
623	# We need to do this before the next, more liberal match, because the next
624	# match will start at the first `<div>` and stop at the first `</div>`.
625	text = self._strict_tag_block_re.sub(hash_html_block_sub, text)
626
627	# Now match more liberally, simply from `\n<tag>` to `</tag>\n`
628	text = self._liberal_tag_block_re.sub(hash_html_block_sub, text)
629
630	# Special case just for <hr />. It was easier to make a special
631	# case than to make the other regex more complicated.
632	if "<hr" in text:
633	_hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width)
634	text = _hr_tag_re.sub(hash_html_block_sub, text)
635
636	# Special case for standalone HTML comments:
637	if "<!--" in text:
638	start = 0
639	while True:
640	# Delimiters for next comment block.
641	try:
642	start_idx = text.index("<!--", start)
643	except ValueError:
644	break
645	try:
646	end_idx = text.index("-->", start_idx) + 3
647	except ValueError:
648	break
649
650	# Start position for next comment block search.
651	start = end_idx
652
653	# Validate whitespace before comment.
654	if start_idx:
655	# - Up to `tab_width - 1` spaces before start_idx.
656	for i in range(self.tab_width - 1):
657	if text[start_idx - 1] != ' ':
658	break
659	start_idx -= 1
660	if start_idx == 0:
661	break
662	# - Must be preceded by 2 newlines or hit the start of
663	# the document.
664	if start_idx == 0:
665	pass
666	elif start_idx == 1 and text[0] == '\n':
667	start_idx = 0 # to match minute detail of Markdown.pl regex
668	elif text[start_idx-2:start_idx] == '\n\n':
669	pass
670	else:
671	break
672
673	# Validate whitespace after comment.
674	# - Any number of spaces and tabs.
675	while end_idx < len(text):
676	if text[end_idx] not in ' \t':
677	break
678	end_idx += 1
679	# - Must be following by 2 newlines or hit end of text.
680	if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'):
681	continue
682
683	# Escape and hash (must match `_hash_html_block_sub`).
684	html = text[start_idx:end_idx]
685	if raw and self.safe_mode:
686	html = self._sanitize_html(html)
687	key = _hash_text(html)
688	self.html_blocks[key] = html
689	text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:]
690
691	if "xml" in self.extras:
692	# Treat XML processing instructions and namespaced one-liner
693	# tags as if they were block HTML tags. E.g., if standalone
694	# (i.e. are their own paragraph), the following do not get
695	# wrapped in a <p> tag:
696	# <?foo bar?>
697	#
698	# <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/>
699	_xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width)
700	text = _xml_oneliner_re.sub(hash_html_block_sub, text)
701
702	return text
703
704	def _strip_link_definitions(self, text):
705	# Strips link definitions from text, stores the URLs and titles in
706	# hash references.
707	less_than_tab = self.tab_width - 1
708
709	# Link defs are in the form:
710	# [id]: url "optional title"
711	_link_def_re = re.compile(r"""
712	^[ ]{0,%d}\[(.+)\]: # id = \1
713	[ \t]*
714	\n? # maybe one newline
715	[ \t]*
716	<?(.+?)>? # url = \2
717	[ \t]*
718	(?:
719	\n? # maybe one newline
720	[ \t]*
721	(?<=\s) # lookbehind for whitespace
722	['"(]
723	([^\n]*) # title = \3
724	['")]
725	[ \t]*
726	)? # title is optional
727	(?:\n+\|\Z)
728	""" % less_than_tab, re.X \| re.M \| re.U)
729	return _link_def_re.sub(self._extract_link_def_sub, text)
730
731	def _extract_link_def_sub(self, match):
732	id, url, title = match.groups()
733	key = id.lower() # Link IDs are case-insensitive
734	self.urls[key] = self._encode_amps_and_angles(url)
735	if title:
736	self.titles[key] = title
737	return ""
738
739	def _extract_footnote_def_sub(self, match):
740	id, text = match.groups()
741	text = _dedent(text, skip_first_line=not text.startswith('\n')).strip()
742	normed_id = re.sub(r'\W', '-', id)
743	# Ensure footnote text ends with a couple newlines (for some
744	# block gamut matches).
745	self.footnotes[normed_id] = text + "\n\n"
746	return ""
747
748	def _strip_footnote_definitions(self, text):
749	"""A footnote definition looks like this:
750
751	[^note-id]: Text of the note.
752
753	May include one or more indented paragraphs.
754
755	Where,
756	- The 'note-id' can be pretty much anything, though typically it
757	is the number of the footnote.
758	- The first paragraph may start on the next line, like so:
759
760	[^note-id]:
761	Text of the note.
762	"""
763	less_than_tab = self.tab_width - 1
764	footnote_def_re = re.compile(r'''
765	^[ ]{0,%d}\[\^(.+)\]: # id = \1
766	[ \t]*
767	( # footnote text = \2
768	# First line need not start with the spaces.
769	(?:\s.\n+)
770	(?:
771	(?:[ ]{%d} \| \t) # Subsequent lines must be indented.
772	.*\n+
773	)*
774	)
775	# Lookahead for non-space at line-start, or end of doc.
776	(?:(?=^[ ]{0,%d}\S)\|\Z)
777	''' % (less_than_tab, self.tab_width, self.tab_width),
778	re.X \| re.M)
779	return footnote_def_re.sub(self._extract_footnote_def_sub, text)
780
781	_hr_re = re.compile(r'^[ ]{0,3}([-_*][ ]{0,2}){3,}$', re.M)
782
783	def _run_block_gamut(self, text):
784	# These are all the transformations that form block-level
785	# tags like paragraphs, headers, and list items.
786
787	if "fenced-code-blocks" in self.extras:
788	text = self._do_fenced_code_blocks(text)
789
790	text = self._do_headers(text)
791
792	# Do Horizontal Rules:
793	# On the number of spaces in horizontal rules: The spec is fuzzy: "If
794	# you wish, you may use spaces between the hyphens or asterisks."
795	# Markdown.pl 1.0.1's hr regexes limit the number of spaces between the
796	# hr chars to one or two. We'll reproduce that limit here.
797	hr = "\n<hr"+self.empty_element_suffix+"\n"
798	text = re.sub(self._hr_re, hr, text)
799
800	text = self._do_lists(text)
801
802	if "pyshell" in self.extras:
803	text = self._prepare_pyshell_blocks(text)
804	if "wiki-tables" in self.extras:
805	text = self._do_wiki_tables(text)
806	if "tables" in self.extras:
807	text = self._do_tables(text)
808
809	text = self._do_code_blocks(text)
810
811	text = self._do_block_quotes(text)
812
813	# We already ran _HashHTMLBlocks() before, in Markdown(), but that
814	# was to escape raw HTML in the original Markdown source. This time,
815	# we're escaping the markup we've just created, so that we don't wrap
816	# <p> tags around block-level tags.
817	text = self._hash_html_blocks(text)
818
819	text = self._form_paragraphs(text)
820
821	return text
822
823	def _pyshell_block_sub(self, match):
824	lines = match.group(0).splitlines(0)
825	_dedentlines(lines)
826	indent = ' ' * self.tab_width
827	s = ('\n' # separate from possible cuddled paragraph
828	+ indent + ('\n'+indent).join(lines)
829	+ '\n\n')
830	return s
831
832	def _prepare_pyshell_blocks(self, text):
833	"""Ensure that Python interactive shell sessions are put in
834	code blocks -- even if not properly indented.
835	"""
836	if ">>>" not in text:
837	return text
838
839	less_than_tab = self.tab_width - 1
840	_pyshell_block_re = re.compile(r"""
841	^([ ]{0,%d})>>>[ ].*\n # first line
842	^(\1.\S+.\n)* # any number of subsequent lines
843	^\n # ends with a blank line
844	""" % less_than_tab, re.M \| re.X)
845
846	return _pyshell_block_re.sub(self._pyshell_block_sub, text)
847
848	def _table_sub(self, match):
849	trim_space_re = '^[ \t\n]+\|[ \t\n]+$'
850	trim_bar_re = '^\\|\|\\|$'
851
852	head, underline, body = match.groups()
853
854	# Determine aligns for columns.
855	cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", underline)).split('\|')]
856	align_from_col_idx = {}
857	for col_idx, col in enumerate(cols):
858	if col[0] == ':' and col[-1] == ':':
859	align_from_col_idx[col_idx] = ' align="center"'
860	elif col[0] == ':':
861	align_from_col_idx[col_idx] = ' align="left"'
862	elif col[-1] == ':':
863	align_from_col_idx[col_idx] = ' align="right"'
864
865	# thead
866	hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<thead>', '<tr>']
867	cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)).split('\|')]
868	for col_idx, col in enumerate(cols):
869	hlines.append(' <th%s>%s</th>' % (
870	align_from_col_idx.get(col_idx, ''),
871	self._run_span_gamut(col)
872	))
873	hlines.append('</tr>')
874	hlines.append('</thead>')
875
876	# tbody
877	hlines.append('<tbody>')
878	for line in body.strip('\n').split('\n'):
879	hlines.append('<tr>')
880	cols = [cell.strip() for cell in re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)).split('\|')]
881	for col_idx, col in enumerate(cols):
882	hlines.append(' <td%s>%s</td>' % (
883	align_from_col_idx.get(col_idx, ''),
884	self._run_span_gamut(col)
885	))
886	hlines.append('</tr>')
887	hlines.append('</tbody>')
888	hlines.append('</table>')
889
890	return '\n'.join(hlines) + '\n'
891
892	def _do_tables(self, text):
893	"""Copying PHP-Markdown and GFM table syntax. Some regex borrowed from
894	https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538
895	"""
896	less_than_tab = self.tab_width - 1
897	table_re = re.compile(r'''
898	(?:(?<=\n\n)\|\A\n?) # leading blank line
899
900	^[ ]{0,%d} # allowed whitespace
901	(.[\|].) \n # $1: header row (at least one pipe)
902
903	^[ ]{0,%d} # allowed whitespace
904	( # $2: underline row
905	# underline row with leading bar
906	(?: \\|\ :?-+:?\ )+ \\|? \n
907	\|
908	# or, underline row without leading bar
909	(?: \ :?-+:?\ \\| )+ (?: \ :?-+:?\ )? \n
910	)
911
912	( # $3: data rows
913	(?:
914	^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces
915	.\\|. \n
916	)+
917	)
918	''' % (less_than_tab, less_than_tab, less_than_tab), re.M \| re.X)
919	return table_re.sub(self._table_sub, text)
920
921	def _wiki_table_sub(self, match):
922	ttext = match.group(0).strip()
923	#print 'wiki table: %r' % match.group(0)
924	rows = []
925	for line in ttext.splitlines(0):
926	line = line.strip()[2:-2].strip()
927	row = [c.strip() for c in re.split(r'(?<!\\)\\|\\|', line)]
928	rows.append(row)
929	#pprint(rows)
930	hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<tbody>']
931	for row in rows:
932	hrow = ['<tr>']
933	for cell in row:
934	hrow.append('<td>')
935	hrow.append(self._run_span_gamut(cell))
936	hrow.append('</td>')
937	hrow.append('</tr>')
938	hlines.append(''.join(hrow))
939	hlines += ['</tbody>', '</table>']
940	return '\n'.join(hlines) + '\n'
941
942	def _do_wiki_tables(self, text):
943	# Optimization.
944	if "\|\|" not in text:
945	return text
946
947	less_than_tab = self.tab_width - 1
948	wiki_table_re = re.compile(r'''
949	(?:(?<=\n\n)\|\A\n?) # leading blank line
950	^([ ]{0,%d})\\|\\|.+?\\|\\|[ ]*\n # first line
951	(^\1\\|\\|.+?\\|\\|\n)* # any number of subsequent lines
952	''' % less_than_tab, re.M \| re.X)
953	return wiki_table_re.sub(self._wiki_table_sub, text)
954
955	def _run_span_gamut(self, text):
956	# These are all the transformations that occur within block-level
957	# tags like paragraphs, headers, and list items.
958
959	text = self._do_code_spans(text)
960
961	text = self._escape_special_chars(text)
962
963	# Process anchor and image tags.
964	text = self._do_links(text)
965
966	# Make links out of things like `<http://example.com/>`
967	# Must come after _do_links(), because you can use < and >
968	# delimiters in inline links like [this](<url>).
969	text = self._do_auto_links(text)
970
971	if "link-patterns" in self.extras:
972	text = self._do_link_patterns(text)
973
974	text = self._encode_amps_and_angles(text)
975
976	if "strike" in self.extras:
977	text = self._do_strike(text)
978
979	text = self._do_italics_and_bold(text)
980
981	if "smarty-pants" in self.extras:
982	text = self._do_smart_punctuation(text)
983
984	# Do hard breaks:
985	if "break-on-newline" in self.extras:
986	text = re.sub(r" *\n", "<br%s\n" % self.empty_element_suffix, text)
987	else:
988	text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
989
990	return text
991
992	# "Sorta" because auto-links are identified as "tag" tokens.
993	_sorta_html_tokenize_re = re.compile(r"""
994	(
995	# tag
996	</?
997	(?:\w+) # tag name
998	(?:\s+(?:[\w-]+:)?[\w-]+=(?:".?"\|'.?'))* # attributes
999	\s*/?>
1000	\|
1001	# auto-link (e.g., <http://www.activestate.com/>)
1002	<\w+[^>]*>
1003	\|
1004	<!--.*?--> # comment
1005	\|
1006	<\?.*?\?> # processing instruction
1007	)
1008	""", re.X)
1009
1010	def _escape_special_chars(self, text):
1011	# Python markdown note: the HTML tokenization here differs from
1012	# that in Markdown.pl, hence the behaviour for subtle cases can
1013	# differ (I believe the tokenizer here does a better job because
1014	# it isn't susceptible to unmatched '<' and '>' in HTML tags).
1015	# Note, however, that '>' is not allowed in an auto-link URL
1016	# here.
1017	escaped = []
1018	is_html_markup = False
1019	for token in self._sorta_html_tokenize_re.split(text):
1020	if is_html_markup:
1021	# Within tags/HTML-comments/auto-links, encode * and _
1022	# so they don't conflict with their use in Markdown for
1023	# italics and strong. We're replacing each such
1024	# character with its corresponding MD5 checksum value;
1025	# this is likely overkill, but it should prevent us from
1026	# colliding with the escape values by accident.
1027	escaped.append(token.replace('', self._escape_table[''])
1028	.replace('_', self._escape_table['_']))
1029	else:
1030	escaped.append(self._encode_backslash_escapes(token))
1031	is_html_markup = not is_html_markup
1032	return ''.join(escaped)
1033
1034	def _hash_html_spans(self, text):
1035	# Used for safe_mode.
1036
1037	def _is_auto_link(s):
1038	if ':' in s and self._auto_link_re.match(s):
1039	return True
1040	elif '@' in s and self._auto_email_link_re.match(s):
1041	return True
1042	return False
1043
1044	tokens = []
1045	is_html_markup = False
1046	for token in self._sorta_html_tokenize_re.split(text):
1047	if is_html_markup and not _is_auto_link(token):
1048	sanitized = self._sanitize_html(token)
1049	key = _hash_text(sanitized)
1050	self.html_spans[key] = sanitized
1051	tokens.append(key)
1052	else:
1053	tokens.append(token)
1054	is_html_markup = not is_html_markup
1055	return ''.join(tokens)
1056
1057	def _unhash_html_spans(self, text):
1058	for key, sanitized in list(self.html_spans.items()):
1059	text = text.replace(key, sanitized)
1060	return text
1061
1062	def _sanitize_html(self, s):
1063	if self.safe_mode == "replace":
1064	return self.html_removed_text
1065	elif self.safe_mode == "escape":
1066	replacements = [
1067	('&', '&'),
1068	('<', '<'),
1069	('>', '>'),
1070	]
1071	for before, after in replacements:
1072	s = s.replace(before, after)
1073	return s
1074	else:
1075	raise MarkdownError("invalid value for 'safe_mode': %r (must be "
1076	"'escape' or 'replace')" % self.safe_mode)
1077
1078	_inline_link_title = re.compile(r'''
1079	( # \1
1080	[ \t]+
1081	(['"]) # quote char = \2
1082	(?P<title>.*?)
1083	\2
1084	)? # title is optional
1085	\)$
1086	''', re.X \| re.S)
1087	_tail_of_reference_link_re = re.compile(r'''
1088	# Match tail of: [text][id]
1089	[ ]? # one optional space
1090	(?:\n[ ]*)? # one optional newline followed by spaces
1091	\[
1092	(?P<id>.*?)
1093	\]
1094	''', re.X \| re.S)
1095
1096	_whitespace = re.compile(r'\s*')
1097
1098	_strip_anglebrackets = re.compile(r'<(.)>.')
1099
1100	def _find_non_whitespace(self, text, start):
1101	"""Returns the index of the first non-whitespace character in text
1102	after (and including) start
1103	"""
1104	match = self._whitespace.match(text, start)
1105	return match.end()
1106
1107	def _find_balanced(self, text, start, open_c, close_c):
1108	"""Returns the index where the open_c and close_c characters balance
1109	out - the same number of open_c and close_c are encountered - or the
1110	end of string if it's reached before the balance point is found.
1111	"""
1112	i = start
1113	l = len(text)
1114	count = 1
1115	while count > 0 and i < l:
1116	if text[i] == open_c:
1117	count += 1
1118	elif text[i] == close_c:
1119	count -= 1
1120	i += 1
1121	return i
1122
1123	def _extract_url_and_title(self, text, start):
1124	"""Extracts the url and (optional) title from the tail of a link"""
1125	# text[start] equals the opening parenthesis
1126	idx = self._find_non_whitespace(text, start+1)
1127	if idx == len(text):
1128	return None, None, None
1129	end_idx = idx
1130	has_anglebrackets = text[idx] == "<"
1131	if has_anglebrackets:
1132	end_idx = self._find_balanced(text, end_idx+1, "<", ">")
1133	end_idx = self._find_balanced(text, end_idx, "(", ")")
1134	match = self._inline_link_title.search(text, idx, end_idx)
1135	if not match:
1136	return None, None, None
1137	url, title = text[idx:match.start()], match.group("title")
1138	if has_anglebrackets:
1139	url = self._strip_anglebrackets.sub(r'\1', url)
1140	return url, title, end_idx
1141
1142	def _do_links(self, text):
1143	"""Turn Markdown link shortcuts into XHTML <a> and <img> tags.
1144
1145	This is a combination of Markdown.pl's _DoAnchors() and
1146	_DoImages(). They are done together because that simplified the
1147	approach. It was necessary to use a different approach than
1148	Markdown.pl because of the lack of atomic matching support in
1149	Python's regex engine used in $g_nested_brackets.
1150	"""
1151	MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24
1152
1153	# `anchor_allowed_pos` is used to support img links inside
1154	# anchors, but not anchors inside anchors. An anchor's start
1155	# pos must be `>= anchor_allowed_pos`.
1156	anchor_allowed_pos = 0
1157
1158	curr_pos = 0
1159	while True: # Handle the next link.
1160	# The next '[' is the start of:
1161	# - an inline anchor: [text](url "title")
1162	# - a reference anchor: [text][id]
1163	# - an inline img: ![text](url "title")
1164	# - a reference img: ![text][id]
1165	# - a footnote ref: [^id]
1166	# (Only if 'footnotes' extra enabled)
1167	# - a footnote defn: [^id]: ...
1168	# (Only if 'footnotes' extra enabled) These have already
1169	# been stripped in _strip_footnote_definitions() so no
1170	# need to watch for them.
1171	# - a link definition: [id]: url "title"
1172	# These have already been stripped in
1173	# _strip_link_definitions() so no need to watch for them.
1174	# - not markup: [...anything else...
1175	try:
1176	start_idx = text.index('[', curr_pos)
1177	except ValueError:
1178	break
1179	text_length = len(text)
1180
1181	# Find the matching closing ']'.
1182	# Markdown.pl allows matching brackets in link text so we
1183	# will here too. Markdown.pl doesn't currently allow
1184	# matching brackets in img alt text -- we'll differ in that
1185	# regard.
1186	bracket_depth = 0
1187	for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,
1188	text_length)):
1189	ch = text[p]
1190	if ch == ']':
1191	bracket_depth -= 1
1192	if bracket_depth < 0:
1193	break
1194	elif ch == '[':
1195	bracket_depth += 1
1196	else:
1197	# Closing bracket not found within sentinel length.
1198	# This isn't markup.
1199	curr_pos = start_idx + 1
1200	continue
1201	link_text = text[start_idx+1:p]
1202
1203	# Possibly a footnote ref?
1204	if "footnotes" in self.extras and link_text.startswith("^"):
1205	normed_id = re.sub(r'\W', '-', link_text[1:])
1206	if normed_id in self.footnotes:
1207	self.footnote_ids.append(normed_id)
1208	result = '<sup class="footnote-ref" id="fnref-%s">' \
1209	'<a href="#fn-%s">%s</a></sup>' \
1210	% (normed_id, normed_id, len(self.footnote_ids))
1211	text = text[:start_idx] + result + text[p+1:]
1212	else:
1213	# This id isn't defined, leave the markup alone.
1214	curr_pos = p+1
1215	continue
1216
1217	# Now determine what this is by the remainder.
1218	p += 1
1219	if p == text_length:
1220	return text
1221
1222	# Inline anchor or img?
1223	if text[p] == '(': # attempt at perf improvement
1224	url, title, url_end_idx = self._extract_url_and_title(text, p)
1225	if url is not None:
1226	# Handle an inline anchor or img.
1227	is_img = start_idx > 0 and text[start_idx-1] == "!"
1228	if is_img:
1229	start_idx -= 1
1230
1231	# We've got to encode these to avoid conflicting
1232	# with italics/bold.
1233	url = url.replace('', self._escape_table['']) \
1234	.replace('_', self._escape_table['_'])
1235	if title:
1236	title_str = ' title="%s"' % (
1237	_xml_escape_attr(title)
1238	.replace('', self._escape_table[''])
1239	.replace('_', self._escape_table['_']))
1240	else:
1241	title_str = ''
1242	if is_img:
1243	img_class_str = self._html_class_str_from_tag("img")
1244	result = '<img src="%s" alt="%s"%s%s%s' \
1245	% (url.replace('"', '"'),
1246	_xml_escape_attr(link_text),
1247	title_str, img_class_str, self.empty_element_suffix)
1248	if "smarty-pants" in self.extras:
1249	result = result.replace('"', self._escape_table['"'])
1250	curr_pos = start_idx + len(result)
1251	text = text[:start_idx] + result + text[url_end_idx:]
1252	elif start_idx >= anchor_allowed_pos:
1253	result_head = '<a href="%s"%s>' % (url, title_str)
1254	result = '%s%s</a>' % (result_head, link_text)
1255	if "smarty-pants" in self.extras:
1256	result = result.replace('"', self._escape_table['"'])
1257	# <img> allowed from curr_pos on, <a> from
1258	# anchor_allowed_pos on.
1259	curr_pos = start_idx + len(result_head)
1260	anchor_allowed_pos = start_idx + len(result)
1261	text = text[:start_idx] + result + text[url_end_idx:]
1262	else:
1263	# Anchor not allowed here.
1264	curr_pos = start_idx + 1
1265	continue
1266
1267	# Reference anchor or img?
1268	else:
1269	match = self._tail_of_reference_link_re.match(text, p)
1270	if match:
1271	# Handle a reference-style anchor or img.
1272	is_img = start_idx > 0 and text[start_idx-1] == "!"
1273	if is_img:
1274	start_idx -= 1
1275	link_id = match.group("id").lower()
1276	if not link_id:
1277	link_id = link_text.lower() # for links like [this][]
1278	if link_id in self.urls:
1279	url = self.urls[link_id]
1280	# We've got to encode these to avoid conflicting
1281	# with italics/bold.
1282	url = url.replace('', self._escape_table['']) \
1283	.replace('_', self._escape_table['_'])
1284	title = self.titles.get(link_id)
1285	if title:
1286	title = _xml_escape_attr(title) \
1287	.replace('', self._escape_table['']) \
1288	.replace('_', self._escape_table['_'])
1289	title_str = ' title="%s"' % title
1290	else:
1291	title_str = ''
1292	if is_img:
1293	img_class_str = self._html_class_str_from_tag("img")
1294	result = '<img src="%s" alt="%s"%s%s%s' \
1295	% (url.replace('"', '"'),
1296	link_text.replace('"', '"'),
1297	title_str, img_class_str, self.empty_element_suffix)
1298	if "smarty-pants" in self.extras:
1299	result = result.replace('"', self._escape_table['"'])
1300	curr_pos = start_idx + len(result)
1301	text = text[:start_idx] + result + text[match.end():]
1302	elif start_idx >= anchor_allowed_pos:
1303	result = '<a href="%s"%s>%s</a>' \
1304	% (url, title_str, link_text)
1305	result_head = '<a href="%s"%s>' % (url, title_str)
1306	result = '%s%s</a>' % (result_head, link_text)
1307	if "smarty-pants" in self.extras:
1308	result = result.replace('"', self._escape_table['"'])
1309	# <img> allowed from curr_pos on, <a> from
1310	# anchor_allowed_pos on.
1311	curr_pos = start_idx + len(result_head)
1312	anchor_allowed_pos = start_idx + len(result)
1313	text = text[:start_idx] + result + text[match.end():]
1314	else:
1315	# Anchor not allowed here.
1316	curr_pos = start_idx + 1
1317	else:
1318	# This id isn't defined, leave the markup alone.
1319	curr_pos = match.end()
1320	continue
1321
1322	# Otherwise, it isn't markup.
1323	curr_pos = start_idx + 1
1324
1325	return text
1326
1327	def header_id_from_text(self, text, prefix, n):
1328	"""Generate a header id attribute value from the given header
1329	HTML content.
1330
1331	This is only called if the "header-ids" extra is enabled.
1332	Subclasses may override this for different header ids.
1333
1334	@param text {str} The text of the header tag
1335	@param prefix {str} The requested prefix for header ids. This is the
1336	value of the "header-ids" extra key, if any. Otherwise, None.
1337	@param n {int} The <hN> tag number, i.e. `1` for an <h1> tag.
1338	@returns {str} The value for the header tag's "id" attribute. Return
1339	None to not have an id attribute and to exclude this header from
1340	the TOC (if the "toc" extra is specified).
1341	"""
1342	header_id = _slugify(text)
1343	if prefix and isinstance(prefix, base_string_type):
1344	header_id = prefix + '-' + header_id
1345	if header_id in self._count_from_header_id:
1346	self._count_from_header_id[header_id] += 1
1347	header_id += '-%s' % self._count_from_header_id[header_id]
1348	else:
1349	self._count_from_header_id[header_id] = 1
1350	return header_id
1351
1352	_toc = None
1353	def _toc_add_entry(self, level, id, name):
1354	if self._toc is None:
1355	self._toc = []
1356	self._toc.append((level, id, self._unescape_special_chars(name)))
1357
1358	_h_re_base = r'''
1359	(^(.+)[ \t]\n(=+\|-+)[ \t]\n+)
1360	\|
1361	(^(\#{1,6}) # \1 = string of #'s
1362	[ \t]%s
1363	(.+?) # \2 = Header text
1364	[ \t]*
1365	(?<!\\) # ensure not an escaped trailing '#'
1366	\#* # optional closing #'s (not counted)
1367	\n+
1368	)
1369	'''
1370
1371	_h_re = re.compile(_h_re_base % '*', re.X \| re.M)
1372	_h_re_tag_friendly = re.compile(_h_re_base % '+', re.X \| re.M)
1373
1374	def _h_sub(self, match):
1375	if match.group(1) is not None:
1376	# Setext header
1377	n = {"=": 1, "-": 2}[match.group(3)[0]]
1378	header_group = match.group(2)
1379	else:
1380	# atx header
1381	n = len(match.group(5))
1382	header_group = match.group(6)
1383
1384	demote_headers = self.extras.get("demote-headers")
1385	if demote_headers:
1386	n = min(n + demote_headers, 6)
1387	header_id_attr = ""
1388	if "header-ids" in self.extras:
1389	header_id = self.header_id_from_text(header_group,
1390	self.extras["header-ids"], n)
1391	if header_id:
1392	header_id_attr = ' id="%s"' % header_id
1393	html = self._run_span_gamut(header_group)
1394	if "toc" in self.extras and header_id:
1395	self._toc_add_entry(n, header_id, html)
1396	return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
1397
1398	def _do_headers(self, text):
1399	# Setext-style headers:
1400	# Header 1
1401	# ========
1402	#
1403	# Header 2
1404	# --------
1405
1406	# atx-style headers:
1407	# # Header 1
1408	# ## Header 2
1409	# ## Header 2 with closing hashes ##
1410	# ...
1411	# ###### Header 6
1412
1413	if 'tag-friendly' in self.extras:
1414	return self._h_re_tag_friendly.sub(self._h_sub, text)
1415	return self._h_re.sub(self._h_sub, text)
1416
1417	_marker_ul_chars = '*+-'
1418	_marker_any = r'(?:[%s]\|\d+\.)' % _marker_ul_chars
1419	_marker_ul = '(?:[%s])' % _marker_ul_chars
1420	_marker_ol = r'(?:\d+\.)'
1421
1422	def _list_sub(self, match):
1423	lst = match.group(1)
1424	lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol"
1425	result = self._process_list_items(lst)
1426	if self.list_level:
1427	return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type)
1428	else:
1429	return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type)
1430
1431	def _do_lists(self, text):
1432	# Form HTML ordered (numbered) and unordered (bulleted) lists.
1433
1434	# Iterate over each non-overlapping list match.
1435	pos = 0
1436	while True:
1437	# Find the first hit for either list style (ul or ol). We
1438	# match ul and ol separately to avoid adjacent lists of different
1439	# types running into each other (see issue #16).
1440	hits = []
1441	for marker_pat in (self._marker_ul, self._marker_ol):
1442	less_than_tab = self.tab_width - 1
1443	whole_list = r'''
1444	( # \1 = whole list
1445	( # \2
1446	[ ]{0,%d}
1447	(%s) # \3 = first list item marker
1448	[ \t]+
1449	(?!\ *\3\ ) # '- - - ...' isn't a list. See 'not_quite_a_list' test case.
1450	)
1451	(?:.+?)
1452	( # \4
1453	\Z
1454	\|
1455	\n{2,}
1456	(?=\S)
1457	(?! # Negative lookahead for another list item marker
1458	[ \t]*
1459	%s[ \t]+
1460	)
1461	)
1462	)
1463	''' % (less_than_tab, marker_pat, marker_pat)
1464	if self.list_level: # sub-list
1465	list_re = re.compile("^"+whole_list, re.X \| re.M \| re.S)
1466	else:
1467	list_re = re.compile(r"(?:(?<=\n\n)\|\A\n?)"+whole_list,
1468	re.X \| re.M \| re.S)
1469	match = list_re.search(text, pos)
1470	if match:
1471	hits.append((match.start(), match))
1472	if not hits:
1473	break
1474	hits.sort()
1475	match = hits[0][1]
1476	start, end = match.span()
1477	middle = self._list_sub(match)
1478	text = text[:start] + middle + text[end:]
1479	pos = start + len(middle) # start pos for next attempted match
1480
1481	return text
1482
1483	_list_item_re = re.compile(r'''
1484	(\n)? # leading line = \1
1485	(^[ \t]*) # leading whitespace = \2
1486	(?P<marker>%s) [ \t]+ # list marker = \3
1487	((?:.+?) # list item text = \4
1488	(\n{1,2})) # eols = \5
1489	(?= \n* (\Z \| \2 (?P<next_marker>%s) [ \t]+))
1490	''' % (_marker_any, _marker_any),
1491	re.M \| re.X \| re.S)
1492
1493	_last_li_endswith_two_eols = False
1494	def _list_item_sub(self, match):
1495	item = match.group(4)
1496	leading_line = match.group(1)
1497	if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
1498	item = self._run_block_gamut(self._outdent(item))
1499	else:
1500	# Recursion for sub-lists:
1501	item = self._do_lists(self._outdent(item))
1502	if item.endswith('\n'):
1503	item = item[:-1]
1504	item = self._run_span_gamut(item)
1505	self._last_li_endswith_two_eols = (len(match.group(5)) == 2)
1506	return "<li>%s</li>\n" % item
1507
1508	def _process_list_items(self, list_str):
1509	# Process the contents of a single ordered or unordered list,
1510	# splitting it into individual list items.
1511
1512	# The $g_list_level global keeps track of when we're inside a list.
1513	# Each time we enter a list, we increment it; when we leave a list,
1514	# we decrement. If it's zero, we're not in a list anymore.
1515	#
1516	# We do this because when we're not inside a list, we want to treat
1517	# something like this:
1518	#
1519	# I recommend upgrading to version
1520	# 8. Oops, now this line is treated
1521	# as a sub-list.
1522	#
1523	# As a single paragraph, despite the fact that the second line starts
1524	# with a digit-period-space sequence.
1525	#
1526	# Whereas when we're inside a list (or sub-list), that line will be
1527	# treated as the start of a sub-list. What a kludge, huh? This is
1528	# an aspect of Markdown's syntax that's hard to parse perfectly
1529	# without resorting to mind-reading. Perhaps the solution is to
1530	# change the syntax rules such that sub-lists must start with a
1531	# starting cardinal number; e.g. "1." or "a.".
1532	self.list_level += 1
1533	self._last_li_endswith_two_eols = False
1534	list_str = list_str.rstrip('\n') + '\n'
1535	list_str = self._list_item_re.sub(self._list_item_sub, list_str)
1536	self.list_level -= 1
1537	return list_str
1538
1539	def _get_pygments_lexer(self, lexer_name):
1540	try:
1541	from pygments import lexers, util
1542	except ImportError:
1543	return None
1544	try:
1545	return lexers.get_lexer_by_name(lexer_name)
1546	except util.ClassNotFound:
1547	return None
1548
1549	def _color_with_pygments(self, codeblock, lexer, **formatter_opts):
1550	import pygments
1551	import pygments.formatters
1552
1553	class HtmlCodeFormatter(pygments.formatters.HtmlFormatter):
1554	def _wrap_code(self, inner):
1555	"""A function for use in a Pygments Formatter which
1556	wraps in <code> tags.
1557	"""
1558	yield 0, "<code>"
1559	for tup in inner:
1560	yield tup
1561	yield 0, "</code>"
1562
1563	def wrap(self, source, outfile):
1564	"""Return the source with a code, pre, and div."""
1565	return self._wrap_div(self._wrap_pre(self._wrap_code(source)))
1566
1567	formatter_opts.setdefault("cssclass", "codehilite")
1568	formatter = HtmlCodeFormatter(**formatter_opts)
1569	return pygments.highlight(codeblock, lexer, formatter)
1570
1571	def _code_block_sub(self, match, is_fenced_code_block=False):
1572	lexer_name = None
1573	if is_fenced_code_block:
1574	lexer_name = match.group(1)
1575	if lexer_name:
1576	formatter_opts = self.extras['fenced-code-blocks'] or {}
1577	codeblock = match.group(2)
1578	codeblock = codeblock[:-1] # drop one trailing newline
1579	else:
1580	codeblock = match.group(1)
1581	codeblock = self._outdent(codeblock)
1582	codeblock = self._detab(codeblock)
1583	codeblock = codeblock.lstrip('\n') # trim leading newlines
1584	codeblock = codeblock.rstrip() # trim trailing whitespace
1585
1586	# Note: "code-color" extra is DEPRECATED.
1587	if "code-color" in self.extras and codeblock.startswith(":::"):
1588	lexer_name, rest = codeblock.split('\n', 1)
1589	lexer_name = lexer_name[3:].strip()
1590	codeblock = rest.lstrip("\n") # Remove lexer declaration line.
1591	formatter_opts = self.extras['code-color'] or {}
1592
1593	if lexer_name:
1594	def unhash_code( codeblock ):
1595	for key, sanitized in list(self.html_spans.items()):
1596	codeblock = codeblock.replace(key, sanitized)
1597	replacements = [
1598	("&", "&"),
1599	("<", "<"),
1600	(">", ">")
1601	]
1602	for old, new in replacements:
1603	codeblock = codeblock.replace(old, new)
1604	return codeblock
1605	lexer = self._get_pygments_lexer(lexer_name)
1606	if lexer:
1607	codeblock = unhash_code( codeblock )
1608	colored = self._color_with_pygments(codeblock, lexer,
1609	**formatter_opts)
1610	return "\n\n%s\n\n" % colored
1611
1612	codeblock = self._encode_code(codeblock)
1613	pre_class_str = self._html_class_str_from_tag("pre")
1614	code_class_str = self._html_class_str_from_tag("code")
1615	return "\n\n<pre%s><code%s>%s\n</code></pre>\n\n" % (
1616	pre_class_str, code_class_str, codeblock)
1617
1618	def _html_class_str_from_tag(self, tag):
1619	"""Get the appropriate ' class="..."' string (note the leading
1620	space), if any, for the given tag.
1621	"""
1622	if "html-classes" not in self.extras:
1623	return ""
1624	try:
1625	html_classes_from_tag = self.extras["html-classes"]
1626	except TypeError:
1627	return ""
1628	else:
1629	if tag in html_classes_from_tag:
1630	return ' class="%s"' % html_classes_from_tag[tag]
1631	return ""
1632
1633	def _do_code_blocks(self, text):
1634	"""Process Markdown `<pre><code>` blocks."""
1635	code_block_re = re.compile(r'''
1636	(?:\n\n\|\A\n?)
1637	( # $1 = the code block -- one or more lines, starting with a space/tab
1638	(?:
1639	(?:[ ]{%d} \| \t) # Lines must start with a tab or a tab-width of spaces
1640	.*\n+
1641	)+
1642	)
1643	((?=^[ ]{0,%d}\S)\|\Z) # Lookahead for non-space at line-start, or end of doc
1644	# Lookahead to make sure this block isn't already in a code block.
1645	# Needed when syntax highlighting is being used.
1646	(?![^<]*\</code\>)
1647	''' % (self.tab_width, self.tab_width),
1648	re.M \| re.X)
1649	return code_block_re.sub(self._code_block_sub, text)
1650
1651	_fenced_code_block_re = re.compile(r'''
1652	(?:\n\n\|\A\n?)
1653	^```([\w+-]+)?[ \t]*\n # opening fence, $1 = optional lang
1654	(.*?) # $2 = code block content
1655	^```[ \t]*\n # closing fence
1656	''', re.M \| re.X \| re.S)
1657
1658	def _fenced_code_block_sub(self, match):
1659	return self._code_block_sub(match, is_fenced_code_block=True);
1660
1661	def _do_fenced_code_blocks(self, text):
1662	"""Process ```-fenced unindented code blocks ('fenced-code-blocks' extra)."""
1663	return self._fenced_code_block_re.sub(self._fenced_code_block_sub, text)
1664
1665	# Rules for a code span:
1666	# - backslash escapes are not interpreted in a code span
1667	# - to include one or or a run of more backticks the delimiters must
1668	# be a longer run of backticks
1669	# - cannot start or end a code span with a backtick; pad with a
1670	# space and that space will be removed in the emitted HTML
1671	# See `test/tm-cases/escapes.text` for a number of edge-case
1672	# examples.
1673	_code_span_re = re.compile(r'''
1674	(?<!\\)
1675	(`+) # \1 = Opening run of `
1676	(?!`) # See Note A test/tm-cases/escapes.text
1677	(.+?) # \2 = The code block
1678	(?<!`)
1679	\1 # Matching closer
1680	(?!`)
1681	''', re.X \| re.S)
1682
1683	def _code_span_sub(self, match):
1684	c = match.group(2).strip(" \t")
1685	c = self._encode_code(c)
1686	return "<code>%s</code>" % c
1687
1688	def _do_code_spans(self, text):
1689	# * Backtick quotes are used for <code></code> spans.
1690	#
1691	# * You can use multiple backticks as the delimiters if you want to
1692	# include literal backticks in the code span. So, this input:
1693	#
1694	# Just type ``foo `bar` baz`` at the prompt.
1695	#
1696	# Will translate to:
1697	#
1698	# <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1699	#
1700	# There's no arbitrary limit to the number of backticks you
1701	# can use as delimters. If you need three consecutive backticks
1702	# in your code, use four for delimiters, etc.
1703	#
1704	# * You can use spaces to get literal backticks at the edges:
1705	#
1706	# ... type `` `bar` `` ...
1707	#
1708	# Turns to:
1709	#
1710	# ... type <code>`bar`</code> ...
1711	return self._code_span_re.sub(self._code_span_sub, text)
1712
1713	def _encode_code(self, text):
1714	"""Encode/escape certain characters inside Markdown code runs.
1715	The point is that in code, these characters are literals,
1716	and lose their special Markdown meanings.
1717	"""
1718	replacements = [
1719	# Encode all ampersands; HTML entities are not
1720	# entities within a Markdown code span.
1721	('&', '&'),
1722	# Do the angle bracket song and dance:
1723	('<', '<'),
1724	('>', '>'),
1725	]
1726	for before, after in replacements:
1727	text = text.replace(before, after)
1728	hashed = _hash_text(text)
1729	self._escape_table[text] = hashed
1730	return hashed
1731
1732	_strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S)
1733	def _do_strike(self, text):
1734	text = self._strike_re.sub(r"<strike>\1</strike>", text)
1735	return text
1736
1737	_strong_re = re.compile(r"(\\\|__)(?=\S)(.+?[_])(?<=\S)\1", re.S)
1738	_em_re = re.compile(r"(\*\|_)(?=\S)(.+?)(?<=\S)\1", re.S)
1739	_code_friendly_strong_re = re.compile(r"\\(?=\S)(.+?[_])(?<=\S)\\", re.S)
1740	_code_friendly_em_re = re.compile(r"\(?=\S)(.+?)(?<=\S)\", re.S)
1741	def _do_italics_and_bold(self, text):
1742	# <strong> must go first:
1743	if "code-friendly" in self.extras:
1744	text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text)
1745	text = self._code_friendly_em_re.sub(r"<em>\1</em>", text)
1746	else:
1747	text = self._strong_re.sub(r"<strong>\2</strong>", text)
1748	text = self._em_re.sub(r"<em>\2</em>", text)
1749	return text
1750
1751	# "smarty-pants" extra: Very liberal in interpreting a single prime as an
1752	# apostrophe; e.g. ignores the fact that "round", "bout", "twer", and
1753	# "twixt" can be written without an initial apostrophe. This is fine because
1754	# using scare quotes (single quotation marks) is rare.
1755	_apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s\|,\|;\|\.\|\?\|!\|$))")
1756	_contractions = ["tis", "twas", "twer", "neath", "o", "n",
1757	"round", "bout", "twixt", "nuff", "fraid", "sup"]
1758	def _do_smart_contractions(self, text):
1759	text = self._apostrophe_year_re.sub(r"’\1", text)
1760	for c in self._contractions:
1761	text = text.replace("'%s" % c, "’%s" % c)
1762	text = text.replace("'%s" % c.capitalize(),
1763	"’%s" % c.capitalize())
1764	return text
1765
1766	# Substitute double-quotes before single-quotes.
1767	_opening_single_quote_re = re.compile(r"(?<!\S)'(?=\S)")
1768	_opening_double_quote_re = re.compile(r'(?<!\S)"(?=\S)')
1769	_closing_single_quote_re = re.compile(r"(?<=\S)'")
1770	_closing_double_quote_re = re.compile(r'(?<=\S)"(?=(\s\|,\|;\|\.\|\?\|!\|$))')
1771	def _do_smart_punctuation(self, text):
1772	"""Fancifies 'single quotes', "double quotes", and apostrophes.
1773	Converts --, ---, and ... into en dashes, em dashes, and ellipses.
1774
1775	Inspiration is: <http://daringfireball.net/projects/smartypants/>
1776	See "test/tm-cases/smarty_pants.text" for a full discussion of the
1777	support here and
1778	<http://code.google.com/p/python-markdown2/issues/detail?id=42> for a
1779	discussion of some diversion from the original SmartyPants.
1780	"""
1781	if "'" in text: # guard for perf
1782	text = self._do_smart_contractions(text)
1783	text = self._opening_single_quote_re.sub("‘", text)
1784	text = self._closing_single_quote_re.sub("’", text)
1785
1786	if '"' in text: # guard for perf
1787	text = self._opening_double_quote_re.sub("“", text)
1788	text = self._closing_double_quote_re.sub("”", text)
1789
1790	text = text.replace("---", "—")
1791	text = text.replace("--", "–")
1792	text = text.replace("...", "…")
1793	text = text.replace(" . . . ", "…")
1794	text = text.replace(". . .", "…")
1795	return text
1796
1797	_block_quote_base = r'''
1798	( # Wrap whole match in \1
1799	(
1800	^[ \t]*>%s[ \t]? # '>' at the start of a line
1801	.+\n # rest of the first line
1802	(.+\n)* # subsequent consecutive lines
1803	\n* # blanks
1804	)+
1805	)
1806	'''
1807	_block_quote_re = re.compile(_block_quote_base % '', re.M \| re.X)
1808	_block_quote_re_spoiler = re.compile(_block_quote_base % '[ \t]*?!?', re.M \| re.X)
1809	_bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M);
1810	_bq_one_level_re_spoiler = re.compile('^[ \t]>[ \t]?![ \t]?', re.M);
1811	_bq_all_lines_spoilers = re.compile(r'\A(?:^[ \t]>[ \t]?!.[\n\r])+\Z', re.M)
1812	_html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
1813	def _dedent_two_spaces_sub(self, match):
1814	return re.sub(r'(?m)^ ', '', match.group(1))
1815
1816	def _block_quote_sub(self, match):
1817	bq = match.group(1)
1818	is_spoiler = 'spoiler' in self.extras and self._bq_all_lines_spoilers.match(bq)
1819	# trim one level of quoting
1820	if is_spoiler:
1821	bq = self._bq_one_level_re_spoiler.sub('', bq)
1822	else:
1823	bq = self._bq_one_level_re.sub('', bq)
1824	# trim whitespace-only lines
1825	bq = self._ws_only_line_re.sub('', bq)
1826	bq = self._run_block_gamut(bq) # recurse
1827
1828	bq = re.sub('(?m)^', ' ', bq)
1829	# These leading spaces screw with <pre> content, so we need to fix that:
1830	bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)
1831
1832	if is_spoiler:
1833	return '<blockquote class="spoiler">\n%s\n</blockquote>\n\n' % bq
1834	else:
1835	return '<blockquote>\n%s\n</blockquote>\n\n' % bq
1836
1837	def _do_block_quotes(self, text):
1838	if '>' not in text:
1839	return text
1840	if 'spoiler' in self.extras:
1841	return self._block_quote_re_spoiler.sub(self._block_quote_sub, text)
1842	else:
1843	return self._block_quote_re.sub(self._block_quote_sub, text)
1844
1845	def _form_paragraphs(self, text):
1846	# Strip leading and trailing lines:
1847	text = text.strip('\n')
1848
1849	# Wrap <p> tags.
1850	grafs = []
1851	for i, graf in enumerate(re.split(r"\n{2,}", text)):
1852	if graf in self.html_blocks:
1853	# Unhashify HTML blocks
1854	grafs.append(self.html_blocks[graf])
1855	else:
1856	cuddled_list = None
1857	if "cuddled-lists" in self.extras:
1858	# Need to put back trailing '\n' for `_list_item_re`
1859	# match at the end of the paragraph.
1860	li = self._list_item_re.search(graf + '\n')
1861	# Two of the same list marker in this paragraph: a likely
1862	# candidate for a list cuddled to preceding paragraph
1863	# text (issue 33). Note the `[-1]` is a quick way to
1864	# consider numeric bullets (e.g. "1." and "2.") to be
1865	# equal.
1866	if (li and len(li.group(2)) <= 3 and li.group("next_marker")
1867	and li.group("marker")[-1] == li.group("next_marker")[-1]):
1868	start = li.start()
1869	cuddled_list = self._do_lists(graf[start:]).rstrip("\n")
1870	assert cuddled_list.startswith("<ul>") or cuddled_list.startswith("<ol>")
1871	graf = graf[:start]
1872
1873	# Wrap <p> tags.
1874	graf = self._run_span_gamut(graf)
1875	grafs.append("<p>" + graf.lstrip(" \t") + "</p>")
1876
1877	if cuddled_list:
1878	grafs.append(cuddled_list)
1879
1880	return "\n\n".join(grafs)
1881
1882	def _add_footnotes(self, text):
1883	if self.footnotes:
1884	footer = [
1885	'<div class="footnotes">',
1886	'<hr' + self.empty_element_suffix,
1887	'<ol>',
1888	]
1889	for i, id in enumerate(self.footnote_ids):
1890	if i != 0:
1891	footer.append('')
1892	footer.append('<li id="fn-%s">' % id)
1893	footer.append(self._run_block_gamut(self.footnotes[id]))
1894	backlink = ('<a href="#fnref-%s" '
1895	'class="footnoteBackLink" '
1896	'title="Jump back to footnote %d in the text.">'
1897	'↩</a>' % (id, i+1))
1898	if footer[-1].endswith("</p>"):
1899	footer[-1] = footer[-1][:-len("</p>")] \
1900	+ ' ' + backlink + "</p>"
1901	else:
1902	footer.append("\n<p>%s</p>" % backlink)
1903	footer.append('</li>')
1904	footer.append('</ol>')
1905	footer.append('</div>')
1906	return text + '\n\n' + '\n'.join(footer)
1907	else:
1908	return text
1909
1910	# Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1911	# http://bumppo.net/projects/amputator/
1912	_ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+\|\w+);)')
1913	_naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I)
1914	_naked_gt_re = re.compile(r'''(?<![a-z0-9?!/'"-])>''', re.I)
1915
1916	def _encode_amps_and_angles(self, text):
1917	# Smart processing for ampersands and angle brackets that need
1918	# to be encoded.
1919	text = self._ampersand_re.sub('&', text)
1920
1921	# Encode naked <'s
1922	text = self._naked_lt_re.sub('<', text)
1923
1924	# Encode naked >'s
1925	# Note: Other markdown implementations (e.g. Markdown.pl, PHP
1926	# Markdown) don't do this.
1927	text = self._naked_gt_re.sub('>', text)
1928	return text
1929
1930	def _encode_backslash_escapes(self, text):
1931	for ch, escape in list(self._escape_table.items()):
1932	text = text.replace("\\"+ch, escape)
1933	return text
1934
1935	_auto_link_re = re.compile(r'<((https?\|ftp):[^\'">\s]+)>', re.I)
1936	def _auto_link_sub(self, match):
1937	g1 = match.group(1)
1938	return '<a href="%s">%s</a>' % (g1, g1)
1939
1940	_auto_email_link_re = re.compile(r"""
1941	<
1942	(?:mailto:)?
1943	(
1944	[-.\w]+
1945	\@
1946	[-\w]+(\.[-\w]+)*\.[a-z]+
1947	)
1948	>
1949	""", re.I \| re.X \| re.U)
1950	def _auto_email_link_sub(self, match):
1951	return self._encode_email_address(
1952	self._unescape_special_chars(match.group(1)))
1953
1954	def _do_auto_links(self, text):
1955	text = self._auto_link_re.sub(self._auto_link_sub, text)
1956	text = self._auto_email_link_re.sub(self._auto_email_link_sub, text)
1957	return text
1958
1959	def _encode_email_address(self, addr):
1960	# Input: an email address, e.g. "foo@example.com"
1961	#
1962	# Output: the email address as a mailto link, with each character
1963	# of the address encoded as either a decimal or hex entity, in
1964	# the hopes of foiling most address harvesting spam bots. E.g.:
1965	#
1966	# <a href="mailto:foo@e
1967	# xample.com">foo
1968	# @example.com</a>
1969	#
1970	# Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1971	# mailing list: <http://tinyurl.com/yu7ue>
1972	chars = [_xml_encode_email_char_at_random(ch)
1973	for ch in "mailto:" + addr]
1974	# Strip the mailto: from the visible part.
1975	addr = '<a href="%s">%s</a>' \
1976	% (''.join(chars), ''.join(chars[7:]))
1977	return addr
1978
1979	def _do_link_patterns(self, text):
1980	"""Caveat emptor: there isn't much guarding against link
1981	patterns being formed inside other standard Markdown links, e.g.
1982	inside a [link def][like this].
1983
1984	Dev Notes: Could consider prefixing regexes with a negative
1985	lookbehind assertion to attempt to guard against this.
1986	"""
1987	link_from_hash = {}
1988	for regex, repl in self.link_patterns:
1989	replacements = []
1990	for match in regex.finditer(text):
1991	if hasattr(repl, "__call__"):
1992	href = repl(match)
1993	else:
1994	href = match.expand(repl)
1995	replacements.append((match.span(), href))
1996	for (start, end), href in reversed(replacements):
1997	escaped_href = (
1998	href.replace('"', '"') # b/c of attr quote
1999	# To avoid markdown <em> and <strong>:
2000	.replace('', self._escape_table[''])
2001	.replace('_', self._escape_table['_']))
2002	link = '<a href="%s">%s</a>' % (escaped_href, text[start:end])
2003	hash = _hash_text(link)
2004	link_from_hash[hash] = link
2005	text = text[:start] + hash + text[end:]
2006	for hash, link in list(link_from_hash.items()):
2007	text = text.replace(hash, link)
2008	return text
2009
2010	def _unescape_special_chars(self, text):
2011	# Swap back in all the special characters we've hidden.
2012	for ch, hash in list(self._escape_table.items()):
2013	text = text.replace(hash, ch)
2014	return text
2015
2016	def _outdent(self, text):
2017	# Remove one level of line-leading tabs or spaces
2018	return self._outdent_re.sub('', text)
2019
2020
2021	class MarkdownWithExtras(Markdown):
2022	"""A markdowner class that enables most extras:
2023
2024	- footnotes
2025	- code-color (only has effect if 'pygments' Python module on path)
2026
2027	These are not included:
2028	- pyshell (specific to Python-related documenting)
2029	- code-friendly (because it disables part of the syntax)
2030	- link-patterns (because you need to specify some actual
2031	link-patterns anyway)
2032	"""
2033	extras = ["footnotes", "code-color"]
2034
2035
2036	#---- internal support functions
2037
2038	class UnicodeWithAttrs(unicode):
2039	"""A subclass of unicode used for the return value of conversion to
2040	possibly attach some attributes. E.g. the "toc_html" attribute when
2041	the "toc" extra is used.
2042	"""
2043	metadata = None
2044	_toc = None
2045	def toc_html(self):
2046	"""Return the HTML for the current TOC.
2047
2048	This expects the `_toc` attribute to have been set on this instance.
2049	"""
2050	if self._toc is None:
2051	return None
2052
2053	def indent():
2054	return ' ' * (len(h_stack) - 1)
2055	lines = []
2056	h_stack = [0] # stack of header-level numbers
2057	for level, id, name in self._toc:
2058	if level > h_stack[-1]:
2059	lines.append("%s<ul>" % indent())
2060	h_stack.append(level)
2061	elif level == h_stack[-1]:
2062	lines[-1] += "</li>"
2063	else:
2064	while level < h_stack[-1]:
2065	h_stack.pop()
2066	if not lines[-1].endswith("</li>"):
2067	lines[-1] += "</li>"
2068	lines.append("%s</ul></li>" % indent())
2069	lines.append('%s<li><a href="#%s">%s</a>' % (
2070	indent(), id, name))
2071	while len(h_stack) > 1:
2072	h_stack.pop()
2073	if not lines[-1].endswith("</li>"):
2074	lines[-1] += "</li>"
2075	lines.append("%s</ul>" % indent())
2076	return '\n'.join(lines) + '\n'
2077	toc_html = property(toc_html)
2078
2079	## {{{ http://code.activestate.com/recipes/577257/ (r1)
2080	_slugify_strip_re = re.compile(r'[^\w\s-]')
2081	_slugify_hyphenate_re = re.compile(r'[-\s]+')
2082	def _slugify(value):
2083	"""
2084	Normalizes string, converts to lowercase, removes non-alpha characters,
2085	and converts spaces to hyphens.
2086
2087	From Django's "django/template/defaultfilters.py".
2088	"""
2089	import unicodedata
2090	value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
2091	value = _slugify_strip_re.sub('', value).strip().lower()
2092	return _slugify_hyphenate_re.sub('-', value)
2093	## end of http://code.activestate.com/recipes/577257/ }}}
2094
2095
2096	# From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
2097	def _curry(args, *kwargs):
2098	function, args = args[0], args[1:]
2099	def result(rest, *kwrest):
2100	combined = kwargs.copy()
2101	combined.update(kwrest)
2102	return function(args + rest, *combined)
2103	return result
2104
2105	# Recipe: regex_from_encoded_pattern (1.0)
2106	def _regex_from_encoded_pattern(s):
2107	"""'foo' -> re.compile(re.escape('foo'))
2108	'/foo/' -> re.compile('foo')
2109	'/foo/i' -> re.compile('foo', re.I)
2110	"""
2111	if s.startswith('/') and s.rfind('/') != 0:
2112	# Parse it: /PATTERN/FLAGS
2113	idx = s.rfind('/')
2114	pattern, flags_str = s[1:idx], s[idx+1:]
2115	flag_from_char = {
2116	"i": re.IGNORECASE,
2117	"l": re.LOCALE,
2118	"s": re.DOTALL,
2119	"m": re.MULTILINE,
2120	"u": re.UNICODE,
2121	}
2122	flags = 0
2123	for char in flags_str:
2124	try:
2125	flags \|= flag_from_char[char]
2126	except KeyError:
2127	raise ValueError("unsupported regex flag: '%s' in '%s' "
2128	"(must be one of '%s')"
2129	% (char, s, ''.join(list(flag_from_char.keys()))))
2130	return re.compile(s[1:idx], flags)
2131	else: # not an encoded regex
2132	return re.compile(re.escape(s))
2133
2134	# Recipe: dedent (0.1.2)
2135	def _dedentlines(lines, tabsize=8, skip_first_line=False):
2136	"""_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
2137
2138	"lines" is a list of lines to dedent.
2139	"tabsize" is the tab width to use for indent width calculations.
2140	"skip_first_line" is a boolean indicating if the first line should
2141	be skipped for calculating the indent width and for dedenting.
2142	This is sometimes useful for docstrings and similar.
2143
2144	Same as dedent() except operates on a sequence of lines. Note: the
2145	lines list is modified in-place.
2146	"""
2147	DEBUG = False
2148	if DEBUG:
2149	print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
2150	% (tabsize, skip_first_line))
2151	margin = None
2152	for i, line in enumerate(lines):
2153	if i == 0 and skip_first_line: continue
2154	indent = 0
2155	for ch in line:
2156	if ch == ' ':
2157	indent += 1
2158	elif ch == '\t':
2159	indent += tabsize - (indent % tabsize)
2160	elif ch in '\r\n':
2161	continue # skip all-whitespace lines
2162	else:
2163	break
2164	else:
2165	continue # skip all-whitespace lines
2166	if DEBUG: print("dedent: indent=%d: %r" % (indent, line))
2167	if margin is None:
2168	margin = indent
2169	else:
2170	margin = min(margin, indent)
2171	if DEBUG: print("dedent: margin=%r" % margin)
2172
2173	if margin is not None and margin > 0:
2174	for i, line in enumerate(lines):
2175	if i == 0 and skip_first_line: continue
2176	removed = 0
2177	for j, ch in enumerate(line):
2178	if ch == ' ':
2179	removed += 1
2180	elif ch == '\t':
2181	removed += tabsize - (removed % tabsize)
2182	elif ch in '\r\n':
2183	if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line)
2184	lines[i] = lines[i][j:]
2185	break
2186	else:
2187	raise ValueError("unexpected non-whitespace char %r in "
2188	"line %r while removing %d-space margin"
2189	% (ch, line, margin))
2190	if DEBUG:
2191	print("dedent: %r: %r -> removed %d/%d"\
2192	% (line, ch, removed, margin))
2193	if removed == margin:
2194	lines[i] = lines[i][j+1:]
2195	break
2196	elif removed > margin:
2197	lines[i] = ' '*(removed-margin) + lines[i][j+1:]
2198	break
2199	else:
2200	if removed:
2201	lines[i] = lines[i][removed:]
2202	return lines
2203
2204	def _dedent(text, tabsize=8, skip_first_line=False):
2205	"""_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
2206
2207	"text" is the text to dedent.
2208	"tabsize" is the tab width to use for indent width calculations.
2209	"skip_first_line" is a boolean indicating if the first line should
2210	be skipped for calculating the indent width and for dedenting.
2211	This is sometimes useful for docstrings and similar.
2212
2213	textwrap.dedent(s), but don't expand tabs to spaces
2214	"""
2215	lines = text.splitlines(1)
2216	_dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
2217	return ''.join(lines)
2218
2219
2220	class _memoized(object):
2221	"""Decorator that caches a function's return value each time it is called.
2222	If called later with the same arguments, the cached value is returned, and
2223	not re-evaluated.
2224
2225	http://wiki.python.org/moin/PythonDecoratorLibrary
2226	"""
2227	def __init__(self, func):
2228	self.func = func
2229	self.cache = {}
2230	def __call__(self, *args):
2231	try:
2232	return self.cache[args]
2233	except KeyError:
2234	self.cache[args] = value = self.func(*args)
2235	return value
2236	except TypeError:
2237	# uncachable -- for instance, passing a list as an argument.
2238	# Better to not cache than to blow up entirely.
2239	return self.func(*args)
2240	def __repr__(self):
2241	"""Return the function's docstring."""
2242	return self.func.__doc__
2243
2244
2245	def _xml_oneliner_re_from_tab_width(tab_width):
2246	"""Standalone XML processing instruction regex."""
2247	return re.compile(r"""
2248	(?:
2249	(?<=\n\n) # Starting after a blank line
2250	\| # or
2251	\A\n? # the beginning of the doc
2252	)
2253	( # save in $1
2254	[ ]{0,%d}
2255	(?:
2256	<\?\w+\b\s+.*?\?> # XML processing instruction
2257	\|
2258	<\w+:\w+\b\s+.*?/> # namespaced single tag
2259	)
2260	[ \t]*
2261	(?=\n{2,}\|\Z) # followed by a blank line or end of document
2262	)
2263	""" % (tab_width - 1), re.X)
2264	_xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
2265
2266	def _hr_tag_re_from_tab_width(tab_width):
2267	return re.compile(r"""
2268	(?:
2269	(?<=\n\n) # Starting after a blank line
2270	\| # or
2271	\A\n? # the beginning of the doc
2272	)
2273	( # save in \1
2274	[ ]{0,%d}
2275	<(hr) # start tag = \2
2276	\b # word break
2277	([^<>])*? #
2278	/?> # the matching end tag
2279	[ \t]*
2280	(?=\n{2,}\|\Z) # followed by a blank line or end of document
2281	)
2282	""" % (tab_width - 1), re.X)
2283	_hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width)
2284
2285
2286	def _xml_escape_attr(attr, skip_single_quote=True):
2287	"""Escape the given string for use in an HTML/XML tag attribute.
2288
2289	By default this doesn't bother with escaping `'` to `'`, presuming that
2290	the tag attribute is surrounded by double quotes.
2291	"""
2292	escaped = (attr
2293	.replace('&', '&')
2294	.replace('"', '"')
2295	.replace('<', '<')
2296	.replace('>', '>'))
2297	if not skip_single_quote:
2298	escaped = escaped.replace("'", "'")
2299	return escaped
2300
2301
2302	def _xml_encode_email_char_at_random(ch):
2303	r = random()
2304	# Roughly 10% raw, 45% hex, 45% dec.
2305	# '@' must be encoded. I [John Gruber] insist.
2306	# Issue 26: '_' must be encoded.
2307	if r > 0.9 and ch not in "@_":
2308	return ch
2309	elif r < 0.45:
2310	# The [1:] is to drop leading '0': 0x63 -> x63
2311	return '&#%s;' % hex(ord(ch))[1:]
2312	else:
2313	return '&#%s;' % ord(ch)
2314
2315
2316
2317	#---- mainline
2318
2319	class _NoReflowFormatter(optparse.IndentedHelpFormatter):
2320	"""An optparse formatter that does NOT reflow the description."""
2321	def format_description(self, description):
2322	return description or ""
2323
2324	def _test():
2325	import doctest
2326	doctest.testmod()
2327
2328	def main(argv=None):
2329	if argv is None:
2330	argv = sys.argv
2331	if not logging.root.handlers:
2332	logging.basicConfig()
2333
2334	usage = "usage: %prog [PATHS...]"
2335	version = "%prog "+__version__
2336	parser = optparse.OptionParser(prog="markdown2", usage=usage,
2337	version=version, description=cmdln_desc,
2338	formatter=_NoReflowFormatter())
2339	parser.add_option("-v", "--verbose", dest="log_level",
2340	action="store_const", const=logging.DEBUG,
2341	help="more verbose output")
2342	parser.add_option("--encoding",
2343	help="specify encoding of text content")
2344	parser.add_option("--html4tags", action="store_true", default=False,
2345	help="use HTML 4 style for empty element tags")
2346	parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode",
2347	help="sanitize literal HTML: 'escape' escapes "
2348	"HTML meta chars, 'replace' replaces with an "
2349	"[HTML_REMOVED] note")
2350	parser.add_option("-x", "--extras", action="append",
2351	help="Turn on specific extra features (not part of "
2352	"the core Markdown spec). See above.")
2353	parser.add_option("--use-file-vars",
2354	help="Look for and use Emacs-style 'markdown-extras' "
2355	"file var to turn on extras. See "
2356	"<https://github.com/trentm/python-markdown2/wiki/Extras>")
2357	parser.add_option("--link-patterns-file",
2358	help="path to a link pattern file")
2359	parser.add_option("--self-test", action="store_true",
2360	help="run internal self-tests (some doctests)")
2361	parser.add_option("--compare", action="store_true",
2362	help="run against Markdown.pl as well (for testing)")
2363	parser.set_defaults(log_level=logging.INFO, compare=False,
2364	encoding="utf-8", safe_mode=None, use_file_vars=False)
2365	opts, paths = parser.parse_args()
2366	log.setLevel(opts.log_level)
2367
2368	if opts.self_test:
2369	return _test()
2370
2371	if opts.extras:
2372	extras = {}
2373	for s in opts.extras:
2374	splitter = re.compile("[,;: ]+")
2375	for e in splitter.split(s):
2376	if '=' in e:
2377	ename, earg = e.split('=', 1)
2378	try:
2379	earg = int(earg)
2380	except ValueError:
2381	pass
2382	else:
2383	ename, earg = e, None
2384	extras[ename] = earg
2385	else:
2386	extras = None
2387
2388	if opts.link_patterns_file:
2389	link_patterns = []
2390	f = open(opts.link_patterns_file)
2391	try:
2392	for i, line in enumerate(f.readlines()):
2393	if not line.strip(): continue
2394	if line.lstrip().startswith("#"): continue
2395	try:
2396	pat, href = line.rstrip().rsplit(None, 1)
2397	except ValueError:
2398	raise MarkdownError("%s:%d: invalid link pattern line: %r"
2399	% (opts.link_patterns_file, i+1, line))
2400	link_patterns.append(
2401	(_regex_from_encoded_pattern(pat), href))
2402	finally:
2403	f.close()
2404	else:
2405	link_patterns = None
2406
2407	from os.path import join, dirname, abspath, exists
2408	markdown_pl = join(dirname(dirname(abspath(__file__))), "test",
2409	"Markdown.pl")
2410	if not paths:
2411	paths = ['-']
2412	for path in paths:
2413	if path == '-':
2414	text = sys.stdin.read()
2415	else:
2416	fp = codecs.open(path, 'r', opts.encoding)
2417	text = fp.read()
2418	fp.close()
2419	if opts.compare:
2420	from subprocess import Popen, PIPE
2421	print("==== Markdown.pl ====")
2422	p = Popen('perl %s' % markdown_pl, shell=True, stdin=PIPE, stdout=PIPE, close_fds=True)
2423	p.stdin.write(text.encode('utf-8'))
2424	p.stdin.close()
2425	perl_html = p.stdout.read().decode('utf-8')
2426	if py3:
2427	sys.stdout.write(perl_html)
2428	else:
2429	sys.stdout.write(perl_html.encode(
2430	sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2431	print("==== markdown2.py ====")
2432	html = markdown(text,
2433	html4tags=opts.html4tags,
2434	safe_mode=opts.safe_mode,
2435	extras=extras, link_patterns=link_patterns,
2436	use_file_vars=opts.use_file_vars)
2437	if py3:
2438	sys.stdout.write(html)
2439	else:
2440	sys.stdout.write(html.encode(
2441	sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2442	if extras and "toc" in extras:
2443	log.debug("toc_html: " +
2444	html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
2445	if opts.compare:
2446	test_dir = join(dirname(dirname(abspath(__file__))), "test")
2447	if exists(join(test_dir, "test_markdown2.py")):
2448	sys.path.insert(0, test_dir)
2449	from test_markdown2 import norm_html_from_html
2450	norm_html = norm_html_from_html(html)
2451	norm_perl_html = norm_html_from_html(perl_html)
2452	else:
2453	norm_html = html
2454	norm_perl_html = perl_html
2455	print("==== match? %r ====" % (norm_perl_html == norm_html))
2456
2457
2458	if __name__ == "__main__":
2459	sys.exit( main(sys.argv) )

Note: See TracBrowser for help on using the repository browser.

Download in other formats: