# Copyright (c) 2007-2008 ActiveState Corp.
# License: MIT (http://www.opensource.org/licenses/mit-license.php)
#
-# I used version 1.0.1.12, but deleted:
+# I used version 1.0.1.16, but deleted:
# * file-vars (emacs-style settings inside the file)
# * Standardize line endings
# * call to _do_links()
# * logging
# * allow "= Header =" in addition to "# Header #"
-#
+#
import os, sys, re, codecs
try:
from hashlib import md5
except ImportError:
from md5 import md5
-from random import random
+from random import random, randint
DEFAULT_TAB_WIDTH = 4
-# Table of hash values for escaped characters:
-def _escape_hash(s):
- # Lame attempt to avoid possible collision with someone actually
- # using the MD5 hexdigest of one of these chars in there text.
- # Other ideas: random.random(), uuid.uuid()
+
+try:
+ import uuid
+except ImportError:
+ SECRET_SALT = str(randint(0, 1000000))
+else:
+ SECRET_SALT = str(uuid.uuid4())
+def _hash_ascii(s):
#return md5(s).hexdigest() # Markdown.pl effectively does this.
- return 'md5-'+md5(s).hexdigest()
-g_escape_table = dict([(ch, _escape_hash(ch)) for ch in '\\`*_{}[]()>#+-.!'])
+ return 'md5-' + md5(SECRET_SALT + s).hexdigest()
+def _hash_text(s):
+ return 'md5-' + md5(SECRET_SALT + s.encode("utf-8")).hexdigest()
+
+# Table of hash values for escaped characters:
+g_escape_table = dict([(ch, _hash_ascii(ch))
+ for ch in '\\`*_{}[]()>#+-.!'])
self.tab_width = tab_width
# For compatibility with earlier markdown2.py and with
- # markdown.py's safe_mode being a boolean,
+ # markdown.py's safe_mode being a boolean,
# safe_mode == True -> "replace"
if safe_mode is True:
self.safe_mode = "replace"
text = self._run_block_gamut(text)
- text = self._unescape_special_chars(text)
-
if "footnotes" in self.extras:
text = self._add_footnotes(text)
+ text = self._unescape_special_chars(text)
+
if self.safe_mode:
text = self._unhash_html_spans(text)
text = self._liberal_tag_block_re.sub(hash_html_block_sub, text)
# Special case just for <hr />. It was easier to make a special
- # case than to make the other regex more complicated.
+ # case than to make the other regex more complicated.
if "<hr" in text:
_hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width)
text = _hr_tag_re.sub(hash_html_block_sub, text)
if "xml" in self.extras:
# Treat XML processing instructions and namespaced one-liner
# tags as if they were block HTML tags. E.g., if standalone
- # (i.e. are their own paragraph), the following do not get
+ # (i.e. are their own paragraph), the following do not get
# wrapped in a <p> tag:
# <?foo bar?>
#
# Strips link definitions from text, stores the URLs and titles in
# hash references.
less_than_tab = self.tab_width - 1
-
+
# Link defs are in the form:
# [id]: url "optional title"
_link_def_re = re.compile(r"""
- The 'note-id' can be pretty much anything, though typically it
is the number of the footnote.
- The first paragraph may start on the next line, like so:
-
+
[^note-id]:
Text of the note.
"""
+ indent + ('\n'+indent).join(lines)
+ '\n\n')
return s
-
+
def _prepare_pyshell_blocks(self, text):
"""Ensure that Python interactive shell sessions are put in
code blocks -- even if not properly indented.
def _run_span_gamut(self, text):
# These are all the transformations that occur *within* block-level
# tags like paragraphs, headers, and list items.
-
+
text = self._do_code_spans(text)
-
+
text = self._escape_special_chars(text)
-
+
# Process anchor and image tags.
#text = self._do_links(text)
-
+
# Make links out of things like `<http://example.com/>`
# Must come after _do_links(), because you can use < and >
# delimiters in inline links like [this](<url>).
if "link-patterns" in self.extras:
text = self._do_link_patterns(text)
-
+
text = self._encode_amps_and_angles(text)
-
+
text = self._do_italics_and_bold(text)
-
+
# Do hard breaks:
text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
-
+
return text
# "Sorta" because auto-links are identified as "tag" tokens.
_sorta_html_tokenize_re = re.compile(r"""
(
# tag
- </?
+ </?
(?:\w+) # tag name
(?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))* # attributes
\s*/?>
<\?.*?\?> # processing instruction
)
""", re.X)
-
+
def _escape_special_chars(self, text):
# Python markdown note: the HTML tokenization here differs from
# that in Markdown.pl, hence the behaviour for subtle cases can
Markdown.pl because of the lack of atomic matching support in
Python's regex engine used in $g_nested_brackets.
"""
- MAX_LINK_TEXT_SENTINEL = 300
+ MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24
# `anchor_allowed_pos` is used to support img links inside
# anchors, but not anchors inside anchors. An anchor's start
# matching brackets in img alt text -- we'll differ in that
# regard.
bracket_depth = 0
- for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,
+ for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,
text_length)):
ch = text[p]
if ch == ']':
title_str = ''
if is_img:
result = '<img src="%s" alt="%s"%s%s' \
- % (url, link_text.replace('"', '"'),
+ % (url.replace('"', '"'),
+ link_text.replace('"', '"'),
title_str, self.empty_element_suffix)
curr_pos = start_idx + len(result)
text = text[:start_idx] + result + text[match.end():]
title_str = ''
if is_img:
result = '<img src="%s" alt="%s"%s%s' \
- % (url, link_text.replace('"', '"'),
+ % (url.replace('"', '"'),
+ link_text.replace('"', '"'),
title_str, self.empty_element_suffix)
curr_pos = start_idx + len(result)
text = text[:start_idx] + result + text[match.end():]
# Otherwise, it isn't markup.
curr_pos = start_idx + 1
- return text
+ return text
_setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M)
# Setext-style headers:
# Header 1
# ========
- #
+ #
# Header 2
# --------
text = self._setext_h_re.sub(self._setext_h_sub, text)
)
)
''' % (less_than_tab, marker_pat, marker_pat)
-
+
# We use a different prefix before nested lists than top-level lists.
# See extended comment in _process_list_items().
#
text = list_re.sub(self._list_sub, text)
return text
-
+
_list_item_re = re.compile(r'''
(\n)? # leading line = \1
(^[ \t]*) # leading whitespace = \2
- (%s) [ \t]+ # list marker = \3
+ (?P<marker>%s) [ \t]+ # list marker = \3
((?:.+?) # list item text = \4
(\n{1,2})) # eols = \5
- (?= \n* (\Z | \2 (%s) [ \t]+))
+ (?= \n* (\Z | \2 (?P<next_marker>%s) [ \t]+))
''' % (_marker_any, _marker_any),
re.M | re.X | re.S)
def _process_list_items(self, list_str):
# Process the contents of a single ordered or unordered list,
# splitting it into individual list items.
-
+
# The $g_list_level global keeps track of when we're inside a list.
# Each time we enter a list, we increment it; when we leave a list,
# we decrement. If it's zero, we're not in a list anymore.
"""
yield 0, "<code>"
for tup in inner:
- yield tup
+ yield tup
yield 0, "</code>"
def wrap(self, source, outfile):
def _do_code_spans(self, text):
# * Backtick quotes are used for <code></code> spans.
- #
+ #
# * You can use multiple backticks as the delimiters if you want to
# include literal backticks in the code span. So, this input:
- #
+ #
# Just type ``foo `bar` baz`` at the prompt.
- #
+ #
# Will translate to:
- #
+ #
# <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
- #
+ #
# There's no arbitrary limit to the number of backticks you
# can use as delimters. If you need three consecutive backticks
# in your code, use four for delimiters, etc.
#
# * You can use spaces to get literal backticks at the edges:
- #
+ #
# ... type `` `bar` `` ...
- #
+ #
# Turns to:
- #
+ #
# ... type <code>`bar`</code> ...
return self._code_span_re.sub(self._code_span_sub, text)
text = self._strong_re.sub(r"<strong>\2</strong>", text)
text = self._em_re.sub(r"<em>\2</em>", text)
return text
-
+
_block_quote_re = re.compile(r'''
( # Wrap whole match in \1
text = text.strip('\n')
# Wrap <p> tags.
- grafs = re.split(r"\n{2,}", text)
- for i, graf in enumerate(grafs):
+ grafs = []
+ for i, graf in enumerate(re.split(r"\n{2,}", text)):
if graf in self.html_blocks:
# Unhashify HTML blocks
- grafs[i] = self.html_blocks[graf]
+ grafs.append(self.html_blocks[graf])
else:
+ cuddled_list = None
+ if "cuddled-lists" in self.extras:
+ # Need to put back trailing '\n' for `_list_item_re`
+ # match at the end of the paragraph.
+ li = self._list_item_re.search(graf + '\n')
+ # Two of the same list marker in this paragraph: a likely
+ # candidate for a list cuddled to preceding paragraph
+ # text (issue 33). Note the `[-1]` is a quick way to
+ # consider numeric bullets (e.g. "1." and "2.") to be
+ # equal.
+ if (li and li.group("next_marker")
+ and li.group("marker")[-1] == li.group("next_marker")[-1]):
+ start = li.start()
+ cuddled_list = self._do_lists(graf[start:]).rstrip("\n")
+ assert cuddled_list.startswith("<ul>") or cuddled_list.startswith("<ol>")
+ graf = graf[:start]
+
# Wrap <p> tags.
graf = self._run_span_gamut(graf)
- grafs[i] = "<p>" + graf.lstrip(" \t") + "</p>"
+ grafs.append("<p>" + graf.lstrip(" \t") + "</p>")
+
+ if cuddled_list:
+ grafs.append(cuddled_list)
return "\n\n".join(grafs)
# Smart processing for ampersands and angle brackets that need
# to be encoded.
text = self._ampersand_re.sub('&', text)
-
+
# Encode naked <'s
text = self._naked_lt_re.sub('<', text)
addr = '<a href="%s">%s</a>' \
% (''.join(chars), ''.join(chars[7:]))
return addr
-
+
def _do_link_patterns(self, text):
"""Caveat emptor: there isn't much guarding against link
patterns being formed inside other standard Markdown links, e.g.
.replace('*', g_escape_table['*'])
.replace('_', g_escape_table['_']))
link = '<a href="%s">%s</a>' % (escaped_href, text[start:end])
- hash = md5(link).hexdigest()
+ hash = _hash_text(link)
link_from_hash[hash] = link
text = text[:start] + hash + text[end:]
for hash, link in link_from_hash.items():
text = text.replace(hash, link)
return text
-
+
def _unescape_special_chars(self, text):
# Swap back in all the special characters we've hidden.
for ch, hash in g_escape_table.items():
# Recipe: dedent (0.1.2)
def _dedentlines(lines, tabsize=8, skip_first_line=False):
"""_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
-
+
"lines" is a list of lines to dedent.
"tabsize" is the tab width to use for indent width calculations.
"skip_first_line" is a boolean indicating if the first line should
be skipped for calculating the indent width and for dedenting.
This is sometimes useful for docstrings and similar.
-
+
Same as dedent() except operates on a sequence of lines. Note: the
lines list is modified **in-place**.
"""
DEBUG = False
- if DEBUG:
+ if DEBUG:
print "dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
% (tabsize, skip_first_line)
indents = []
"skip_first_line" is a boolean indicating if the first line should
be skipped for calculating the indent width and for dedenting.
This is sometimes useful for docstrings and similar.
-
+
textwrap.dedent(s), but don't expand tabs to spaces
"""
lines = text.splitlines(1)
[ ]{0,%d}
<(hr) # start tag = \2
\b # word break
- ([^<>])*? #
+ ([^<>])*? #
/?> # the matching end tag
[ \t]*
(?=\n{2,}|\Z) # followed by a blank line or end of document
r = random()
# Roughly 10% raw, 45% hex, 45% dec.
# '@' *must* be encoded. I [John Gruber] insist.
- if r > 0.9 and ch != "@":
+ # Issue 26: '_' must be encoded.
+ if r > 0.9 and ch not in "@_":
return ch
elif r < 0.45:
# The [1:] is to drop leading '0': 0x63 -> x63
else:
return '&#%s;' % ord(ch)
-def _hash_text(text):
- return 'md5:'+md5(text.encode("utf-8")).hexdigest()
file = params.file
if file.rel_path.endswith(".md"):
file.render = "html"
- f = file.read_keywords()
- return f.read()
+ file.read()
_markdown = None