webber: remove trailing whitespace

[webber.git] / plugins / read_markdown.py
diff --git a/plugins/read_markdown.py b/plugins/read_markdown.py

index 06ed2adcaeafa92db18d91cfe241158a335ba67e..fead97eb97a1bfdbda01bf22cbc4e95fae94ae3d 100644 (file)
--- a/plugins/read_markdown.py
+++ b/plugins/read_markdown.py
@@ -5,20 +5,20 @@ from webber import *
  # Copyright (c) 2007-2008 ActiveState Corp.
  # License: MIT (http://www.opensource.org/licenses/mit-license.php)
  #
-# I used version 1.0.1.12, but deleted:
+# I used version 1.0.1.16, but deleted:
  #      * file-vars (emacs-style settings inside the file)
  #      * Standardize line endings
  #      * call to _do_links()
  #      * logging
  #      * allow "= Header =" in addition to "# Header #"
-#      
+#
  
  import os, sys, re, codecs
  try:
      from hashlib import md5
  except ImportError:
      from md5 import md5
-from random import random
+from random import random, randint
  
  
  
@@ -42,14 +42,22 @@ DEBUG = False
  
  DEFAULT_TAB_WIDTH = 4
  
-# Table of hash values for escaped characters:
-def _escape_hash(s):
-    # Lame attempt to avoid possible collision with someone actually
-    # using the MD5 hexdigest of one of these chars in there text.
-    # Other ideas: random.random(), uuid.uuid()
+
+try:
+    import uuid
+except ImportError:
+    SECRET_SALT = str(randint(0, 1000000))
+else:
+    SECRET_SALT = str(uuid.uuid4())
+def _hash_ascii(s):
      #return md5(s).hexdigest()   # Markdown.pl effectively does this.
-    return 'md5-'+md5(s).hexdigest()
-g_escape_table = dict([(ch, _escape_hash(ch)) for ch in '\\`*_{}[]()>#+-.!'])
+    return 'md5-' + md5(SECRET_SALT + s).hexdigest()
+def _hash_text(s):
+    return 'md5-' + md5(SECRET_SALT + s.encode("utf-8")).hexdigest()
+
+# Table of hash values for escaped characters:
+g_escape_table = dict([(ch, _hash_ascii(ch))
+                       for ch in '\\`*_{}[]()>#+-.!'])
  
  
  
@@ -98,7 +106,7 @@ class Markdown(object):
          self.tab_width = tab_width
  
          # For compatibility with earlier markdown2.py and with
-        # markdown.py's safe_mode being a boolean, 
+        # markdown.py's safe_mode being a boolean,
          #   safe_mode == True -> "replace"
          if safe_mode is True:
              self.safe_mode = "replace"
@@ -179,11 +187,11 @@ class Markdown(object):
  
          text = self._run_block_gamut(text)
  
-        text = self._unescape_special_chars(text)
-
          if "footnotes" in self.extras:
              text = self._add_footnotes(text)
  
+        text = self._unescape_special_chars(text)
+
          if self.safe_mode:
              text = self._unhash_html_spans(text)
  
@@ -286,7 +294,7 @@ class Markdown(object):
          text = self._liberal_tag_block_re.sub(hash_html_block_sub, text)
  
          # Special case just for <hr />. It was easier to make a special
-        # case than to make the other regex more complicated.   
+        # case than to make the other regex more complicated.
          if "<hr" in text:
              _hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width)
              text = _hr_tag_re.sub(hash_html_block_sub, text)
@@ -349,7 +357,7 @@ class Markdown(object):
          if "xml" in self.extras:
              # Treat XML processing instructions and namespaced one-liner
              # tags as if they were block HTML tags. E.g., if standalone
-            # (i.e. are their own paragraph), the following do not get 
+            # (i.e. are their own paragraph), the following do not get
              # wrapped in a <p> tag:
              #    <?foo bar?>
              #
@@ -363,7 +371,7 @@ class Markdown(object):
          # Strips link definitions from text, stores the URLs and titles in
          # hash references.
          less_than_tab = self.tab_width - 1
-    
+
          # Link defs are in the form:
          #   [id]: url "optional title"
          _link_def_re = re.compile(r"""
@@ -414,7 +422,7 @@ class Markdown(object):
          - The 'note-id' can be pretty much anything, though typically it
            is the number of the footnote.
          - The first paragraph may start on the next line, like so:
-            
+
              [^note-id]:
                  Text of the note.
          """
@@ -481,7 +489,7 @@ class Markdown(object):
               + indent + ('\n'+indent).join(lines)
               + '\n\n')
          return s
-        
+
      def _prepare_pyshell_blocks(self, text):
          """Ensure that Python interactive shell sessions are put in
          code blocks -- even if not properly indented.
@@ -501,14 +509,14 @@ class Markdown(object):
      def _run_span_gamut(self, text):
          # These are all the transformations that occur *within* block-level
          # tags like paragraphs, headers, and list items.
-    
+
          text = self._do_code_spans(text)
-    
+
          text = self._escape_special_chars(text)
-    
+
          # Process anchor and image tags.
          #text = self._do_links(text)
-    
+
          # Make links out of things like `<http://example.com/>`
          # Must come after _do_links(), because you can use < and >
          # delimiters in inline links like [this](<url>).
@@ -516,21 +524,21 @@ class Markdown(object):
  
          if "link-patterns" in self.extras:
              text = self._do_link_patterns(text)
-    
+
          text = self._encode_amps_and_angles(text)
-    
+
          text = self._do_italics_and_bold(text)
-    
+
          # Do hard breaks:
          text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
-    
+
          return text
  
      # "Sorta" because auto-links are identified as "tag" tokens.
      _sorta_html_tokenize_re = re.compile(r"""
          (
              # tag
-            </?         
+            </?
              (?:\w+)                                     # tag name
              (?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))*  # attributes
              \s*/?>
@@ -543,7 +551,7 @@ class Markdown(object):
              <\?.*?\?>       # processing instruction
          )
          """, re.X)
-    
+
      def _escape_special_chars(self, text):
          # Python markdown note: the HTML tokenization here differs from
          # that in Markdown.pl, hence the behaviour for subtle cases can
@@ -647,7 +655,7 @@ class Markdown(object):
          Markdown.pl because of the lack of atomic matching support in
          Python's regex engine used in $g_nested_brackets.
          """
-        MAX_LINK_TEXT_SENTINEL = 300
+        MAX_LINK_TEXT_SENTINEL = 3000  # markdown2 issue 24
  
          # `anchor_allowed_pos` is used to support img links inside
          # anchors, but not anchors inside anchors. An anchor's start
@@ -683,7 +691,7 @@ class Markdown(object):
              # matching brackets in img alt text -- we'll differ in that
              # regard.
              bracket_depth = 0
-            for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL, 
+            for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,
                                              text_length)):
                  ch = text[p]
                  if ch == ']':
@@ -743,7 +751,8 @@ class Markdown(object):
                          title_str = ''
                      if is_img:
                          result = '<img src="%s" alt="%s"%s%s' \
-                            % (url, link_text.replace('"', '&quot;'),
+                            % (url.replace('"', '&quot;'),
+                               link_text.replace('"', '&quot;'),
                                 title_str, self.empty_element_suffix)
                          curr_pos = start_idx + len(result)
                          text = text[:start_idx] + result + text[match.end():]
@@ -786,7 +795,8 @@ class Markdown(object):
                              title_str = ''
                          if is_img:
                              result = '<img src="%s" alt="%s"%s%s' \
-                                % (url, link_text.replace('"', '&quot;'),
+                                % (url.replace('"', '&quot;'),
+                                   link_text.replace('"', '&quot;'),
                                     title_str, self.empty_element_suffix)
                              curr_pos = start_idx + len(result)
                              text = text[:start_idx] + result + text[match.end():]
@@ -811,7 +821,7 @@ class Markdown(object):
              # Otherwise, it isn't markup.
              curr_pos = start_idx + 1
  
-        return text 
+        return text
  
  
      _setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M)
@@ -844,7 +854,7 @@ class Markdown(object):
          # Setext-style headers:
          #     Header 1
          #     ========
-        #  
+        #
          #     Header 2
          #     --------
          text = self._setext_h_re.sub(self._setext_h_sub, text)
@@ -900,7 +910,7 @@ class Markdown(object):
                    )
                  )
              ''' % (less_than_tab, marker_pat, marker_pat)
-        
+
              # We use a different prefix before nested lists than top-level lists.
              # See extended comment in _process_list_items().
              #
@@ -928,14 +938,14 @@ class Markdown(object):
                  text = list_re.sub(self._list_sub, text)
  
          return text
-    
+
      _list_item_re = re.compile(r'''
          (\n)?               # leading line = \1
          (^[ \t]*)           # leading whitespace = \2
-        (%s) [ \t]+         # list marker = \3
+        (?P<marker>%s) [ \t]+   # list marker = \3
          ((?:.+?)            # list item text = \4
           (\n{1,2}))         # eols = \5
-        (?= \n* (\Z | \2 (%s) [ \t]+))
+        (?= \n* (\Z | \2 (?P<next_marker>%s) [ \t]+))
          ''' % (_marker_any, _marker_any),
          re.M | re.X | re.S)
  
@@ -958,7 +968,7 @@ class Markdown(object):
      def _process_list_items(self, list_str):
          # Process the contents of a single ordered or unordered list,
          # splitting it into individual list items.
-    
+
          # The $g_list_level global keeps track of when we're inside a list.
          # Each time we enter a list, we increment it; when we leave a list,
          # we decrement. If it's zero, we're not in a list anymore.
@@ -1007,7 +1017,7 @@ class Markdown(object):
                  """
                  yield 0, "<code>"
                  for tup in inner:
-                    yield tup 
+                    yield tup
                  yield 0, "</code>"
  
              def wrap(self, source, outfile):
@@ -1080,26 +1090,26 @@ class Markdown(object):
  
      def _do_code_spans(self, text):
          #   *   Backtick quotes are used for <code></code> spans.
-        # 
+        #
          #   *   You can use multiple backticks as the delimiters if you want to
          #       include literal backticks in the code span. So, this input:
-        #     
+        #
          #         Just type ``foo `bar` baz`` at the prompt.
-        #     
+        #
          #       Will translate to:
-        #     
+        #
          #         <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
-        #     
+        #
          #       There's no arbitrary limit to the number of backticks you
          #       can use as delimters. If you need three consecutive backticks
          #       in your code, use four for delimiters, etc.
          #
          #   *   You can use spaces to get literal backticks at the edges:
-        #     
+        #
          #         ... type `` `bar` `` ...
-        #     
+        #
          #       Turns to:
-        #     
+        #
          #         ... type <code>`bar`</code> ...
          return self._code_span_re.sub(self._code_span_sub, text)
  
@@ -1141,7 +1151,7 @@ class Markdown(object):
              text = self._strong_re.sub(r"<strong>\2</strong>", text)
              text = self._em_re.sub(r"<em>\2</em>", text)
          return text
-    
+
  
      _block_quote_re = re.compile(r'''
          (                           # Wrap whole match in \1
@@ -1181,15 +1191,35 @@ class Markdown(object):
          text = text.strip('\n')
  
          # Wrap <p> tags.
-        grafs = re.split(r"\n{2,}", text)
-        for i, graf in enumerate(grafs):
+        grafs = []
+        for i, graf in enumerate(re.split(r"\n{2,}", text)):
              if graf in self.html_blocks:
                  # Unhashify HTML blocks
-                grafs[i] = self.html_blocks[graf]
+                grafs.append(self.html_blocks[graf])
              else:
+                cuddled_list = None
+                if "cuddled-lists" in self.extras:
+                    # Need to put back trailing '\n' for `_list_item_re`
+                    # match at the end of the paragraph.
+                    li = self._list_item_re.search(graf + '\n')
+                    # Two of the same list marker in this paragraph: a likely
+                    # candidate for a list cuddled to preceding paragraph
+                    # text (issue 33). Note the `[-1]` is a quick way to
+                    # consider numeric bullets (e.g. "1." and "2.") to be
+                    # equal.
+                    if (li and li.group("next_marker")
+                        and li.group("marker")[-1] == li.group("next_marker")[-1]):
+                        start = li.start()
+                        cuddled_list = self._do_lists(graf[start:]).rstrip("\n")
+                        assert cuddled_list.startswith("<ul>") or cuddled_list.startswith("<ol>")
+                        graf = graf[:start]
+
                  # Wrap <p> tags.
                  graf = self._run_span_gamut(graf)
-                grafs[i] = "<p>" + graf.lstrip(" \t") + "</p>"
+                grafs.append("<p>" + graf.lstrip(" \t") + "</p>")
+
+                if cuddled_list:
+                    grafs.append(cuddled_list)
  
          return "\n\n".join(grafs)
  
@@ -1231,7 +1261,7 @@ class Markdown(object):
          # Smart processing for ampersands and angle brackets that need
          # to be encoded.
          text = self._ampersand_re.sub('&amp;', text)
-    
+
          # Encode naked <'s
          text = self._naked_lt_re.sub('&lt;', text)
  
@@ -1289,7 +1319,7 @@ class Markdown(object):
          addr = '<a href="%s">%s</a>' \
                 % (''.join(chars), ''.join(chars[7:]))
          return addr
-    
+
      def _do_link_patterns(self, text):
          """Caveat emptor: there isn't much guarding against link
          patterns being formed inside other standard Markdown links, e.g.
@@ -1314,13 +1344,13 @@ class Markdown(object):
                          .replace('*', g_escape_table['*'])
                          .replace('_', g_escape_table['_']))
                  link = '<a href="%s">%s</a>' % (escaped_href, text[start:end])
-                hash = md5(link).hexdigest()
+                hash = _hash_text(link)
                  link_from_hash[hash] = link
                  text = text[:start] + hash + text[end:]
          for hash, link in link_from_hash.items():
              text = text.replace(hash, link)
          return text
-    
+
      def _unescape_special_chars(self, text):
          # Swap back in all the special characters we've hidden.
          for ch, hash in g_escape_table.items():
@@ -1390,18 +1420,18 @@ def _regex_from_encoded_pattern(s):
  # Recipe: dedent (0.1.2)
  def _dedentlines(lines, tabsize=8, skip_first_line=False):
      """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
-    
+
          "lines" is a list of lines to dedent.
          "tabsize" is the tab width to use for indent width calculations.
          "skip_first_line" is a boolean indicating if the first line should
              be skipped for calculating the indent width and for dedenting.
              This is sometimes useful for docstrings and similar.
-    
+
      Same as dedent() except operates on a sequence of lines. Note: the
      lines list is modified **in-place**.
      """
      DEBUG = False
-    if DEBUG: 
+    if DEBUG:
          print "dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
                % (tabsize, skip_first_line)
      indents = []
@@ -1466,7 +1496,7 @@ def _dedent(text, tabsize=8, skip_first_line=False):
          "skip_first_line" is a boolean indicating if the first line should
              be skipped for calculating the indent width and for dedenting.
              This is sometimes useful for docstrings and similar.
-    
+
      textwrap.dedent(s), but don't expand tabs to spaces
      """
      lines = text.splitlines(1)
@@ -1531,7 +1561,7 @@ def _hr_tag_re_from_tab_width(tab_width):
              [ ]{0,%d}
              <(hr)               # start tag = \2
              \b                  # word break
-            ([^<>])*?           # 
+            ([^<>])*?           #
              /?>                 # the matching end tag
              [ \t]*
              (?=\n{2,}|\Z)       # followed by a blank line or end of document
@@ -1544,7 +1574,8 @@ def _xml_encode_email_char_at_random(ch):
      r = random()
      # Roughly 10% raw, 45% hex, 45% dec.
      # '@' *must* be encoded. I [John Gruber] insist.
-    if r > 0.9 and ch != "@":
+    # Issue 26: '_' must be encoded.
+    if r > 0.9 and ch not in "@_":
          return ch
      elif r < 0.45:
          # The [1:] is to drop leading '0': 0x63 -> x63
@@ -1552,8 +1583,6 @@ def _xml_encode_email_char_at_random(ch):
      else:
          return '&#%s;' % ord(ch)
  
-def _hash_text(text):
-    return 'md5:'+md5(text.encode("utf-8")).hexdigest()