From: R. Steve McKown Date: Tue, 9 Aug 2011 02:40:55 +0000 (-0600) Subject: Manually merge from latest webber X-Git-Url: https://oss.titaniummirror.com/gitweb?p=oss-web.git;a=commitdiff_plain;h=712216965d66183bf3b78d342c0bd4f2627bb431 Manually merge from latest webber * webber a42d5b9b523b56e34234685de6b58cf0e6274b34 * Use webber's read_markdown and dispense with local version that acted as a shim to 'plain vanilla' markdown2. * Uncomment markdown _do_links() Use markdown _do_links() --- diff --git a/Makefile b/Makefile index 9d767ab..6773984 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,13 @@ all: webber.conf profile: ./webber --profile +lint: + pylint \ + --include-ids=y \ + --reports=n \ + --disable-msg=W0312,C0103 \ + webber.py plugins + clean: rm -f *.pyc plugins/*.pyc diff --git a/plugins/google_sitemap.py b/plugins/google_sitemap.py index a13bc85..e039e21 100644 --- a/plugins/google_sitemap.py +++ b/plugins/google_sitemap.py @@ -36,8 +36,14 @@ def write_initial(params): def sitemap_scan(params): global f file = params.file + + # Ignore non-pages if not file.has_key("linktitle"): return + # Ignore hidden pages + if file.has_key("hide") and file.hide: + return + if f is None: write_initial(params) @@ -56,7 +62,7 @@ def sitemap_scan(params): #print file.sitemap_priority, file.sitemap_changefreq, file.rel_path f.write('\n') - f.write(' http://%s/%s\n' % (file.main_url, file.rel_path)) + f.write(' http://%s/%s\n' % (file.main_url, file.out_path)) f.write(' %s\n' % time.strftime( "%Y-%m-%d", time.localtime(file.mtime)) ) f.write(' %s\n' % file.sitemap_changefreq) f.write(' %s\n' % file.sitemap_priority) diff --git a/plugins/hierarchy.py b/plugins/hierarchy.py index 8c65003..9157556 100644 --- a/plugins/hierarchy.py +++ b/plugins/hierarchy.py @@ -5,33 +5,36 @@ import re reSPLIT = re.compile(r',\s*') +# This hashes store webber.File instances _childs = {} _parent = {} -def memorize_links(title, links): +def memorize_links(thisfile, links): global _childs if not links: return order = 100 for link in reSPLIT.split(links): - #print title, link - if not _childs.has_key(title): - _childs[title] = [] - _childs[title].append( (order,link)) + linked = get_file_for(link) + if not _childs.has_key(thisfile): + _childs[thisfile] = [] + _childs[thisfile].append( (order, linked)) order += 100 - _parent[link] = title + #print "memorize_links:", thisfile, "->", linked + _parent[linked] = thisfile -def memorize_parent(title, parent, order=100): - #print "memorize_parent:", title, parent - #print " parent:", _parent - #print " childs:", _childs - #print "order:", title, order +def memorize_parent(thisfile, parent, order=100): + # Convert titles or linktitle to entries of webber.File + if not isinstance(parent, webber.File): + parent = get_file_for(parent) + if not _childs.has_key(parent): _childs[parent] = [] - _childs[parent].append( (order, title) ) - _parent[title] = parent - + _childs[parent].append( (order, thisfile) ) + #print "memorize_parent:", thisfile, "->", parent + _parent[thisfile] = parent + # # The "scan" plugins should scan for meta-data, mostly for links. @@ -43,14 +46,19 @@ def memorize_parent(title, parent, order=100): @set_hook("scan") def scan(params): file = params["file"] + + # Ignore hidden pages + if file.has_key("hide") and file.hide: + return + if file.has_key("links"): - memorize_links(file.linktitle, file.links) + memorize_links(file, file.links) if file.has_key("parent"): if file.has_key("order"): order = int(file.order) else: order = 100 - memorize_parent(file.linktitle, file.parent, order) + memorize_parent(file, file.parent, order) @set_hook("scan_done") @@ -59,14 +67,30 @@ def scan_done(params): in ascending order.""" for c in _childs: - _childs[c].sort() - return + # Sort by linktitle + _childs[c].sort(key = lambda x: x[1].linktitle) + # And now sort by priority. Since Python 2.2 and upwards has stable-sort, + # this effectively makes a two-dimensional sort. + _childs[c].sort(key = lambda x: x[0]) + + visited = {} + visited[get_file_for("Home")] = True + for f in _parent: + visited[f] = True + for f in files: + file = files[f] + if not file.has_key("linktitle"): + continue + if not visited.has_key(file ): + warning("orphan file '%s'" % f) + return print "_parent:" for c in _parent: print " ", c, _parent[c] print "_childs:" - for c in _childs: print " ", c,_childs[c] + for c in _childs: + print " ", c,_childs[c] @set_function("get_breadcrumbs") @@ -75,33 +99,30 @@ def get_breadcrumbs(orig_page=None): orig_page = get_current_file() res = [(orig_page, get_link_from(orig_page, orig_page))] page = orig_page - #print "orig_page:", orig_page - while _parent.has_key(page.linktitle): - page = get_file_for(_parent[page.linktitle]) + while _parent.has_key(page): + page = _parent[page] link = get_link_from(orig_page, page) - #print " page, link:", page, link res.insert(0, (page, link)) - #print res return res @set_function("get_sidemenu") def get_sidemenu(root="Home", level=1): + """Returns (level, part_of_path, is_current, page, link) tuples, where + page is a class File object and link is a relative link from the current + page to page.""" page = get_current_file() if not isinstance(root, webber.File): root = get_file_for(root) - bread = get_breadcrumbs() - #print "Menu for:", page - #print "Bread:", bread - res = [(0, 1, int(root==page), root, get_link_from(page, root))] + bread = get_breadcrumbs() + def do_menu(pg, level): - #print "pg, has_key:", pg, _childs.has_key(pg) - if _childs.has_key(pg.linktitle): - for p in _childs[pg.linktitle]: - subpage = get_file_for(p[1]) + if _childs.has_key(pg): + for p in _childs[pg]: + subpage = p[1] in_bread = False for b in bread: if b[0] == subpage: @@ -109,7 +130,6 @@ def get_sidemenu(root="Home", level=1): break go_deeper = in_bread or (subpage==page) - #print "subpage:", subpage, "in bread:", in_bread, "go deeper:", go_deeper link = get_link_from(page, subpage) res.append((level, in_bread, int(subpage==page), subpage, link)) if go_deeper: @@ -118,50 +138,60 @@ def get_sidemenu(root="Home", level=1): # TODO: make this configurable, e.g. cfg.rootpage, otherwise a page # that is outside of the menu won't show a menu do_menu(root, level) + + # print "-" * 77 + # import pprint + # pprint.pprint(res) + # print "-" * 77 return res -@set_function("get_sitemap") -def get_sitemap(root="Home", show_orphans=False, level=1): +@set_function("get_hierarchical_sitemap") +def get_hierarchical_sitemap(root="Home"): + page = get_current_file() + if not isinstance(root, webber.File): + root = get_file_for(root) + + def do_menu(pg): + res = [] + if _childs.has_key(pg): + for p in _childs[pg]: + subpage = p[1] + res.append( do_menu(subpage) ) + return (pg, get_link_from(root, pg), res) + + res = do_menu(root) + + #import pprint + #pprint.pprint(res, indent=4) + return res + + +@set_function("get_linear_sitemap") +def get_linear_sitemap(root="Home", level=1): page = get_current_file() if not isinstance(root, webber.File): root = get_file_for(root) res = [(0, root, get_link_from(page, root))] - visited = {root: None} def do_menu(pg, level): #print "pg:", pg - #, _childs.has_key(pg.linktitle) - if _childs.has_key(pg.linktitle): - for p in _childs[pg.linktitle]: - subpage = get_file_for(p[1]) + #, _childs.has_key(pg.title) + if _childs.has_key(pg): + for p in _childs[pg]: + subpage = p[1] #print "subpage:", subpage link = get_link_from(page, subpage) res.append((level, subpage, link)) - visited[subpage] = None do_menu(subpage, level+1) do_menu(root, level) - #print "visited:", visited - if show_orphans: - for f in files: - #print f - file = files[f] - if not file.has_key("linktitle"): - continue - try: - if file in visited: - #print "found", file.linktitle - continue - except KeyError: - continue - #print "not found:", file.linktitle - res.append( (0, file, get_link_from(page, file.linktitle))) - #for t in res: print t + #import pprint + #pprint.pprint(res) return res @@ -176,11 +206,11 @@ def get_recently(page=None, max_items=10): orig_page = page def addPage(res, page): + #print "page:", page res.append( (page, get_link_from(orig_page, page)) ) - if _childs.has_key(page.linktitle): - for c in _childs[page.linktitle]: - if len(res) < max_items: - addPage(res, get_file_for(c[1])) + if _childs.has_key(page): + for c in _childs[page]: + addPage(res, c[1]) addPage(res, orig_page) res.sort(cmp = lambda x,y: cmp(y[0].mtime, x[0].mtime)) - return res + return res[:max_items] diff --git a/plugins/link.py b/plugins/link.py index d8c4214..8a3404a 100644 --- a/plugins/link.py +++ b/plugins/link.py @@ -1,24 +1,27 @@ # -*- coding: iso-8859-1 -*- from webber import * -import os, re +import os, re, urlparse + +_file_links = {} # To understand this beast, read /usr/share/doc/python2.5-doc/html/lib/module-re.html :-) reLink = re.compile(r''' - \[\[ # Begin of link - (?=[^!]) # Don't fire for macros + \[\[ # Begin of link + (?=[^!]) # Don't fire for macros (?: - ([^\]\|]+) # 1: link text - \| # followed by '|' - )? # optional - ([^\n\r\]#]+) # 2: page to link to + ([^\]\|]+) # 1: link text + \| # followed by '|' + )? # optional + ([^\n\r\]#]+) # 2: page to link to ( - \# # '#', beginning of anchor - [^\s\]]+ # 3: anchor text, doesn't contain spaces or ']' - )? # optional - \]\] # end of link + \# # '#', beginning of anchor + [^\s\]]+ # 3: anchor text, doesn't contain spaces or ']' + )? # optional + \]\] # end of link ''', re.VERBOSE) + def do_link(m): """Used in re.sub() to substitute link with HTML""" text = m.group(1) or "" @@ -41,7 +44,13 @@ def do_link(m): break if not text: text = link - # TODO: validate link + # validate local files + components = urlparse.urlparse(link) + if components.scheme in ("", "file"): + file = get_current_file() + fname = os.path.join(file.direc, components.path) + fname = os.path.normpath(fname) + _file_links[fname] = file.rel_path return '%s' % (link, anchor, text) @@ -55,12 +64,13 @@ def test_link(): m = reLink.search(s) if m: print "link:", s - print " name:", m.group(1) - print " link:", m.group(2) - print " anchor:", m.group(3) + print " name:", m.group(1) + print " link:", m.group(2) + print " anchor:", m.group(3) else: print "No link:", s + def test_sub(): for s in ( 'Before [[!macro]] after', @@ -72,12 +82,19 @@ def test_sub(): res = reLink.sub(do_link, s) print "", res -#test_link() -#test_sub() +@set_hook("linkify") +def linkify(params): + params.file.contents = reLink.sub(do_link, params.file.contents) +@set_hook("finish") +def check_links(params): + """Checks all links that are stored in _file_links to warn if the + file doesn't exist""" -@set_hook("linkify") -def linkify(params): - return reLink.sub(do_link, params.file.contents) + for s in _file_links: + #print "check:", s, cfg.out_dir + out_file = os.path.join(cfg.out_dir, s) + if not os.path.exists(out_file): + warning("%s: invalid link to '%s'" % (_file_links[s], s)) diff --git a/plugins/markdown2.py b/plugins/markdown2.py index cffec41..f3d8df4 100755 --- a/plugins/markdown2.py +++ b/plugins/markdown2.py @@ -1023,12 +1023,12 @@ class Markdown(object): % (n, self._run_span_gamut(match.group(1)), n) _atx_h_re = re.compile(r''' - ^(\#{1,6}) # \1 = string of #'s + ^([\#=]{1,6}) # \1 = string of #'s [ \t]* (.+?) # \2 = Header text [ \t]* (?#+-.!']) + + + +#---- exceptions + +class MarkdownError(Exception): + pass + + + +#---- public api + +def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH, + safe_mode=None, extras=None, link_patterns=None): + return Markdown(html4tags=html4tags, tab_width=tab_width, + safe_mode=safe_mode, extras=extras, + link_patterns=link_patterns).convert(text) + +class Markdown(object): + # The dict of "extras" to enable in processing -- a mapping of + # extra name to argument for the extra. Most extras do not have an + # argument, in which case the value is None. + # + # This can be set via (a) subclassing and (b) the constructor + # "extras" argument. + extras = None + + urls = None + titles = None + html_blocks = None + html_spans = None + html_removed_text = "[HTML_REMOVED]" # for compat with markdown.py + + # Used to track when we're inside an ordered or unordered list + # (see _ProcessListItems() for details): + list_level = 0 + + _ws_only_line_re = re.compile(r"^[ \t]+$", re.M) + + def __init__(self, html4tags=False, tab_width=4, safe_mode=None, + extras=None, link_patterns=None): + if html4tags: + self.empty_element_suffix = ">" + else: + self.empty_element_suffix = " />" + self.tab_width = tab_width + + # For compatibility with earlier markdown2.py and with + # markdown.py's safe_mode being a boolean, + # safe_mode == True -> "replace" + if safe_mode is True: + self.safe_mode = "replace" + else: + self.safe_mode = safe_mode + + if self.extras is None: + self.extras = {} + elif not isinstance(self.extras, dict): + self.extras = dict([(e, None) for e in self.extras]) + if extras: + if not isinstance(extras, dict): + extras = dict([(e, None) for e in extras]) + self.extras.update(extras) + assert isinstance(self.extras, dict) + self._instance_extras = self.extras.copy() + self.link_patterns = link_patterns + self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M) + + def reset(self): + self.urls = {} + self.titles = {} + self.html_blocks = {} + self.html_spans = {} + self.list_level = 0 + self.extras = self._instance_extras.copy() + self.encoding = 'utf-8' + if "footnotes" in self.extras: + self.footnotes = {} + self.footnote_ids = [] + + def convert(self, text, encoding=None): + """Convert the given text.""" + # Main function. The order in which other subs are called here is + # essential. Link and image substitutions need to happen before + # _EscapeSpecialChars(), so that any *'s or _'s in the + # and tags get encoded. + + # Clear the global hashes. If we don't clear these, you get conflicts + # from other articles when generating a page which contains more than + # one article (e.g. an index page that shows the N most recent + # articles): + self.reset() + if encoding: + self.encoding = encoding + + if not isinstance(text, unicode): + text = unicode(text, self.encoding) + + # Standardize line endings: + #text = re.sub("\r\n|\r", "\n", text) + + # Make sure $text ends with a couple of newlines: + text += "\n\n" + + # Convert all tabs to spaces. + text = self._detab(text) + + # Strip any lines consisting only of spaces and tabs. + # This makes subsequent regexen easier to write, because we can + # match consecutive blank lines with /\n+/ instead of something + # contorted like /[ \t]*\n+/ . + text = self._ws_only_line_re.sub("", text) + + if self.safe_mode: + text = self._hash_html_spans(text) + + # Turn block-level HTML blocks into hash entries + text = self._hash_html_blocks(text, raw=True) + + # Strip link definitions, store in hashes. + if "footnotes" in self.extras: + # Must do footnotes first because an unlucky footnote defn + # looks like a link defn: + # [^4]: this "looks like a link defn" + text = self._strip_footnote_definitions(text) + text = self._strip_link_definitions(text) + + text = self._run_block_gamut(text) + + if "footnotes" in self.extras: + text = self._add_footnotes(text) + + text = self._unescape_special_chars(text) + + if self.safe_mode: + text = self._unhash_html_spans(text) + + text += "\n" + return text + + # Cribbed from a post by Bart Lateur: + # + _detab_re = re.compile(r'(.*?)\t', re.M) + def _detab_sub(self, match): + g1 = match.group(1) + return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width)) + def _detab(self, text): + r"""Remove (leading?) tabs from a file. + + >>> m = Markdown() + >>> m._detab("\tfoo") + ' foo' + >>> m._detab(" \tfoo") + ' foo' + >>> m._detab("\t foo") + ' foo' + >>> m._detab(" foo") + ' foo' + >>> m._detab(" foo\n\tbar\tblam") + ' foo\n bar blam' + """ + if '\t' not in text: + return text + return self._detab_re.subn(self._detab_sub, text)[0] + + _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del' + _strict_tag_block_re = re.compile(r""" + ( # save in \1 + ^ # start of line (with re.M) + <(%s) # start tag = \2 + \b # word break + (.*\n)*? # any number of lines, minimally matching + # the matching end tag + [ \t]* # trailing spaces/tabs + (?=\n+|\Z) # followed by a newline or end of document + ) + """ % _block_tags_a, + re.X | re.M) + + _block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math' + _liberal_tag_block_re = re.compile(r""" + ( # save in \1 + ^ # start of line (with re.M) + <(%s) # start tag = \2 + \b # word break + (.*\n)*? # any number of lines, minimally matching + .* # the matching end tag + [ \t]* # trailing spaces/tabs + (?=\n+|\Z) # followed by a newline or end of document + ) + """ % _block_tags_b, + re.X | re.M) + + def _hash_html_block_sub(self, match, raw=False): + html = match.group(1) + if raw and self.safe_mode: + html = self._sanitize_html(html) + key = _hash_text(html) + self.html_blocks[key] = html + return "\n\n" + key + "\n\n" + + def _hash_html_blocks(self, text, raw=False): + """Hashify HTML blocks + + We only want to do this for block-level HTML tags, such as headers, + lists, and tables. That's because we still want to wrap

s around + "paragraphs" that are wrapped in non-block-level tags, such as anchors, + phrase emphasis, and spans. The list of tags we're looking for is + hard-coded. + + @param raw {boolean} indicates if these are raw HTML blocks in + the original source. It makes a difference in "safe" mode. + """ + if '<' not in text: + return text + + # Pass `raw` value into our calls to self._hash_html_block_sub. + hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw) + + # First, look for nested blocks, e.g.: + #

+ #
+ # tags for inner block must be indented. + #
+ #
+ # + # The outermost tags must start at the left margin for this to match, and + # the inner nested divs must be indented. + # We need to do this before the next, more liberal match, because the next + # match will start at the first `
` and stop at the first `
`. + text = self._strict_tag_block_re.sub(hash_html_block_sub, text) + + # Now match more liberally, simply from `\n` to `\n` + text = self._liberal_tag_block_re.sub(hash_html_block_sub, text) + + # Special case just for
. It was easier to make a special + # case than to make the other regex more complicated. + if "", start_idx) + 3 + except ValueError, ex: + break + + # Start position for next comment block search. + start = end_idx + + # Validate whitespace before comment. + if start_idx: + # - Up to `tab_width - 1` spaces before start_idx. + for i in range(self.tab_width - 1): + if text[start_idx - 1] != ' ': + break + start_idx -= 1 + if start_idx == 0: + break + # - Must be preceded by 2 newlines or hit the start of + # the document. + if start_idx == 0: + pass + elif start_idx == 1 and text[0] == '\n': + start_idx = 0 # to match minute detail of Markdown.pl regex + elif text[start_idx-2:start_idx] == '\n\n': + pass + else: + break + + # Validate whitespace after comment. + # - Any number of spaces and tabs. + while end_idx < len(text): + if text[end_idx] not in ' \t': + break + end_idx += 1 + # - Must be following by 2 newlines or hit end of text. + if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'): + continue + + # Escape and hash (must match `_hash_html_block_sub`). + html = text[start_idx:end_idx] + if raw and self.safe_mode: + html = self._sanitize_html(html) + key = _hash_text(html) + self.html_blocks[key] = html + text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:] + + if "xml" in self.extras: + # Treat XML processing instructions and namespaced one-liner + # tags as if they were block HTML tags. E.g., if standalone + # (i.e. are their own paragraph), the following do not get + # wrapped in a

tag: + # + # + # + _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width) + text = _xml_oneliner_re.sub(hash_html_block_sub, text) + + return text + + def _strip_link_definitions(self, text): + # Strips link definitions from text, stores the URLs and titles in + # hash references. + less_than_tab = self.tab_width - 1 + + # Link defs are in the form: + # [id]: url "optional title" + _link_def_re = re.compile(r""" + ^[ ]{0,%d}\[(.+)\]: # id = \1 + [ \t]* + \n? # maybe *one* newline + [ \t]* + ? # url = \2 + [ \t]* + (?: + \n? # maybe one newline + [ \t]* + (?<=\s) # lookbehind for whitespace + ['"(] + ([^\n]*) # title = \3 + ['")] + [ \t]* + )? # title is optional + (?:\n+|\Z) + """ % less_than_tab, re.X | re.M | re.U) + return _link_def_re.sub(self._extract_link_def_sub, text) + + def _extract_link_def_sub(self, match): + id, url, title = match.groups() + key = id.lower() # Link IDs are case-insensitive + self.urls[key] = self._encode_amps_and_angles(url) + if title: + self.titles[key] = title.replace('"', '"') + return "" + + def _extract_footnote_def_sub(self, match): + id, text = match.groups() + text = _dedent(text, skip_first_line=not text.startswith('\n')).strip() + normed_id = re.sub(r'\W', '-', id) + # Ensure footnote text ends with a couple newlines (for some + # block gamut matches). + self.footnotes[normed_id] = text + "\n\n" + return "" + + def _strip_footnote_definitions(self, text): + """A footnote definition looks like this: + + [^note-id]: Text of the note. + + May include one or more indented paragraphs. + + Where, + - The 'note-id' can be pretty much anything, though typically it + is the number of the footnote. + - The first paragraph may start on the next line, like so: + + [^note-id]: + Text of the note. + """ + less_than_tab = self.tab_width - 1 + footnote_def_re = re.compile(r''' + ^[ ]{0,%d}\[\^(.+)\]: # id = \1 + [ \t]* + ( # footnote text = \2 + # First line need not start with the spaces. + (?:\s*.*\n+) + (?: + (?:[ ]{%d} | \t) # Subsequent lines must be indented. + .*\n+ + )* + ) + # Lookahead for non-space at line-start, or end of doc. + (?:(?=^[ ]{0,%d}\S)|\Z) + ''' % (less_than_tab, self.tab_width, self.tab_width), + re.X | re.M) + return footnote_def_re.sub(self._extract_footnote_def_sub, text) + + + _hr_res = [ + re.compile(r"^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$", re.M), + re.compile(r"^[ ]{0,2}([ ]?\-[ ]?){3,}[ \t]*$", re.M), + re.compile(r"^[ ]{0,2}([ ]?\_[ ]?){3,}[ \t]*$", re.M), + ] + + def _run_block_gamut(self, text): + # These are all the transformations that form block-level + # tags like paragraphs, headers, and list items. + + text = self._do_headers(text) + + # Do Horizontal Rules: + hr = "\n tags around block-level tags. + text = self._hash_html_blocks(text) + + text = self._form_paragraphs(text) + + return text + + def _pyshell_block_sub(self, match): + lines = match.group(0).splitlines(0) + _dedentlines(lines) + indent = ' ' * self.tab_width + s = ('\n' # separate from possible cuddled paragraph + + indent + ('\n'+indent).join(lines) + + '\n\n') + return s + + def _prepare_pyshell_blocks(self, text): + """Ensure that Python interactive shell sessions are put in + code blocks -- even if not properly indented. + """ + if ">>>" not in text: + return text + + less_than_tab = self.tab_width - 1 + _pyshell_block_re = re.compile(r""" + ^([ ]{0,%d})>>>[ ].*\n # first line + ^(\1.*\S+.*\n)* # any number of subsequent lines + ^\n # ends with a blank line + """ % less_than_tab, re.M | re.X) + + return _pyshell_block_re.sub(self._pyshell_block_sub, text) + + def _run_span_gamut(self, text): + # These are all the transformations that occur *within* block-level + # tags like paragraphs, headers, and list items. + + text = self._do_code_spans(text) + + text = self._escape_special_chars(text) + + # Process anchor and image tags. + text = self._do_links(text) + + # Make links out of things like `` + # Must come after _do_links(), because you can use < and > + # delimiters in inline links like [this](). + text = self._do_auto_links(text) + + if "link-patterns" in self.extras: + text = self._do_link_patterns(text) + + text = self._encode_amps_and_angles(text) + + text = self._do_italics_and_bold(text) + + # Do hard breaks: + text = re.sub(r" {2,}\n", " + | + # auto-link (e.g., ) + <\w+[^>]*> + | + # comment + | + <\?.*?\?> # processing instruction + ) + """, re.X) + + def _escape_special_chars(self, text): + # Python markdown note: the HTML tokenization here differs from + # that in Markdown.pl, hence the behaviour for subtle cases can + # differ (I believe the tokenizer here does a better job because + # it isn't susceptible to unmatched '<' and '>' in HTML tags). + # Note, however, that '>' is not allowed in an auto-link URL + # here. + escaped = [] + is_html_markup = False + for token in self._sorta_html_tokenize_re.split(text): + if is_html_markup: + # Within tags/HTML-comments/auto-links, encode * and _ + # so they don't conflict with their use in Markdown for + # italics and strong. We're replacing each such + # character with its corresponding MD5 checksum value; + # this is likely overkill, but it should prevent us from + # colliding with the escape values by accident. + escaped.append(token.replace('*', g_escape_table['*']) + .replace('_', g_escape_table['_'])) + else: + escaped.append(self._encode_backslash_escapes(token)) + is_html_markup = not is_html_markup + return ''.join(escaped) + + def _hash_html_spans(self, text): + # Used for safe_mode. + + def _is_auto_link(s): + if ':' in s and self._auto_link_re.match(s): + return True + elif '@' in s and self._auto_email_link_re.match(s): + return True + return False + + tokens = [] + is_html_markup = False + for token in self._sorta_html_tokenize_re.split(text): + if is_html_markup and not _is_auto_link(token): + sanitized = self._sanitize_html(token) + key = _hash_text(sanitized) + self.html_spans[key] = sanitized + tokens.append(key) + else: + tokens.append(token) + is_html_markup = not is_html_markup + return ''.join(tokens) + + def _unhash_html_spans(self, text): + for key, sanitized in self.html_spans.items(): + text = text.replace(key, sanitized) + return text + + def _sanitize_html(self, s): + if self.safe_mode == "replace": + return self.html_removed_text + elif self.safe_mode == "escape": + replacements = [ + ('&', '&'), + ('<', '<'), + ('>', '>'), + ] + for before, after in replacements: + s = s.replace(before, after) + return s + else: + raise MarkdownError("invalid value for 'safe_mode': %r (must be " + "'escape' or 'replace')" % self.safe_mode) + + _tail_of_inline_link_re = re.compile(r''' + # Match tail of: [text](/url/) or [text](/url/ "title") + \( # literal paren + [ \t]* + (?P # \1 + <.*?> + | + .*? + ) + [ \t]* + ( # \2 + (['"]) # quote char = \3 + (?P.*?) + \3 # matching quote + )? # title is optional + \) + ''', re.X | re.S) + _tail_of_reference_link_re = re.compile(r''' + # Match tail of: [text][id] + [ ]? # one optional space + (?:\n[ ]*)? # one optional newline followed by spaces + \[ + (?P<id>.*?) + \] + ''', re.X | re.S) + + def _do_links(self, text): + """Turn Markdown link shortcuts into XHTML <a> and <img> tags. + + This is a combination of Markdown.pl's _DoAnchors() and + _DoImages(). They are done together because that simplified the + approach. It was necessary to use a different approach than + Markdown.pl because of the lack of atomic matching support in + Python's regex engine used in $g_nested_brackets. + """ + MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24 + + # `anchor_allowed_pos` is used to support img links inside + # anchors, but not anchors inside anchors. An anchor's start + # pos must be `>= anchor_allowed_pos`. + anchor_allowed_pos = 0 + + curr_pos = 0 + while True: # Handle the next link. + # The next '[' is the start of: + # - an inline anchor: [text](url "title") + # - a reference anchor: [text][id] + # - an inline img: ![text](url "title") + # - a reference img: ![text][id] + # - a footnote ref: [^id] + # (Only if 'footnotes' extra enabled) + # - a footnote defn: [^id]: ... + # (Only if 'footnotes' extra enabled) These have already + # been stripped in _strip_footnote_definitions() so no + # need to watch for them. + # - a link definition: [id]: url "title" + # These have already been stripped in + # _strip_link_definitions() so no need to watch for them. + # - not markup: [...anything else... + try: + start_idx = text.index('[', curr_pos) + except ValueError: + break + text_length = len(text) + + # Find the matching closing ']'. + # Markdown.pl allows *matching* brackets in link text so we + # will here too. Markdown.pl *doesn't* currently allow + # matching brackets in img alt text -- we'll differ in that + # regard. + bracket_depth = 0 + for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL, + text_length)): + ch = text[p] + if ch == ']': + bracket_depth -= 1 + if bracket_depth < 0: + break + elif ch == '[': + bracket_depth += 1 + else: + # Closing bracket not found within sentinel length. + # This isn't markup. + curr_pos = start_idx + 1 + continue + link_text = text[start_idx+1:p] + + # Possibly a footnote ref? + if "footnotes" in self.extras and link_text.startswith("^"): + normed_id = re.sub(r'\W', '-', link_text[1:]) + if normed_id in self.footnotes: + self.footnote_ids.append(normed_id) + result = '<sup class="footnote-ref" id="fnref-%s">' \ + '<a href="#fn-%s">%s</a></sup>' \ + % (normed_id, normed_id, len(self.footnote_ids)) + text = text[:start_idx] + result + text[p+1:] + else: + # This id isn't defined, leave the markup alone. + curr_pos = p+1 + continue + + # Now determine what this is by the remainder. + p += 1 + if p == text_length: + return text + + # Inline anchor or img? + if text[p] == '(': # attempt at perf improvement + match = self._tail_of_inline_link_re.match(text, p) + if match: + # Handle an inline anchor or img. + is_img = start_idx > 0 and text[start_idx-1] == "!" + if is_img: + start_idx -= 1 + + url, title = match.group("url"), match.group("title") + if url and url[0] == '<': + url = url[1:-1] # '<url>' -> 'url' + # We've got to encode these to avoid conflicting + # with italics/bold. + url = url.replace('*', g_escape_table['*']) \ + .replace('_', g_escape_table['_']) + if title: + title_str = ' title="%s"' \ + % title.replace('*', g_escape_table['*']) \ + .replace('_', g_escape_table['_']) \ + .replace('"', '"') + else: + title_str = '' + if is_img: + result = '<img src="%s" alt="%s"%s%s' \ + % (url.replace('"', '"'), + link_text.replace('"', '"'), + title_str, self.empty_element_suffix) + curr_pos = start_idx + len(result) + text = text[:start_idx] + result + text[match.end():] + elif start_idx >= anchor_allowed_pos: + result_head = '<a href="%s"%s>' % (url, title_str) + result = '%s%s</a>' % (result_head, link_text) + # <img> allowed from curr_pos on, <a> from + # anchor_allowed_pos on. + curr_pos = start_idx + len(result_head) + anchor_allowed_pos = start_idx + len(result) + text = text[:start_idx] + result + text[match.end():] + else: + # Anchor not allowed here. + curr_pos = start_idx + 1 + continue + + # Reference anchor or img? + else: + match = self._tail_of_reference_link_re.match(text, p) + if match: + # Handle a reference-style anchor or img. + is_img = start_idx > 0 and text[start_idx-1] == "!" + if is_img: + start_idx -= 1 + link_id = match.group("id").lower() + if not link_id: + link_id = link_text.lower() # for links like [this][] + if link_id in self.urls: + url = self.urls[link_id] + # We've got to encode these to avoid conflicting + # with italics/bold. + url = url.replace('*', g_escape_table['*']) \ + .replace('_', g_escape_table['_']) + title = self.titles.get(link_id) + if title: + title = title.replace('*', g_escape_table['*']) \ + .replace('_', g_escape_table['_']) + title_str = ' title="%s"' % title + else: + title_str = '' + if is_img: + result = '<img src="%s" alt="%s"%s%s' \ + % (url.replace('"', '"'), + link_text.replace('"', '"'), + title_str, self.empty_element_suffix) + curr_pos = start_idx + len(result) + text = text[:start_idx] + result + text[match.end():] + elif start_idx >= anchor_allowed_pos: + result = '<a href="%s"%s>%s</a>' \ + % (url, title_str, link_text) + result_head = '<a href="%s"%s>' % (url, title_str) + result = '%s%s</a>' % (result_head, link_text) + # <img> allowed from curr_pos on, <a> from + # anchor_allowed_pos on. + curr_pos = start_idx + len(result_head) + anchor_allowed_pos = start_idx + len(result) + text = text[:start_idx] + result + text[match.end():] + else: + # Anchor not allowed here. + curr_pos = start_idx + 1 + else: + # This id isn't defined, leave the markup alone. + curr_pos = match.end() + continue + + # Otherwise, it isn't markup. + curr_pos = start_idx + 1 + + return text + + + _setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M) + def _setext_h_sub(self, match): + n = {"=": 1, "-": 2}[match.group(2)[0]] + demote_headers = self.extras.get("demote-headers") + if demote_headers: + n = min(n + demote_headers, 6) + return "<h%d>%s</h%d>\n\n" \ + % (n, self._run_span_gamut(match.group(1)), n) + + _atx_h_re = re.compile(r''' + ^([\#=]{1,6}) # \1 = string of #'s + [ \t]* + (.+?) # \2 = Header text + [ \t]* + (?<!\\) # ensure not an escaped trailing '#' + [\#=]* # optional closing #'s (not counted) + \n+ + ''', re.X | re.M) + def _atx_h_sub(self, match): + n = len(match.group(1)) + demote_headers = self.extras.get("demote-headers") + if demote_headers: + n = min(n + demote_headers, 6) + return "<h%d>%s</h%d>\n\n" \ + % (n, self._run_span_gamut(match.group(2)), n) + + def _do_headers(self, text): + # Setext-style headers: + # Header 1 + # ======== + # + # Header 2 + # -------- + text = self._setext_h_re.sub(self._setext_h_sub, text) + + # atx-style headers: + # # Header 1 + # ## Header 2 + # ## Header 2 with closing hashes ## + # ... + # ###### Header 6 + text = self._atx_h_re.sub(self._atx_h_sub, text) + + return text + + + _marker_ul_chars = '*+-' + _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars + _marker_ul = '(?:[%s])' % _marker_ul_chars + _marker_ol = r'(?:\d+\.)' + + def _list_sub(self, match): + lst = match.group(1) + lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol" + result = self._process_list_items(lst) + if self.list_level: + return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type) + else: + return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type) + + def _do_lists(self, text): + # Form HTML ordered (numbered) and unordered (bulleted) lists. + + for marker_pat in (self._marker_ul, self._marker_ol): + # Re-usable pattern to match any entire ul or ol list: + less_than_tab = self.tab_width - 1 + whole_list = r''' + ( # \1 = whole list + ( # \2 + [ ]{0,%d} + (%s) # \3 = first list item marker + [ \t]+ + ) + (?:.+?) + ( # \4 + \Z + | + \n{2,} + (?=\S) + (?! # Negative lookahead for another list item marker + [ \t]* + %s[ \t]+ + ) + ) + ) + ''' % (less_than_tab, marker_pat, marker_pat) + + # We use a different prefix before nested lists than top-level lists. + # See extended comment in _process_list_items(). + # + # Note: There's a bit of duplication here. My original implementation + # created a scalar regex pattern as the conditional result of the test on + # $g_list_level, and then only ran the $text =~ s{...}{...}egmx + # substitution once, using the scalar as the pattern. This worked, + # everywhere except when running under MT on my hosting account at Pair + # Networks. There, this caused all rebuilds to be killed by the reaper (or + # perhaps they crashed, but that seems incredibly unlikely given that the + # same script on the same server ran fine *except* under MT. I've spent + # more time trying to figure out why this is happening than I'd like to + # admit. My only guess, backed up by the fact that this workaround works, + # is that Perl optimizes the substition when it can figure out that the + # pattern will never change, and when this optimization isn't on, we run + # afoul of the reaper. Thus, the slightly redundant code to that uses two + # static s/// patterns rather than one conditional pattern. + + if self.list_level: + sub_list_re = re.compile("^"+whole_list, re.X | re.M | re.S) + text = sub_list_re.sub(self._list_sub, text) + else: + list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list, + re.X | re.M | re.S) + text = list_re.sub(self._list_sub, text) + + return text + + _list_item_re = re.compile(r''' + (\n)? # leading line = \1 + (^[ \t]*) # leading whitespace = \2 + (?P<marker>%s) [ \t]+ # list marker = \3 + ((?:.+?) # list item text = \4 + (\n{1,2})) # eols = \5 + (?= \n* (\Z | \2 (?P<next_marker>%s) [ \t]+)) + ''' % (_marker_any, _marker_any), + re.M | re.X | re.S) + + _last_li_endswith_two_eols = False + def _list_item_sub(self, match): + item = match.group(4) + leading_line = match.group(1) + leading_space = match.group(2) + if leading_line or "\n\n" in item or self._last_li_endswith_two_eols: + item = self._run_block_gamut(self._outdent(item)) + else: + # Recursion for sub-lists: + item = self._do_lists(self._outdent(item)) + if item.endswith('\n'): + item = item[:-1] + item = self._run_span_gamut(item) + self._last_li_endswith_two_eols = (len(match.group(5)) == 2) + return "<li>%s</li>\n" % item + + def _process_list_items(self, list_str): + # Process the contents of a single ordered or unordered list, + # splitting it into individual list items. + + # The $g_list_level global keeps track of when we're inside a list. + # Each time we enter a list, we increment it; when we leave a list, + # we decrement. If it's zero, we're not in a list anymore. + # + # We do this because when we're not inside a list, we want to treat + # something like this: + # + # I recommend upgrading to version + # 8. Oops, now this line is treated + # as a sub-list. + # + # As a single paragraph, despite the fact that the second line starts + # with a digit-period-space sequence. + # + # Whereas when we're inside a list (or sub-list), that line will be + # treated as the start of a sub-list. What a kludge, huh? This is + # an aspect of Markdown's syntax that's hard to parse perfectly + # without resorting to mind-reading. Perhaps the solution is to + # change the syntax rules such that sub-lists must start with a + # starting cardinal number; e.g. "1." or "a.". + self.list_level += 1 + self._last_li_endswith_two_eols = False + list_str = list_str.rstrip('\n') + '\n' + list_str = self._list_item_re.sub(self._list_item_sub, list_str) + self.list_level -= 1 + return list_str + + def _get_pygments_lexer(self, lexer_name): + try: + from pygments import lexers, util + except ImportError: + return None + try: + return lexers.get_lexer_by_name(lexer_name) + except util.ClassNotFound: + return None + + def _color_with_pygments(self, codeblock, lexer, **formatter_opts): + import pygments + import pygments.formatters + + class HtmlCodeFormatter(pygments.formatters.HtmlFormatter): + def _wrap_code(self, inner): + """A function for use in a Pygments Formatter which + wraps in <code> tags. + """ + yield 0, "<code>" + for tup in inner: + yield tup + yield 0, "</code>" + + def wrap(self, source, outfile): + """Return the source with a code, pre, and div.""" + return self._wrap_div(self._wrap_pre(self._wrap_code(source))) + + formatter = HtmlCodeFormatter(cssclass="codehilite", **formatter_opts) + return pygments.highlight(codeblock, lexer, formatter) + + def _code_block_sub(self, match): + codeblock = match.group(1) + codeblock = self._outdent(codeblock) + codeblock = self._detab(codeblock) + codeblock = codeblock.lstrip('\n') # trim leading newlines + codeblock = codeblock.rstrip() # trim trailing whitespace + + if "code-color" in self.extras and codeblock.startswith(":::"): + lexer_name, rest = codeblock.split('\n', 1) + lexer_name = lexer_name[3:].strip() + lexer = self._get_pygments_lexer(lexer_name) + codeblock = rest.lstrip("\n") # Remove lexer declaration line. + if lexer: + formatter_opts = self.extras['code-color'] or {} + colored = self._color_with_pygments(codeblock, lexer, + **formatter_opts) + return "\n\n%s\n\n" % colored + + codeblock = self._encode_code(codeblock) + return "\n\n<pre><code>%s\n</code></pre>\n\n" % codeblock + + def _do_code_blocks(self, text): + """Process Markdown `<pre><code>` blocks.""" + code_block_re = re.compile(r''' + (?:\n\n|\A) + ( # $1 = the code block -- one or more lines, starting with a space/tab + (?: + (?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces + .*\n+ + )+ + ) + ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc + ''' % (self.tab_width, self.tab_width), + re.M | re.X) + + return code_block_re.sub(self._code_block_sub, text) + + + # Rules for a code span: + # - backslash escapes are not interpreted in a code span + # - to include one or or a run of more backticks the delimiters must + # be a longer run of backticks + # - cannot start or end a code span with a backtick; pad with a + # space and that space will be removed in the emitted HTML + # See `test/tm-cases/escapes.text` for a number of edge-case + # examples. + _code_span_re = re.compile(r''' + (?<!\\) + (`+) # \1 = Opening run of ` + (?!`) # See Note A test/tm-cases/escapes.text + (.+?) # \2 = The code block + (?<!`) + \1 # Matching closer + (?!`) + ''', re.X | re.S) + + def _code_span_sub(self, match): + c = match.group(2).strip(" \t") + c = self._encode_code(c) + return "<code>%s</code>" % c + + def _do_code_spans(self, text): + # * Backtick quotes are used for <code></code> spans. + # + # * You can use multiple backticks as the delimiters if you want to + # include literal backticks in the code span. So, this input: + # + # Just type ``foo `bar` baz`` at the prompt. + # + # Will translate to: + # + # <p>Just type <code>foo `bar` baz</code> at the prompt.</p> + # + # There's no arbitrary limit to the number of backticks you + # can use as delimters. If you need three consecutive backticks + # in your code, use four for delimiters, etc. + # + # * You can use spaces to get literal backticks at the edges: + # + # ... type `` `bar` `` ... + # + # Turns to: + # + # ... type <code>`bar`</code> ... + return self._code_span_re.sub(self._code_span_sub, text) + + def _encode_code(self, text): + """Encode/escape certain characters inside Markdown code runs. + The point is that in code, these characters are literals, + and lose their special Markdown meanings. + """ + replacements = [ + # Encode all ampersands; HTML entities are not + # entities within a Markdown code span. + ('&', '&'), + # Do the angle bracket song and dance: + ('<', '<'), + ('>', '>'), + # Now, escape characters that are magic in Markdown: + ('*', g_escape_table['*']), + ('_', g_escape_table['_']), + ('{', g_escape_table['{']), + ('}', g_escape_table['}']), + ('[', g_escape_table['[']), + (']', g_escape_table[']']), + ('\\', g_escape_table['\\']), + ] + for before, after in replacements: + text = text.replace(before, after) + return text + + _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S) + _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S) + _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S) + _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S) + def _do_italics_and_bold(self, text): + # <strong> must go first: + if "code-friendly" in self.extras: + text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text) + text = self._code_friendly_em_re.sub(r"<em>\1</em>", text) + else: + text = self._strong_re.sub(r"<strong>\2</strong>", text) + text = self._em_re.sub(r"<em>\2</em>", text) + return text + + + _block_quote_re = re.compile(r''' + ( # Wrap whole match in \1 + ( + ^[ \t]*>[ \t]? # '>' at the start of a line + .+\n # rest of the first line + (.+\n)* # subsequent consecutive lines + \n* # blanks + )+ + ) + ''', re.M | re.X) + _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M); + + _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S) + def _dedent_two_spaces_sub(self, match): + return re.sub(r'(?m)^ ', '', match.group(1)) + + def _block_quote_sub(self, match): + bq = match.group(1) + bq = self._bq_one_level_re.sub('', bq) # trim one level of quoting + bq = self._ws_only_line_re.sub('', bq) # trim whitespace-only lines + bq = self._run_block_gamut(bq) # recurse + + bq = re.sub('(?m)^', ' ', bq) + # These leading spaces screw with <pre> content, so we need to fix that: + bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq) + + return "<blockquote>\n%s\n</blockquote>\n\n" % bq + + def _do_block_quotes(self, text): + if '>' not in text: + return text + return self._block_quote_re.sub(self._block_quote_sub, text) + + def _form_paragraphs(self, text): + # Strip leading and trailing lines: + text = text.strip('\n') + + # Wrap <p> tags. + grafs = [] + for i, graf in enumerate(re.split(r"\n{2,}", text)): + if graf in self.html_blocks: + # Unhashify HTML blocks + grafs.append(self.html_blocks[graf]) + else: + cuddled_list = None + if "cuddled-lists" in self.extras: + # Need to put back trailing '\n' for `_list_item_re` + # match at the end of the paragraph. + li = self._list_item_re.search(graf + '\n') + # Two of the same list marker in this paragraph: a likely + # candidate for a list cuddled to preceding paragraph + # text (issue 33). Note the `[-1]` is a quick way to + # consider numeric bullets (e.g. "1." and "2.") to be + # equal. + if (li and li.group("next_marker") + and li.group("marker")[-1] == li.group("next_marker")[-1]): + start = li.start() + cuddled_list = self._do_lists(graf[start:]).rstrip("\n") + assert cuddled_list.startswith("<ul>") or cuddled_list.startswith("<ol>") + graf = graf[:start] + + # Wrap <p> tags. + graf = self._run_span_gamut(graf) + grafs.append("<p>" + graf.lstrip(" \t") + "</p>") + + if cuddled_list: + grafs.append(cuddled_list) + + return "\n\n".join(grafs) + + def _add_footnotes(self, text): + if self.footnotes: + footer = [ + '<div class="footnotes">', + '<hr' + self.empty_element_suffix, + '<ol>', + ] + for i, id in enumerate(self.footnote_ids): + if i != 0: + footer.append('') + footer.append('<li id="fn-%s">' % id) + footer.append(self._run_block_gamut(self.footnotes[id])) + backlink = ('<a href="#fnref-%s" ' + 'class="footnoteBackLink" ' + 'title="Jump back to footnote %d in the text.">' + '↩</a>' % (id, i+1)) + if footer[-1].endswith("</p>"): + footer[-1] = footer[-1][:-len("</p>")] \ + + ' ' + backlink + "</p>" + else: + footer.append("\n<p>%s</p>" % backlink) + footer.append('</li>') + footer.append('</ol>') + footer.append('</div>') + return text + '\n\n' + '\n'.join(footer) + else: + return text + + # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: + # http://bumppo.net/projects/amputator/ + _ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)') + _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I) + _naked_gt_re = re.compile(r'''(?<![a-z?!/'"-])>''', re.I) + + def _encode_amps_and_angles(self, text): + # Smart processing for ampersands and angle brackets that need + # to be encoded. + text = self._ampersand_re.sub('&', text) + + # Encode naked <'s + text = self._naked_lt_re.sub('<', text) + + # Encode naked >'s + # Note: Other markdown implementations (e.g. Markdown.pl, PHP + # Markdown) don't do this. + text = self._naked_gt_re.sub('>', text) + return text + + def _encode_backslash_escapes(self, text): + for ch, escape in g_escape_table.items(): + text = text.replace("\\"+ch, escape) + return text + + _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I) + def _auto_link_sub(self, match): + g1 = match.group(1) + return '<a href="%s">%s</a>' % (g1, g1) + + _auto_email_link_re = re.compile(r""" + < + (?:mailto:)? + ( + [-.\w]+ + \@ + [-\w]+(\.[-\w]+)*\.[a-z]+ + ) + > + """, re.I | re.X | re.U) + def _auto_email_link_sub(self, match): + return self._encode_email_address( + self._unescape_special_chars(match.group(1))) + + def _do_auto_links(self, text): + text = self._auto_link_re.sub(self._auto_link_sub, text) + text = self._auto_email_link_re.sub(self._auto_email_link_sub, text) + return text + + def _encode_email_address(self, addr): + # Input: an email address, e.g. "foo@example.com" + # + # Output: the email address as a mailto link, with each character + # of the address encoded as either a decimal or hex entity, in + # the hopes of foiling most address harvesting spam bots. E.g.: + # + # <a href="mailto:foo@e + # xample.com">foo + # @example.com</a> + # + # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk + # mailing list: <http://tinyurl.com/yu7ue> + chars = [_xml_encode_email_char_at_random(ch) + for ch in "mailto:" + addr] + # Strip the mailto: from the visible part. + addr = '<a href="%s">%s</a>' \ + % (''.join(chars), ''.join(chars[7:])) + return addr + + def _do_link_patterns(self, text): + """Caveat emptor: there isn't much guarding against link + patterns being formed inside other standard Markdown links, e.g. + inside a [link def][like this]. + + Dev Notes: *Could* consider prefixing regexes with a negative + lookbehind assertion to attempt to guard against this. + """ + link_from_hash = {} + for regex, repl in self.link_patterns: + replacements = [] + for match in regex.finditer(text): + if hasattr(repl, "__call__"): + href = repl(match) + else: + href = match.expand(repl) + replacements.append((match.span(), href)) + for (start, end), href in reversed(replacements): + escaped_href = ( + href.replace('"', '"') # b/c of attr quote + # To avoid markdown <em> and <strong>: + .replace('*', g_escape_table['*']) + .replace('_', g_escape_table['_'])) + link = '<a href="%s">%s</a>' % (escaped_href, text[start:end]) + hash = _hash_text(link) + link_from_hash[hash] = link + text = text[:start] + hash + text[end:] + for hash, link in link_from_hash.items(): + text = text.replace(hash, link) + return text + + def _unescape_special_chars(self, text): + # Swap back in all the special characters we've hidden. + for ch, hash in g_escape_table.items(): + text = text.replace(hash, ch) + return text + + def _outdent(self, text): + # Remove one level of line-leading tabs or spaces + return self._outdent_re.sub('', text) + + +class MarkdownWithExtras(Markdown): + """A markdowner class that enables most extras: + + - footnotes + - code-color (only has effect if 'pygments' Python module on path) + + These are not included: + - pyshell (specific to Python-related documenting) + - code-friendly (because it *disables* part of the syntax) + - link-patterns (because you need to specify some actual + link-patterns anyway) + """ + extras = ["footnotes", "code-color"] + + +#---- internal support functions + +# From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549 +def _curry(*args, **kwargs): + function, args = args[0], args[1:] + def result(*rest, **kwrest): + combined = kwargs.copy() + combined.update(kwrest) + return function(*args + rest, **combined) + return result + +# Recipe: regex_from_encoded_pattern (1.0) +def _regex_from_encoded_pattern(s): + """'foo' -> re.compile(re.escape('foo')) + '/foo/' -> re.compile('foo') + '/foo/i' -> re.compile('foo', re.I) + """ + if s.startswith('/') and s.rfind('/') != 0: + # Parse it: /PATTERN/FLAGS + idx = s.rfind('/') + pattern, flags_str = s[1:idx], s[idx+1:] + flag_from_char = { + "i": re.IGNORECASE, + "l": re.LOCALE, + "s": re.DOTALL, + "m": re.MULTILINE, + "u": re.UNICODE, + } + flags = 0 + for char in flags_str: + try: + flags |= flag_from_char[char] + except KeyError: + raise ValueError("unsupported regex flag: '%s' in '%s' " + "(must be one of '%s')" + % (char, s, ''.join(flag_from_char.keys()))) + return re.compile(s[1:idx], flags) + else: # not an encoded regex + return re.compile(re.escape(s)) + +# Recipe: dedent (0.1.2) +def _dedentlines(lines, tabsize=8, skip_first_line=False): + """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines + + "lines" is a list of lines to dedent. + "tabsize" is the tab width to use for indent width calculations. + "skip_first_line" is a boolean indicating if the first line should + be skipped for calculating the indent width and for dedenting. + This is sometimes useful for docstrings and similar. + + Same as dedent() except operates on a sequence of lines. Note: the + lines list is modified **in-place**. + """ + DEBUG = False + if DEBUG: + print "dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\ + % (tabsize, skip_first_line) + indents = [] + margin = None + for i, line in enumerate(lines): + if i == 0 and skip_first_line: continue + indent = 0 + for ch in line: + if ch == ' ': + indent += 1 + elif ch == '\t': + indent += tabsize - (indent % tabsize) + elif ch in '\r\n': + continue # skip all-whitespace lines + else: + break + else: + continue # skip all-whitespace lines + if DEBUG: print "dedent: indent=%d: %r" % (indent, line) + if margin is None: + margin = indent + else: + margin = min(margin, indent) + if DEBUG: print "dedent: margin=%r" % margin + + if margin is not None and margin > 0: + for i, line in enumerate(lines): + if i == 0 and skip_first_line: continue + removed = 0 + for j, ch in enumerate(line): + if ch == ' ': + removed += 1 + elif ch == '\t': + removed += tabsize - (removed % tabsize) + elif ch in '\r\n': + if DEBUG: print "dedent: %r: EOL -> strip up to EOL" % line + lines[i] = lines[i][j:] + break + else: + raise ValueError("unexpected non-whitespace char %r in " + "line %r while removing %d-space margin" + % (ch, line, margin)) + if DEBUG: + print "dedent: %r: %r -> removed %d/%d"\ + % (line, ch, removed, margin) + if removed == margin: + lines[i] = lines[i][j+1:] + break + elif removed > margin: + lines[i] = ' '*(removed-margin) + lines[i][j+1:] + break + else: + if removed: + lines[i] = lines[i][removed:] + return lines + +def _dedent(text, tabsize=8, skip_first_line=False): + """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text + + "text" is the text to dedent. + "tabsize" is the tab width to use for indent width calculations. + "skip_first_line" is a boolean indicating if the first line should + be skipped for calculating the indent width and for dedenting. + This is sometimes useful for docstrings and similar. + + textwrap.dedent(s), but don't expand tabs to spaces + """ + lines = text.splitlines(1) + _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line) + return ''.join(lines) + + +class _memoized(object): + """Decorator that caches a function's return value each time it is called. + If called later with the same arguments, the cached value is returned, and + not re-evaluated. + + http://wiki.python.org/moin/PythonDecoratorLibrary + """ + def __init__(self, func): + self.func = func + self.cache = {} + def __call__(self, *args): + try: + return self.cache[args] + except KeyError: + self.cache[args] = value = self.func(*args) + return value + except TypeError: + # uncachable -- for instance, passing a list as an argument. + # Better to not cache than to blow up entirely. + return self.func(*args) + def __repr__(self): + """Return the function's docstring.""" + return self.func.__doc__ + + +def _xml_oneliner_re_from_tab_width(tab_width): + """Standalone XML processing instruction regex.""" + return re.compile(r""" + (?: + (?<=\n\n) # Starting after a blank line + | # or + \A\n? # the beginning of the doc + ) + ( # save in $1 + [ ]{0,%d} + (?: + <\?\w+\b\s+.*?\?> # XML processing instruction + | + <\w+:\w+\b\s+.*?/> # namespaced single tag + ) + [ \t]* + (?=\n{2,}|\Z) # followed by a blank line or end of document + ) + """ % (tab_width - 1), re.X) +_xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width) + +def _hr_tag_re_from_tab_width(tab_width): + return re.compile(r""" + (?: + (?<=\n\n) # Starting after a blank line + | # or + \A\n? # the beginning of the doc + ) + ( # save in \1 + [ ]{0,%d} + <(hr) # start tag = \2 + \b # word break + ([^<>])*? # + /?> # the matching end tag + [ \t]* + (?=\n{2,}|\Z) # followed by a blank line or end of document + ) + """ % (tab_width - 1), re.X) +_hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width) + + +def _xml_encode_email_char_at_random(ch): + r = random() + # Roughly 10% raw, 45% hex, 45% dec. + # '@' *must* be encoded. I [John Gruber] insist. + # Issue 26: '_' must be encoded. + if r > 0.9 and ch not in "@_": + return ch + elif r < 0.45: + # The [1:] is to drop leading '0': 0x63 -> x63 + return '&#%s;' % hex(ord(ch))[1:] + else: + return '&#%s;' % ord(ch) + + + + +text = """\ +Dies ist ein Text. + +--- + +* Test +* Mu +* Blah +""" + +#markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH, +# safe_mode=None, extras=None, link_patterns=None): +#html = markdown(text, html4tags=False) +# +#print html + + @set_hook("read") def read(params): @@ -32,7 +1629,7 @@ def htmlize(params): "xml":True, "demote-headers":1, "code-color":{}}) - html = _markdown.convert(params.file.contents) + html = _markdown.convert(params.file.contents, params.file.input_encoding) #print type(html) #print html return html diff --git a/plugins/read_rst.py b/plugins/read_rst.py index a9968ac..cb80be8 100644 --- a/plugins/read_rst.py +++ b/plugins/read_rst.py @@ -13,10 +13,10 @@ def read(params): class WebHTMLTranslator(html4css1.HTMLTranslator): - doctype = "" + doctype = "" content_type = "<!--%s-->" generator = "<!--%s-->" - + def __init__(self, document): html4css1.HTMLTranslator.__init__(self, document) self.head_prefix = [] @@ -24,13 +24,13 @@ class WebHTMLTranslator(html4css1.HTMLTranslator): self.stylesheet = [] self.body_suffix = [] self.section_level = 1 - + def visit_system_message(self, node): pass def visit_document (self, node): pass - + def depart_document (self, node): pass diff --git a/plugins/rss_feed.py b/plugins/rss_feed.py new file mode 100644 index 0000000..6104bf5 --- /dev/null +++ b/plugins/rss_feed.py @@ -0,0 +1,112 @@ +# -*- coding: iso-8859-1 -*- +from webber import * +import os, datetime, re +try: + import PyRSS2Gen +except ImportError: + print "rss_feed needs the python module PyRSS2Gen" + raise + +items = [] +max_age = 0 + + +@set_hook("checkconfig") +def checkconfig(params): + if not cfg.has_key("rss_file"): + log('no "rss_file:" configured, using "feed.rss":', 4) + cfg.rss_file = "feed.rss" + if not cfg.has_key("rss_max_items"): + cfg.rss_max_items = 0 + if cfg.has_key("rss_max_age_days"): + import time + global max_age + max_age = int(time.time()) - int(cfg.rss_max_age_days)*86400 + + +# Helper class needed for datetime.datetime to generate GMT timestamps +ZERO = datetime.timedelta(0) +class UTC(datetime.tzinfo): + """UTC""" + + def utcoffset(self, dt): + return ZERO + + def tzname(self, dt): + return "UTC" + + def dst(self, dt): + return ZERO +utc = UTC() + + +@set_hook("scan") +def sitemap_scan(params): + global items + + file = params.file + # Only consider new stuff + if max_age and file["mtime"] < max_age: + return + # Ignore non-pages + if not file.has_key("linktitle"): + return + # Ignore hidden pages + if file.has_key("hide") and file.hide: + return + + if file.has_key("change"): + change = file["change"] + else: + change = "" + + full_url = "http://%s/%s" % (cfg.main_url, file.out_path) + item = PyRSS2Gen.RSSItem( + title = file["title"], + link = full_url, + guid = PyRSS2Gen.Guid("%s %s" % (full_url, file["mtime"]), isPermaLink=0), + description = change, + pubDate = file["mtime"] + ) + items.append(item) + + +@set_hook("finish") +def finish(params): + global items + # Sort items by pubDate, which still holds the mtime + items.sort(key=lambda i: i.pubDate, reverse=True) + + # Limit to requested number + count = int(cfg.rss_max_items) + if count: + items = items[:count] + + # convert mtime to real pupDate + for i in items: + i.pubDate = datetime.datetime.fromtimestamp(i.pubDate, utc) + + rss = PyRSS2Gen.RSS2( + title = cfg.subtitle, + link = "http://%s" % cfg.main_url, + description = cfg.subtitle, + lastBuildDate = datetime.datetime.now(utc), + items = items, + ) + # Step one of self-reference + # (see http://feedvalidator.org/docs/warning/MissingAtomSelfLink.html) + rss.rss_attrs["xmlns:atom"] = "http://www.w3.org/2005/Atom" + + try: + os.makedirs(cfg.out_dir) + except: + pass + f = open(os.path.join(cfg.out_dir, cfg.rss_file), "w") + # Ugly XML beautification + s = rss.to_xml() + s = re.sub("<(?!/)", "\n<", s) + s = s.replace("\n\n", "\n") + # Step two of self-reference + s = s.replace('<channel>', '<channel>\n<atom:link href="http://%s/%s" rel="self" type="application/rss+xml" />' % (cfg.main_url, cfg.rss_file)) + f.write(s[1:]) + f.write("\n") diff --git a/plugins/skeleton.py b/plugins/skeleton.py index 8d4305f..8ab9b57 100644 --- a/plugins/skeleton.py +++ b/plugins/skeleton.py @@ -37,7 +37,7 @@ def checkconfig(params): # param is empty # @set_hook("start") -def finish(params): +def start(params): if cfg.test_verbose: print "in skeleton.start" @@ -142,7 +142,9 @@ def finish(params): -# TODO: Description missing +# +# This is a sample macro. Macros are called with [!name]. +# @set_macro("sample") def sample_macro(params): if cfg.test_verbose: @@ -151,7 +153,9 @@ def sample_macro(params): -# TODO: Description missing +# +# This is a sample function. Functions are called with ${name(args)}. +# @set_function("func") def sample_func(): if cfg.test_verbose: diff --git a/plugins/toc.py b/plugins/toc.py new file mode 100644 index 0000000..41d31dc --- /dev/null +++ b/plugins/toc.py @@ -0,0 +1,99 @@ +# -*- coding: iso-8859-1 -*- +from webber import * +import htmlentitydefs, re + + +reHeader = re.compile(r'<h(\d)(.*)>(.*)</h\1>', re.IGNORECASE | re.MULTILINE) +_toc = [] +_labels = {} +_first = -1 + +toc_min_lines = 30 + + +@set_hook("checkconfig") +def checkconfig(params): + if cfg.has_key("toc_min_lines"): + global toc_min_lines + toc_min_lines = int(cfg.toc_min_times) + + +def slugify(text, separator): + """ + This function converts a normal text string into a string, that + can be safely used for HTML links and anchors. + + Based on http://snipplr.com/view/26266/create-slugs-in-python/ + """ + + ret = "" + for c in text.lower(): + try: + ret += htmlentitydefs.codepoint2name[ord(c)] + except: + ret += c + ret = re.sub("([a-zA-Z])(uml|acute|grave|circ|tilde|cedil)", r"\1", ret) + ret = re.sub("\W", " ", ret) + ret = re.sub(" +", separator, ret) + return ret.strip() + + +def repl(m): + """ + Function used for re.sub() to find all header elements (h1, h2, ...). + Data from those elements (level, headline) are stored in the global + array `toc`. + + This function also modifies the text by adding a anchor to the + header. + """ + global _toc + global _first + label = slugify(m.group(3), "_") + if _labels.has_key(label): + n = 0 + while True: + l = "%s_%d" % (label, n) + if not _labels.has_key(l): + label = l + break + n += 1 + + level = int(m.group(1)) + if _first == -1: + _first = level + _toc.append( (level - _first, m.group(3), label) ) + _labels[label] = 1 + return '<h%s%s>%s<a name="%s"> </a></h%s>' % ( + m.group(1), + m.group(2), + m.group(3), + label, + m.group(1)) + + + +@set_hook("linkify") +def linkify(params): + global _toc + global _labels + global _first + _toc = [] + _labels = {} + _first = -1 + + # Ignore hidden pages + if params.file.has_key("hide") and params.file.hide: + return + + # Very small pages don't need a table-of-contents + if params.file.contents.count("\n") < toc_min_lines: + return + + params.file.contents = reHeader.sub(repl, params.file.contents) + + + +@set_function("get_toc") +def get_toc(): + return _toc diff --git a/webber.py b/webber.py index f3e0f63..ecd0351 100644 --- a/webber.py +++ b/webber.py @@ -11,21 +11,21 @@ from config import Holder __all__ = [ # Globals - "cfg", # configuration from webber.ini - "directories", # global hash of directories, by rel_path - "files", # global hash of files, by rel_path - "functions", # all exported template functions + "cfg", # configuration from webber.ini + "directories", # global hash of directories, by rel_path + "files", # global hash of files, by rel_path + "functions", # all exported template functions # Functions - "set_hook", # decorator for hook-functions - "set_macro", # define macro + "set_hook", # decorator for hook-functions + "set_macro", # define macro "set_function", # define functions for the template "get_file_for", "get_link_from", "get_current_file", # because mako-called functions cannot access the - # current File object + # current File object "get_program_directory", - "log", # misc logging functions + "log", # misc logging functions "info", "warning", "error", @@ -50,7 +50,18 @@ class Directory(Holder): def __init__(self, **kw): Holder.__init__(self, **kw) - directories[kw["rel_path"]] = self + kw["rel_path"] = self + if self.rel_path == "": + self.rel_path = "." + directories[self.rel_path] = self + try: + self.load(os.path.join(self.abs_path, "directory.conf")) + #print self + except IOError: + pass + + def __repr__(self): + return "<Directory %s>" % self.rel_path files = {} @@ -63,6 +74,7 @@ class File(Holder): Holder.__init__(self, **kw) files[kw["rel_path"]] = self self.render = None + self.contents = None mtime = os.stat(self.path)[stat.ST_MTIME] self.mtime = mtime self.ctime = mtime @@ -80,7 +92,7 @@ class File(Holder): if read_keywords: s = s.strip() #print "kwd:", s - if s==terminate_line: + if s == terminate_line: read_keywords = False continue @@ -115,16 +127,19 @@ class File(Holder): # Warn about long titles / long linktitles if len(self.linktitle) > 20: - log('%s: define a shorter "linktitle: xxx"') + log('%s: define a shorter linktitle' % self.rel_path) self.contents = "".join(txt) + def __repr__(self): + return "<File %s>" % self.rel_path + _get_file_for_cache = {} def get_file_for(name): """webber.files is an hash of File objects, but keyed on the real file name. This function returns a File object for a specific linktitle.""" - + try: return _get_file_for_cache[name] except: @@ -138,6 +153,10 @@ def get_file_for(name): #print " via linktitle:", s _get_file_for_cache[name] = f return f + if f.title == name: + #print " via title:", s + _get_file_for_cache[name] = f + return f except: pass # Allow exact match as well @@ -206,8 +225,8 @@ def relpath(base_path, target): def get_link_from(source, dest): - #print "get_link_from", source, dest - #print source + if dest is None: + raise KeyError if not isinstance(source, File): source = get_file_for(source) if not source: @@ -216,7 +235,7 @@ def get_link_from(source, dest): if not isinstance(dest, File): dest = get_file_for(dest) if not dest: - print "NO DEST" + warning("unknown link from %s to %s" % (source.rel_path, dest)) return "." rel_path = relpath(directories[source.direc].abs_path, directories[dest.direc].abs_path) try: @@ -228,7 +247,7 @@ def get_link_from(source, dest): if rel_path.startswith("./"): rel_path = rel_path[2:] #print " from path:", source.out_path - #print " to path: ", out_path + #print " to path: ", out_path #print " rel path: ", rel_path return rel_path @@ -254,14 +273,14 @@ def get_program_directory(): # # Logging # -# 1 Error -# 2 Warning -# 3 Info -# 4 Log +# 1 Error +# 2 Warning +# 3 Info +# 4 Log # 5... Debug # def log(s, level=4): - if level>4: + if level > 4: indent = " " * (level-4) else: indent = "" @@ -287,43 +306,43 @@ def info(s): # IkiWiki does something like this: # At startup: -# getopt modify ARGV -# checkconfig check configuration -# refresh allow plugins to build source files +# getopt modify ARGV +# checkconfig check configuration +# refresh allow plugins to build source files # While scanning files: -# needsbuild detect if page needs to be rebuild -# filter arbitrary changes -# scan collect metadata +# needsbuild detect if page needs to be rebuild +# filter arbitrary changes +# scan collect metadata # While rendering files: -# filter arbitrary changes -# preprocess execute macros -# linkify change wikilinks into links -# htmlize turns text into html -# sanitize sanitize html -# templatefile allows changing of the template on a per-file basis -# pagetemplate fill template with page -# format similar to sanitize, but act on whole page body +# filter arbitrary changes +# preprocess execute macros +# linkify change wikilinks into links +# htmlize turns text into html +# sanitize sanitize html +# templatefile allows changing of the template on a per-file basis +# pagetemplate fill template with page +# format similar to sanitize, but act on whole page body # At the end: -# savestate plugins can save their state +# savestate plugins can save their state # # # We do something like this: # # At startup: -# addoptions allow plugins to add command-line options -# checkconfig check configuration -# start +# addoptions allow plugins to add command-line options +# checkconfig check configuration +# start # While reading files: -# read ask any reader (plugins!) to read the file -# filter ask anybody to filter the contents +# read ask any reader (plugins!) to read the file +# filter ask anybody to filter the contents # While scanning files: -# scan called per file, let plugins act on file data -# scan_done Allows post-processing of scanned data +# scan called per file, let plugins act on file data +# scan_done Allows post-processing of scanned data # While rendering files: -# htmlize turns text into html-part -# linkify convert link macros to HTML -# pagetemplate ask template engine (plugin!) to generate HTML out -# of template and body part +# htmlize turns text into html-part +# linkify convert link macros to HTML +# pagetemplate ask template engine (plugin!) to generate HTML out +# of template and body part # At the end: # finish # @@ -336,13 +355,11 @@ hooks = {} def load_plugins(): """Loads all plugins in the plugins directory.""" sys.path.append(os.path.join(get_program_directory(), "plugins")) + if cfg.has_key("plugin_dirs"): + for s in cfg.plugin_dirs: + sys.path.append(s) for s in cfg.plugins: - #print "import:", s - #try: exec "import %s" % s - #except: - # print "Could not import plugin '%s'" % s - # sys.exit(1) def set_hook(name, last=False): @@ -434,16 +451,18 @@ def iso_to_time(val): try: t = time.strptime(val, "%Y-%m-%d") except ValueError: - warning("%s: wrong ISO format in '%s'" % (self.rel_path, s)) + warning("wrong ISO format in '%s'" % val) return int(time.mktime(t)) @set_function("format_date") -def format_date(timestamp): - return time.strftime(cfg.date_format, time.localtime(timestamp)) +def format_date(timestamp, format=None): + if not format: + format = cfg.date_format + return time.strftime(format, time.localtime(timestamp)) @set_function("get_time") -def get_time(): - return format_date(time.time()) +def get_time(format=None): + return format_date(time.time(), format) @set_function("get_current_file") def get_current_file(): @@ -451,7 +470,7 @@ def get_current_file(): @set_function("get_path_to_root") def get_path_to_root(): - rel_path = relpath(directories[current_file.direc].abs_path, directories[''].abs_path) + rel_path = relpath(directories[current_file.direc].abs_path, directories['.'].abs_path) rel_path = os.path.join(rel_path, os.path.split("")[1]) if rel_path[-1] == "/": rel_path = rel_path[:-1] @@ -485,9 +504,9 @@ def read_file(direc, file): return_holder=False) if not contents: return + file.contents = contents log("filtering file %s" % file.rel_path, level=6) - file.contents = contents res = run_hooks("filter", direc=direc, file=file) @@ -519,7 +538,7 @@ def walk_tree(dirpath): full_path = os.path.join(dirpath, s) ok = True if os.path.isdir(full_path): - for e in cfg.exclude_dir: + for e in cfg.exclude_dirs: if fnmatch.fnmatchcase(s, e): log("ignoring directory %s" % s, level=7) ok = False @@ -551,7 +570,7 @@ def walk_tree(dirpath): ) file.inheritFrom(direc) read_file(direc, file) - + walk(dirpath) @@ -562,14 +581,14 @@ def walk_tree(dirpath): # reMacro = re.compile(r''' - \[\[\! # Begin of macro + \[\[\! # Begin of macro \s* - ([^\s\]]+) # Macro name + ([^\s\]]+) # Macro name (?: - \s+ # optional space - ([^\]]+) # optional argumens + \s+ # optional space + ([^\]]+) # optional argumens )? - \]\] # End of macro + \]\] # End of macro ''', re.VERBOSE) reMacroArgs = re.compile(r''' ([-_\w]+) # parameter name @@ -578,9 +597,9 @@ reMacroArgs = re.compile(r''' = \s* (?: - "([^"]*)" # single-quoted + "([^"]*)" # single-quoted | - (\S+) # unquoted + (\S+) # unquoted ) )? ''', re.VERBOSE) @@ -608,21 +627,27 @@ def run_macros(file, contents): s = reMacro.sub(do_macro, contents) #print s return s - + def scan_files(): info("Scanning files ...") for s in files: file = files[s] - try: - # Just check if the file has contents - contents = file.contents - except: + if not file.has_key("contents"): continue +# try: +# # Just check if the file has contents +# contents = file.contents +# except: +# continue direc = directories[file.direc] + # "calculate" output file name + if file.render and file.render == "html": + file.out_path = os.path.splitext(s)[0] + ".html" + run_hooks("scan", direc=direc, file=file) @@ -668,9 +693,6 @@ def render_files(): continue file.contents = contents - # Output-Filename "berechnen" - file.out_path = os.path.splitext(fname_in)[0] + ".html" - for fname_in in files: file = files[fname_in] current_file = file @@ -679,18 +701,16 @@ def render_files(): continue direc = directories[file.direc] - contents = run_hooks("linkify", + run_hooks("linkify", direc=direc, file=file, - return_holder=False) + return_holder=True) #print "contents after 'linkify':", contents - if not contents: + if not file.contents: continue - file.contents = contents - # TODO: einige Fragmente sollen u.U. in eine andere - # Webseite eingebaut werden und sollten daher nicht in - # ein HTML-File landen + # TODO: make it possible to render also "fragments", e.g. + # parts that don't end up immediately in a the HTML file. contents = run_hooks("pagetemplate", direc=direc, file=file, @@ -708,12 +728,13 @@ def render_files(): except OSError: pass - # TODO: evtl. überprüfen, ob contents == f.read(), dann nicht schreiben + # TODO: check if contents == f.read(). In this case we don't + # need to save. Probably overkill. log("writing file %s" % fname_out, level=6) f = open(fname_out, "w") f.write(contents) f.close() - # TODO: Time-Stamps setzen? + os.utime(fname_out, (file.mtime, file.mtime)) #print file.mtime, file.get("ctime","?") #print direc.keys() @@ -746,7 +767,7 @@ def addoptions(params): return parser - + @set_hook("checkconfig", last=True) def checkconfig(params): # Ensure absolute paths that end in '/'. @@ -775,7 +796,12 @@ def main(): # link contents of webber.ini into cfg and set some defaults, # then let plugins fixup things in cfg.* cfg.inheritFrom(options) - cfg.setDefault("exclude_dir", ["plugins"]) + cfg.setDefault("exclude_dirs", []) + cfg.setDefault("exclude_files", ["webber.conf", "directory.conf", "*.tmpl"]) + cfg.setDefault("copy_files", []) + cfg.setDefault("input_encoding", "iso-8859-1") + cfg.setDefault("output_encoding", "iso-8859-1") + cfg.setDefault("template", "default") run_hooks("checkconfig") run_hooks("start")