rename parse_pages to Dactyl; add experimental style checking

2025-11-20 11:45:50 +00:00 · 2016-03-17 02:53:54 -07:00
parent dc808f32f4
commit 6b5297c2e0
14 changed files with 614 additions and 156 deletions
--- a/tool/dactyl_build.py
+++ b/tool/dactyl_build.py
@@ -0,0 +1,544 @@
+#!/usr/bin/env python3
+
+################################################################################
+# Dactyl - a tool for heroic epics of documentation
+#
+# Generates a website from Markdown and Jinja templates, with filtering
+# along the way.
+################################################################################
+
+DEFAULT_CONFIG_FILE = "dactyl-config.yml"
+
+import os
+import re
+import yaml
+import argparse
+import logging
+
+# Necessary to copy static files to the output dir
+from distutils.dir_util import copy_tree
+
+# Used to import filters.
+from importlib import import_module
+
+# Necessary for prince
+import subprocess
+
+# Used to fetch markdown sources from GitHub repos
+import requests
+
+# Various content and template processing stuff
+from jinja2 import Environment, FileSystemLoader
+from markdown import markdown
+from bs4 import BeautifulSoup
+
+# Watchdog stuff
+import time
+from watchdog.observers import Observer
+from watchdog.events import PatternMatchingEventHandler
+
+# The log level is configurable at runtime (see __main__ below)
+logger = logging.getLogger()
+
+filters = {}
+def load_config(config_file=DEFAULT_CONFIG_FILE):
+    """Reload config from a YAML file."""
+    global config, filters
+    logger.info("loading config file %s..." % config_file)
+    with open(config_file, "r") as f:
+        config = yaml.load(f)
+        assert(config["targets"])
+        assert(config["pages"])
+        assert(config["pdf_template"])
+        assert(config["default_template"])
+        assert(config["content_path"])
+        assert(config["out_path"])
+        assert(config["temporary_files_path"])
+        assert(config["template_static_path"])
+        assert(config["content_static_path"])
+        if "prince_executable" not in config or not config["prince_executable"]:
+            config["prince_executable"] = "prince" # A reasonable default
+
+        # Figure out which filters we need and import them
+        filternames = set()
+        for target in config["targets"]:
+            if "filters" in target:
+                filternames.update(target["filters"])
+        for page in config["pages"]:
+            if "filters" in page:
+                filternames.update(page["filters"])
+        for filter_name in filternames:
+            filters[filter_name] = import_module("filter_"+filter_name)
+
+
+def substitute_links_for_target(soup, target):
+    """Replaces local-html-links with appropriate substitutions
+       for the given target"""
+    target = get_target(target)
+
+    logger.info("... modifying links for target: %s" % target["name"])
+    # We actually want to get all pages, even the ones that aren't built as
+    # part of this target, in case those pages have replacement links.
+    pages = get_pages()
+
+    links = soup.find_all("a", href=re.compile(r"^[^.]+\.html"))
+    for link in links:
+        for page in pages:
+            if target["name"] in page:
+                #There's a replacement link for this env
+                local_url = page["html"]
+                target_url = page[target["name"]]
+                if link["href"][:len(local_url)] == local_url:
+                    link["href"] = link["href"].replace(local_url,
+                                                        target_url)
+
+def get_target(target):
+    """Get a target by name, or return the default target object.
+       We can't use default args in function defs because the default is
+       set at runtime based on config"""
+    if target == None:
+        return config["targets"][0]
+
+    if type(target) == str:
+        try:
+            return next(t for t in config["targets"] if t["name"] == target)
+        except StopIteration:
+            logger.critical("Unknown target: %s" % target)
+            exit(1)
+
+    if "name" in target:
+        # Eh, it's probably a target, just return it
+        return target
+
+def get_filters_for_page(page, target=None):
+    ffp = set()
+    target = get_target(target)
+    if "filters" in target:
+        ffp.update(target["filters"])
+    if "filters" in page:
+        ffp.update(page["filters"])
+    return ffp
+
+def parse_markdown(page, target=None, pages=None):
+    """Take a markdown string and output HTML for that content"""
+    target = get_target(target)
+    logging.info("Preparing page %s" % page["name"])
+
+    # Preprocess Markdown using this Jinja environment
+    pp_env = setup_pp_env()
+
+    # We'll apply these filters to the page
+    page_filters = get_filters_for_page(page, target)
+
+    md = get_markdown_for_page(page["md"], pp_env=pp_env, target=target)
+
+    # Apply markdown-based filters here
+    for filter_name in page_filters:
+        if "filter_markdown" in dir(filters[filter_name]):
+            logging.info("... applying markdown filter %s" % filter_name)
+            md = filters[filter_name].filter_markdown(md)
+
+    # Actually parse the markdown
+    logger.info("... parsing markdown...")
+    html = markdown(md, extensions=["markdown.extensions.extra",
+                                    "markdown.extensions.toc"])
+
+    # Apply raw-HTML-string-based filters here
+    for filter_name in page_filters:
+        if "filter_html" in dir(filters[filter_name]):
+            logging.info("... applying HTML filter %s" % filter_name)
+            html = filters[filter_name].filter_html(html)
+
+    # Some filters would rather operate on a soup than a string.
+    # May as well parse once and re-serialize once.
+    soup = BeautifulSoup(html, "html.parser")
+
+    # Apply soup-based filters here
+    for filter_name in page_filters:
+        if "filter_soup" in dir(filters[filter_name]):
+            logging.info("... applying soup filter %s" % filter_name)
+            filters[filter_name].filter_soup(soup)
+            # ^ the soup filters apply to the same object, passed by reference
+
+    # Replace links for any non-default target
+    if target["name"] != config["targets"][0]["name"]:
+        substitute_links_for_target(soup, target)
+
+    logging.info("... re-rendering HTML from soup...")
+    html2 = str(soup)
+    return html2
+
+
+def githubify_markdown(md, target=None, pages=None):
+    """Github-friendly markdown has absolute links, no md in divs"""
+    MARKDOWN_LINK_REGEX = re.compile(
+        r"(\[([^\]]+)\]\(([^:)]+)\)|\[([^\]]+)\]:\s*(\S+)$)", re.MULTILINE)
+
+    target = get_target(target)
+    if not pages:
+        pages = get_pages(target["name"])
+
+    class MDLink:
+        """A markdown link, either a reference link or inline link"""
+        def __init__(self, fullmatch, label, url, label2, url2):
+            self.fullmatch = fullmatch
+            if label:
+                self.label = label
+                self.url = url
+                self.is_reflink = False
+            elif label2:
+                self.label = label2
+                self.url = url2
+                self.is_reflink = True
+
+        def to_markdown(self):
+            """Re-represent self as a link in markdown syntax"""
+            s = "[" + self.label + "]"
+            if self.is_reflink:
+                s += ": " + self.url
+            else:
+                s += "(" + self.url + ")"
+            return s
+
+    links = [MDLink(*m) for m in MARKDOWN_LINK_REGEX.findall(md)]
+
+    for link in links:
+        for page in pages:
+            if target["name"] in page:
+                #There's a replacement link for this
+                local_url = page["html"]
+                target_url = page[target["name"]]
+                if link.url[:len(local_url)] == local_url:
+                    link.url = link.url.replace(local_url, target_url)
+                    md = md.replace(link.fullmatch, link.to_markdown())
+
+    return md
+
+
+def get_pages(target=None):
+    """Read pages from config and return an object, optionally filtered
+       to just the pages that this target cares about"""
+
+    target = get_target(target)
+    pages = config["pages"]
+
+    if target["name"]:
+        #filter pages that aren't part of this target
+        def should_include(page, target_name):
+            #If no target list specified, then include in all targets
+            if "targets" not in page:
+                return True
+            if target_name in page["targets"]:
+                return True
+            else:
+                return False
+        pages = [page for page in pages
+                 if should_include(page, target["name"])]
+    return pages
+
+
+def get_categories(pages):
+    """Produce an ordered, de-duplicated list of categories from
+       the page list"""
+    categories = []
+    for page in pages:
+        if "category" in page and page["category"] not in categories:
+            categories.append(page["category"])
+    logger.info("categories: %s" % categories)
+    return categories
+
+
+def read_markdown_local(filename, pp_env, target=None):
+    """Read in a markdown file and pre-process any templating lang in it,
+       returning the parsed contents."""
+    target = get_target(target)
+    pages = get_pages(target)
+    logging.info("reading markdown from file: %s" % filename)
+    md_raw = pp_env.get_template(filename)
+    return md_raw.render(target=target, pages=pages)
+
+
+def read_markdown_remote(url):
+    """Fetch a remote markdown file and return its contents"""
+    response = requests.get(url)
+    if response.status_code == 200:
+        return response.text
+    else:
+        raise requests.RequestException("Status code for page was not 200")
+
+
+def get_markdown_for_page(md_where, pp_env=None, target=None):
+    """Read/Fetch and pre-process markdown file"""
+    target = get_target(target)
+    if "http:" in md_where or "https:" in md_where:
+        return read_markdown_remote(md_where)
+    else:
+        return read_markdown_local(md_where, pp_env, target)
+
+
+def copy_static_files(template_static=True, content_static=True, out_path=None):
+    """Copy static files to the output directory."""
+    if out_path == None:
+        out_path = config["out_path"]
+
+
+    if template_static:
+        template_static_src = config["template_static_path"]
+        template_static_dst = os.path.join(out_path,
+                                       os.path.basename(template_static_src))
+        copy_tree(template_static_src, template_static_dst)
+
+    if content_static:
+        content_static_src = config["content_static_path"]
+        content_static_dst = os.path.join(out_path,
+                                           os.path.basename(content_static_src))
+        copy_tree(content_static_src, content_static_dst)
+
+def setup_pp_env():
+    pp_env = Environment(loader=FileSystemLoader(config["content_path"]))
+    #Example: if we want to add custom functions to the md files
+    #pp_env.globals['foo'] = lambda x: "FOO %s"%x
+    return pp_env
+
+def setup_html_env():
+    env = Environment(loader=FileSystemLoader(config["template_path"]))
+    env.lstrip_blocks = True
+    env.trim_blocks = True
+    return env
+
+def render_pages(target=None, for_pdf=False, bypass_errors=False):
+    """Parse and render all pages in target, writing files to out_path."""
+    target = get_target(target)
+    pages = get_pages(target)
+    categories = get_categories(pages)
+
+    # Insert generated HTML into templates using this Jinja environment
+    env = setup_html_env()
+
+    if for_pdf:
+        logging.info("reading pdf template...")
+        default_template = env.get_template(config["pdf_template"])
+    else:
+        logging.info("reading default template...")
+        default_template = env.get_template(config["default_template"])
+
+    for currentpage in pages:
+        if "md" in currentpage:
+            # Read and parse the markdown
+
+            try:
+                html_content = parse_markdown(currentpage, target=target,
+                                              pages=pages)
+
+            except Exception as e:
+                if bypass_errors:
+                    logging.warning( ("Skipping page %s " +
+                          "due to error fetching contents: %s") %
+                           (currentpage["name"], e) )
+                    continue
+                else:
+                    exit("Error when fetching page %s: %s" %
+                         (currentpage["name"], e) )
+        else:
+            html_content = ""
+
+        if "template" in currentpage:
+            # Use a template other than the default one
+            template = env.get_template(currentpage["template"])
+
+            #do link substitution for "doc_page" param
+            if "doc_page" in currentpage:
+                doc_page = next(p for p in pages
+                    if p["html"] == currentpage["doc_page"])
+                if target["name"] in doc_page:
+                    currentpage["doc_page"] = doc_page[target["name"]]
+
+            out_html = template.render(currentpage=currentpage,
+                                       categories=categories,
+                                       pages=pages,
+                                       content=html_content,
+                                       target=target)
+        else:
+            out_html = default_template.render(currentpage=currentpage,
+                                               categories=categories,
+                                               pages=pages,
+                                               content=html_content,
+                                               target=target)
+
+        # Experimental: replace links in full HTML, not just content
+        soup = BeautifulSoup(out_html, "html.parser")
+        if target["name"] != config["targets"][0]["name"]:
+            substitute_links_for_target(soup, target)
+        out_html = str(soup)
+
+        if for_pdf:
+            out_path = config["temporary_files_path"]
+        else:
+            out_path = config["out_path"]
+        fileout = os.path.join(out_path, currentpage["html"])
+        if not os.path.isdir(out_path):
+            logging.info("creating build folder %s" % out_path)
+            os.makedirs(out_path)
+        with open(fileout, "w") as f:
+            logging.info("writing to file: %s..." % fileout)
+            f.write(out_html)
+
+
+def watch(pdf_file, target):
+    """Look for changed files and re-generate HTML (and optionally
+       PDF whenever there's an update. Runs until interrupted."""
+    target = get_target(target)
+
+    class UpdaterHandler(PatternMatchingEventHandler):
+        """Updates to pattern-matched files means rendering."""
+        def on_any_event(self, event):
+            logging.info("got event!")
+            # bypass_errors=True because Watch shouldn't
+            #  just die if a file is temporarily not found
+            if pdf_file:
+                make_pdf(pdf_file, target=target, bypass_errors=True)
+            else:
+                render_pages(target, bypass_errors=True)
+            logging.info("done rendering")
+
+    patterns = ["*template-*.html",
+                "*.md",
+                "*code_samples/*"]
+
+    event_handler = UpdaterHandler(patterns=patterns)
+    observer = Observer()
+    observer.schedule(event_handler, config["template_path"], recursive=True)
+    observer.schedule(event_handler, config["content_path"], recursive=True)
+    observer.start()
+    # The above starts an observing thread,
+    #   so the main thread can just wait
+    try:
+        while True:
+            time.sleep(1)
+    except KeyboardInterrupt:
+        observer.stop()
+    observer.join()
+
+
+def make_pdf(outfile, target=None, bypass_errors=False):
+    """Use prince to convert several HTML files into a PDF"""
+    logging.info("rendering PDF-able versions of pages...")
+    target = get_target(target)
+    render_pages(target=target, for_pdf=True, bypass_errors=bypass_errors)
+
+    temp_files_path = config["temporary_files_path"]
+
+    # Prince will need the static files, so copy them over
+    copy_static_files(out_path=temp_files_path)
+
+    # Make sure the path we're going to write the PDF to exists
+    if not os.path.isdir(config["out_path"]):
+        logging.info("creating build folder %s" % config["out_path"])
+        os.makedirs(config["out_path"])
+
+    # Start preparing the prince command
+    args = [config["prince_executable"], '-o', outfile]
+    # Each HTML output file in the target is another arg to prince
+    pages = get_pages(target)
+    args += [os.path.join(temp_files_path, p["html"]) for p in pages]
+
+    logger.info("generating PDF: running %s..." % " ".join(args))
+    prince_resp = subprocess.check_output(args, universal_newlines=True)
+    print(prince_resp)
+
+
+def githubify(md_file_name, target=None):
+    """Wrapper - make the markdown resemble GitHub flavor"""
+    target = get_target(target)
+
+#    filein = os.path.join(config["content_path"], md_file_name)
+#    logging.info("opening source md file %s"%filein)
+#    with open(filein, "r") as f:
+#        md = f.read()
+    pages = get_pages()
+    logging.info("getting markdown for page %s" % md_file_name)
+    md = get_markdown_for_page(md_file_name,
+                               pp_env=setup_pp_env(),
+                               target=target)
+
+    logging.info("githubifying markdown...")
+    rendered_md = githubify_markdown(md, target=target, pages=pages)
+
+    if not os.path.isdir(config["out_path"]):
+        logging.info("creating build folder %s" % config["out_path"])
+        os.makedirs(config["out_path"])
+
+    fileout = os.path.join(config["out_path"], md_file_name)
+    logging.info("writing generated file to path: %s"%fileout)
+    with open(fileout, "w") as f:
+        f.write(rendered_md)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Generate static site from markdown and templates.')
+    parser.add_argument("--watch", "-w", action="store_true",
+                        help="Watch for changes and re-generate output."+\
+                             "This runs until force-quit.")
+    parser.add_argument("--pdf", type=str,
+                        help="Output a PDF to this file. Requires Prince.")
+    parser.add_argument("--githubify", "-g", type=str,
+                        help="Output md prepared for GitHub")
+    parser.add_argument("--target", "-t", type=str,
+                        help="Build for the specified target.")
+    parser.add_argument("--out_dir", "-o", type=str,
+                        help="Output to this folder (overrides config file)")
+    parser.add_argument("--quiet", "-q", action="store_true",
+                        help="Suppress status messages")
+    parser.add_argument("--bypass_errors", "-b", action="store_true",
+                        help="Continue building if some contents not found")
+    parser.add_argument("--config", "-c", type=str,
+                        help="Specify path to an alternate config file.")
+    parser.add_argument("--copy_static", "-s", action="store_true",
+                        help="Copy static files to the out dir",
+                        default=False)
+    cli_args = parser.parse_args()
+
+    if not cli_args.quiet:
+        logging.basicConfig(level=logging.INFO)
+
+    if cli_args.config:
+        load_config(cli_args.config)
+    else:
+        load_config()
+
+    if cli_args.out_dir:
+        config["out_path"] = cli_args.out_dir
+
+    if cli_args.githubify:
+        githubify(cli_args.githubify, cli_args.target)
+        if cli_args.copy_static:
+            copy_static(template_static=False, content_static=True)
+        exit(0)
+
+    if cli_args.pdf:
+        if cli_args.pdf[-4:] != ".pdf":
+            exit("PDF filename must end in .pdf")
+        logging.info("making a pdf...")
+        pdf_path = os.path.join(config["out_path"], cli_args.pdf)
+        make_pdf(pdf_path, target=cli_args.target,
+                 bypass_errors=cli_args.bypass_errors)
+        logging.info("pdf done")
+
+    else:
+        logging.info("rendering pages...")
+        render_pages(target=cli_args.target,
+                     bypass_errors=cli_args.bypass_errors)
+        logging.info("done rendering")
+
+        if cli_args.copy_static:
+            logging.info("copying static pages...")
+            copy_static_files()
+
+    if cli_args.watch:
+        logging.info("watching for changes...")
+        if cli_args.pdf:
+            pdf_path = os.path.join(config["out_path"], cli_args.pdf)
+            watch(pdf_path, cli_args.target)
+        else:
+            watch(None, cli_args.target)