#!/usr/bin/env python3 ################################################################################ # Dactyl - a tool for heroic epics of documentation # # Generates a website from Markdown and Jinja templates, with filtering # along the way. ################################################################################ DEFAULT_CONFIG_FILE = "dactyl-config.yml" import os import re import yaml import argparse import logging import traceback # Necessary to copy static files to the output dir from distutils.dir_util import copy_tree # Used to import filters. from importlib import import_module # Necessary for prince import subprocess # Used to fetch markdown sources from GitHub repos import requests # Various content and template processing stuff from jinja2 import Environment, FileSystemLoader, TemplateError from markdown import markdown from bs4 import BeautifulSoup # Watchdog stuff import time from watchdog.observers import Observer from watchdog.events import PatternMatchingEventHandler # The log level is configurable at runtime (see __main__ below) logger = logging.getLogger() # These fields are special, and pages don't inherit them directly RESERVED_KEYS_TARGET = [ "name", "display_name", "filters", "image_subs", ] ADHOC_TARGET = "__ADHOC__" DEFAULT_PDF_FILE = "__DEFAULT_FILENAME__" NO_PDF = "__NO_PDF__" filters = {} def load_config(config_file=DEFAULT_CONFIG_FILE): """Reload config from a YAML file.""" global config, filters logger.info("loading config file %s..." % config_file) with open(config_file, "r") as f: config = yaml.load(f) assert(config["targets"]) assert(config["pages"]) assert(config["pdf_template"]) assert(config["default_template"]) assert(config["content_path"]) assert(config["out_path"]) assert(config["temporary_files_path"]) assert(config["template_static_path"]) assert(config["content_static_path"]) if "prince_executable" not in config or not config["prince_executable"]: config["prince_executable"] = "prince" # A reasonable default if "default_filters" not in config: config["default_filters"] = [] if "skip_preprocessor" not in config: config["skip_preprocessor"] = False # Warn if any pages aren't part of a target for page in config["pages"]: if "targets" not in page: if "name" in page: logging.warn("Page %s is not part of any targets." % page["name"]) else: logging.warn("Page %s is not part of any targets." % page) if "md" in page and "name" not in page: logging.debug("Guessing page name for page %s" % page) page_path = os.path.join(config["content_path"], page["md"]) page["name"] = guess_title_from_md_file(page_path) # Figure out which filters we need and import them filternames = set(config["default_filters"]) for target in config["targets"]: if "filters" in target: filternames.update(target["filters"]) for page in config["pages"]: if "filters" in page: filternames.update(page["filters"]) for filter_name in filternames: filters[filter_name] = import_module("filter_"+filter_name) def default_pdf_name(target): target = get_target(target) if {"product","version","guide"} <= set(target.keys()): p_name = slugify(target["product"]) v_num = slugify(target["version"]) g_name = slugify(target["guide"]) return p_name+"-"+v_num+"-"+g_name+".pdf" elif "display_name" in target: return slugify(target["display_name"])+".pdf" else: return slugify(target["name"])+".pdf" unacceptable_chars = re.compile(r"[^A-Za-z0-9._ ]+") whitespace_regex = re.compile(r"\s+") def slugify(s): s = re.sub(unacceptable_chars, "", s) s = re.sub(whitespace_regex, "_", s) if not s: s = "_" return s def substitute_links_for_target(soup, target): """Replaces local-html-links with appropriate substitutions for the given target, and images likewise""" target = get_target(target) logger.info("... modifying links for target: %s" % target["name"]) # We actually want to get all pages, even the ones that aren't built as # part of this target, in case those pages have replacement links. pages = get_pages() links = soup.find_all("a", href=re.compile(r"^[^.]+\.html")) for link in links: for page in pages: if target["name"] in page: #There's a replacement link for this env local_url = page["html"] target_url = page[target["name"]] if link["href"][:len(local_url)] == local_url: link["href"] = link["href"].replace(local_url, target_url) if "image_subs" in target: images = soup.find_all("img") for img in images: local_path = img["src"] if local_path in target["image_subs"]: logger.info("... replacing image path '%s' with '%s'" % (local_path, target["image_subs"][local_path])) img["src"] = target["image_subs"][local_path] image_links = soup.find_all("a", href=re.compile(r"^[^.]+\.(png|jpg|jpeg|gif|svg)")) for img_link in image_links: local_path = img_link["href"] if local_path in target["image_subs"]: logger.info("... replacing image link '%s' with '%s'" % (local_path, target["image_subs"][local_path])) img_link["href"] = target["image_subs"][local_path] def substitute_parameter_links(link_parameter, currentpage, target): """Some templates have links in page parameters. Do link substitution for the target on one of those parameters.""" target = get_target(target) # We actually want to get all pages, even the ones that aren't built as # part of this target, in case those pages have replacement links. pages = get_pages() if link_parameter in currentpage: linked_page = next(p for p in pages if p["html"] == currentpage[link_parameter]) if target["name"] in linked_page: #there's a link substitution available currentpage[link_parameter] = linked_page[target["name"]] ## We could warn here, but it would frequently be a false alarm # else: # logging.warning("No substitution for %s[%s] for this target" % # (currentpage["html"],link_parameter)) def get_target(target): """Get a target by name, or return the default target object. We can't use default args in function defs because the default is set at runtime based on config""" if target == None: return config["targets"][0] if type(target) == str: try: return next(t for t in config["targets"] if t["name"] == target) except StopIteration: logger.critical("Unknown target: %s" % target) exit(1) if "name" in target: # Eh, it's probably a target, just return it return target def make_adhoc_target(inpages, no_cover): t = { "name": ADHOC_TARGET, "display_name": "(Untitled)", } if not no_cover: indexpage = next(p for p in config["pages"] if p["html"] == "index.html") indexpage["targets"].append(ADHOC_TARGET) if len(inpages) == 1: t["display_name"] = guess_title_from_md_file(inpages[0]) for inpage in inpages: # Figure out the actual filename and location of this infile # and set the content source dir appropriately in_dir, in_file = os.path.split(inpage) config["content_path"] = in_dir # Figure out what html file to output to ENDS_IN_MD = re.compile("\.md$", re.I) if re.search(ENDS_IN_MD, in_file): out_html_file = re.sub(ENDS_IN_MD, ".html", in_file) else: out_html_file = in_file+".html" # Try to come up with a reasonable page title page_title = guess_title_from_md_file(inpage) new_page = { "name": page_title, "md": in_file, "html": out_html_file, "targets": [ADHOC_TARGET], "category": "Pages", "pp_env": in_dir, } config["pages"].append(new_page) config["targets"].append(t) return t def guess_title_from_md_file(filepath): with open(filepath, "r") as f: line1 = f.readline() line2 = f.readline() # look for headers in the "followed by ----- or ===== format" ALT_HEADER_REGEX = re.compile("^[=-]{3,}$") if ALT_HEADER_REGEX.match(line2): possible_header = line1 if possible_header.strip(): return possible_header.strip() # look for headers in the "## abc ## format" HEADER_REGEX = re.compile("^#+\s*(.+[^#\s])\s*#*$") m = HEADER_REGEX.match(line1) if m: possible_header = m.group(1) if possible_header.strip(): return possible_header.strip() #basically if the first line's not a markdown header, we give up and use # the filename instead return os.path.basename(filepath) def get_filters_for_page(page, target=None): ffp = set(config["default_filters"]) target = get_target(target) if "filters" in target: ffp.update(target["filters"]) if "filters" in page: ffp.update(page["filters"]) return ffp def parse_markdown(page, target=None, pages=None, bypass_errors=False): """Take a markdown string and output HTML for that content""" target = get_target(target) logging.info("Preparing page %s" % page["name"]) # Preprocess Markdown using this Jinja environment pp_env = setup_pp_env(page) # We'll apply these filters to the page page_filters = get_filters_for_page(page, target) md = get_markdown_for_page(page["md"], pp_env=pp_env, target=target, bypass_errors=bypass_errors, currentpage=page) # Apply markdown-based filters here for filter_name in page_filters: if "filter_markdown" in dir(filters[filter_name]): logging.info("... applying markdown filter %s" % filter_name) md = filters[filter_name].filter_markdown(md, target=target, page=page) # Actually parse the markdown logger.info("... parsing markdown...") html = markdown(md, extensions=["markdown.extensions.extra", "markdown.extensions.toc"], lazy_ol=False) # Apply raw-HTML-string-based filters here for filter_name in page_filters: if "filter_html" in dir(filters[filter_name]): logging.info("... applying HTML filter %s" % filter_name) html = filters[filter_name].filter_html(html, target=target, page=page) # Some filters would rather operate on a soup than a string. # May as well parse once and re-serialize once. soup = BeautifulSoup(html, "html.parser") # Apply soup-based filters here for filter_name in page_filters: if "filter_soup" in dir(filters[filter_name]): logging.info("... applying soup filter %s" % filter_name) filters[filter_name].filter_soup(soup, target=target, page=page) # ^ the soup filters apply to the same object, passed by reference # Replace links for any non-default target if target["name"] != config["targets"][0]["name"]: substitute_links_for_target(soup, target) logging.info("... re-rendering HTML from soup...") html2 = str(soup) return html2 def githubify_markdown(md, target=None, pages=None): """Github-friendly markdown has absolute links, no md in divs""" MARKDOWN_LINK_REGEX = re.compile( r"(\[([^\]]+)\]\(([^:)]+)\)|\[([^\]]+)\]:\s*(\S+)$)", re.MULTILINE) target = get_target(target) if not pages: pages = get_pages(target["name"]) class MDLink: """A markdown link, either a reference link or inline link""" def __init__(self, fullmatch, label, url, label2, url2): self.fullmatch = fullmatch if label: self.label = label self.url = url self.is_reflink = False elif label2: self.label = label2 self.url = url2 self.is_reflink = True def to_markdown(self): """Re-represent self as a link in markdown syntax""" s = "[" + self.label + "]" if self.is_reflink: s += ": " + self.url else: s += "(" + self.url + ")" return s links = [MDLink(*m) for m in MARKDOWN_LINK_REGEX.findall(md)] for link in links: for page in pages: if target["name"] in page: #There's a replacement link for this local_url = page["html"] target_url = page[target["name"]] if link.url[:len(local_url)] == local_url: link.url = link.url.replace(local_url, target_url) md = md.replace(link.fullmatch, link.to_markdown()) return md def get_pages(target=None): """Read pages from config and return an object, optionally filtered to just the pages that this target cares about""" target = get_target(target) pages = config["pages"] if target["name"]: #filter pages that aren't part of this target def should_include(page, target_name): if "targets" not in page: return False if target_name in page["targets"]: return True else: return False pages = [page for page in pages if should_include(page, target["name"])] # Pages should inherit non-reserved keys from the target for p in pages: for key,val in target.items(): if key in RESERVED_KEYS_TARGET: continue elif key not in p: p[key] = val return pages def get_categories(pages): """Produce an ordered, de-duplicated list of categories from the page list""" categories = [] for page in pages: if "category" in page and page["category"] not in categories: categories.append(page["category"]) logger.info("categories: %s" % categories) return categories def read_markdown_local(filename, pp_env, target=None, bypass_errors=False, currentpage={}): """Read in a markdown file and pre-process any templating lang in it, returning the parsed contents.""" target = get_target(target) pages = get_pages(target) logging.info("reading markdown from file: %s" % filename) if config["skip_preprocessor"]: fpath = pp_env.loader.searchpath[0] with open(os.path.join(fpath,filename), "r") as f: md_out = f.read() else: try: md_raw = pp_env.get_template(filename) md_out = md_raw.render(target=target, pages=pages, currentpage=currentpage) except TemplateError as e: traceback.print_tb(e.__traceback__) if bypass_errors: logging.warn("Error pre-processing page %s; trying to load it raw" % filename) fpath = pp_env.loader.searchpath[0] with open(os.path.join(fpath,filename), "r") as f: md_out = f.read() else: exit("Error pre-processing page %s: %s" % (filename, e)) return md_out def read_markdown_remote(url): """Fetch a remote markdown file and return its contents""" response = requests.get(url) if response.status_code == 200: return response.text else: raise requests.RequestException("Status code for page was not 200") def get_markdown_for_page(md_where, pp_env=None, target=None, bypass_errors=False, currentpage={}): """Read/Fetch and pre-process markdown file""" target = get_target(target) if "http:" in md_where or "https:" in md_where: try: mdr = read_markdown_remote(md_where) except requests.RequestException as e: if bypass_errors: mdr = "" else: traceback.print_tb(e.__traceback__) exit("Error fetching page %s: %s" % (md_where, e)) return mdr else: return read_markdown_local(md_where, pp_env, target, bypass_errors, currentpage=currentpage) def copy_static_files(template_static=True, content_static=True, out_path=None): """Copy static files to the output directory.""" if out_path == None: out_path = config["out_path"] if template_static: template_static_src = config["template_static_path"] template_static_dst = os.path.join(out_path, os.path.basename(template_static_src)) copy_tree(template_static_src, template_static_dst) if content_static: content_static_src = config["content_static_path"] content_static_dst = os.path.join(out_path, os.path.basename(content_static_src)) copy_tree(content_static_src, content_static_dst) def setup_pp_env(page=None): if not page or "pp_dir" not in page: pp_env = Environment(loader=FileSystemLoader(config["content_path"])) else: pp_env = Environment(loader=FileSystemLoader(page["pp_dir"])) #Example: if we want to add custom functions to the md files #pp_env.globals['foo'] = lambda x: "FOO %s"%x return pp_env def setup_html_env(): env = Environment(loader=FileSystemLoader(config["template_path"])) env.lstrip_blocks = True env.trim_blocks = True return env def toc_from_headers(html_string): """make a table of contents from headers""" soup = BeautifulSoup(html_string, "html.parser") headers = soup.find_all(name=re.compile("h[1-3]"), id=True) toc_s = "" for h in headers: if h.name == "h1": toc_level = "level-1" elif h.name == "h2": toc_level = "level-2" else: toc_level = "level-3" new_a = soup.new_tag("a", href="#"+h["id"]) if h.string: new_a.string = h.string else: new_a.string = " ".join(h.strings) new_li = soup.new_tag("li") new_li["class"] = toc_level new_li.append(new_a) toc_s += str(new_li)+"\n" return str(toc_s) def render_pages(target=None, for_pdf=False, bypass_errors=False): """Parse and render all pages in target, writing files to out_path.""" target = get_target(target) pages = get_pages(target) categories = get_categories(pages) # Insert generated HTML into templates using this Jinja environment env = setup_html_env() if for_pdf: if "pdf_template" in target: logging.debug("reading pdf template %s from target..." % target["pdf_template"]) default_template = env.get_template(target["pdf_template"]) else: logging.debug("reading default pdf template %s..." % config["pdf_template"]) default_template = env.get_template(config["pdf_template"]) else: if "template" in target: logging.debug("reading HTML template %s from target..." % target["template"]) default_template = env.get_template(target["template"]) else: logging.debug("reading default HTML template %s..." % config["default_template"]) default_template = env.get_template(config["default_template"]) for currentpage in pages: if "md" in currentpage: # Read and parse the markdown try: html_content = parse_markdown(currentpage, target=target, pages=pages, bypass_errors=bypass_errors) except Exception as e: if bypass_errors: traceback.print_tb(e.__traceback__) logging.warning( ("Skipping page %s " + "due to error fetching contents: %s") % (currentpage["name"], e) ) continue else: traceback.print_tb(e.__traceback__) exit("Error when fetching page %s: %s" % (currentpage["name"], e) ) else: html_content = "" # default to a table-of-contents sidebar... if "sidebar" not in currentpage: currentpage["sidebar"] = "toc" if currentpage["sidebar"] == "toc": sidebar_content = toc_from_headers(html_content) else: sidebar_content = None # Prepare some parameters for rendering substitute_parameter_links("doc_page", currentpage, target) current_time = time.strftime("%B %d, %Y") # Figure out which template to use if "template" in currentpage and not for_pdf: logging.info("using template %s from page" % currentpage["template"]) use_template = env.get_template(currentpage["template"]) elif "pdf_template" in currentpage and for_pdf: logging.info("using pdf_template %s from page" % currentpage["pdf_template"]) use_template = env.get_template(currentpage["pdf_template"]) else: use_template = default_template # Render the content into the appropriate template out_html = use_template.render(currentpage=currentpage, categories=categories, pages=pages, content=html_content, target=target, current_time=current_time, sidebar_content=sidebar_content) if for_pdf: out_path = config["temporary_files_path"] else: out_path = config["out_path"] fileout = os.path.join(out_path, currentpage["html"]) if not os.path.isdir(out_path): logging.info("creating build folder %s" % out_path) os.makedirs(out_path) with open(fileout, "w") as f: logging.info("writing to file: %s..." % fileout) f.write(out_html) def watch(pdf_file, target): """Look for changed files and re-generate HTML (and optionally PDF whenever there's an update. Runs until interrupted.""" target = get_target(target) class UpdaterHandler(PatternMatchingEventHandler): """Updates to pattern-matched files means rendering.""" def on_any_event(self, event): logging.info("got event!") # bypass_errors=True because Watch shouldn't # just die if a file is temporarily not found if pdf_file: make_pdf(pdf_file, target=target, bypass_errors=True) else: render_pages(target, bypass_errors=True) logging.info("done rendering") patterns = ["*template-*.html", "*.md", "*code_samples/*"] event_handler = UpdaterHandler(patterns=patterns) observer = Observer() observer.schedule(event_handler, config["template_path"], recursive=True) observer.schedule(event_handler, config["content_path"], recursive=True) observer.start() # The above starts an observing thread, # so the main thread can just wait try: while True: time.sleep(1) except KeyboardInterrupt: observer.stop() observer.join() def make_pdf(outfile, target=None, bypass_errors=False): """Use prince to convert several HTML files into a PDF""" logging.info("rendering PDF-able versions of pages...") target = get_target(target) render_pages(target=target, for_pdf=True, bypass_errors=bypass_errors) temp_files_path = config["temporary_files_path"] # Prince will need the static files, so copy them over copy_static_files(out_path=temp_files_path) # Make sure the path we're going to write the PDF to exists if not os.path.isdir(config["out_path"]): logging.info("creating build folder %s" % config["out_path"]) os.makedirs(config["out_path"]) # Start preparing the prince command args = [config["prince_executable"], '--javascript', '-o', outfile] # Each HTML output file in the target is another arg to prince pages = get_pages(target) args += [os.path.join(temp_files_path, p["html"]) for p in pages] logger.info("generating PDF: running %s..." % " ".join(args)) prince_resp = subprocess.check_output(args, universal_newlines=True) print(prince_resp) def githubify(md_file_name, target=None): """Wrapper - make the markdown resemble GitHub flavor""" target = get_target(target) pages = get_pages() logging.info("getting markdown for page %s" % md_file_name) md = get_markdown_for_page(md_file_name, pp_env=setup_pp_env(), target=target) logging.info("githubifying markdown...") rendered_md = githubify_markdown(md, target=target, pages=pages) if not os.path.isdir(config["out_path"]): logging.info("creating build folder %s" % config["out_path"]) os.makedirs(config["out_path"]) fileout = os.path.join(config["out_path"], md_file_name) logging.info("writing generated file to path: %s"%fileout) with open(fileout, "w") as f: f.write(rendered_md) if __name__ == "__main__": parser = argparse.ArgumentParser( description='Generate static site from markdown and templates.') parser.add_argument("--watch", "-w", action="store_true", help="Watch for changes and re-generate output. "+\ "This runs until force-quit.") parser.add_argument("--pdf", nargs="?", type=str, const=DEFAULT_PDF_FILE, default=NO_PDF, help="Output a PDF to this file. Requires Prince.") parser.add_argument("--githubify", "-g", type=str, help="Output md prepared for GitHub") parser.add_argument("--target", "-t", type=str, help="Build for the specified target.") parser.add_argument("--out_dir", "-o", type=str, help="Output to this folder (overrides config file)") parser.add_argument("--quiet", "-q", action="store_true", help="Suppress status messages") parser.add_argument("--bypass_errors", "-b", action="store_true", help="Continue building if some contents not found") parser.add_argument("--config", "-c", type=str, help="Specify path to an alternate config file.") parser.add_argument("--copy_static", "-s", action="store_true", help="Copy static files to the out dir", default=False) parser.add_argument("--pages", type=str, help="Build markdown page(s) "+\ "that aren't described in the config.", nargs="+") parser.add_argument("--no_cover", "-n", action="store_true", help="(with --pages only) Don't automatically add a "+\ "cover page / index.html file.") parser.add_argument("--skip_preprocessor", action="store_true", default=False, help="Don't pre-process Jinja syntax in markdown files") parser.add_argument("--title", type=str, help="Override target display "+\ "name. Useful when passing multiple args to --pages.") parser.add_argument("--list_targets_only", "-l", action="store_true", help="Don't build anything, just display list of "+ "known targets from the config file.") cli_args = parser.parse_args() if not cli_args.quiet: logging.basicConfig(level=logging.INFO) if cli_args.config: load_config(cli_args.config) else: load_config() if cli_args.list_targets_only: for t in config["targets"]: if "display_name" in t: display_name = t["display_name"] elif {"product","version","guide"} <= set(t.keys()): display_name = " ".join([t["product"],t["version"],t["guide"]]) else: display_name = "" print("%s\t\t%s" % (t["name"], display_name)) #print(" ".join([t["name"] for t in config["targets"]])) exit(0) if cli_args.out_dir: config["out_path"] = cli_args.out_dir config["skip_preprocessor"] = cli_args.skip_preprocessor if cli_args.pages: make_adhoc_target(cli_args.pages, cli_args.no_cover) cli_args.target = ADHOC_TARGET if cli_args.title: target = get_target(cli_args.target) target["display_name"] = cli_args.title if cli_args.githubify: githubify(cli_args.githubify, cli_args.target) if cli_args.copy_static: copy_static(template_static=False, content_static=True) exit(0) if cli_args.pdf != NO_PDF: if cli_args.pdf == DEFAULT_PDF_FILE: pdf_path = os.path.join(config["out_path"], default_pdf_name(cli_args.target)) elif cli_args.pdf[-4:] != ".pdf": exit("PDF filename must end in .pdf") else: pdf_path = os.path.join(config["out_path"], cli_args.pdf) logging.info("making a pdf...") make_pdf(pdf_path, target=cli_args.target, bypass_errors=cli_args.bypass_errors) logging.info("pdf done") else: logging.info("rendering pages...") render_pages(target=cli_args.target, bypass_errors=cli_args.bypass_errors) logging.info("done rendering") if cli_args.copy_static: logging.info("copying static pages...") copy_static_files() if cli_args.watch: logging.info("watching for changes...") if cli_args.pdf: pdf_path = os.path.join(config["out_path"], cli_args.pdf) watch(pdf_path, cli_args.target) else: watch(None, cli_args.target)