From ef8d7736806e49d14810a54ac15c808a81c703fd Mon Sep 17 00:00:00 2001 From: mDuo13 Date: Tue, 21 Feb 2017 17:05:34 -0800 Subject: [PATCH] Move to stand-alone Dactyl package --- README.md | 122 +--- circle.yml | 6 +- tool/dactyl-config.yml | 15 +- tool/dactyl_build.py | 834 -------------------------- tool/dactyl_link_checker.py | 288 --------- tool/dactyl_style_checker.py | 170 ------ tool/filter_add_version.py | 31 - tool/filter_badges.py | 44 -- tool/filter_buttonize.py | 18 - tool/filter_callouts.py | 27 - tool/filter_markdown_in_divs.py | 25 - tool/filter_multicode_tabs.py | 61 -- tool/filter_remove_doctoc.py | 20 - tool/filter_standardize_header_ids.py | 17 - tool/filter_xrefs.py | 142 ----- tool/packages.txt | 5 - 16 files changed, 21 insertions(+), 1804 deletions(-) delete mode 100755 tool/dactyl_build.py delete mode 100755 tool/dactyl_link_checker.py delete mode 100755 tool/dactyl_style_checker.py delete mode 100644 tool/filter_add_version.py delete mode 100644 tool/filter_badges.py delete mode 100644 tool/filter_buttonize.py delete mode 100644 tool/filter_callouts.py delete mode 100644 tool/filter_markdown_in_divs.py delete mode 100644 tool/filter_multicode_tabs.py delete mode 100644 tool/filter_remove_doctoc.py delete mode 100644 tool/filter_standardize_header_ids.py delete mode 100644 tool/filter_xrefs.py delete mode 100644 tool/packages.txt diff --git a/README.md b/README.md index 28e8ddf72a..126f549b8a 100644 --- a/README.md +++ b/README.md @@ -7,126 +7,28 @@ The [Ripple Developer Portal](https://dev.ripple.com) is the authoritative sourc Repository Layout ----------------- -The HTML pages in this portal are generated from the markdown files in the [content/](content/) folder. Always edit the markdown files, not the HTML files. +The HTML pages in this portal are generated from the markdown files in the [content/](content/) folder. Always edit the markdown files, not the HTML files. The [assets/](assets/) folder contains static files used by the site's templates. The [img](img/) folder contains images used in the docs. -The [tool/](tool/) folder contains a tool, called **Dactyl**, for generating the HTML files in the top level. The `dactyl_build.py` script uses the templates and the a YAML config file to generate HTML output. The `dactyl_link_checker.py` script checks the generated HTML content for broken links. The `dactyl_style_checker.py` script (experimental) checks the content for style guide violations. +The HTML files are generated using Ripple's documentation tool, called [**Dactyl**](https://github.com/ripple/dactyl). After you've done the [Dactyl Setup](#dactyl-setup), you can build the docs from the `tool/` folder: + +``` +cd tool +dactyl_build +``` + +Dactyl also provides link checking (the `dactyl_link_checker` script) and style checking (`dactyl_style_checker`), which you can run from the `tool/` folder. + +The list of which files are built, and metadata about those files, is in the `tool/dactyl-config.yml` file. The `tool/` folder also contains the templates and style-checker rules used by Dactyl. -In general, Dactyl assumes you are running it with `tool/` as the current working directory, and the included config uses the top level of this repository as the output directory. Dactyl Setup ------------ Dactyl uses Python 3 and a number of modules. First, make sure you have Python 3 installed in your local operating system, then use [PIP](https://pip.pypa.io/en/stable/) to install the dependencies: -`pip3 install -r tool/packages.txt` +`pip3 install dactyl` -Building --------- - -The default configuration is [`dactyl-config.yml`](tool/dactyl-config.yml). You can specify an alternate config file with the `-c` or `--config` parameter: - -`./dactyl_build.py -c alt-config.yml` -`./dactyl_link_checker.py -c alt-config.yml` - -This script does the following: - -1. Chooses a target based on the commandline `-t` parameter. If not specified, uses the first target listed in the config file. If building for PDF, add the `--pdf your_output_file.pdf` parameter (The output filename must end in `.pdf`!) -2. Reads the list of **pages** from the config. -3. For all pages that have a filename as the `md` parameter in the config, it reads the file from the configured **content_path** and "preprocesses" any [Jinja2 Templating Language](http://jinja.pocoo.org/docs/dev/templates/) statements in those files. The markdown files can use this opportunity to include other files, or include or exclude content based on the `target` and `pages` parameters. -4. For all pages that have a URL as the `md` parameter, it fetches the file via HTTP(S). No pre-processing occurs on such contents. -5. For all the retrieved and preprocessed markdown files, it parses them using Python's markdown library, with extras enabled to approximate GitHub-flavored markdown. -6. For each page, it parses the **template** configured for the page using Jinja2, falling back to the **default_template** (or **pdf_template**). If it produced HTML from a Markdown source, it passes that HTML as the `content` parameter to the template. It also passes in several other arguments from the config file, including definition of the current page as the `currentpage` parameter. -7. It applies several post-processing steps on the generated HTML. Additional [filters](#filters) can be configured as plugins to run on a per-page or per-target basis. Dactyl always performs link substitution by target. -8. If building for PDF: It outputs the resulting HTML files to the configured **temporary_files_path**, using filenames specified in the **html** parameter of each page in the config file. It also copies the **template_static_path** and **content_static_path** folders to the temporary folder. Then it runs [Prince](http://www.princexml.com/) to generate a PDF. It writes the generated PDF to the **out_path**. -9. Otherwise: It outputs the resulting HTML files to the **out_path**. This does not generate a working copy of the site unless the necessary static files are also available at the **out_path**. (This is true by default, since the default output directory is the top level of the repository.) You can have it copy the configured **template_static_path** (containing files referenced by the templates) and the **content_static_path** (containing files referenced by the content) to the output directory using the `--copy_static` or `-s` parameter. - - -Githubify Mode --------------- - -Alternate usage: `-g` produces a GitHub-flavored Markdown version of a single file. This parameter takes one argument, which should be a markdown file in the **content_path**. The tool uses Jinja2 to "preprocess" the file, as above, but stops after assembling GitHub-flavored Markdown and writes the output to the same filename in the **out_path**. - -**Note:** The tool never erases files from the **out_path** or the **temporary_files_path**. You may want to do that yourself, especially if you remove files from your config or rename them. - -Ad-Hoc Targets --------------- - -If you want to build output without editing the config file, you can use the `--pages` option, following that with a list of input markdown files. (The `--pages` option is incompatible with the `-t` option.) In this case, Dactyl creates an "ad-hoc" target for the page(s) specified. It includes the `index.html` file (PDF cover in PDF mode) in the ad-hoc target unless you specify `--no_cover` in the command. - -For each page, it picks an output filename based on the input filename. It tries to guesses a sensible page title (to use in sidebars, dropdowns, table of contents, and other page navigation) from the first line of the file, falling back to the filename as the page title if the first line isn't a Markdown-formatted header. - -Example usage: - -``` -./dactyl_build.py --pages ~/Ripple/*.md -o /tmp/dactyl_out/ --pdf scraps.pdf -``` - -Multiple Targets ----------------- - -You can define multiple **targets** in the config file with arbitrary key-value parameters. The two parameters that the tool cares about by default are **name** (used to identify the target on the commandline and in the pages section of the config) and **filters** (which lists filter plugins to apply, if provided). - -By default, the tool builds the first target in the list. Every page in the `pages` array is included in every target unless the page definition includes an explicit list of **targets** to build. (Each member in the list should be the **name** of a target.) - -The tool can perform automatic substitution of links in the resulting HTML (or Markdown, when using [githubify](#githubify-mode)). For each parameter in the page definition that matches the name of a target, it replaces links to the `html` file with the contents of the target-name-parameter. Anchors from the original link carry over. This allows you to link to other pages using the filenames from the local version of the site, but replace them with different links for a live site. (As seen in the default configuration, Ripple.com has very different URLs for many pages.) - -Filters -------- - -Dactyl can apply various filters on document content, which is useful for handling compatibility with alternate Markdown flavors, among other things. The **filters** option, at the target or page level of the config, should contain an array of filters to apply. (Filters at the target level apply to all pages in that target; filters at the page level apply to that page only.) Each filter is implemented by a Python script in the Dactyl directory named `filter_{filter_name}.py`, based on the {filter_name} specified in the config. - -Filters can apply at any or all of three steps: Raw markdown, raw HTML, or BeautifulSoup, as follows: - -Raw markdown filters implement a `filter_markdown(md)` function, which inputs and outputs a string of markdown text. - -Raw HTML filters implement a `filter_html(html)` function, which inputs and outputs a string of HTML text. - -BeautifulSoup filters implement a `filter_soup(soup)` method, which takes a BeautifulSoup4 representation of the parsed HTML content as input. Because the input is passed by reference, the function modifies the soup in place instead of returning it. - -Dactyl comes with the following filters: - - * `remove_doctoc` - Remove DOCTOC-generated tables of contents - * `multicode_tabs` - Lets you group multiple code samples to appear in tabs (HTML only) - * `standardize_header_ids` - Modify the `id` fields of generated header (<h#>) elements to use dashes instead of underscores. (This is for compatibility with previously-used doc tools.) - * `buttonize` - Adds the `button` class to links whose text ends in > - * `markdown_in_divs` - Automatically add the `markdown="1"` element to <div> elements so that their contents get parsed as Markdown. (No longer used by the Dev Portal, but useful for compatibility with Markdown flavors that do this automatically.) - * `add_version` - Adds a "Updated for \[Version\]" link to the page. Only works if the page is remotely-sourced from a git tag on GitHub. - * `` - -Multicode Tabs --------------- - -The `multicode_tabs` filter lets you group multiple related code samples to appear in tabs in the HTML version. It has no meaningful effect when building for PDF. - -The syntax for multicode tabs is as follows: - -~~~ -(whatever content comes before the multi-code block) - - - -*Tab 1 Name* - -``` -Tab 1 code sample -``` - -*Tab 2 Name* - -``` -Tab 2 code sample -``` - -... (repeat for N tabs) ... - - - -(whatever content comes after the multi-code block) -~~~ - -This syntax is designed to "gracefully degrade" to a sensible syntax in cases (like PDF) where the [javascript to make the tabs work](assets/js/multicodetab.js) is either unavailable or undesirable. - Contributing ------------ diff --git a/circle.yml b/circle.yml index dbde82b4de..b4798dacbd 100644 --- a/circle.yml +++ b/circle.yml @@ -4,8 +4,8 @@ dependencies: pre: - pyenv global 3.4.0 override: - - pip3 install -r packages.txt + - pip3 install dactyl test: override: - - ./dactyl_build.py - - ./dactyl_link_checker.py + - dactyl_build + - dactyl_link_checker diff --git a/tool/dactyl-config.yml b/tool/dactyl-config.yml index 0cdac1777f..ee186c8667 100644 --- a/tool/dactyl-config.yml +++ b/tool/dactyl-config.yml @@ -29,6 +29,12 @@ default_filters: - callouts - badges +cover_page: + name: Overview + html: index.html + sidebar: false + template: template-index.html + targets: # First member is the default that gets built when target not specified - name: local @@ -60,15 +66,6 @@ targets: display_name: rippled Setup Guide pages: -# Intro pages is not directly replicated on ripple.com at this time - - name: Overview - html: index.html - sidebar: false - template: template-index.html - targets: - - local - - ripple.com - # References are exhaustive lists of commands and options - name: RippleAPI category: References diff --git a/tool/dactyl_build.py b/tool/dactyl_build.py deleted file mode 100755 index ef1aaccc9a..0000000000 --- a/tool/dactyl_build.py +++ /dev/null @@ -1,834 +0,0 @@ -#!/usr/bin/env python3 - -################################################################################ -# Dactyl - a tool for heroic epics of documentation -# -# Generates a website from Markdown and Jinja templates, with filtering -# along the way. -################################################################################ - -DEFAULT_CONFIG_FILE = "dactyl-config.yml" - -import os -import re -import yaml -import argparse -import logging -import traceback - -# Necessary to copy static files to the output dir -from distutils.dir_util import copy_tree - -# Used to import filters. -from importlib import import_module - -# Necessary for prince -import subprocess - -# Used to fetch markdown sources from GitHub repos -import requests - -# Various content and template processing stuff -from jinja2 import Environment, FileSystemLoader, TemplateError -from markdown import markdown -from bs4 import BeautifulSoup - -# Watchdog stuff -import time -from watchdog.observers import Observer -from watchdog.events import PatternMatchingEventHandler - -# The log level is configurable at runtime (see __main__ below) -logger = logging.getLogger() - -# These fields are special, and pages don't inherit them directly -RESERVED_KEYS_TARGET = [ - "name", - "display_name", - "filters", - "image_subs", - "pages", -] -ADHOC_TARGET = "__ADHOC__" -DEFAULT_PDF_FILE = "__DEFAULT_FILENAME__" -NO_PDF = "__NO_PDF__" - -filters = {} -def load_config(config_file=DEFAULT_CONFIG_FILE): - """Reload config from a YAML file.""" - global config, filters - logger.info("loading config file %s..." % config_file) - with open(config_file, "r") as f: - config = yaml.load(f) - assert(config["targets"]) - assert(config["pages"]) - assert(config["pdf_template"]) - assert(config["default_template"]) - assert(config["content_path"]) - assert(config["out_path"]) - assert(config["temporary_files_path"]) - assert(config["template_static_path"]) - assert(config["content_static_path"]) - if "prince_executable" not in config or not config["prince_executable"]: - config["prince_executable"] = "prince" # A reasonable default - if "default_filters" not in config: - config["default_filters"] = [] - if "skip_preprocessor" not in config: - config["skip_preprocessor"] = False - if "pdf_filename_fields" not in config: - config["pdf_filename_fields"] = "display_name" - if "pdf_filename_separator" not in config: - config["pdf_filename_separator"] = "-" - - - # Warn if any pages aren't part of a target - for page in config["pages"]: - if "targets" not in page: - if "name" in page: - logging.warn("Page %s is not part of any targets." % - page["name"]) - else: - logging.warn("Page %s is not part of any targets." % page) - if "md" in page and "name" not in page: - logging.debug("Guessing page name for page %s" % page) - page_path = os.path.join(config["content_path"], page["md"]) - page["name"] = guess_title_from_md_file(page_path) - - # Figure out which filters we need and import them - filternames = set(config["default_filters"]) - for target in config["targets"]: - if "filters" in target: - filternames.update(target["filters"]) - for page in config["pages"]: - if "filters" in page: - filternames.update(page["filters"]) - for filter_name in filternames: - filters[filter_name] = import_module("filter_"+filter_name) - -def default_pdf_name(target): - target = get_target(target) - filename_segments = [] - for fieldname in config["pdf_filename_fields"]: - if fieldname in target.keys(): - filename_segments.append(slugify(target[fieldname])) - - if filename_segments: - return config["pdf_filename_separator"].join(filename_segments) + ".pdf" - else: - return slugify(target["name"])+".pdf" - -# old default_pdf_name(target) - # if {"product","version","guide"} <= set(target.keys()): - # p_name = slugify(target["product"]) - # v_num = slugify(target["version"]) - # g_name = slugify(target["guide"]) - # return p_name+"-"+v_num+"-"+g_name+".pdf" - # elif "display_name" in target: - # return slugify(target["display_name"])+".pdf" - # else: - # return slugify(target["name"])+".pdf" - -# Note: this regex means non-ascii characters get stripped from filenames, -# which is not preferable when making non-English filenames. -unacceptable_chars = re.compile(r"[^A-Za-z0-9._ ]+") -whitespace_regex = re.compile(r"\s+") -def slugify(s): - s = re.sub(unacceptable_chars, "", s) - s = re.sub(whitespace_regex, "_", s) - if not s: - s = "_" - return s - -def substitute_links_for_target(soup, target): - """Replaces local-html-links with appropriate substitutions - for the given target, and images likewise""" - target = get_target(target) - - logger.info("... modifying links for target: %s" % target["name"]) - # We actually want to get all pages, even the ones that aren't built as - # part of this target, in case those pages have replacement links. - pages = get_pages() - - links = soup.find_all("a", href=re.compile(r"^[^.]+\.html")) - for link in links: - for page in pages: - if target["name"] in page: - #There's a replacement link for this env - local_url = page["html"] - target_url = page[target["name"]] - if link["href"][:len(local_url)] == local_url: - link["href"] = link["href"].replace(local_url, - target_url) - - if "image_subs" in target: - images = soup.find_all("img") - for img in images: - local_path = img["src"] - if local_path in target["image_subs"]: - logger.info("... replacing image path '%s' with '%s'" % - (local_path, target["image_subs"][local_path])) - img["src"] = target["image_subs"][local_path] - - image_links = soup.find_all("a", - href=re.compile(r"^[^.]+\.(png|jpg|jpeg|gif|svg)")) - for img_link in image_links: - local_path = img_link["href"] - if local_path in target["image_subs"]: - logger.info("... replacing image link '%s' with '%s'" % - (local_path, target["image_subs"][local_path])) - img_link["href"] = target["image_subs"][local_path] - - -def substitute_parameter_links(link_parameter, currentpage, target): - """Some templates have links in page parameters. Do link substitution for - the target on one of those parameters.""" - target = get_target(target) - # We actually want to get all pages, even the ones that aren't built as - # part of this target, in case those pages have replacement links. - pages = get_pages() - - if link_parameter in currentpage: - linked_page = next(p for p in pages - if p["html"] == currentpage[link_parameter]) - if target["name"] in linked_page: - #there's a link substitution available - currentpage[link_parameter] = linked_page[target["name"]] - ## We could warn here, but it would frequently be a false alarm - # else: - # logging.warning("No substitution for %s[%s] for this target" % - # (currentpage["html"],link_parameter)) - -def get_target(target): - """Get a target by name, or return the default target object. - We can't use default args in function defs because the default is - set at runtime based on config""" - if target == None: - return config["targets"][0] - - if type(target) == str: - try: - return next(t for t in config["targets"] if t["name"] == target) - except StopIteration: - logger.critical("Unknown target: %s" % target) - exit(1) - - if "name" in target: - # Eh, it's probably a target, just return it - return target - -def make_adhoc_target(inpages, no_cover): - t = { - "name": ADHOC_TARGET, - "display_name": "(Untitled)", - } - - if not no_cover: - indexpage = next(p for p in config["pages"] - if p["html"] == "index.html") - indexpage["targets"].append(ADHOC_TARGET) - - if len(inpages) == 1: - t["display_name"] = guess_title_from_md_file(inpages[0]) - - for inpage in inpages: - # Figure out the actual filename and location of this infile - # and set the content source dir appropriately - in_dir, in_file = os.path.split(inpage) - config["content_path"] = in_dir - - # Figure out what html file to output to - ENDS_IN_MD = re.compile("\.md$", re.I) - if re.search(ENDS_IN_MD, in_file): - out_html_file = re.sub(ENDS_IN_MD, ".html", in_file) - else: - out_html_file = in_file+".html" - - # Try to come up with a reasonable page title - page_title = guess_title_from_md_file(inpage) - - new_page = { - "name": page_title, - "md": in_file, - "html": out_html_file, - "targets": [ADHOC_TARGET], - "category": "Pages", - "pp_env": in_dir, - } - config["pages"].append(new_page) - - config["targets"].append(t) - - return t - -def guess_title_from_md_file(filepath): - with open(filepath, "r") as f: - line1 = f.readline() - line2 = f.readline() - - # look for headers in the "followed by ----- or ===== format" - ALT_HEADER_REGEX = re.compile("^[=-]{3,}$") - if ALT_HEADER_REGEX.match(line2): - possible_header = line1 - if possible_header.strip(): - return possible_header.strip() - - # look for headers in the "## abc ## format" - HEADER_REGEX = re.compile("^#+\s*(.+[^#\s])\s*#*$") - m = HEADER_REGEX.match(line1) - if m: - possible_header = m.group(1) - if possible_header.strip(): - return possible_header.strip() - - #basically if the first line's not a markdown header, we give up and use - # the filename instead - return os.path.basename(filepath) - -def get_filters_for_page(page, target=None): - ffp = set(config["default_filters"]) - target = get_target(target) - if "filters" in target: - ffp.update(target["filters"]) - if "filters" in page: - ffp.update(page["filters"]) - return ffp - -def parse_markdown(page, target=None, pages=None, bypass_errors=False): - """Take a markdown string and output HTML for that content""" - target = get_target(target) - logging.info("Preparing page %s" % page["name"]) - - # Preprocess Markdown using this Jinja environment - pp_env = setup_pp_env(page) - - # We'll apply these filters to the page - page_filters = get_filters_for_page(page, target) - - md = get_markdown_for_page(page["md"], pp_env=pp_env, target=target, - bypass_errors=bypass_errors, currentpage=page) - - # Apply markdown-based filters here - for filter_name in page_filters: - if "filter_markdown" in dir(filters[filter_name]): - logging.info("... applying markdown filter %s" % filter_name) - md = filters[filter_name].filter_markdown(md, target=target, - page=page, config=config) - - # Actually parse the markdown - logger.info("... parsing markdown...") - html = markdown(md, extensions=["markdown.extensions.extra", - "markdown.extensions.toc"], - lazy_ol=False) - - # Apply raw-HTML-string-based filters here - for filter_name in page_filters: - if "filter_html" in dir(filters[filter_name]): - logging.info("... applying HTML filter %s" % filter_name) - html = filters[filter_name].filter_html(html, target=target, - page=page, config=config) - - # Some filters would rather operate on a soup than a string. - # May as well parse once and re-serialize once. - soup = BeautifulSoup(html, "html.parser") - - # Apply soup-based filters here - for filter_name in page_filters: - if "filter_soup" in dir(filters[filter_name]): - logging.info("... applying soup filter %s" % filter_name) - filters[filter_name].filter_soup(soup, target=target, - page=page, config=config) - # ^ the soup filters apply to the same object, passed by reference - - # Replace links for any non-default target - if target["name"] != config["targets"][0]["name"]: - substitute_links_for_target(soup, target) - - logging.info("... re-rendering HTML from soup...") - html2 = str(soup) - return html2 - - -def githubify_markdown(md, target=None, pages=None): - """Github-friendly markdown has absolute links, no md in divs""" - MARKDOWN_LINK_REGEX = re.compile( - r"(\[([^\]]+)\]\(([^:)]+)\)|\[([^\]]+)\]:\s*(\S+)$)", re.MULTILINE) - - target = get_target(target) - if not pages: - pages = get_pages(target["name"]) - - class MDLink: - """A markdown link, either a reference link or inline link""" - def __init__(self, fullmatch, label, url, label2, url2): - self.fullmatch = fullmatch - if label: - self.label = label - self.url = url - self.is_reflink = False - elif label2: - self.label = label2 - self.url = url2 - self.is_reflink = True - - def to_markdown(self): - """Re-represent self as a link in markdown syntax""" - s = "[" + self.label + "]" - if self.is_reflink: - s += ": " + self.url - else: - s += "(" + self.url + ")" - return s - - links = [MDLink(*m) for m in MARKDOWN_LINK_REGEX.findall(md)] - - for link in links: - for page in pages: - if target["name"] in page: - #There's a replacement link for this - local_url = page["html"] - target_url = page[target["name"]] - if link.url[:len(local_url)] == local_url: - link.url = link.url.replace(local_url, target_url) - md = md.replace(link.fullmatch, link.to_markdown()) - - return md - - -def get_pages(target=None): - """Read pages from config and return an object, optionally filtered - to just the pages that this target cares about""" - - target = get_target(target) - pages = config["pages"] - - if target["name"]: - #filter pages that aren't part of this target - def should_include(page, target_name): - if "targets" not in page: - return False - if target_name in page["targets"]: - return True - else: - return False - pages = [page for page in pages - if should_include(page, target["name"])] - - # Pages should inherit non-reserved keys from the target - for p in pages: - for key,val in target.items(): - if key in RESERVED_KEYS_TARGET: - continue - elif key not in p: - p[key] = val - return pages - - -def get_categories(pages): - """Produce an ordered, de-duplicated list of categories from - the page list""" - categories = [] - for page in pages: - if "category" in page and page["category"] not in categories: - categories.append(page["category"]) - logger.info("categories: %s" % categories) - return categories - - -def read_markdown_local(filename, pp_env, target=None, bypass_errors=False, currentpage={}): - """Read in a markdown file and pre-process any templating lang in it, - returning the parsed contents.""" - target = get_target(target) - pages = get_pages(target) - logging.info("reading markdown from file: %s" % filename) - - if config["skip_preprocessor"]: - fpath = pp_env.loader.searchpath[0] - with open(os.path.join(fpath,filename), "r") as f: - md_out = f.read() - else: - try: - md_raw = pp_env.get_template(filename) - md_out = md_raw.render(target=target, pages=pages, currentpage=currentpage) - except TemplateError as e: - traceback.print_tb(e.__traceback__) - if bypass_errors: - logging.warn("Error pre-processing page %s; trying to load it raw" - % filename) - fpath = pp_env.loader.searchpath[0] - with open(os.path.join(fpath,filename), "r") as f: - md_out = f.read() - else: - exit("Error pre-processing page %s: %s" % (filename, e)) - return md_out - - -def read_markdown_remote(url): - """Fetch a remote markdown file and return its contents""" - response = requests.get(url) - if response.status_code == 200: - return response.text - else: - raise requests.RequestException("Status code for page was not 200") - - -def get_markdown_for_page(md_where, pp_env=None, target=None, bypass_errors=False, currentpage={}): - """Read/Fetch and pre-process markdown file""" - target = get_target(target) - if "http:" in md_where or "https:" in md_where: - try: - mdr = read_markdown_remote(md_where) - except requests.RequestException as e: - if bypass_errors: - mdr = "" - else: - traceback.print_tb(e.__traceback__) - exit("Error fetching page %s: %s" % (md_where, e)) - return mdr - else: - return read_markdown_local(md_where, pp_env, target, bypass_errors, currentpage=currentpage) - - -def copy_static_files(template_static=True, content_static=True, out_path=None): - """Copy static files to the output directory.""" - if out_path == None: - out_path = config["out_path"] - - - if template_static: - template_static_src = config["template_static_path"] - template_static_dst = os.path.join(out_path, - os.path.basename(template_static_src)) - copy_tree(template_static_src, template_static_dst) - - if content_static: - content_static_src = config["content_static_path"] - content_static_dst = os.path.join(out_path, - os.path.basename(content_static_src)) - copy_tree(content_static_src, content_static_dst) - -def setup_pp_env(page=None): - if not page or "pp_dir" not in page: - pp_env = Environment(loader=FileSystemLoader(config["content_path"])) - else: - pp_env = Environment(loader=FileSystemLoader(page["pp_dir"])) - #Example: if we want to add custom functions to the md files - #pp_env.globals['foo'] = lambda x: "FOO %s"%x - return pp_env - -def setup_html_env(): - env = Environment(loader=FileSystemLoader(config["template_path"])) - env.lstrip_blocks = True - env.trim_blocks = True - return env - -def toc_from_headers(html_string): - """make a table of contents from headers""" - soup = BeautifulSoup(html_string, "html.parser") - headers = soup.find_all(name=re.compile("h[1-3]"), id=True) - toc_s = "" - for h in headers: - if h.name == "h1": - toc_level = "level-1" - elif h.name == "h2": - toc_level = "level-2" - else: - toc_level = "level-3" - - new_a = soup.new_tag("a", href="#"+h["id"]) - if h.string: - new_a.string = h.string - else: - new_a.string = " ".join(h.strings) - new_li = soup.new_tag("li") - new_li["class"] = toc_level - new_li.append(new_a) - - toc_s += str(new_li)+"\n" - - return str(toc_s) - -def render_pages(target=None, for_pdf=False, bypass_errors=False): - """Parse and render all pages in target, writing files to out_path.""" - target = get_target(target) - pages = get_pages(target) - categories = get_categories(pages) - - # Insert generated HTML into templates using this Jinja environment - env = setup_html_env() - - if for_pdf: - if "pdf_template" in target: - logging.debug("reading pdf template %s from target..." % target["pdf_template"]) - default_template = env.get_template(target["pdf_template"]) - else: - logging.debug("reading default pdf template %s..." % config["pdf_template"]) - default_template = env.get_template(config["pdf_template"]) - else: - if "template" in target: - logging.debug("reading HTML template %s from target..." % target["template"]) - default_template = env.get_template(target["template"]) - else: - logging.debug("reading default HTML template %s..." % config["default_template"]) - default_template = env.get_template(config["default_template"]) - - for currentpage in pages: - if "md" in currentpage: - # Read and parse the markdown - - try: - html_content = parse_markdown(currentpage, target=target, - pages=pages, bypass_errors=bypass_errors) - - except Exception as e: - if bypass_errors: - traceback.print_tb(e.__traceback__) - logging.warning( ("Skipping page %s " + - "due to error fetching contents: %s") % - (currentpage["name"], e) ) - continue - else: - traceback.print_tb(e.__traceback__) - exit("Error when fetching page %s: %s" % - (currentpage["name"], e) ) - else: - html_content = "" - - # default to a table-of-contents sidebar... - if "sidebar" not in currentpage: - currentpage["sidebar"] = "toc" - if currentpage["sidebar"] == "toc": - sidebar_content = toc_from_headers(html_content) - else: - sidebar_content = None - - # Prepare some parameters for rendering - substitute_parameter_links("doc_page", currentpage, target) - current_time = time.strftime("%B %d, %Y") - - # Figure out which template to use - if "template" in currentpage and not for_pdf: - logging.info("using template %s from page" % currentpage["template"]) - use_template = env.get_template(currentpage["template"]) - elif "pdf_template" in currentpage and for_pdf: - logging.info("using pdf_template %s from page" % currentpage["pdf_template"]) - use_template = env.get_template(currentpage["pdf_template"]) - else: - use_template = default_template - - # Render the content into the appropriate template - out_html = use_template.render(currentpage=currentpage, - categories=categories, - pages=pages, - content=html_content, - target=target, - current_time=current_time, - sidebar_content=sidebar_content) - - - if for_pdf: - out_path = config["temporary_files_path"] - else: - out_path = config["out_path"] - fileout = os.path.join(out_path, currentpage["html"]) - if not os.path.isdir(out_path): - logging.info("creating build folder %s" % out_path) - os.makedirs(out_path) - with open(fileout, "w") as f: - logging.info("writing to file: %s..." % fileout) - f.write(out_html) - - -def watch(pdf_file, target): - """Look for changed files and re-generate HTML (and optionally - PDF whenever there's an update. Runs until interrupted.""" - target = get_target(target) - - class UpdaterHandler(PatternMatchingEventHandler): - """Updates to pattern-matched files means rendering.""" - def on_any_event(self, event): - logging.info("got event!") - # bypass_errors=True because Watch shouldn't - # just die if a file is temporarily not found - if pdf_file: - make_pdf(pdf_file, target=target, bypass_errors=True) - else: - render_pages(target, bypass_errors=True) - logging.info("done rendering") - - patterns = ["*template-*.html", - "*.md", - "*code_samples/*"] - - event_handler = UpdaterHandler(patterns=patterns) - observer = Observer() - observer.schedule(event_handler, config["template_path"], recursive=True) - observer.schedule(event_handler, config["content_path"], recursive=True) - observer.start() - # The above starts an observing thread, - # so the main thread can just wait - try: - while True: - time.sleep(1) - except KeyboardInterrupt: - observer.stop() - observer.join() - - -def make_pdf(outfile, target=None, bypass_errors=False): - """Use prince to convert several HTML files into a PDF""" - logging.info("rendering PDF-able versions of pages...") - target = get_target(target) - render_pages(target=target, for_pdf=True, bypass_errors=bypass_errors) - - temp_files_path = config["temporary_files_path"] - - # Prince will need the static files, so copy them over - copy_static_files(out_path=temp_files_path) - - # Make sure the path we're going to write the PDF to exists - if not os.path.isdir(config["out_path"]): - logging.info("creating build folder %s" % config["out_path"]) - os.makedirs(config["out_path"]) - - # Start preparing the prince command - args = [config["prince_executable"], '--javascript', '-o', outfile] - # Each HTML output file in the target is another arg to prince - pages = get_pages(target) - args += [os.path.join(temp_files_path, p["html"]) for p in pages] - - logger.info("generating PDF: running %s..." % " ".join(args)) - prince_resp = subprocess.check_output(args, universal_newlines=True) - print(prince_resp) - - -def githubify(md_file_name, target=None): - """Wrapper - make the markdown resemble GitHub flavor""" - target = get_target(target) - - pages = get_pages() - logging.info("getting markdown for page %s" % md_file_name) - md = get_markdown_for_page(md_file_name, - pp_env=setup_pp_env(), - target=target) - - logging.info("githubifying markdown...") - rendered_md = githubify_markdown(md, target=target, pages=pages) - - if not os.path.isdir(config["out_path"]): - logging.info("creating build folder %s" % config["out_path"]) - os.makedirs(config["out_path"]) - - fileout = os.path.join(config["out_path"], md_file_name) - logging.info("writing generated file to path: %s"%fileout) - with open(fileout, "w") as f: - f.write(rendered_md) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='Generate static site from markdown and templates.') - parser.add_argument("--watch", "-w", action="store_true", - help="Watch for changes and re-generate output. "+\ - "This runs until force-quit.") - parser.add_argument("--pdf", nargs="?", type=str, - const=DEFAULT_PDF_FILE, default=NO_PDF, - help="Output a PDF to this file. Requires Prince.") - parser.add_argument("--githubify", "-g", type=str, - help="Output md prepared for GitHub") - parser.add_argument("--target", "-t", type=str, - help="Build for the specified target.") - parser.add_argument("--out_dir", "-o", type=str, - help="Output to this folder (overrides config file)") - parser.add_argument("--quiet", "-q", action="store_true", - help="Suppress status messages") - parser.add_argument("--bypass_errors", "-b", action="store_true", - help="Continue building if some contents not found") - parser.add_argument("--config", "-c", type=str, - help="Specify path to an alternate config file.") - parser.add_argument("--copy_static", "-s", action="store_true", - help="Copy static files to the out dir", - default=False) - parser.add_argument("--pages", type=str, help="Build markdown page(s) "+\ - "that aren't described in the config.", nargs="+") - parser.add_argument("--no_cover", "-n", action="store_true", - help="(with --pages only) Don't automatically add a "+\ - "cover page / index.html file.") - parser.add_argument("--skip_preprocessor", action="store_true", default=False, - help="Don't pre-process Jinja syntax in markdown files") - parser.add_argument("--title", type=str, help="Override target display "+\ - "name. Useful when passing multiple args to --pages.") - parser.add_argument("--list_targets_only", "-l", action="store_true", - help="Don't build anything, just display list of "+ - "known targets from the config file.") - cli_args = parser.parse_args() - - if not cli_args.quiet: - logging.basicConfig(level=logging.INFO) - - if cli_args.config: - load_config(cli_args.config) - else: - load_config() - - if cli_args.list_targets_only: - for t in config["targets"]: - if "display_name" in t: - display_name = t["display_name"] - elif {"product","version","guide"} <= set(t.keys()): - display_name = " ".join([t["product"],t["version"],t["guide"]]) - else: - display_name = "" - print("%s\t\t%s" % (t["name"], display_name)) - - #print(" ".join([t["name"] for t in config["targets"]])) - exit(0) - - if cli_args.out_dir: - config["out_path"] = cli_args.out_dir - - config["skip_preprocessor"] = cli_args.skip_preprocessor - - if cli_args.pages: - make_adhoc_target(cli_args.pages, cli_args.no_cover) - cli_args.target = ADHOC_TARGET - - if cli_args.title: - target = get_target(cli_args.target) - target["display_name"] = cli_args.title - - if cli_args.githubify: - githubify(cli_args.githubify, cli_args.target) - if cli_args.copy_static: - copy_static(template_static=False, content_static=True) - exit(0) - - if cli_args.pdf != NO_PDF: - if cli_args.pdf == DEFAULT_PDF_FILE: - pdf_path = os.path.join(config["out_path"], - default_pdf_name(cli_args.target)) - elif cli_args.pdf[-4:] != ".pdf": - exit("PDF filename must end in .pdf") - else: - pdf_path = os.path.join(config["out_path"], cli_args.pdf) - logging.info("making a pdf...") - make_pdf(pdf_path, target=cli_args.target, - bypass_errors=cli_args.bypass_errors) - logging.info("pdf done") - - else: - logging.info("rendering pages...") - render_pages(target=cli_args.target, - bypass_errors=cli_args.bypass_errors) - logging.info("done rendering") - - if cli_args.copy_static: - logging.info("copying static pages...") - copy_static_files() - - if cli_args.watch: - logging.info("watching for changes...") - if cli_args.pdf: - pdf_path = os.path.join(config["out_path"], cli_args.pdf) - watch(pdf_path, cli_args.target) - else: - watch(None, cli_args.target) diff --git a/tool/dactyl_link_checker.py b/tool/dactyl_link_checker.py deleted file mode 100755 index 668fdc47eb..0000000000 --- a/tool/dactyl_link_checker.py +++ /dev/null @@ -1,288 +0,0 @@ -#!/usr/bin/env python3 -import requests -import os -import yaml -import argparse -import logging -import re -from bs4 import BeautifulSoup -from time import time, sleep - -DEFAULT_CONFIG_FILE = "dactyl-config.yml" -TIMEOUT_SECS = 9.1 -CHECK_IN_INTERVAL = 30 -FINAL_RETRY_DELAY = 4 * CHECK_IN_INTERVAL - -soupsCache = {} -def getSoup(fullPath): - if fullPath in soupsCache.keys(): - soup = soupsCache[fullPath] - else: - with open(fullPath, 'r') as f: - soup = BeautifulSoup(f.read(), "html.parser") - soupsCache[fullPath] = soup - return soup - -def check_for_unparsed_reference_links(soup): - #unmatched_reflink_regex = re.compile(r"\[[^\]]+\]\[(\w| )*\]") - unmatched_reflink_regex = re.compile(r"(\[[^\]]+)?\]\[(\w| )*\]") - unparsed_links = [] - for s in soup.strings: - m = re.search(unmatched_reflink_regex, s) - if m: - unparsed_links.append(m.group(0)) - return unparsed_links - -def check_remote_url(endpoint, fullPath, broken_links, externalCache, isImg=False): - if isImg: - linkword = "image" - else: - linkword = "link" - if endpoint in [v for k,v in broken_links]: - # We already confirmed this was broken, so just add another instance - logging.warning("Broken %s %s appears again in %s" % (linkword, endpoint, fullPath)) - broken_links.append( (fullPath, endpoint) ) - return False - if endpoint in externalCache: - logging.debug("Skipping cached %s %s" % (linkword, endpoint)) - return True - if endpoint in config["known_broken_links"]: - logging.warning("Skipping known broken %s %s in %s" % (linkword, endpoint, fullPath)) - return True - - logging.info("Testing remote %s URL %s"%(linkword, endpoint)) - try: - code = requests.head(endpoint, timeout=TIMEOUT_SECS).status_code - except Exception as e: - logging.warning("Error occurred: %s" % e) - code = 500 - if code == 405 or code == 404: - #HEAD didn't work, maybe GET will? - try: - code = requests.get(endpoint, timeout=TIMEOUT_SECS).status_code - except Exception as e: - logging.warning("Error occurred: %s" % e) - code = 500 - - if code < 200 or code >= 400: - logging.warning("Broken remote %s in %s to %s"%(linkword, fullPath, endpoint)) - broken_links.append( (fullPath, endpoint) ) - return False - else: - logging.info("...success.") - externalCache.append(endpoint) - return True - -def checkLinks(offline=False): - externalCache = [] - broken_links = [] - num_links_checked = 0 - last_checkin = time() - for dirpath, dirnames, filenames in os.walk(config["out_path"]): - if time() - last_checkin > CHECK_IN_INTERVAL: - last_checkin = time() - print("... still working (dirpath: %s) ..." % dirpath) - if os.path.abspath(dirpath) == os.path.abspath(config["template_path"]): - # don't try to parse and linkcheck the templates - continue - for fname in filenames: - if time() - last_checkin > CHECK_IN_INTERVAL: - last_checkin = time() - print("... still working (file: %s) ..." % fname) - fullPath = os.path.join(dirpath, fname) - if "/node_modules/" in fullPath or ".git" in fullPath: - logging.debug("skipping ignored dir: %s" % fullPath) - continue - if fullPath.endswith(".html"): - soup = getSoup(fullPath) - unparsed_links = check_for_unparsed_reference_links(soup) - if unparsed_links: - logging.warning("Found %d unparsed Markdown reference links: %s" % - (len(unparsed_links), "\n... ".join(unparsed_links))) - [broken_links.append( (fullPath, u) ) for u in unparsed_links] - links = soup.find_all('a') - for link in links: - if time() - last_checkin > CHECK_IN_INTERVAL: - last_checkin = time() - print("... still working (link: %s) ..." % link) - if "href" not in link.attrs: - #probably an type anchor, skip - continue - - endpoint = link['href'] - if not endpoint.strip(): - logging.warning("Empty link in %s" % fullPath) - broken_links.append( (fullPath, endpoint) ) - num_links_checked += 1 - - elif endpoint == "#": - continue - - elif "mailto:" in endpoint: - logging.info("Skipping email link in %s to %s"%(fullPath, endpoint)) - continue - - elif "://" in endpoint: - if offline: - logging.info("Offline - Skipping remote URL %s"%(endpoint)) - continue - - num_links_checked += 1 - check_remote_url(endpoint, fullPath, broken_links, externalCache) - - - elif '#' in endpoint: - if fname in config["ignore_anchors_in"]: - logging.info("Ignoring anchor %s in dynamic page %s"%(endpoint,fname)) - continue - logging.info("Testing local link %s from %s"%(endpoint, fullPath)) - num_links_checked += 1 - filename,anchor = endpoint.split("#",1) - if filename == "": - fullTargetPath = fullPath - else: - fullTargetPath = os.path.join(dirpath, filename) - if not os.path.exists(fullTargetPath): - logging.warning("Broken local link in %s to %s"%(fullPath, endpoint)) - broken_links.append( (fullPath, endpoint) ) - - elif filename in config["ignore_anchors_in"]: - #Some pages are populated dynamically, so BeatifulSoup wouldn't - # be able to find anchors in them anyway - logging.info("Skipping anchor link in %s to dynamic page %s" % - (fullPath, endpoint)) - continue - - elif fullTargetPath != "../": - num_links_checked += 1 - targetSoup = getSoup(fullTargetPath) - if not targetSoup.find(id=anchor) and not targetSoup.find( - "a",attrs={"name":anchor}): - logging.warning("Broken anchor link in %s to %s"%(fullPath, endpoint)) - broken_links.append( (fullPath, endpoint) ) - else: - logging.info("...anchor found.") - continue - - elif endpoint[0] == '/': - #can't really test links out of the local field - logging.info("Skipping absolute link in %s to %s"%(fullPath, endpoint)) - continue - - else: - num_links_checked += 1 - if not os.path.exists(os.path.join(dirpath, endpoint)): - logging.warning("Broken local link in %s to %s"%(fullPath, endpoint)) - broken_links.append( (fullPath, endpoint) ) - - #Now check images - imgs = soup.find_all('img') - for img in imgs: - num_links_checked += 1 - if "src" not in img.attrs or not img["src"].strip(): - logging.warning("Broken image with no src in %s" % fullPath) - broken_links.append( (fullPath, img["src"]) ) - continue - - src = img["src"] - if "://" in src: - if offline: - logging.info("Offline - Skipping remote image %s"%(endpoint)) - continue - - check_remote_url(src, fullPath, broken_links, externalCache, isImg=True) - - else: - logging.info("Checking local image %s in %s" % (src, fullPath)) - if os.path.exists(os.path.join(dirpath, src)): - logging.info("...success") - else: - logging.warning("Broken local image %s in %s" % (src, fullPath)) - broken_links.append( (fullPath, src) ) - return broken_links, num_links_checked - -def final_retry_links(broken_links): - """Give the broken remote links a little while to recover in case they're just flaps""" - broken_remote_links = [ (page,link) for page,link in broken_links - if re.match(r"^https?://", link) ] - if not broken_remote_links: - logging.info("(no http/https broken links to retry)") - return - - logging.info("Waiting %d seconds to retry broken %d remote links..." - % (FINAL_RETRY_DELAY, len(broken_remote_links))) - start_wait = time() - elapsed = 0 - while elapsed < FINAL_RETRY_DELAY: - sleep(CHECK_IN_INTERVAL) - print("...") - elapsed = time() - start_wait - - retry_cache = [] - retry_broken = [] - for page, link in broken_remote_links: - link_works = check_remote_url(link, page, retry_broken, retry_cache) - if link_works: - logging.info("Link %s in page %s is back online" % (link, page)) - broken_links.remove( (page,link) ) - else: - logging.info("Link %s in page %s is still down." % (link, page)) - -def load_config(config_file=DEFAULT_CONFIG_FILE): - """Reload config from a YAML file.""" - global config - logging.info("loading config file %s..." % config_file) - with open(config_file, "r") as f: - config = yaml.load(f) - assert(config["out_path"]) - assert(type(config["known_broken_links"]) == list) - - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='Check files in this repository for broken links.') - parser.add_argument("-o", "--offline", action="store_true", - help="Check local anchors only") - parser.add_argument("-s", "--strict", action="store_true", - help="Exit with error even on known problems") - parser.add_argument("--config", "-c", type=str, - help="Specify path to an alternate config file.") - parser.add_argument("-n", "--no_final_retry", action="store_true", - help="Don't wait and retry failed remote links at the end.") - parser.add_argument("--quiet", "-q", action="store_true", - help="Reduce output to just failures and final report") - args = parser.parse_args() - - if not args.quiet: - logging.basicConfig(level=logging.INFO) - - if args.config: - load_config(args.config) - else: - load_config() - - broken_links, num_links_checked = checkLinks(args.offline) - - if not args.no_final_retry and not args.offline: - final_retry_links(broken_links) - #^ sleeps for FINAL_RETRY_DELAY and then retries remote links - # Automatically removes from broken_links if they work now - - print("---------------------------------------") - print("Link check report. %d links checked."%num_links_checked) - - if not args.strict: - unknown_broken_links = [ (page,link) for page,link in broken_links - if link not in config["known_broken_links"] ] - - if not broken_links: - print("Success! No broken links found.") - else: - print("%d broken links found:"%(len(broken_links))) - [print("File:",fname,"Link:",link) for fname,link in broken_links] - - if args.strict or unknown_broken_links: - exit(1) - - print("Success - all broken links are known problems.") diff --git a/tool/dactyl_style_checker.py b/tool/dactyl_style_checker.py deleted file mode 100755 index 06a305d5f4..0000000000 --- a/tool/dactyl_style_checker.py +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 - -############################################################################### -## Dactyl Style Police ## -## Author: Rome Reginelli ## -## Copyright: Ripple Labs, Inc. 2016 ## -## ## -## Reads the markdown files to try and enforce elements of good style. ## -############################################################################### - -import logging -import argparse -#import nltk -import re -import collections -import yaml - -from bs4 import BeautifulSoup -from bs4 import Comment -from bs4 import NavigableString - -import dactyl_build - -DEFAULT_CONFIG_FILE = "dactyl-config.yml" -OVERRIDE_COMMENT_REGEX = r" *STYLE_OVERRIDE: *([\w, -]+)" - -logger = logging.getLogger() - -def load_config(config_file=DEFAULT_CONFIG_FILE): - global config - dactyl_build.load_config(config_file) - config = dactyl_build.config - - if "word_substitutions_file" in config: - with open(config["word_substitutions_file"], "r") as f: - config["disallowed_words"] = yaml.load(f) - else: - logging.warning("No 'word_substitutions_file' found in config.") - - if "phrase_substitutions_file" in config: - with open(config["phrase_substitutions_file"], "r") as f: - config["disallowed_phrases"] = yaml.load(f) - else: - logging.warning("No 'phrase_substitutions_file' found in config.") - -def tokenize(passage): - words = re.split(r"[\s,.;()!'\"]+", passage) - return [w for w in words if w] - -def depunctuate(passage): - punctuation = re.compile(r"[,.;()!'\"]") - return re.sub(punctuation, "", passage) - -def check_all_pages(target=None): - """Reads all pages for a target and checks them for style.""" - target = dactyl_build.get_target(target) - pages = dactyl_build.get_pages(target) - - pp_env = dactyl_build.setup_pp_env() - - print("Style Checker - checking all pages in target %s" % target["name"]) - - style_issues = [] - for page in pages: - if "md" not in page: - # Not a doc page, move on - continue - logging.info("Checking page %s" % page["name"]) - page_issues = [] - html = dactyl_build.parse_markdown(page, pages=pages, target=target) - soup = BeautifulSoup(html, "html.parser") - - overrides = get_overrides(soup) - - content_elements = ["p","li","a","em","strong","th","td", - "h1","h2","h3","h4","h5","h6"] - for el in soup.descendants: - if (type(el) == NavigableString and - el.parent.name in content_elements and - str(el).strip()): - passage = str(el).strip() - passage_issues = check_passage(passage, overrides) - if passage_issues: - page_issues += passage_issues - #print("'%s' (%s)" % (el, el.parent.name)) - # for el in soup.find_all(content_elements): - # for passage in el.stripped_strings: - # passage_issues = check_passage(passage, overrides) - # if passage_issues: - # page_issues += passage_issues - - if page_issues: - style_issues.append( (page["name"], page_issues) ) - - return style_issues - -def get_overrides(soup): - overrides = [] - comments = soup.find_all(string=lambda text:isinstance(text,Comment)) - for comment in comments: - m = re.match(OVERRIDE_COMMENT_REGEX, comment) - if m: - new_overrides = m.group(1).split(",") - new_overrides = [o.strip() for o in new_overrides] - logging.info("Overrides found: %s" % new_overrides) - overrides += new_overrides - return overrides - -def check_passage(passage, overrides): - """Checks an individual string of text for style issues.""" - issues = [] - logging.debug("Checking passage %s" % passage) - #tokens = nltk.word_tokenize(passage) - tokens = tokenize(passage) - for t in tokens: - if t.lower() in config["disallowed_words"]: - if t.lower() in overrides: - logging.info("Unplain word violation %s overridden" % t) - continue - issues.append( ("Unplain Word", t.lower()) ) - - for phrase,sub in config["disallowed_phrases"].items(): - if phrase.lower() in depunctuate(passage): - if phrase.lower() in overrides: - logging.info("Unplain phrase violation %s overridden" % t) - continue - #logging.warn("Unplain phrase: %s; suggest %s instead" % (phrase, sub)) - issues.append( ("Unplain Phrase", phrase.lower()) ) - - return issues - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Check content files for style issues.") - parser.add_argument("--config", "-c", type=str, - help="Specify path to an alternate config file.") - parser.add_argument("--verbose", "-v", action="store_true", - help="Show status messages") - parser.add_argument("--target", "-t", type=str, - help="Check the specified target.") - cli_args = parser.parse_args() - - if cli_args.verbose: - logging.basicConfig(level=logging.INFO) - - if cli_args.config: - load_config(cli_args.config) - else: - load_config() - - issues = check_all_pages(target=cli_args.target) - if issues: - num_issues = sum(len(p[1]) for p in issues) - print("Found %d issues:" % num_issues) - for pagename,issuelist in issues: - print("Page: %s" % pagename) - c = collections.Counter(issuelist) - for i, count_i in c.items(): - if i[0]=="Unplain Phrase": - print(" Discouraged phrase: %s (%d instances); suggest '%s' instead." % - ( i[1], count_i, config["disallowed_phrases"][i[1].lower()] )) - elif i[0]=="Unplain Word": - print(" Discouraged word: %s (%d instances); suggest '%s' instead." % - ( i[1], count_i, config["disallowed_words"][i[1].lower()] )) - else: - print(" %s: %s (%d instances)" % (i[0], i[1], count_i)) - exit(1) - else: - print("Style check passed with flying colors!") - exit(0) diff --git a/tool/filter_add_version.py b/tool/filter_add_version.py deleted file mode 100644 index 289a182f33..0000000000 --- a/tool/filter_add_version.py +++ /dev/null @@ -1,31 +0,0 @@ -################################################################################ -## Add version to markdown filter ## -## Author: Rome Reginelli ## -## Copyright: Ripple Labs, Inc. 2016 ## -## ## -## Adds a message to the beginning of a file with a version number, based on ## -## the URL of the remotely-fetched markdown. ## -################################################################################ -import re -import logging - -def filter_markdown(md, target=None, page=None, config=None): - """Finds the version number and adds it to the start of the page.""" - version_regex = r"https://raw.githubusercontent.com/([A-Za-z0-9_.-]+)/([A-Za-z0-9_.-]+)/([A-Za-z0-9_-]+\.[A-Za-z0-9_.-]+)/.+\.md" - - try: - version_match = re.match(version_regex, page["md"]) - except (TypeError, KeyError): - logging.warning("couldn't get MD path from page %s" % page) - return md - - try: - github_owner = version_match.group(1) - github_project = version_match.group(2) - vnum = version_match.group(3) - url = "https://github.com/%s/%s/releases/%s" % (github_owner, github_project, vnum) - md = ("

Updated for version %s

"%(url, vnum))+md - except AttributeError: - logging.warning("version regex didn't match: %s" % version_match) - - return md diff --git a/tool/filter_badges.py b/tool/filter_badges.py deleted file mode 100644 index 589da446c2..0000000000 --- a/tool/filter_badges.py +++ /dev/null @@ -1,44 +0,0 @@ -################################################################################ -## Badges filter ## -## Author: Rome Reginelli ## -## Copyright: Ripple Labs, Inc. 2016 ## -## ## -## Looks for links with the title text "BADGE" and makes them into badges. ## -## The alt text must be in the form of : and ## -## the left half can't contain a colon. ## -################################################################################ -import re -import logging -from urllib.parse import quote as urlescape - -BADGE_REGEX = re.compile("BADGE_(BRIGHTGREEN|GREEN|YELLOWGREEN|YELLOW|ORANGE|RED|LIGHTGREY|BLUE|[0-9A-Fa-f]{6})") - -def filter_soup(soup, target=None, page=None, config=None): - """replace underscores with dashes in h1,h2,etc. for backwards compatibility""" - - badge_links = soup.find_all(name="a", title=BADGE_REGEX) - - for b in badge_links: - badge_label = b.string - if not badge_label: - badge_label = "".join(b.strings) - if not badge_label: - logging.warning("Badge link with no string: %s" % b) - continue - if ":" not in badge_label: - logging.warning("Badge link specified with no ':' in link: %s" % b.string) - continue - - badge_color = BADGE_REGEX.match(b["title"]).group(1).lower() - badge_left, badge_right = [urlescape(s.strip()).replace("-","--") - for s in badge_label.split(":", 1)] - badge_url = "https://img.shields.io/badge/%s-%s-%s.svg" % ( - badge_left, badge_right, badge_color) - - img = soup.new_tag("img", src=badge_url, alt=badge_label) - img["class"]="dactyl_badge" - b.clear() - b.append(img) - b["title"] = badge_label - if not b["href"]: - del b["href"] diff --git a/tool/filter_buttonize.py b/tool/filter_buttonize.py deleted file mode 100644 index edda8d69a5..0000000000 --- a/tool/filter_buttonize.py +++ /dev/null @@ -1,18 +0,0 @@ -################################################################################ -## Buttonize links ## -## Author: Rome Reginelli ## -## Copyright: Ripple Labs, Inc. 2016 ## -## ## -## Looks for links ending in >, and adds a "button" class to those links so ## -## they can be styled like buttons in the page. ## -################################################################################ -import re - -def filter_soup(soup, target=None, page=None, config=None): - """make links ending in > render like buttons""" - buttonlinks = soup.find_all("a", string=re.compile(">$")) - for link in buttonlinks: - if "class" in link.attrs: - link["class"].append("button") - else: - link["class"] = "button" diff --git a/tool/filter_callouts.py b/tool/filter_callouts.py deleted file mode 100644 index 6fa83e423c..0000000000 --- a/tool/filter_callouts.py +++ /dev/null @@ -1,27 +0,0 @@ -################################################################################ -## Callouts filter ## -## Author: Rome Reginelli ## -## Copyright: Ripple Labs, Inc. 2016 ## -## ## -## Looks for sections starting **Note:** or **Caution:** and gives them CSS ## -## classes like "callout note" so they can be styled accordinglyselfselfself. ## -################################################################################ -import re - -CALLOUT_CLASS_MAPPING = { - "note": "devportal-callout note", - "warning": "devportal-callout warning", - "caution": "devportal-callout caution", - "tip": "devportal-callout tip", -} - -def filter_soup(soup, target=None, page=None, config=None): - """replace underscores with dashes in h1,h2,etc. for backwards compatibility""" - callout_intro = re.compile(r"(Note|Warning|Tip|Caution):?$", re.I) - callouts = soup.find_all(name=["strong","em"], string=callout_intro) - for c in callouts: - if not c.previous_sibling: #This callout starts a block - callout_type = c.string.replace(":","").lower() - if callout_type in CALLOUT_CLASS_MAPPING: - c.parent["class"] = CALLOUT_CLASS_MAPPING[callout_type] - #c.parent["class"] = "callout %s" % callout_type diff --git a/tool/filter_markdown_in_divs.py b/tool/filter_markdown_in_divs.py deleted file mode 100644 index 890b71c2f7..0000000000 --- a/tool/filter_markdown_in_divs.py +++ /dev/null @@ -1,25 +0,0 @@ -################################################################################ -## Add Markdown Class to Divs filter ## -## Author: Rome Reginelli ## -## Copyright: Ripple Labs, Inc. 2016 ## -## ## -## Finds raw divs in the markdown and adds the markdown=1 attribute to them ## -## so that HTML inside those divs gets parsed as markdown. ## -## Some flavors of markdown do this automatically, so this provides ## -## compatibility with those. ## -################################################################################ - -def filter_markdown(md, target=None, page=None, config=None): - """Python markdown requires markdown="1" on HTML block elements - that contain markdown. AND there's a bug where if you use - markdown.extensions.extra, it replaces code fences in HTML - block elements with garbled text.""" - def add_markdown_class(m): - if m.group(0).find("markdown=") == -1: - return m.group(1) + ' markdown="1">' - else: - return m.group(0) - - logger.info("... adding markdown class to embedded divs...") - md = re.sub(r"(]*)>", add_markdown_class, md) - return md diff --git a/tool/filter_multicode_tabs.py b/tool/filter_multicode_tabs.py deleted file mode 100644 index 5c5ed9819c..0000000000 --- a/tool/filter_multicode_tabs.py +++ /dev/null @@ -1,61 +0,0 @@ -################################################################################ -## Multicode Tabs 2 filter ## -## Author: Rome Reginelli ## -## Copyright: Ripple Labs, Inc. 2016 ## -## ## -## Finds multicode tab sections and turns them into properly-formatted ## -## HTML syntax to use with minitabs jQuery ## -################################################################################ -import re -import logging - -def filter_html(html, target=None, page=None, config=None): - """Turn multicode comments into a div (after markdown inside is parsed)""" - MC_START_REGEX = re.compile(r"") - MC_END_REGEX = re.compile(r"") - - html = re.sub(MC_START_REGEX, "
", html) - html = re.sub(MC_END_REGEX, "
", html) - return html - -def filter_soup(soup, target=None, page=None, config=None): - """Turn a multicode block into the correct syntax for minitabs""" - multicodes = soup.find_all(class_="multicode") - index1 = 0 - for cb_area in multicodes: - cb_area["id"] = "code-%d" % index1 - - codetabs_ul = soup.new_tag("ul") - codetabs_ul["class"] = "codetabs" - cb_area.insert(0,codetabs_ul) - - pres = cb_area.find_all("pre") - index2 = 0 - for pre in pres: - #make a unique ID for this code sample - linkid = "code-%d-%d" % (index1, index2) - - #wrap this code sample in an ID'd div - code_sample_wrapper = soup.new_tag("div", id=linkid) - code_sample_wrapper["class"] = "code_sample" - code_sample_wrapper["style"] = "position: static;" - pre.wrap(code_sample_wrapper) - - #add a link to the tabs ul - linkback = soup.new_tag("a", href=("#%s" % linkid)) - linkback_li = soup.new_tag("li") - linkback_li.append(linkback) - codetabs_ul.append(linkback_li) - - #find the text label for this sample - prev_p = code_sample_wrapper.find_previous_sibling("p") - try: - label = "".join(prev_p.em.strings) - except AttributeError: - label = "Code Sample %d-%d" % (index1, index2) - linkback.string = label - prev_p.decompose() - - index2 += 1 - - index1 += 1 diff --git a/tool/filter_remove_doctoc.py b/tool/filter_remove_doctoc.py deleted file mode 100644 index 25b52cdf75..0000000000 --- a/tool/filter_remove_doctoc.py +++ /dev/null @@ -1,20 +0,0 @@ -################################################################################ -## Remove doctoc filter ## -## Author: Rome Reginelli ## -## Copyright: Ripple Labs, Inc. 2016 ## -## ## -## Removes an automatically-generated "doctoc" table of contents, as ## -## delineated by HTML comments, from the markdown source. ## -################################################################################ - - -def filter_markdown(md, target=None, page=None, config=None): - """Strip out doctoc Table of Contents for RippleAPI""" - DOCTOC_START = "" - DOCTOC_END = "" - - doctoc_start_i = md.find(DOCTOC_START) - doctoc_end_i = md.find(DOCTOC_END) - if doctoc_start_i != -1 and doctoc_end_i != -1: - md = md[:doctoc_start_i]+md[doctoc_end_i+len(DOCTOC_END):] - return md diff --git a/tool/filter_standardize_header_ids.py b/tool/filter_standardize_header_ids.py deleted file mode 100644 index 7cb3d8c4fe..0000000000 --- a/tool/filter_standardize_header_ids.py +++ /dev/null @@ -1,17 +0,0 @@ -################################################################################ -## Standardize header IDs ## -## Author: Rome Reginelli ## -## Copyright: Ripple Labs, Inc. 2016 ## -## ## -## Replaces underscores with dashes in h1,h2,... element IDs. This provides ## -## compatibility with some other flavors of markdown that generate HTML IDs ## -## differently. ## -################################################################################ -import re - -def filter_soup(soup, target=None, page=None, config=None): - """replace underscores with dashes in h1,h2,etc. for backwards compatibility""" - headers = soup.find_all(name=re.compile("h[0-9]"), id=True) - for h in headers: - if "_" in h["id"]: - h["id"] = h["id"].replace("_", "-") diff --git a/tool/filter_xrefs.py b/tool/filter_xrefs.py deleted file mode 100644 index 5f4fad00f4..0000000000 --- a/tool/filter_xrefs.py +++ /dev/null @@ -1,142 +0,0 @@ -################################################################################ -## XRefs: Intelligent Crossreferences filter ## -## Author: Rome Reginelli ## -## Copyright: Ripple Labs, Inc. 2017 ## -## ## -## Looks for syntax matching the following format: ## -## [optional text](XREF: some-link.html#fragment) ## -## and interprets it as cross-references. If some-link.html is a file in the ## -## current target it becomes a normal hyperlink. If the link text is [] (that ## -## is, blank) it gets replaced with the title of the page. ## -## (Note: we can't look up section titles as that would require parsing the ## -## cross-referenced page and could lead to an infinite recursion loop if two ## -## pages cross-ferenced each other.) ## -## If the file isn't part of the current target but is part of another ## -## target, it becomes a non-hyperlink cross reference to the page in the ## -## first target that DOES have it. For example: ## -## "Some Link Title" in _A Target Containing Some Link_ ## -################################################################################ -import re -from logging import warning - -# match anything starting with XREF:/xref:, split by the # if there is one -# dropping any excess whitespace -xref_regex = re.compile(r"^\s*xref:\s*(?P[^#]+)(?P#\S+)?\s*?$", re.I) - -def find_file_in_target(fname, targetname, config): - if fname[-3:] == ".md": - # look by markdown file first - for page in config["pages"]: - if "md" not in page: - continue - elif ("/" in fname and page["md"] == fname # try to match md file by exact path - and targetname in page.get("targets",[]) # the page appears in this target - and page.get("html","") ): # and finally, the page has an html filename - return page - elif ( page["md"].split("/")[-1] == fname # match md filename in any directory - and targetname in page.get("targets",[]) - and page.get("html","") ): - return page - - for page in config["pages"]: - if "html" not in page: - continue - elif page["html"] != fname: - continue - if targetname in page["targets"]: - return page - else: - return False - -def find_file_in_any_target(fname, config): - if fname[-3:] == ".md": - #print("finding in any target by md") - # look by markdown file first - for page in config["pages"]: - if "md" not in page: - continue - elif ("/" in fname and page["md"] == fname # try to match md file by exact path - and page.get("targets",[]) # page must appear in some target - and page.get("html","") ): # and page must have an html filename - return page - elif ( page["md"].split("/")[-1] == fname # match md filename in any folder - and page.get("targets",[]) - and page.get("html","") ): - return page - - # look by HTML file if it didn't end in .md or if we didn't find it yet - for page in config["pages"]: - if "html" not in page: - continue - elif page["html"] == fname and page["targets"]: - #page has to have "some" target(s) for it to be worthwhile - return page - else: - return False - -def lookup_display_name(targetname, config): - for t in config["targets"]: - if "name" in t and t["name"] == targetname: - display_name = "%s %s %s %s %s" % ( - t.get("display_name", ""), - t.get("product", ""), - t.get("version", ""), - t.get("guide", ""), - t.get("subtitle", "") - ) - if display_name.strip(): - return display_name - else: - warning("Target has no display_name/product/version/guide: %s" % targetname) - return targetname - else: - warning("Target not found: %s" % targetname) - return targetname - -def filter_soup(soup, target={"name":""}, page=None, config={"pages":[]}): - """Look for cross-references and replace them with not-hyperlinks if they - don't exist in the current target.""" - - xrefs = soup.find_all(href=xref_regex) - #print("Crossreferences:", xrefs) - #print("Target pages:", target["pages"]) - - for xref in xrefs: - m = xref_regex.match(xref.attrs["href"]) - xref_file = m.group("xref_file") - xref_frag = m.group("xref_frag") or "" - - xref_page = find_file_in_target(xref_file, target["name"], config) - if xref_page == False: - # Cross-referenced page isn't part of this target - xref_page = find_file_in_any_target(xref_file, config) - if not xref_page: - raise KeyError(("xref to missing file: '%s'. Maybe it's not in the Dactyl config file?")%xref_file) - xref_target_shortname = xref_page["targets"][0] - - ref_target = lookup_display_name(xref_target_shortname, config) - - link_label = " ".join([s for s in xref.stripped_strings]) - # If a link label wasn't provided, generate one from the page name - if not link_label.strip(): - link_label = xref_page["name"] - link_label = link_label.strip() - - # "Link Label" in _Target Display Name_ - span = soup.new_tag("span") - span.attrs["class"] = "dactyl_xref" - span.string = '"%s" in the ' % link_label - em = soup.new_tag("em") - em.string = ref_target - span.append(em) - xref.replace_with(span) - - else: - # The xref is on-target - # First fix the hyperlink. Use the HTML (in case of link-by-md): - xref.attrs["href"] = xref_page["html"]+xref_frag - # If this link's label is only whitespace, fix it - if not [s for s in xref.stripped_strings]: - #print("replacing label for xref", xref) - #print("stripped_strings was", [s for s in xref.stripped_strings]) - xref.string = xref_page["name"] diff --git a/tool/packages.txt b/tool/packages.txt deleted file mode 100644 index 47b0a4fff4..0000000000 --- a/tool/packages.txt +++ /dev/null @@ -1,5 +0,0 @@ -Jinja2==2.8 -Markdown==2.6.2 -watchdog==0.8.3 -requests==2.8.1 -beautifulsoup4==4.4.1