parse_pages - code quality improvements

2025-11-30 00:25:49 +00:00 · 2016-01-14 17:09:46 -08:00
parent 0b8300ed0d
commit 6a9e64a1cf
1 changed files with 132 additions and 107 deletions
--- a/tool/parse_pages.py
+++ b/tool/parse_pages.py
@@ -7,8 +7,8 @@
 # Optionally pre-compile them to HTML (using pandoc & a custom filter)
 ################################################################################

-from jinja2 import Environment, FileSystemLoader
-import os, sys, re
+import os
+import re
 import json
 import argparse

@@ -18,12 +18,13 @@ import subprocess
 #Fetch markdown sources from another repo
 import requests

-#Used for processing and post-processing of markdown
+#Used for processing and pre/post-processing of markdown
+from jinja2 import Environment, FileSystemLoader
 from markdown import markdown
 from bs4 import BeautifulSoup

 #Watchdog stuff
-import time#, logging
+import time
 from watchdog.observers import Observer
 from watchdog.events import PatternMatchingEventHandler

@@ -38,7 +39,7 @@ PRINCE_PAGE_MANIFEST_FILE = "/tmp/devportal-pages.txt"

 PDF_TARGET = "pdf"
 DEFAULT_TARGET = "local"
-MULTICODE_TAB_TARGETS = ["local","ripple.com"]
+MULTICODE_TAB_TARGETS = ["local", "ripple.com"]


 MC_START_REGEX = re.compile("<!-- *<div class='multicode'[^>]*> *-->")
@@ -47,6 +48,7 @@ DOCTOC_START = "<!-- START doctoc generated TOC please keep comment here to allo
 DOCTOC_END = "<!-- END doctoc generated TOC please keep comment here to allow auto update -->"

 def parse_markdown(md, target=DEFAULT_TARGET, pages=None):
+    """Take a markdown string and output HTML for that content"""
    ## Python markdown requires markdown="1" on HTML block elements
    ##     that contain markdown. AND there's a bug where if you use
    ##     markdown.extensions.extra, it replaces code fences in HTML
@@ -57,38 +59,38 @@ def parse_markdown(md, target=DEFAULT_TARGET, pages=None):
 #            return m.group(1) + ' markdown="1">'
 #        else:
 #            return m.group(0)
-#    
+#
 #    md = re.sub("(<div[^>]*)>", add_markdown_class, md)
 #    print("done")
-    
+
    #Strip out doctoc Table of Contents for RippleAPI
    doctoc_start_i = md.find(DOCTOC_START)
    doctoc_end_i = md.find(DOCTOC_END)
    if doctoc_start_i != -1 and doctoc_end_i != -1:
        md = md[:doctoc_start_i]+md[doctoc_end_i+len(DOCTOC_END):]
-    
+
    #the actual markdown parsing is the easy part
    print("parsing markdown...")
    html = markdown(md, extensions=["markdown.extensions.extra",
                                    "markdown.extensions.toc"])
    print("done")
-    
+
    #if target uses multicode tabs, uncomment the divs
    if target in MULTICODE_TAB_TARGETS:
        print("enabling multicode tabs...")
        html = re.sub(MC_START_REGEX, "<div class='multicode'>", html)
        html = re.sub(MC_END_REGEX, "</div>", html)
        print("done")
-    
+
    #replace underscores with dashes in h1,h2,etc. for Flatdoc compatibility
    print("tweaking header IDs...")
    soup = BeautifulSoup(html, "html.parser")
    headers = soup.find_all(name=re.compile("h[0-9]"), id=True)
    for h in headers:
        if "_" in h["id"]:
-            h["id"] = h["id"].replace("_","-")
+            h["id"] = h["id"].replace("_", "-")
    print("done")
-    
+
    #buttonize links ending in >
    print("buttonizing try-it links...")
    buttonlinks = soup.find_all("a", string=re.compile(">$"))
@@ -98,14 +100,14 @@ def parse_markdown(md, target=DEFAULT_TARGET, pages=None):
        else:
            link["class"] = "button"
    print("done")
-    
+
    #Replace links for live site
    if target != DEFAULT_TARGET:
-        print("modifying links for target",target)
+        print("modifying links for target", target)
        if not pages:
            pages = get_pages()
-        
-        links = soup.find_all("a",href=re.compile("^[^.]+\.html"))
+
+        links = soup.find_all("a", href=re.compile(r"^[^.]+\.html"))
        for link in links:
            for page in pages:
                if target in page:
@@ -116,7 +118,7 @@ def parse_markdown(md, target=DEFAULT_TARGET, pages=None):
                        link["href"] = link["href"].replace(local_url,
                                                            target_url)
        print("done")
-    
+
    print("re-rendering HTML")
    html2 = str(soup)
    print("done")
@@ -124,10 +126,12 @@ def parse_markdown(md, target=DEFAULT_TARGET, pages=None):

 MARKDOWN_LINK_REGEX = re.compile(r"(\[([^\]]+)\]\(([^:)]+)\)|\[([^\]]+)\]:\s*(\S+)$)", re.MULTILINE)
 def githubify_markdown(md, target=DEFAULT_TARGET, pages=None):
+    """Github-friendly markdown has absolute links, no md in divs"""
    if not pages:
        pages = get_pages()

    class MDLink:
+        """A markdown link, either a reference link or inline link"""
        def __init__(self, fullmatch, label, url, label2, url2):
            self.fullmatch = fullmatch
            if label:
@@ -138,22 +142,18 @@ def githubify_markdown(md, target=DEFAULT_TARGET, pages=None):
                self.label = label2
                self.url = url2
                self.is_reflink = True
-        
+
        def to_markdown(self):
-            s = "["
-            s += self.label
-            s += "]"
+            """Re-represent self as a link in markdown syntax"""
+            s = "[" + self.label + "]"
            if self.is_reflink:
-                s += ": "
-                s += self.url
+                s += ": " + self.url
            else:
-                s += "("
-                s += self.url
-                s += ")"
+                s += "(" + self.url + ")"
            return s

    links = [MDLink(*m) for m in MARKDOWN_LINK_REGEX.findall(md)]
-    
+
    for link in links:
        for page in pages:
            if target in page:
@@ -163,138 +163,158 @@ def githubify_markdown(md, target=DEFAULT_TARGET, pages=None):
                if link.url[:len(local_url)] == local_url:
                    link.url = link.url.replace(local_url, target_url)
                    md = md.replace(link.fullmatch, link.to_markdown())
-    
+
    return md
-    
-def get_pages(target=None,verbose=True):
+
+def get_pages(target=None):
+    """Read pages.json and return an object, optionally filtered
+       to just the pages that this target cares about"""
    with open(PAGE_MANIFEST_FILE) as f:
        pages = json.load(f)
-    
+
    if target:
-    #filter pages that aren't part of this target
+        #filter pages that aren't part of this target
+        def should_include(page, target):
+            if "targets" not in page:
+                return True
+            if target in page["targets"]:
+                return True
+            else:
+                return False
        pages = [page for page in pages
-                 if "targets" not in page or target in page["targets"]
-                ]
+                 if should_include(page, target)]
    return pages

 def render_pages(precompiled, target=DEFAULT_TARGET):
+    """The main work function. Reads pages.json, runs the pre-parser,
+       runs the markdown parser, and writes the resulting 
+       HTML files and maybe PDF."""
    pages = get_pages(target)
    categories = []#ordered, de-duplicated list
    for page in pages:
        if "category" in page and page["category"] not in categories:
            categories.append(page["category"])
-    print("categories:",categories)
-    
+    print("categories:", categories)
+
    env = Environment(loader=FileSystemLoader(os.path.curdir))
    env.lstrip_blocks = True
    env.trim_blocks = True
-    
+
    pp_env = Environment(loader=FileSystemLoader(CONTENT_PATH))
    #Example: if we want to add custom functions to the md files
    #pp_env.globals['foo'] = lambda x: "FOO %s"%x

    for currentpage in pages:
-    
+
        if "md" in currentpage:
            # Documentation file
-    
+
            print("reading template file...")

            doc_template = env.get_template(DOC_TEMPLATE_FILE)
            if target == PDF_TARGET:
                doc_template = env.get_template(PDF_TEMPLATE_FILE)
            print("done")
-            
-    
+
+
            if precompiled:
                if "http:" in currentpage["md"] or "https:" in currentpage["md"]:
-                    #No pre-processing for remote pages
-                    print("fetching remote page",currentpage["name"])
+                    # No pre-processing for remote pages
+                    print("fetching remote page",
+                          currentpage["name"])
                    try:
-                        r = requests.get(currentpage["md"])
-                        if r.status_code == 200:
-                            md_in = r.text
+                        response = requests.get(currentpage["md"])
+                        if response.status_code == 200:
+                            md_in = response.text
                        else:
                            raise requests.RequestException("Status code for page was not 200")
                    except:
-                        print("Skipping page",currentpage["name"],"due to error fetching contents")
+                        print("Skipping page",
+                              currentpage["name"],
+                              "due to error fetching contents")
                        continue
                    print("done")
-                    
+
                else:
-                    ## Read markdown as a template
-                    filein = os.path.join(CONTENT_PATH, currentpage["md"])
-                    print("pre-processing markdown file",currentpage["md"])
+                    # Read markdown as a template
+                    print("pre-processing markdown file",
+                          currentpage["md"])
                    md_raw = pp_env.get_template(currentpage["md"])
-                    md_in = md_raw.render(target=target,pages=pages)
-                
+                    md_in = md_raw.render(target=target,
+                                          pages=pages)
+
                print("parsing markdown for", currentpage["name"])
                doc_html = parse_markdown(md_in, target, pages)
-                
                print("done")
-                
-                print("rendering page",currentpage["name"],"...")
-                out_html = doc_template.render(currentpage=currentpage,
-                                               categories=categories,
-                                               pages=pages, 
-                                               content=doc_html,        
-                                               precompiled=precompiled)
+
+                print("rendering page", currentpage["name"], "...")
+                out_html = doc_template.render(
+                    currentpage=currentpage,
+                    categories=categories,
+                    pages=pages,
+                    content=doc_html,
+                    precompiled=precompiled)
                print("done")
-            
+
            else:
                print("compiling skipped")
-                
-                print("rendering page",currentpage["name"],"...")
-                out_html = doc_template.render(currentpage=currentpage, 
-                                               categories=categories,
-                                               pages=pages, 
-                                               content="", 
-                                               precompiled=precompiled)
+
+                print("rendering page", currentpage["name"], "...")
+                out_html = doc_template.render(
+                    currentpage=currentpage,
+                    categories=categories,
+                    pages=pages,
+                    content="",
+                    precompiled=precompiled)
                print("done")
-        
+
        else:
            # Not a documentation page
            print("reading template file...")
            template = env.get_template(currentpage["template"])
            print("done")
-            
-            
-            print("rendering page",currentpage["name"],"...")
+
+
+            print("rendering page", currentpage["name"], "...")
            out_html = template.render(currentpage=currentpage,
                                       categories=categories,
                                       pages=pages)
            print("done")
-            
-        
+
+
        fileout = os.path.join(BUILD_PATH, currentpage["html"])
-        if (not os.path.isdir(BUILD_PATH)):
-            print("creating build folder",BUILD_PATH)
+        if not os.path.isdir(BUILD_PATH):
+            print("creating build folder", BUILD_PATH)
            os.makedirs(BUILD_PATH)
        with open(fileout, "w") as f:
-            print("writing to file:",fileout,"...")
+            print("writing to file:", fileout, "...")
            f.write(out_html)
            print("done")


 def watch(pre_parse, pdf, target):
+    """Look for changed files and re-generate HTML (and optionally 
+       PDF whenever there's an update. Runs until interrupted."""
    path = ".."
    class UpdaterHandler(PatternMatchingEventHandler):
+        """Updates to pattern-matched files means rendering."""
        def on_any_event(self, event):
            print("got event!")
            if pdf:
                make_pdf(pdf)
            render_pages(pre_parse, target)
            print("done rendering")
-    
-    patterns = ["*tool/pages.json","*tool/template-*.html"]
+
+    patterns = ["*tool/pages.json", "*tool/template-*.html"]
    if pre_parse:
        #md only prompts HTML change if pre-parsed
-        patterns.append("*content/*.md",)
+        patterns.append("*content/*.md", "content/code_samples/*")
    event_handler = UpdaterHandler(patterns=patterns)
    observer = Observer()
    observer.schedule(event_handler, path, recursive=True)
    observer.start()
-    #The above starts an observing thread, so the main thread can just wait
+    # The above starts an observing thread,
+    #   so the main thread can just wait
    try:
        while True:
            time.sleep(1)
@@ -303,18 +323,21 @@ def watch(pre_parse, pdf, target):
    observer.join()

 def make_pdf(outfile):
+    """Use prince to convert several HTML files into a PDF"""
    print("rendering PDF-able versions of pages...")
    render_pages(precompiled=True, target=PDF_TARGET)
    print("done")
-    
+
    args = ['prince', '-o', outfile]
    pages = get_pages(PDF_TARGET)
    args += ["../"+p["html"] for p in pages]
-    print("generating PDF: running ", " ".join(args),"...")
+    print("generating PDF: running ", " ".join(args), "...")
    prince_resp = subprocess.check_output(args, universal_newlines=True)
+    print(prince_resp)


 def githubify(md_file_name, target=DEFAULT_TARGET):
+    """Wrapper - make the markdown resemble GitHub flavor"""
    filein = os.path.join(CONTENT_PATH, md_file_name)
    with open(filein, "r") as f:
        md = f.read()
@@ -324,35 +347,37 @@ def githubify(md_file_name, target=DEFAULT_TARGET):

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
-            description='Generate static site from markdown and templates.')
+        description='Generate static site from markdown and templates.')
    parser.add_argument("-f", "--flatdoc", action="store_true",
-                       help="Use Flatdoc instead of parsing pages")
-    parser.add_argument("-w","--watch", action="store_true",
-                       help="Watch for changes and re-generate the files. This runs until force-quit.")
-    parser.add_argument("--pdf", type=str, 
-            help="Generate a PDF, too. Requires Prince.")
-    parser.add_argument("-g","--githubify", type=str, help="Output md prepared for GitHub")
-    parser.add_argument("--target", "-t", type=str, default=DEFAULT_TARGET)
-    args = parser.parse_args()
-    pre_parse = not args.flatdoc
-    
-    if args.githubify:
-        githubify(args.githubify, args.target)
+                        help="Use Flatdoc instead of parsing pages")
+    parser.add_argument("-w", "--watch", action="store_true",
+                        help="Watch for changes and re-generate the files. This runs until force-quit.")
+    parser.add_argument("--pdf", type=str,
+                        help="Generate a PDF, too. Requires Prince.")
+    parser.add_argument("-g", "--githubify", type=str,
+                        help="Output md prepared for GitHub")
+    parser.add_argument("--target", "-t", type=str,
+                        default=DEFAULT_TARGET)
+    cli_args = parser.parse_args()
+    pre_parse = not cli_args.flatdoc
+
+    if cli_args.githubify:
+        githubify(cli_args.githubify, cli_args.target)
        exit(0)
-    
-    if args.pdf:
-        if args.pdf[-4:] != ".pdf":
+
+    if cli_args.pdf:
+        if cli_args.pdf[-4:] != ".pdf":
            exit("PDF filename must end in .pdf")
        print("making a pdf...")
-        make_pdf(args.pdf)
+        make_pdf(cli_args.pdf)
        print("pdf done")
-    
+
    #Not an accident that we go on to re-gen files in non-PDF format
    print("rendering pages now")
-    render_pages(precompiled=pre_parse, target=args.target)
+    render_pages(precompiled=pre_parse, target=cli_args.target)
    print("all done")
-    
-    if args.watch:
+
+    if cli_args.watch:
        print("watching for changes...")
-        watch(pre_parse, args.pdf, args.target)
-    
+        watch(pre_parse, cli_args.pdf, cli_args.target)
+