Files
xrpl-dev-portal/tool/dactyl_style_checker.py

171 lines
6.2 KiB
Python
Executable File

#!/usr/bin/env python3
###############################################################################
## Dactyl Style Police ##
## Author: Rome Reginelli ##
## Copyright: Ripple Labs, Inc. 2016 ##
## ##
## Reads the markdown files to try and enforce elements of good style. ##
###############################################################################
import logging
import argparse
#import nltk
import re
import collections
import yaml
from bs4 import BeautifulSoup
from bs4 import Comment
from bs4 import NavigableString
import dactyl_build
DEFAULT_CONFIG_FILE = "dactyl-config.yml"
OVERRIDE_COMMENT_REGEX = r" *STYLE_OVERRIDE: *([\w, -]+)"
logger = logging.getLogger()
def load_config(config_file=DEFAULT_CONFIG_FILE):
global config
dactyl_build.load_config(config_file)
config = dactyl_build.config
if "word_substitutions_file" in config:
with open(config["word_substitutions_file"], "r") as f:
config["disallowed_words"] = yaml.load(f)
else:
logging.warning("No 'word_substitutions_file' found in config.")
if "phrase_substitutions_file" in config:
with open(config["phrase_substitutions_file"], "r") as f:
config["disallowed_phrases"] = yaml.load(f)
else:
logging.warning("No 'phrase_substitutions_file' found in config.")
def tokenize(passage):
words = re.split(r"[\s,.;()!'\"]+", passage)
return [w for w in words if w]
def depunctuate(passage):
punctuation = re.compile(r"[,.;()!'\"]")
return re.sub(punctuation, "", passage)
def check_all_pages(target=None):
"""Reads all pages for a target and checks them for style."""
target = dactyl_build.get_target(target)
pages = dactyl_build.get_pages(target)
pp_env = dactyl_build.setup_pp_env()
print("Style Checker - checking all pages in target %s" % target["name"])
style_issues = []
for page in pages:
if "md" not in page:
# Not a doc page, move on
continue
logging.info("Checking page %s" % page["name"])
page_issues = []
html = dactyl_build.parse_markdown(page, pages=pages, target=target)
soup = BeautifulSoup(html, "html.parser")
overrides = get_overrides(soup)
content_elements = ["p","li","a","em","strong","th","td",
"h1","h2","h3","h4","h5","h6"]
for el in soup.descendants:
if (type(el) == NavigableString and
el.parent.name in content_elements and
str(el).strip()):
passage = str(el).strip()
passage_issues = check_passage(passage, overrides)
if passage_issues:
page_issues += passage_issues
#print("'%s' (%s)" % (el, el.parent.name))
# for el in soup.find_all(content_elements):
# for passage in el.stripped_strings:
# passage_issues = check_passage(passage, overrides)
# if passage_issues:
# page_issues += passage_issues
if page_issues:
style_issues.append( (page["name"], page_issues) )
return style_issues
def get_overrides(soup):
overrides = []
comments = soup.find_all(string=lambda text:isinstance(text,Comment))
for comment in comments:
m = re.match(OVERRIDE_COMMENT_REGEX, comment)
if m:
new_overrides = m.group(1).split(",")
new_overrides = [o.strip() for o in new_overrides]
logging.info("Overrides found: %s" % new_overrides)
overrides += new_overrides
return overrides
def check_passage(passage, overrides):
"""Checks an individual string of text for style issues."""
issues = []
logging.debug("Checking passage %s" % passage)
#tokens = nltk.word_tokenize(passage)
tokens = tokenize(passage)
for t in tokens:
if t.lower() in config["disallowed_words"]:
if t.lower() in overrides:
logging.info("Unplain word violation %s overridden" % t)
continue
issues.append( ("Unplain Word", t.lower()) )
for phrase,sub in config["disallowed_phrases"].items():
if phrase.lower() in depunctuate(passage):
if phrase.lower() in overrides:
logging.info("Unplain phrase violation %s overridden" % t)
continue
#logging.warn("Unplain phrase: %s; suggest %s instead" % (phrase, sub))
issues.append( ("Unplain Phrase", phrase.lower()) )
return issues
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Check content files for style issues.")
parser.add_argument("--config", "-c", type=str,
help="Specify path to an alternate config file.")
parser.add_argument("--verbose", "-v", action="store_true",
help="Show status messages")
parser.add_argument("--target", "-t", type=str,
help="Check the specified target.")
cli_args = parser.parse_args()
if cli_args.verbose:
logging.basicConfig(level=logging.INFO)
if cli_args.config:
load_config(cli_args.config)
else:
load_config()
issues = check_all_pages(target=cli_args.target)
if issues:
num_issues = sum(len(p[1]) for p in issues)
print("Found %d issues:" % num_issues)
for pagename,issuelist in issues:
print("Page: %s" % pagename)
c = collections.Counter(issuelist)
for i, count_i in c.items():
if i[0]=="Unplain Phrase":
print(" Discouraged phrase: %s (%d instances); suggest '%s' instead." %
( i[1], count_i, config["disallowed_phrases"][i[1].lower()] ))
elif i[0]=="Unplain Word":
print(" Discouraged word: %s (%d instances); suggest '%s' instead." %
( i[1], count_i, config["disallowed_words"][i[1].lower()] ))
else:
print(" %s: %s (%d instances)" % (i[0], i[1], count_i))
exit(1)
else:
print("Style check passed with flying colors!")
exit(0)