Compare commits

...

2 Commits

Author SHA1 Message Date
mDuo13
db877d0d83 Add 'live' link checker powered by Selenium WebDriver 2026-05-21 16:26:52 -07:00
mDuo13
7dc311935e Implement external link checker 2026-05-19 18:45:57 -07:00
4 changed files with 464 additions and 1 deletions

2
.gitignore vendored
View File

@@ -11,6 +11,6 @@ _code-samples/*/js/package-lock.json
_code-samples/*/go/go.sum
_code-samples/*/java/target/
_code-samples/*/*/*[Ss]etup.json
tools/link-cache.json
# PHP
composer.lock

View File

@@ -6,6 +6,7 @@ ignore:
- _code-samples/create-amm/ts/tsconfig.json
- resources/contribute-blog/_blog-template.md
- resources/contribute-documentation/_tutorial-template.md
- tools/*
l10n:
defaultLocale: en-US
locales:

40
tools/broken-links.txt Normal file
View File

@@ -0,0 +1,40 @@
# "Known broken" links file.
# Matching URLs will not be checked by the link checker and won't be
# flagged/reported as broken.
# Typically, these are sites that block crawlers so are falsely reported as
# broken by the link checker, or tend to be slow / unreliable but are generally
# still the proper, official link.
#
# Syntax:
# One fully-qualified http: or https: URL per line.
# Lines starting with # are ignored (comments), but # in the middle of a line is
# treated as part of the URL and parsed as normal.
# * may be used as a wildcard AT THE END OF LINKS ONLY
# These sites block crawlers (often using Cloudflare) with a 403 but typically
# work when accessed from an actual browser:
https://x.com/*
https://www.gnu.org/software/screen/manual/screen.html
https://www.coindesk.com/markets/2015/04/02/1-million-legal-fight-ensnares-ripple-bitstamp-and-jed-mccaleb/
https://www.coindesk.com/markets/2016/02/12/ripple-settles-1-million-lawsuit-with-former-executive-and-founder/
https://bsaaml.ffiec.gov/manual/Introduction/01
https://www.sec.gov/oiea/investor-alerts-and-bulletins/ib_coinofferings
https://www.npmjs.com/*
https://go.dev/dl/
https://search.maven.org/artifact/org.xrpl/xrpl4j-parent
https://xrp.cafe/
https://medium.com/*
# Redocly adds links to claude & ChatGPT to every page. No need to check them
# with a unique query every time:
https://chat.openai.com/?*
https://claude.ai/new?*
# wxPython is still in active development, but their site appears to be dead.
# Maybe it'll come back online, in which case we can remove these:
https://wxpython.org/
https://docs.wxpython.org/*
# The WebSocket tool uses the anchors portion of the URL even though it doesn't
# have a corresponding element by ID, so links to it aren't actually broken:
http://localhost:4000/resources/dev-tools/websocket-api-tool*

422
tools/check-external-links.py Executable file
View File

@@ -0,0 +1,422 @@
#!/usr/bin/env python
"""
Check markdown files for broken links.
Usage (from repo root):
tools/check-external-links.py [folder/to/check/]
If [folder/to/check] is omitted, check ./docs/
Prints a report of broken links. In the default mode, checks external links in
Markdown files only. Pass --live to test against a local Redocly dev server,
which checks *all* links, including anchors and markdoc tags; assumes the
server is already up and running on localhost:4000.
Requires: beautifulsoup4, markdown, requests
For live mode, selenium and Chrome are also required.
Links & sites that often report false-positives can be added to broken-links.txt
to have the link checker skip them.
"""
import argparse
import json
import logging
import os
import re
from time import time, sleep
import requests
from requests.adapters import HTTPAdapter, Retry
from markdown import markdown
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
CHECK_IN_INTERVAL = 30 # Seconds before printing *something* as a keep-alive
DEFAULT_SKIP_PATHS = [
".git",
"node_modules",
".venv",
".claude",
"__pycache__",
"_snippets",
"_code-samples", # Debatably, we might want to link-check the READMEs here
"_api-examples",
"_sources",
]
MAX_RETRIES = 1 # Times to retry if a link doesn't work
TIMEOUT_SECONDS = 8 # Seconds before giving up on a link
RECHECK_INTERVAL = 60*60*24*7 # Seconds before re-checking a link
CACHE_WRITE_INTERVAL = 60 # Save cache after this many seconds even if ongoing
SAME_HOST_DELAY = 0.5 # Seconds to wait before calling the same host again
DEFAULT_CACHE_FILE = "link-cache.json"
CACHE_FILE_FOLDER = "tools" # Check this folder for cache file
KNOWN_BROKEN_LINKS_FILE = "broken-links.txt" # List of links that work "normally" but report false-positives in this link checker
USER_AGENT = "xrpl-dev-portal-link-checker/0.1" # Identify self to websites
REDOCLY_DEV_BASE = "http://localhost:4000/"
UNMATCHED_REFLINK_REGEX = re.compile(r"(\[[^\]]+)?\]\[(\w| )*\]")
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
logger.propagate = False
class LinkChecker:
def __init__(self, topdir,
skip_paths = DEFAULT_SKIP_PATHS,
cache_file = DEFAULT_CACHE_FILE,
known_broken = KNOWN_BROKEN_LINKS_FILE,
live = False
):
self.topdir = topdir
self.skip_paths = skip_paths
self.last_checkin = time()
self.last_cache_update = 0
self.live = live
self.setup_sessions()
self.init_cache(cache_file)
self.init_known_broken(known_broken)
def init_cache(self, cache_file: str):
self.anchor_cache = {}
if not cache_file:
logger.debug("No cache file, not loading anything.")
self.cache = {}
return
# Default to tools/link-cache.json whether the script is run from repo
# top or from within tools
if cache_file == DEFAULT_CACHE_FILE and os.path.basename(os.getcwd()) != CACHE_FILE_FOLDER:
cache_file = os.path.join(CACHE_FILE_FOLDER, cache_file)
self.cache_file = cache_file
try:
with open(cache_file) as f:
self.cache = json.load(f)
except Exception as e:
logger.warning(f"Unable to load cache file {cache_file}")
self.cache = {}
# Invalidate cache entries if last check failed or last success was
# more than RECHECK_INTERVAL ago
invalidate_keys = []
for href, result in self.cache.items():
was_good, time_checked = result
if not was_good or time() - time_checked > RECHECK_INTERVAL:
invalidate_keys.append(href)
if self.trim_trackers(href) != href:
# Probably a cached entry including tracking parameters from
# before changing what tracking parameters get removed
invalidate_keys.append(href)
for href in invalidate_keys:
del self.cache[href]
logger.debug(f"Removed {len(invalidate_keys)} items from cache")
self.last_cache_update = time()
def write_cache(self):
if not self.cache_file:
return
with open(self.cache_file, "w") as f:
json.dump(self.cache, f)
def setup_sessions(self):
retries = Retry(total=MAX_RETRIES, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
self.s = requests.Session()
self.s.mount("https://", HTTPAdapter(max_retries=retries))
self.h = requests.Session()
self.h.mount("http://", HTTPAdapter(max_retries=retries))
self.h.headers.update({"User-Agent": USER_AGENT})
self.s.headers.update({"User-Agent": USER_AGENT})
self.last_host_called = None
if self.live:
options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
self.chrome = webdriver.Chrome(options=options)
# Make a second one so we can check for anchors without resetting
# all the references we have in the page the link comes from
self.chrome2 = webdriver.Chrome(options=options)
def init_known_broken(self, known_broken):
self.exact_known_broken = []
self.wildcard_known_broken = []
if known_broken == KNOWN_BROKEN_LINKS_FILE and os.path.basename(os.getcwd()) != CACHE_FILE_FOLDER:
known_broken = os.path.join(CACHE_FILE_FOLDER, known_broken)
try:
with open(known_broken) as f:
kb_text = f.read()
except (FileNotFoundError):
logger.warning("No known broken links file; proceeding without.")
return
for line in kb_text.split("\n"):
line = line.strip()
if not line or line[:1] == "#":
continue
if line[-1:] == "*":
self.wildcard_known_broken.append(line[:-1])
else:
self.exact_known_broken.append(line)
def checkin(self, current_ref: str):
"""
Print output periodically so you know the job is still running, and
save the cache file if it needs updating.
"""
if time() - self.last_checkin > CHECK_IN_INTERVAL:
print(f"... still working ({current_ref}) ...")
self.last_checkin = time()
if time() - self.last_cache_update > CACHE_WRITE_INTERVAL:
self.write_cache()
def walk(self):
logger.info(f"Checking files in {os.path.abspath(self.topdir)}")
externalCache = []
broken_links = []
total_links_checked = 0
last_checkin = time()
for dirpath, dirnames, filenames in os.walk(self.topdir):
self.checkin(f"dir: {dirpath}")
if dirpath in self.skip_paths:
logger.debug(f"Skipping ignored path {dirpath}")
continue
for fname in filenames:
self.checkin(f"file: {fname}")
in_file = os.path.join(dirpath, fname)
if in_file.endswith(".md"):
if self.live:
newly_checked, newly_broken = self.check_file_live(in_file)
else:
newly_checked, newly_broken = self.check_file(in_file)
broken_links += newly_broken
total_links_checked += newly_checked
if in_file.endswith(".page.tsx") and self.live:
newly_checked, newly_broken = self.check_file_live(in_file)
broken_links += newly_broken
total_links_checked += newly_checked
self.report(broken_links, total_links_checked)
def check_file(self, in_file: str):
"""
Given a specific .md file, look for external links in it and check them.
Returns how many were checked and a list of tuples where each member is
a (file,link) pair representing a broken link.
Note that this does not parse Markdoc including partials so it can't
handle the full context of the Redocly parser but those *usually* won't
be needed for external links.
"""
logger.info(f"Checking file {in_file}")
with open(in_file, 'r', encoding="utf-8") as f:
html = markdown(f.read())
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all("a")
broken = []
num_checked = 0
for link in links:
self.checkin(f"link: {link}")
if "href" not in link.attrs:
# probably an <a name> type anchor, skip
continue
if (link["href"].startswith("https://") or
link["href"].startswith("http://")):
was_checked, was_good = self.check_link(link["href"])
num_checked += was_checked
if not was_good:
broken.append( (in_file, link["href"]) )
return num_checked, broken
def check_file_live(self, in_file: str):
"""
Given a specific .md file, fetch it from the Redocly dev server and
check for links in it.
"""
suffixes = ["/index.md", ".md", "/index.page.tsx", ".page.tsx"] # order matters
for suffix in suffixes:
if in_file.endswith(suffix):
path = in_file[:-len(suffix)]
break
else:
logger.warning(f"Not checking path that's not an md or page.tsx file: {in_file}")
return (0, [])
url = REDOCLY_DEV_BASE + path
code = self.fetch(url)
if code < 200 or code >= 400:
logger.warning(f"Failed to get page from dev server for file {in_file}")
return 0, []
broken = []
num_checked = 0
logger.info(f"Checking path {path}")
self.chrome.get(REDOCLY_DEV_BASE+path)
# sleep(0.2) # give it a moment in case hydration fails or something
rootlayout = self.chrome.find_element(By.CSS_SELECTOR, '[data-component-name="layouts/RootLayout"]')
try:
links = rootlayout.find_elements(By.CSS_SELECTOR, "a")
pagetext = rootlayout.text
hrefs = [link.get_attribute("href") for link in links]
except StaleElementReferenceException:
# This can happen when hydration fails or the page is updated
# asynchronously, for example by fetching amendment status.
# Try again and hopefully it works this time.
sleep(0.2)
rootlayout = self.chrome.find_element(By.CSS_SELECTOR, '[data-component-name="layouts/RootLayout"]')
pagetext = rootlayout.text
links = rootlayout.find_elements(By.CSS_SELECTOR, "a")
hrefs = [link.get_attribute("href") for link in links]
for href in hrefs:
self.checkin(f"link: {href}")
if not href:
# Probably a name anchor something, skip
continue
if href.startswith(REDOCLY_DEV_BASE):
was_checked, was_good = self.check_dev_link(href)
num_checked += was_checked
if not was_good:
broken.append( (in_file, href) )
elif href.startswith("http://") or href.startswith("https://"):
was_checked, was_good = self.check_link(href)
num_checked += was_checked
if not was_good:
broken.append( (in_file, href) )
broken_reflinks = self.check_for_unparsed_reflinks(pagetext)
for brl in broken_reflinks:
broken.append( (in_file, brl) )
return num_checked, broken
def check_for_unparsed_reflinks(self, text: str):
unparsed_links = []
matches = UNMATCHED_REFLINK_REGEX.finditer(text)
for m in matches:
logger.warning(f"... ... Unparsed reference link: {m.group(0)}")
unparsed_links.append(m.group(0))
return unparsed_links
def is_known_broken(self, href: str):
if href in self.exact_known_broken:
return True
for pattern in self.wildcard_known_broken:
if href.startswith(pattern):
return True
return False
def fetch(self, href: str):
"""
Get status code of a URL, using saved sessions & retries, automatically
failing over from HTTP HEAD to HTTP GET and adding a slight delay to
avoid hammering the same host repeatedly.
"""
proto,predicate = href.split("//",1)
host = predicate.split("/", 1)[0]
if host == self.last_host_called:
sleep(SAME_HOST_DELAY)
self.last_host_called = host
if href.startswith("http://"):
sess = self.h
else:
sess = self.s
try:
code = sess.head(href, timeout=TIMEOUT_SECONDS).status_code
except Exception as e:
logger.debug(f"Error getting {href}: {e}")
try:
code = sess.get(href, timeout=TIMEOUT_SECONDS)
except Exception as e2:
logger.debug(f"Error getting {href}: {e2}")
code = 500
return code
def trim_trackers(self, href: str):
if "?__hstc=" in href:
return href[:href.find("?__hstc=")]
return href
def check_link(self, href: str):
href = self.trim_trackers(href)
if href in self.cache.keys():
logger.debug(f"... Skipping (cached): {href}")
was_good, time_checked = self.cache[href]
return (1, was_good)
if self.is_known_broken(href):
logger.debug(f"... Skipping (known broken): {href}")
return (0, True)
logger.info(f"... Testing link {href}")
code = self.fetch(href)
if code < 200 or code >= 400:
logger.warning(f"... Broken link to {href}")
self.cache[href] = (False, time())
return (1, False)
logger.info("... ... success.")
self.cache[href] = (True, time())
return (1, True)
def check_dev_link(self, href: str):
"""
Check a local dev link; if it has an anchor, use the Chrome driver to
check for the presence of an element with a matching ID, otherwise fall
back to just checking that the page exists.
"""
if self.is_known_broken(href):
logger.debug(f"... Skipping (known broken): {href}")
return (0, True)
if "#" in href and href.split("#",1)[1].strip():
if href in self.anchor_cache.keys():
return (1, self.anchor_cache[href])
id = href.split("#",1)[1].strip()
# Use the selenium driver to check for the exact anchor
self.chrome2.get(href)
sleep(0.3) # delay to give it time for hydration failures
try:
el = self.chrome2.find_element(By.ID, id)
except NoSuchElementException:
el = None
if el:
self.anchor_cache[href] = True
return (1, True)
else:
self.anchor_cache[href] = False
return (1, False)
return self.check_link(href)
def report(self, broken_links: list, total_links_checked: int):
print("---------------------------------------------------------------")
print(f"{len(broken_links)} broken links found among {total_links_checked} total links.")
last_printed_in_file = None
for in_file, href in broken_links:
if in_file != last_printed_in_file:
print("File:", in_file)
last_printed_in_file = in_file
print(" Link:", href)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="XRPL Dev Portal link checker")
noisiness = parser.add_mutually_exclusive_group(required=False)
noisiness.add_argument("--quiet", "-q", action="store_true",
help="Suppress informational status messages")
noisiness.add_argument("--debug", "-d", action="store_true",
help="Print debug-level log messages")
parser.add_argument("--live", "-l", action="store_true",
help="Use a Selenium-powered browser session to get rendered docs")
parser.add_argument("path", type=str, nargs="?", default="docs/",
help="Check *.md files in this directory (including subdirs)")
cli_args = parser.parse_args()
if cli_args.quiet:
logger.setLevel(logging.WARNING)
elif cli_args.debug:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)
l = LinkChecker(cli_args.path, live=cli_args.live)
try:
l.walk()
except (KeyboardInterrupt) as e:
l.write_cache()