Add 'live' link checker powered by Selenium WebDriver

This commit is contained in:
mDuo13
2026-05-21 16:26:52 -07:00
parent 7dc311935e
commit db877d0d83
3 changed files with 182 additions and 14 deletions

View File

@@ -6,6 +6,7 @@ ignore:
- _code-samples/create-amm/ts/tsconfig.json
- resources/contribute-blog/_blog-template.md
- resources/contribute-documentation/_tutorial-template.md
- tools/*
l10n:
defaultLocale: en-US
locales:

View File

@@ -1,10 +1,40 @@
# Known broken links file. Syntax:
# One fully-qualified http: or https: URL per line.
# Lines starting with # are ignored
# * may be used as a wildcard AT THE END OF LINKS ONLY
# "Known broken" links file.
# Matching URLs will not be checked by the link checker and won't be
# flagged/reported as broken.
# Typically, these are sites that block crawlers so are falsely reported as
# broken by the link checker, or tend to be slow / unreliable but are generally
# still the proper, official link.
#
# Syntax:
# One fully-qualified http: or https: URL per line.
# Lines starting with # are ignored (comments), but # in the middle of a line is
# treated as part of the URL and parsed as normal.
# * may be used as a wildcard AT THE END OF LINKS ONLY
# These sites block crawlers (often using Cloudflare) with a 403 but typically
# work when accessed from an actual browser:
https://x.com/*
https://www.gnu.org/software/screen/manual/screen.html
https://www.coindesk.com/markets/2015/04/02/1-million-legal-fight-ensnares-ripple-bitstamp-and-jed-mccaleb/
https://www.coindesk.com/markets/2016/02/12/ripple-settles-1-million-lawsuit-with-former-executive-and-founder/
https://bsaaml.ffiec.gov/manual/Introduction/01
https://www.sec.gov/oiea/investor-alerts-and-bulletins/ib_coinofferings
https://www.npmjs.com/*
https://go.dev/dl/
https://search.maven.org/artifact/org.xrpl/xrpl4j-parent
https://xrp.cafe/
https://medium.com/*
# Redocly adds links to claude & ChatGPT to every page. No need to check them
# with a unique query every time:
https://chat.openai.com/?*
https://claude.ai/new?*
# wxPython is still in active development, but their site appears to be dead.
# Maybe it'll come back online, in which case we can remove these:
https://wxpython.org/
https://docs.wxpython.org/*
# The WebSocket tool uses the anchors portion of the URL even though it doesn't
# have a corresponding element by ID, so links to it aren't actually broken:
http://localhost:4000/resources/dev-tools/websocket-api-tool*

View File

@@ -1,15 +1,19 @@
#!/usr/bin/env python
"""
Check markdown files for broken external links.
Check markdown files for broken links.
Usage (from repo root):
tools/check-external-links.py [folder/to/check/]
If [folder/to/check] is omitted, check ./docs/
Prints a report of broken links. Doesn't check relative (in-site) links.
Prints a report of broken links. In the default mode, checks external links in
Markdown files only. Pass --live to test against a local Redocly dev server,
which checks *all* links, including anchors and markdoc tags; assumes the
server is already up and running on localhost:4000.
Requires: Beautiful Soup 4, python-markdown, requests
Requires: beautifulsoup4, markdown, requests
For live mode, selenium and Chrome are also required.
Links & sites that often report false-positives can be added to broken-links.txt
to have the link checker skip them.
@@ -19,12 +23,16 @@ import argparse
import json
import logging
import os
import re
from time import time, sleep
import requests
from requests.adapters import HTTPAdapter, Retry
from markdown import markdown
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
CHECK_IN_INTERVAL = 30 # Seconds before printing *something* as a keep-alive
DEFAULT_SKIP_PATHS = [
@@ -33,6 +41,10 @@ DEFAULT_SKIP_PATHS = [
".venv",
".claude",
"__pycache__",
"_snippets",
"_code-samples", # Debatably, we might want to link-check the READMEs here
"_api-examples",
"_sources",
]
MAX_RETRIES = 1 # Times to retry if a link doesn't work
TIMEOUT_SECONDS = 8 # Seconds before giving up on a link
@@ -43,6 +55,8 @@ DEFAULT_CACHE_FILE = "link-cache.json"
CACHE_FILE_FOLDER = "tools" # Check this folder for cache file
KNOWN_BROKEN_LINKS_FILE = "broken-links.txt" # List of links that work "normally" but report false-positives in this link checker
USER_AGENT = "xrpl-dev-portal-link-checker/0.1" # Identify self to websites
REDOCLY_DEV_BASE = "http://localhost:4000/"
UNMATCHED_REFLINK_REGEX = re.compile(r"(\[[^\]]+)?\]\[(\w| )*\]")
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
@@ -52,17 +66,20 @@ class LinkChecker:
def __init__(self, topdir,
skip_paths = DEFAULT_SKIP_PATHS,
cache_file = DEFAULT_CACHE_FILE,
known_broken = KNOWN_BROKEN_LINKS_FILE
known_broken = KNOWN_BROKEN_LINKS_FILE,
live = False
):
self.topdir = topdir
self.skip_paths = skip_paths
self.last_checkin = time()
self.last_cache_update = 0
self.live = live
self.setup_sessions()
self.init_cache(cache_file)
self.init_known_broken(known_broken)
def init_cache(self, cache_file: str):
self.anchor_cache = {}
if not cache_file:
logger.debug("No cache file, not loading anything.")
self.cache = {}
@@ -87,6 +104,10 @@ class LinkChecker:
was_good, time_checked = result
if not was_good or time() - time_checked > RECHECK_INTERVAL:
invalidate_keys.append(href)
if self.trim_trackers(href) != href:
# Probably a cached entry including tracking parameters from
# before changing what tracking parameters get removed
invalidate_keys.append(href)
for href in invalidate_keys:
del self.cache[href]
logger.debug(f"Removed {len(invalidate_keys)} items from cache")
@@ -109,6 +130,14 @@ class LinkChecker:
self.s.headers.update({"User-Agent": USER_AGENT})
self.last_host_called = None
if self.live:
options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
self.chrome = webdriver.Chrome(options=options)
# Make a second one so we can check for anchors without resetting
# all the references we have in the page the link comes from
self.chrome2 = webdriver.Chrome(options=options)
def init_known_broken(self, known_broken):
self.exact_known_broken = []
self.wildcard_known_broken = []
@@ -143,7 +172,7 @@ class LinkChecker:
self.write_cache()
def walk(self):
logger.info(f"Checking *.md files in {os.path.abspath(self.topdir)}")
logger.info(f"Checking files in {os.path.abspath(self.topdir)}")
externalCache = []
broken_links = []
total_links_checked = 0
@@ -158,7 +187,14 @@ class LinkChecker:
in_file = os.path.join(dirpath, fname)
if in_file.endswith(".md"):
newly_checked, newly_broken = self.check_file(in_file)
if self.live:
newly_checked, newly_broken = self.check_file_live(in_file)
else:
newly_checked, newly_broken = self.check_file(in_file)
broken_links += newly_broken
total_links_checked += newly_checked
if in_file.endswith(".page.tsx") and self.live:
newly_checked, newly_broken = self.check_file_live(in_file)
broken_links += newly_broken
total_links_checked += newly_checked
self.report(broken_links, total_links_checked)
@@ -185,14 +221,79 @@ class LinkChecker:
if "href" not in link.attrs:
# probably an <a name> type anchor, skip
continue
if (link["href"].startswith("//") or
link["href"].startswith("https://") or
if (link["href"].startswith("https://") or
link["href"].startswith("http://")):
was_checked, was_good = self.check_link(link["href"])
num_checked += was_checked
if not was_good:
broken.append( (in_file, link["href"]) )
return num_checked, broken
def check_file_live(self, in_file: str):
"""
Given a specific .md file, fetch it from the Redocly dev server and
check for links in it.
"""
suffixes = ["/index.md", ".md", "/index.page.tsx", ".page.tsx"] # order matters
for suffix in suffixes:
if in_file.endswith(suffix):
path = in_file[:-len(suffix)]
break
else:
logger.warning(f"Not checking path that's not an md or page.tsx file: {in_file}")
return (0, [])
url = REDOCLY_DEV_BASE + path
code = self.fetch(url)
if code < 200 or code >= 400:
logger.warning(f"Failed to get page from dev server for file {in_file}")
return 0, []
broken = []
num_checked = 0
logger.info(f"Checking path {path}")
self.chrome.get(REDOCLY_DEV_BASE+path)
# sleep(0.2) # give it a moment in case hydration fails or something
rootlayout = self.chrome.find_element(By.CSS_SELECTOR, '[data-component-name="layouts/RootLayout"]')
try:
links = rootlayout.find_elements(By.CSS_SELECTOR, "a")
pagetext = rootlayout.text
hrefs = [link.get_attribute("href") for link in links]
except StaleElementReferenceException:
# This can happen when hydration fails or the page is updated
# asynchronously, for example by fetching amendment status.
# Try again and hopefully it works this time.
sleep(0.2)
rootlayout = self.chrome.find_element(By.CSS_SELECTOR, '[data-component-name="layouts/RootLayout"]')
pagetext = rootlayout.text
links = rootlayout.find_elements(By.CSS_SELECTOR, "a")
hrefs = [link.get_attribute("href") for link in links]
for href in hrefs:
self.checkin(f"link: {href}")
if not href:
# Probably a name anchor something, skip
continue
if href.startswith(REDOCLY_DEV_BASE):
was_checked, was_good = self.check_dev_link(href)
num_checked += was_checked
if not was_good:
broken.append( (in_file, href) )
elif href.startswith("http://") or href.startswith("https://"):
was_checked, was_good = self.check_link(href)
num_checked += was_checked
if not was_good:
broken.append( (in_file, href) )
broken_reflinks = self.check_for_unparsed_reflinks(pagetext)
for brl in broken_reflinks:
broken.append( (in_file, brl) )
return num_checked, broken
def check_for_unparsed_reflinks(self, text: str):
unparsed_links = []
matches = UNMATCHED_REFLINK_REGEX.finditer(text)
for m in matches:
logger.warning(f"... ... Unparsed reference link: {m.group(0)}")
unparsed_links.append(m.group(0))
return unparsed_links
def is_known_broken(self, href: str):
if href in self.exact_known_broken:
@@ -231,7 +332,13 @@ class LinkChecker:
return code
def check_link(self, href):
def trim_trackers(self, href: str):
if "?__hstc=" in href:
return href[:href.find("?__hstc=")]
return href
def check_link(self, href: str):
href = self.trim_trackers(href)
if href in self.cache.keys():
logger.debug(f"... Skipping (cached): {href}")
was_good, time_checked = self.cache[href]
@@ -248,6 +355,34 @@ class LinkChecker:
logger.info("... ... success.")
self.cache[href] = (True, time())
return (1, True)
def check_dev_link(self, href: str):
"""
Check a local dev link; if it has an anchor, use the Chrome driver to
check for the presence of an element with a matching ID, otherwise fall
back to just checking that the page exists.
"""
if self.is_known_broken(href):
logger.debug(f"... Skipping (known broken): {href}")
return (0, True)
if "#" in href and href.split("#",1)[1].strip():
if href in self.anchor_cache.keys():
return (1, self.anchor_cache[href])
id = href.split("#",1)[1].strip()
# Use the selenium driver to check for the exact anchor
self.chrome2.get(href)
sleep(0.3) # delay to give it time for hydration failures
try:
el = self.chrome2.find_element(By.ID, id)
except NoSuchElementException:
el = None
if el:
self.anchor_cache[href] = True
return (1, True)
else:
self.anchor_cache[href] = False
return (1, False)
return self.check_link(href)
def report(self, broken_links: list, total_links_checked: int):
print("---------------------------------------------------------------")
@@ -266,6 +401,8 @@ if __name__ == "__main__":
help="Suppress informational status messages")
noisiness.add_argument("--debug", "-d", action="store_true",
help="Print debug-level log messages")
parser.add_argument("--live", "-l", action="store_true",
help="Use a Selenium-powered browser session to get rendered docs")
parser.add_argument("path", type=str, nargs="?", default="docs/",
help="Check *.md files in this directory (including subdirs)")
@@ -277,7 +414,7 @@ if __name__ == "__main__":
else:
logger.setLevel(logging.INFO)
l = LinkChecker(cli_args.path)
l = LinkChecker(cli_args.path, live=cli_args.live)
try:
l.walk()
except (KeyboardInterrupt) as e: