From b5dcbe8d1cf20099cf81a618798c99df6621b01e Mon Sep 17 00:00:00 2001 From: Pedro Brochado Date: Thu, 29 Jan 2026 18:15:19 -0300 Subject: [PATCH 1/2] Add linkchecker to pulp-docs --- pyproject.toml | 1 + src/linkchecker/__init__.py | 5 ++ src/linkchecker/cli.py | 119 ++++++++++++++++++++++++++++++ tests/assets/invalid_links/bar.md | 9 +++ tests/assets/invalid_links/foo.md | 0 tests/conftest.py | 33 +++++++++ tests/test_linkchecker.py | 65 ++++++++++++++++ tests/test_testutils.py | 26 +++++++ 8 files changed, 258 insertions(+) create mode 100644 src/linkchecker/__init__.py create mode 100644 src/linkchecker/cli.py create mode 100644 tests/assets/invalid_links/bar.md create mode 100644 tests/assets/invalid_links/foo.md create mode 100644 tests/conftest.py create mode 100644 tests/test_linkchecker.py create mode 100644 tests/test_testutils.py diff --git a/pyproject.toml b/pyproject.toml index f01e7f35..755b88b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ readme = "README.md" [project.scripts] pulp-docs = "pulp_docs.cli:main" +linkchecker = "linkchecker.cli:main" [project.entry-points."mkdocs.plugins"] PulpDocs = "pulp_docs.plugin:PulpDocsPlugin" diff --git a/src/linkchecker/__init__.py b/src/linkchecker/__init__.py new file mode 100644 index 00000000..5260cbab --- /dev/null +++ b/src/linkchecker/__init__.py @@ -0,0 +1,5 @@ +from linkchecker.cli import linkchecker + +__all__ = [ + "linkchecker", +] diff --git a/src/linkchecker/cli.py b/src/linkchecker/cli.py new file mode 100644 index 00000000..466ecf6b --- /dev/null +++ b/src/linkchecker/cli.py @@ -0,0 +1,119 @@ +import os +from typing import NamedTuple + +HEADER_ERROR = "Broken links:" + + +class LinkError(NamedTuple): + link_target: str + src_filename: str + src_line: str + src_lineno: int + + +def linkchecker(component_rootdir: str, filenames: list[str]) -> int: + cumulative_errors = [] + for file in filenames: + link_errors = check_file(component_rootdir, file) + if not link_errors: + continue + cumulative_errors.extend(link_errors) + report_errors(link_errors=cumulative_errors, component_rootdir=component_rootdir) + return 0 + + +def check_file(component_rootdir: str, src_filename: str) -> list[LinkError]: + if not file_exists(src_filename): + # log.warning(f"{file} does not exist.") + return [] + + link_errors = [] + with open(src_filename, "r") as fd: + for src_lineno, src_line in enumerate(fd): + invalid_links = check_line(src_line, component_rootdir) + for link_target in invalid_links: + link_error = LinkError( + link_target=link_target, + src_line=src_line, + src_filename=src_filename, + src_lineno=src_lineno, + ) + link_errors.append(link_error) + return link_errors + + +def check_line(line: str, basedir: str) -> list[str]: + """Return invalid link in line.""" + invalid_links = [] + relative_path, component_name = os.path.split(basedir) + + for link in get_links(line): + # Filter out external component links + if not link.startswith(f"{component_name}/"): + continue + link_target = os.path.join(relative_path, link) + if not file_exists(link_target): + # Store original site: format + invalid_links.append(link) + return invalid_links + + +def get_links(line: str) -> list[str]: + """Extract site: links from a markdown line.""" + import re + + links = [] + # Match inline links: [text](site:path) + inline_pattern = r"\[([^\]]+)\]\(site:([^\)]+)\)" + # Match reference links: [ref]: site:path + reference_pattern = r"\[[^\]]+\]:\s*site:([^\s]+)" + + for match in re.finditer(inline_pattern, line): + links.append(match.group(2)) + for match in re.finditer(reference_pattern, line): + links.append(match.group(1)) + + return links + + +def file_exists(file: str) -> bool: + """Check if a file exists, treating .md extension as optional.""" + if os.path.exists(file): + return True + # Try with .md extension + if os.path.exists(file + ".md"): + return True + return False + + +def report_errors(link_errors: list[LinkError], component_rootdir: str): + """Print link errors to stdout.""" + if not link_errors: + return + print(HEADER_ERROR) + for error in link_errors: + line_str = error.src_line.strip() + filename = os.path.relpath(error.src_filename, component_rootdir) + lineno = error.src_lineno + 1 + print(f"{filename}:{lineno}:{line_str}") + + +def parse_arguments(): + """Parse command line arguments.""" + import argparse + + parser = argparse.ArgumentParser(description="Check markdown links") + parser.add_argument("basedir", help="Base directory for link checking") + parser.add_argument("files", nargs="+", help="Markdown files to check") + args = parser.parse_args() + return args.basedir, args.files + + +def main(): + """CLI entry point for the linkchecker command.""" + basedir, files = parse_arguments() + exit(linkchecker(basedir, files)) + + +if __name__ == "__main__": + main() diff --git a/tests/assets/invalid_links/bar.md b/tests/assets/invalid_links/bar.md new file mode 100644 index 00000000..057d8b2d --- /dev/null +++ b/tests/assets/invalid_links/bar.md @@ -0,0 +1,9 @@ +[valid](site:A/foo) +[invalid](site:A/NOEXIST) +[ignored](site:B/foo) +[ignored](site:B/bar) + +[valid]: (site:A/foo +[invalid]: (site:A/NOEXIST +[ignored]: (site:B/foo +[ignored]: (site:B/bar diff --git a/tests/assets/invalid_links/foo.md b/tests/assets/invalid_links/foo.md new file mode 100644 index 00000000..e69de29b diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..fa59ef72 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,33 @@ +from pathlib import Path +from textwrap import dedent + +import pytest + + +@pytest.fixture +def create_file(tmp_path: Path): + def _create_file(filename: str, content: str) -> Path: + if not filename: + raise ValueError("filename can't be empty") + newfile = tmp_path / filename + newfile.parent.mkdir(parents=True, exist_ok=True) + newfile.write_text(dedent(content)) + return newfile + + return _create_file + + +@pytest.fixture +def create_tree(create_file, tmp_path): + def _create_tree(tree_text: str) -> tuple[Path, list[Path]]: + files = [] + fragments = dedent(tree_text).split("=== ") + for fragment in fragments: + if not fragment.strip(): + continue + filename, content = fragment.split("\n", maxsplit=1) + content = content.strip() or f"Placeholder for {filename=}" + files.append(create_file(filename.strip(), content)) + return tmp_path, files + + return _create_tree diff --git a/tests/test_linkchecker.py b/tests/test_linkchecker.py new file mode 100644 index 00000000..167c21bf --- /dev/null +++ b/tests/test_linkchecker.py @@ -0,0 +1,65 @@ +from textwrap import dedent +from typing import NamedTuple + +import pytest + +from linkchecker.cli import HEADER_ERROR, linkchecker + + +class Scenario(NamedTuple): + tree: str + component_dir: str + exit_code: int + output: str + + +COMMON_TREE = """ +=== A/docs/guides/foo.md +=== A/docs/reference/bar.md +=== B/docs/guides/foo.md +=== B/docs/reference/bar.md +""" + +cases = [ + Scenario( + tree=f""" + === A/docs/index.md + [valid](site:A/docs/guides/foo) + [valid](site:A/docs/guides/foo.md) + + [valid]: site:A/docs/reference/bar + [valid]: site:A/docs/reference/bar.md + {COMMON_TREE} + """, + component_dir="A", + exit_code=0, + output="", + ), + Scenario( + tree=f""" + === A/docs/index.md + [valid](site:A/docs/guides/foo) + [valid](site:A/docs/reference/bar) + [invalid](site:A/docs/guides/NOEXIT.md) + [invalid](site:A/docs/reference/NOEXIT.md) + {COMMON_TREE} + """, + component_dir="A", + exit_code=0, + output=f""" + {HEADER_ERROR} + docs/index.md:3:[invalid](site:A/docs/guides/NOEXIT.md) + docs/index.md:4:[invalid](site:A/docs/reference/NOEXIT.md) + """, + ), +] + + +@pytest.mark.parametrize("case", cases) +def test_linkchecker_main(case, create_tree, capsys): + """Test checking all files in the docs directory.""" + basedir, files = create_tree(case.tree) + exit_code = linkchecker(str(basedir / case.component_dir), files) + out, err = capsys.readouterr() + assert exit_code == case.exit_code + assert out.strip() == dedent(case.output).strip() diff --git a/tests/test_testutils.py b/tests/test_testutils.py new file mode 100644 index 00000000..b4d59921 --- /dev/null +++ b/tests/test_testutils.py @@ -0,0 +1,26 @@ +def test_create_tree(create_tree): + tree = """ + === docs/index.md + # hello world + + [foo](bar.md) + + === docs/foo.md + This is the foo file + """ + rootdir, files = create_tree(tree) + created_file_content = {} + for file in rootdir.rglob("*"): + if not file.is_file(): + continue + content = file.read_text() + created_file_content[str(file.relative_to(rootdir))] = content + + assert len(files) == 2 + assert list(created_file_content.keys()) == ["docs/foo.md", "docs/index.md"] + + assert "[foo](bar.md)" in created_file_content["docs/index.md"] + assert "[foo](bar.md)" not in created_file_content["docs/foo.md"] + + assert "This is the foo file" in created_file_content["docs/foo.md"] + assert "This is the foo file" not in created_file_content["docs/index.md"] From 1f7986186fdcab37ed23134e6e8aeb1153139324 Mon Sep 17 00:00:00 2001 From: Pedro Brochado Date: Thu, 29 Jan 2026 19:17:09 -0300 Subject: [PATCH 2/2] Add pre-commit hooks and test pre-commit config --- .pre-commit-config.yaml | 2 +- .pre-commit-hooks.yaml | 10 +++++ pyproject.toml | 2 +- src/linkchecker/cli.py | 40 ++++++++++++++----- test_requirements.txt | 1 + tests/assets/invalid_links/bar.md | 9 ----- tests/assets/invalid_links/component-a/bar.md | 9 +++++ .../invalid_links/{ => component-a}/foo.md | 0 tests/assets/invalid_links/component-b/bar.md | 9 +++++ tests/assets/invalid_links/component-b/foo.md | 0 tests/conftest.py | 39 ++++++++++++++++++ tests/test_linkchecker.py | 21 +++++++--- 12 files changed, 115 insertions(+), 27 deletions(-) create mode 100644 .pre-commit-hooks.yaml delete mode 100644 tests/assets/invalid_links/bar.md create mode 100644 tests/assets/invalid_links/component-a/bar.md rename tests/assets/invalid_links/{ => component-a}/foo.md (100%) create mode 100644 tests/assets/invalid_links/component-b/bar.md create mode 100644 tests/assets/invalid_links/component-b/foo.md diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 625b1a75..fa57c092 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ --- repos: - repo: "https://github.com/pre-commit/pre-commit-hooks" - rev: "v5.0.0" + rev: "v6.0.0" hooks: - id: "trailing-whitespace" exclude: "^pulpproject.org" diff --git a/.pre-commit-hooks.yaml b/.pre-commit-hooks.yaml new file mode 100644 index 00000000..b6518591 --- /dev/null +++ b/.pre-commit-hooks.yaml @@ -0,0 +1,10 @@ +--- +- id: "linkchecker" + name: "Link Checker" + description: "Validates site: links in markdown files" + entry: "linkchecker" + language: "python" + types: ["markdown"] + pass_filenames: true + require_serial: true +... diff --git a/pyproject.toml b/pyproject.toml index 755b88b4..325a1f20 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ PulpDocs = "pulp_docs.plugin:PulpDocsPlugin" ########### [tool.hatch.build.targets.wheel] -packages = ["src/pulp_docs"] +packages = ["src/pulp_docs", "src/linkchecker"] [tool.hatch.build.targets.wheel.force-include] # enable using the installed package directly for development diff --git a/src/linkchecker/cli.py b/src/linkchecker/cli.py index 466ecf6b..bb8a87f3 100644 --- a/src/linkchecker/cli.py +++ b/src/linkchecker/cli.py @@ -1,7 +1,8 @@ +import argparse import os from typing import NamedTuple -HEADER_ERROR = "Broken links:" +HEADER_ERROR = "Found {n} broken links:" class LinkError(NamedTuple): @@ -19,6 +20,8 @@ def linkchecker(component_rootdir: str, filenames: list[str]) -> int: continue cumulative_errors.extend(link_errors) report_errors(link_errors=cumulative_errors, component_rootdir=component_rootdir) + if cumulative_errors: + return 1 return 0 @@ -47,14 +50,15 @@ def check_line(line: str, basedir: str) -> list[str]: invalid_links = [] relative_path, component_name = os.path.split(basedir) - for link in get_links(line): + for original_link in get_links(line): + link = original_link.removeprefix("site:") # Filter out external component links if not link.startswith(f"{component_name}/"): continue link_target = os.path.join(relative_path, link) if not file_exists(link_target): # Store original site: format - invalid_links.append(link) + invalid_links.append(original_link) return invalid_links @@ -64,9 +68,9 @@ def get_links(line: str) -> list[str]: links = [] # Match inline links: [text](site:path) - inline_pattern = r"\[([^\]]+)\]\(site:([^\)]+)\)" + inline_pattern = r"\[([^\]]+)\]\((site:[^\)]+)\)" # Match reference links: [ref]: site:path - reference_pattern = r"\[[^\]]+\]:\s*site:([^\s]+)" + reference_pattern = r"\[[^\]]+\]:\s*(site:[^\s]+)" for match in re.finditer(inline_pattern, line): links.append(match.group(2)) @@ -90,23 +94,37 @@ def report_errors(link_errors: list[LinkError], component_rootdir: str): """Print link errors to stdout.""" if not link_errors: return - print(HEADER_ERROR) + print(HEADER_ERROR.format(n=len(link_errors))) for error in link_errors: - line_str = error.src_line.strip() + # line_str = error.src_line.strip()[:85] + " (...)" filename = os.path.relpath(error.src_filename, component_rootdir) lineno = error.src_lineno + 1 - print(f"{filename}:{lineno}:{line_str}") + print(f"{filename}:{lineno}:{error.link_target}") def parse_arguments(): """Parse command line arguments.""" - import argparse parser = argparse.ArgumentParser(description="Check markdown links") - parser.add_argument("basedir", help="Base directory for link checking") + parser.add_argument( + "--basedir", + default=".", + help="Base directory for link checking (default: current directory)", + ) parser.add_argument("files", nargs="+", help="Markdown files to check") args = parser.parse_args() - return args.basedir, args.files + + # Shell-expand and normalize the basedir path + basedir = os.path.expanduser(args.basedir) + basedir = os.path.expandvars(basedir) + basedir = os.path.abspath(basedir) + + if not os.path.exists(basedir): + parser.error(f"basedir does not exist: {basedir}") + if not os.path.isdir(basedir): + parser.error(f"basedir is not a directory: {basedir}") + + return basedir, args.files def main(): diff --git a/test_requirements.txt b/test_requirements.txt index 0cebed5f..cff7831c 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -1 +1,2 @@ +pre-commit~=4.5 pytest~=8.4 diff --git a/tests/assets/invalid_links/bar.md b/tests/assets/invalid_links/bar.md deleted file mode 100644 index 057d8b2d..00000000 --- a/tests/assets/invalid_links/bar.md +++ /dev/null @@ -1,9 +0,0 @@ -[valid](site:A/foo) -[invalid](site:A/NOEXIST) -[ignored](site:B/foo) -[ignored](site:B/bar) - -[valid]: (site:A/foo -[invalid]: (site:A/NOEXIST -[ignored]: (site:B/foo -[ignored]: (site:B/bar diff --git a/tests/assets/invalid_links/component-a/bar.md b/tests/assets/invalid_links/component-a/bar.md new file mode 100644 index 00000000..470b73d6 --- /dev/null +++ b/tests/assets/invalid_links/component-a/bar.md @@ -0,0 +1,9 @@ +[valid](site:component-a/foo) +[invalid](site:component-a/NOEXIST1) +[ignored](site:component-b/foo) +[ignored](site:component-b/bar) + +[valid]: site:component-a/foo +[invalid]: site:component-a/NOEXIST2 +[ignored]: site:component-b/foo +[ignored]: site:component-b/bar diff --git a/tests/assets/invalid_links/foo.md b/tests/assets/invalid_links/component-a/foo.md similarity index 100% rename from tests/assets/invalid_links/foo.md rename to tests/assets/invalid_links/component-a/foo.md diff --git a/tests/assets/invalid_links/component-b/bar.md b/tests/assets/invalid_links/component-b/bar.md new file mode 100644 index 00000000..8f375b0e --- /dev/null +++ b/tests/assets/invalid_links/component-b/bar.md @@ -0,0 +1,9 @@ +[valid](site:component-b/foo) +[invalid](site:component-b/NOEXIST) +[ignored](site:component-a/foo) +[ignored](site:component-a/bar) + +[valid]: (site:component-b/foo +[invalid]: (site:component-b/NOEXIST +[ignored]: (site:component-a/foo +[ignored]: (site:component-a/bar diff --git a/tests/assets/invalid_links/component-b/foo.md b/tests/assets/invalid_links/component-b/foo.md new file mode 100644 index 00000000..e69de29b diff --git a/tests/conftest.py b/tests/conftest.py index fa59ef72..734bce01 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,6 @@ +import os +import shutil +import subprocess from pathlib import Path from textwrap import dedent @@ -31,3 +34,39 @@ def _create_tree(tree_text: str) -> tuple[Path, list[Path]]: return tmp_path, files return _create_tree + + +@pytest.fixture +def repository_root() -> Path: + return Path(__file__).parent.parent.resolve() + + +@pytest.fixture +def assets_tmpdir(repository_root: Path, tmp_path: Path) -> Path: + assets_dir = repository_root / "tests" / "assets" + _assets_tmpdir = tmp_path / "assets" + + def ignore_fn(src, names): + return [".git"] + + shutil.copytree(assets_dir, _assets_tmpdir, ignore=ignore_fn) + return _assets_tmpdir.resolve() + + +@pytest.fixture +def precommit_test(repository_root: Path, assets_tmpdir: Path): + def git(*args: str): + if not os.getcwd().startswith("/tmp"): + RuntimeError("This must be used in a temporary directory.") + subprocess.check_call(("git",) + args) + + def _precommit_test(hookid: str, fixture: str) -> tuple[int, str, str]: + component_dir = assets_tmpdir / fixture + os.chdir(component_dir) + git("init") + git("add", ".") + cmd = ["pre-commit", "try-repo", str(repository_root), hookid, "-a", "-v"] + result = subprocess.run(cmd, capture_output=True) + return result.returncode, result.stdout.decode(), result.stderr.decode() + + return _precommit_test diff --git a/tests/test_linkchecker.py b/tests/test_linkchecker.py index 167c21bf..116a56a5 100644 --- a/tests/test_linkchecker.py +++ b/tests/test_linkchecker.py @@ -45,21 +45,32 @@ class Scenario(NamedTuple): {COMMON_TREE} """, component_dir="A", - exit_code=0, + exit_code=1, output=f""" - {HEADER_ERROR} - docs/index.md:3:[invalid](site:A/docs/guides/NOEXIT.md) - docs/index.md:4:[invalid](site:A/docs/reference/NOEXIT.md) + {HEADER_ERROR.format(n=2)} + docs/index.md:3:site:A/docs/guides/NOEXIT.md + docs/index.md:4:site:A/docs/reference/NOEXIT.md """, ), ] @pytest.mark.parametrize("case", cases) -def test_linkchecker_main(case, create_tree, capsys): +def test_linkchecker_main(case: Scenario, create_tree, capsys): """Test checking all files in the docs directory.""" basedir, files = create_tree(case.tree) exit_code = linkchecker(str(basedir / case.component_dir), files) out, err = capsys.readouterr() assert exit_code == case.exit_code assert out.strip() == dedent(case.output).strip() + + +def test_precommit_hook(precommit_test): + exitcode, stdout, stderr = precommit_test( + hookid="linkchecker", fixture="invalid_links/component-a" + ) + print(stdout) + assert exitcode == 1 + assert HEADER_ERROR.format(n=2) in stdout + assert "site:component-a/NOEXIST1" in stdout + assert "site:component-a/NOEXIST2" in stdout