diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 625b1a7..fa57c09 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ --- repos: - repo: "https://github.com/pre-commit/pre-commit-hooks" - rev: "v5.0.0" + rev: "v6.0.0" hooks: - id: "trailing-whitespace" exclude: "^pulpproject.org" diff --git a/.pre-commit-hooks.yaml b/.pre-commit-hooks.yaml new file mode 100644 index 0000000..b651859 --- /dev/null +++ b/.pre-commit-hooks.yaml @@ -0,0 +1,10 @@ +--- +- id: "linkchecker" + name: "Link Checker" + description: "Validates site: links in markdown files" + entry: "linkchecker" + language: "python" + types: ["markdown"] + pass_filenames: true + require_serial: true +... diff --git a/pyproject.toml b/pyproject.toml index f01e7f3..325a1f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ readme = "README.md" [project.scripts] pulp-docs = "pulp_docs.cli:main" +linkchecker = "linkchecker.cli:main" [project.entry-points."mkdocs.plugins"] PulpDocs = "pulp_docs.plugin:PulpDocsPlugin" @@ -34,7 +35,7 @@ PulpDocs = "pulp_docs.plugin:PulpDocsPlugin" ########### [tool.hatch.build.targets.wheel] -packages = ["src/pulp_docs"] +packages = ["src/pulp_docs", "src/linkchecker"] [tool.hatch.build.targets.wheel.force-include] # enable using the installed package directly for development diff --git a/src/linkchecker/__init__.py b/src/linkchecker/__init__.py new file mode 100644 index 0000000..5260cba --- /dev/null +++ b/src/linkchecker/__init__.py @@ -0,0 +1,5 @@ +from linkchecker.cli import linkchecker + +__all__ = [ + "linkchecker", +] diff --git a/src/linkchecker/cli.py b/src/linkchecker/cli.py new file mode 100644 index 0000000..bb8a87f --- /dev/null +++ b/src/linkchecker/cli.py @@ -0,0 +1,137 @@ +import argparse +import os +from typing import NamedTuple + +HEADER_ERROR = "Found {n} broken links:" + + +class LinkError(NamedTuple): + link_target: str + src_filename: str + src_line: str + src_lineno: int + + +def linkchecker(component_rootdir: str, filenames: list[str]) -> int: + cumulative_errors = [] + for file in filenames: + link_errors = check_file(component_rootdir, file) + if not link_errors: + continue + cumulative_errors.extend(link_errors) + report_errors(link_errors=cumulative_errors, component_rootdir=component_rootdir) + if cumulative_errors: + return 1 + return 0 + + +def check_file(component_rootdir: str, src_filename: str) -> list[LinkError]: + if not file_exists(src_filename): + # log.warning(f"{file} does not exist.") + return [] + + link_errors = [] + with open(src_filename, "r") as fd: + for src_lineno, src_line in enumerate(fd): + invalid_links = check_line(src_line, component_rootdir) + for link_target in invalid_links: + link_error = LinkError( + link_target=link_target, + src_line=src_line, + src_filename=src_filename, + src_lineno=src_lineno, + ) + link_errors.append(link_error) + return link_errors + + +def check_line(line: str, basedir: str) -> list[str]: + """Return invalid link in line.""" + invalid_links = [] + relative_path, component_name = os.path.split(basedir) + + for original_link in get_links(line): + link = original_link.removeprefix("site:") + # Filter out external component links + if not link.startswith(f"{component_name}/"): + continue + link_target = os.path.join(relative_path, link) + if not file_exists(link_target): + # Store original site: format + invalid_links.append(original_link) + return invalid_links + + +def get_links(line: str) -> list[str]: + """Extract site: links from a markdown line.""" + import re + + links = [] + # Match inline links: [text](site:path) + inline_pattern = r"\[([^\]]+)\]\((site:[^\)]+)\)" + # Match reference links: [ref]: site:path + reference_pattern = r"\[[^\]]+\]:\s*(site:[^\s]+)" + + for match in re.finditer(inline_pattern, line): + links.append(match.group(2)) + for match in re.finditer(reference_pattern, line): + links.append(match.group(1)) + + return links + + +def file_exists(file: str) -> bool: + """Check if a file exists, treating .md extension as optional.""" + if os.path.exists(file): + return True + # Try with .md extension + if os.path.exists(file + ".md"): + return True + return False + + +def report_errors(link_errors: list[LinkError], component_rootdir: str): + """Print link errors to stdout.""" + if not link_errors: + return + print(HEADER_ERROR.format(n=len(link_errors))) + for error in link_errors: + # line_str = error.src_line.strip()[:85] + " (...)" + filename = os.path.relpath(error.src_filename, component_rootdir) + lineno = error.src_lineno + 1 + print(f"{filename}:{lineno}:{error.link_target}") + + +def parse_arguments(): + """Parse command line arguments.""" + + parser = argparse.ArgumentParser(description="Check markdown links") + parser.add_argument( + "--basedir", + default=".", + help="Base directory for link checking (default: current directory)", + ) + parser.add_argument("files", nargs="+", help="Markdown files to check") + args = parser.parse_args() + + # Shell-expand and normalize the basedir path + basedir = os.path.expanduser(args.basedir) + basedir = os.path.expandvars(basedir) + basedir = os.path.abspath(basedir) + + if not os.path.exists(basedir): + parser.error(f"basedir does not exist: {basedir}") + if not os.path.isdir(basedir): + parser.error(f"basedir is not a directory: {basedir}") + + return basedir, args.files + + +def main(): + """CLI entry point for the linkchecker command.""" + basedir, files = parse_arguments() + exit(linkchecker(basedir, files)) + + +if __name__ == "__main__": + main() diff --git a/test_requirements.txt b/test_requirements.txt index 0cebed5..cff7831 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -1 +1,2 @@ +pre-commit~=4.5 pytest~=8.4 diff --git a/tests/assets/invalid_links/component-a/bar.md b/tests/assets/invalid_links/component-a/bar.md new file mode 100644 index 0000000..470b73d --- /dev/null +++ b/tests/assets/invalid_links/component-a/bar.md @@ -0,0 +1,9 @@ +[valid](site:component-a/foo) +[invalid](site:component-a/NOEXIST1) +[ignored](site:component-b/foo) +[ignored](site:component-b/bar) + +[valid]: site:component-a/foo +[invalid]: site:component-a/NOEXIST2 +[ignored]: site:component-b/foo +[ignored]: site:component-b/bar diff --git a/tests/assets/invalid_links/component-a/foo.md b/tests/assets/invalid_links/component-a/foo.md new file mode 100644 index 0000000..e69de29 diff --git a/tests/assets/invalid_links/component-b/bar.md b/tests/assets/invalid_links/component-b/bar.md new file mode 100644 index 0000000..8f375b0 --- /dev/null +++ b/tests/assets/invalid_links/component-b/bar.md @@ -0,0 +1,9 @@ +[valid](site:component-b/foo) +[invalid](site:component-b/NOEXIST) +[ignored](site:component-a/foo) +[ignored](site:component-a/bar) + +[valid]: (site:component-b/foo +[invalid]: (site:component-b/NOEXIST +[ignored]: (site:component-a/foo +[ignored]: (site:component-a/bar diff --git a/tests/assets/invalid_links/component-b/foo.md b/tests/assets/invalid_links/component-b/foo.md new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..734bce0 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,72 @@ +import os +import shutil +import subprocess +from pathlib import Path +from textwrap import dedent + +import pytest + + +@pytest.fixture +def create_file(tmp_path: Path): + def _create_file(filename: str, content: str) -> Path: + if not filename: + raise ValueError("filename can't be empty") + newfile = tmp_path / filename + newfile.parent.mkdir(parents=True, exist_ok=True) + newfile.write_text(dedent(content)) + return newfile + + return _create_file + + +@pytest.fixture +def create_tree(create_file, tmp_path): + def _create_tree(tree_text: str) -> tuple[Path, list[Path]]: + files = [] + fragments = dedent(tree_text).split("=== ") + for fragment in fragments: + if not fragment.strip(): + continue + filename, content = fragment.split("\n", maxsplit=1) + content = content.strip() or f"Placeholder for {filename=}" + files.append(create_file(filename.strip(), content)) + return tmp_path, files + + return _create_tree + + +@pytest.fixture +def repository_root() -> Path: + return Path(__file__).parent.parent.resolve() + + +@pytest.fixture +def assets_tmpdir(repository_root: Path, tmp_path: Path) -> Path: + assets_dir = repository_root / "tests" / "assets" + _assets_tmpdir = tmp_path / "assets" + + def ignore_fn(src, names): + return [".git"] + + shutil.copytree(assets_dir, _assets_tmpdir, ignore=ignore_fn) + return _assets_tmpdir.resolve() + + +@pytest.fixture +def precommit_test(repository_root: Path, assets_tmpdir: Path): + def git(*args: str): + if not os.getcwd().startswith("/tmp"): + RuntimeError("This must be used in a temporary directory.") + subprocess.check_call(("git",) + args) + + def _precommit_test(hookid: str, fixture: str) -> tuple[int, str, str]: + component_dir = assets_tmpdir / fixture + os.chdir(component_dir) + git("init") + git("add", ".") + cmd = ["pre-commit", "try-repo", str(repository_root), hookid, "-a", "-v"] + result = subprocess.run(cmd, capture_output=True) + return result.returncode, result.stdout.decode(), result.stderr.decode() + + return _precommit_test diff --git a/tests/test_linkchecker.py b/tests/test_linkchecker.py new file mode 100644 index 0000000..116a56a --- /dev/null +++ b/tests/test_linkchecker.py @@ -0,0 +1,76 @@ +from textwrap import dedent +from typing import NamedTuple + +import pytest + +from linkchecker.cli import HEADER_ERROR, linkchecker + + +class Scenario(NamedTuple): + tree: str + component_dir: str + exit_code: int + output: str + + +COMMON_TREE = """ +=== A/docs/guides/foo.md +=== A/docs/reference/bar.md +=== B/docs/guides/foo.md +=== B/docs/reference/bar.md +""" + +cases = [ + Scenario( + tree=f""" + === A/docs/index.md + [valid](site:A/docs/guides/foo) + [valid](site:A/docs/guides/foo.md) + + [valid]: site:A/docs/reference/bar + [valid]: site:A/docs/reference/bar.md + {COMMON_TREE} + """, + component_dir="A", + exit_code=0, + output="", + ), + Scenario( + tree=f""" + === A/docs/index.md + [valid](site:A/docs/guides/foo) + [valid](site:A/docs/reference/bar) + [invalid](site:A/docs/guides/NOEXIT.md) + [invalid](site:A/docs/reference/NOEXIT.md) + {COMMON_TREE} + """, + component_dir="A", + exit_code=1, + output=f""" + {HEADER_ERROR.format(n=2)} + docs/index.md:3:site:A/docs/guides/NOEXIT.md + docs/index.md:4:site:A/docs/reference/NOEXIT.md + """, + ), +] + + +@pytest.mark.parametrize("case", cases) +def test_linkchecker_main(case: Scenario, create_tree, capsys): + """Test checking all files in the docs directory.""" + basedir, files = create_tree(case.tree) + exit_code = linkchecker(str(basedir / case.component_dir), files) + out, err = capsys.readouterr() + assert exit_code == case.exit_code + assert out.strip() == dedent(case.output).strip() + + +def test_precommit_hook(precommit_test): + exitcode, stdout, stderr = precommit_test( + hookid="linkchecker", fixture="invalid_links/component-a" + ) + print(stdout) + assert exitcode == 1 + assert HEADER_ERROR.format(n=2) in stdout + assert "site:component-a/NOEXIST1" in stdout + assert "site:component-a/NOEXIST2" in stdout diff --git a/tests/test_testutils.py b/tests/test_testutils.py new file mode 100644 index 0000000..b4d5992 --- /dev/null +++ b/tests/test_testutils.py @@ -0,0 +1,26 @@ +def test_create_tree(create_tree): + tree = """ + === docs/index.md + # hello world + + [foo](bar.md) + + === docs/foo.md + This is the foo file + """ + rootdir, files = create_tree(tree) + created_file_content = {} + for file in rootdir.rglob("*"): + if not file.is_file(): + continue + content = file.read_text() + created_file_content[str(file.relative_to(rootdir))] = content + + assert len(files) == 2 + assert list(created_file_content.keys()) == ["docs/foo.md", "docs/index.md"] + + assert "[foo](bar.md)" in created_file_content["docs/index.md"] + assert "[foo](bar.md)" not in created_file_content["docs/foo.md"] + + assert "This is the foo file" in created_file_content["docs/foo.md"] + assert "This is the foo file" not in created_file_content["docs/index.md"]