githubnext · Copilot · May 15, 2026 · May 15, 2026 · May 15, 2026 · May 15, 2026
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -18,8 +18,12 @@ on:
     paths:
       - 'docs/src/content/docs/progress/**'
       - 'docs/astro.config.mjs'
+      - 'scripts/benchmark_manifest_ops.py'
   pull_request:
-    paths: ['docs/**']
+    paths:
+      - 'docs/**'
+      - 'scripts/benchmark_manifest_ops.py'
+      - '.github/workflows/docs.yml'
   workflow_dispatch:
 
 permissions:
@@ -49,6 +53,9 @@ jobs:
         working-directory: ./docs
         run: npm ci
 
+      - name: Generate manifest benchmark results
+        run: python3 scripts/benchmark_manifest_ops.py --work-dir "$RUNNER_TEMP" --markdown --update-doc docs/src/content/docs/progress/autoloop-go-migration.mdx
+
       - name: Build documentation
         working-directory: ./docs
         run: npm run build

diff --git a/docs/src/content/docs/progress/autoloop-go-migration.mdx b/docs/src/content/docs/progress/autoloop-go-migration.mdx
@@ -201,15 +201,19 @@ Autoloop tracks `python_lines_migrated_pct = (migrated_python_lines / original_p
 
 ### Manifest operations benchmark (`scripts/benchmark_manifest_ops.py`)
 
-The script `scripts/benchmark_manifest_ops.py` exists in the repository. A local run was attempted but could not complete (permission denied in the sandbox environment). Results from the previous documented run (2026-05-13) are shown below for reference; re-run locally to get current values.
+`scripts/benchmark_manifest_ops.py` runs in CI with scratch space rooted in the runner temp directory, and the docs workflow regenerates this block before publishing to GitHub Pages.
+
+{/* benchmark_manifest_ops:start */}
+The table below is generated by `scripts/benchmark_manifest_ops.py` during the docs build.
 
 | Scale | `check_collision` speedup | `sync_remove_files` speedup | `cleanup_empty_parents` speedup | Scoped uninstall speedup |
 |---|---:|---:|---:|---:|
-| Current: 10 pkgs, 50 paths | 18.1x | 0.8x | 0.7x | 1.4x |
-| Growing: 50 pkgs, 250 paths | 17.4x | 1.6x | 0.5x | 12.2x |
-| Large monorepo: 100 pkgs, 2,000 paths | 1,606.6x | 2.2x | 0.6x | 26.0x |
+| Current: 10 pkgs, 50 paths | 22.1x | 1.8x | 1.0x | 2.0x |
+| Growing: 50 pkgs, 250 paths | 55.4x | 2.1x | 0.5x | 9.2x |
+| Large monorepo: 100 pkgs, 2,000 paths | 1018.7x | 1.7x | 0.5x | 17.8x |
 
-`cleanup_empty_parents` shows a small regression at scale (0.5x-0.9x) because the batch bottom-up algorithm has higher constant overhead than the legacy per-file walk-up at low deleted-file counts. This is expected and acceptable given the gains on the other three operations.
+`cleanup_empty_parents` may show a small regression at low deleted-file counts because the batch bottom-up algorithm has higher constant overhead than the legacy per-file walk-up. This is expected and acceptable given the gains on the other three operations.
+{/* benchmark_manifest_ops:end */}
 
 ### Go build/test validation
 

diff --git a/scripts/benchmark_manifest_ops.py b/scripts/benchmark_manifest_ops.py
@@ -8,14 +8,14 @@
 - Optimization 4: Scoped uninstall file set (removed packages only)
 
 Usage:
-    uv run python scripts/benchmark_manifest_ops.py
+    python3 scripts/benchmark_manifest_ops.py
+    python3 scripts/benchmark_manifest_ops.py --work-dir "$RUNNER_TEMP" --markdown --update-doc docs/src/content/docs/progress/autoloop-go-migration.mdx
 """
 
-import os
+import argparse
+import shutil
 import tempfile
 import time
-import sys
-import shutil
 from pathlib import Path
 
 # ---------------------------------------------------------------------------
@@ -35,6 +35,14 @@
 PACKAGES = 50
 FILES_PER_PACKAGE = 5
 INTEGRATOR_TYPES = 6  # prompts, agents-gh, agents-cl, commands, skills, hooks
+DOC_START = "{/* benchmark_manifest_ops:start */}"
+DOC_END = "{/* benchmark_manifest_ops:end */}"
+HELP_DESCRIPTION = "Benchmark manifest-based collision detection and sync operations."
+SCALES = [
+    ("Current (10 pkgs x 5 files = 50 paths)", "Current: 10 pkgs, 50 paths", 10, 5),
+    ("Growing (50 pkgs x 5 files = 250 paths)", "Growing: 50 pkgs, 250 paths", 50, 5),
+    ("Large monorepo (100 pkgs x 20 files = 2000 paths)", "Large monorepo: 100 pkgs, 2,000 paths", 100, 20),
+]
 
 
 def build_managed_files(n_packages: int, files_per_pkg: int) -> set:
@@ -53,9 +61,7 @@ def build_managed_files(n_packages: int, files_per_pkg: int) -> set:
 
 def check_collision_OLD(rel_path: str, managed_files: set) -> bool:
     """Original O(M) per call — rebuilds normalized set."""
-    if rel_path.replace("\\", "/") in {p.replace("\\", "/") for p in managed_files}:
-        return False
-    return True
+    return rel_path.replace("\\", "/") not in {p.replace("\\", "/") for p in managed_files}
 
 
 # ---------------------------------------------------------------------------
@@ -68,9 +74,7 @@ def normalize_managed_files(managed_files: set) -> set:
 
 def check_collision_NEW(rel_path: str, managed_files_normalized: set) -> bool:
     """Optimized O(1) lookup against pre-normalized set."""
-    if rel_path.replace("\\", "/") in managed_files_normalized:
-        return False
-    return True
+    return rel_path.replace("\\", "/") not in managed_files_normalized
 
 
 # ---------------------------------------------------------------------------
@@ -125,39 +129,57 @@ def timeit(fn, *args, iterations: int = 1000) -> float:
     return (time.perf_counter() - start) * 1000
 
 
-def run_benchmarks():
-    print("=" * 72)
-    print("APM Manifest Operations Benchmark")
-    print("=" * 72)
+def _make_temp_dir(work_dir: Path | None) -> Path:
+    """Create benchmark scratch space inside a caller-approved directory."""
+    if work_dir is not None:
+        work_dir.mkdir(parents=True, exist_ok=True)
+    return Path(
+        tempfile.mkdtemp(
+            prefix="apm-manifest-ops-",
+            dir=str(work_dir) if work_dir is not None else None,
+        )
+    )
+
 
-    for scale_label, n_pkgs, n_files in [
-        ("Current (10 pkgs × 5 files = 50 paths)", 10, 5),
-        ("Growing (50 pkgs × 5 files = 250 paths)", 50, 5),
-        ("Large monorepo (100 pkgs × 20 files = 2000 paths)", 100, 20),
-    ]:
+def _format_speedup(speedup: float) -> str:
+    if speedup == float("inf"):
+        return "inf x"
+    return f"{speedup:.1f}x"
+
+
+def run_benchmarks(work_dir: Path | None = None, emit_text: bool = True) -> list[dict[str, str]]:
+    results = []
+    if emit_text:
+        print("=" * 72)
+        print("APM Manifest Operations Benchmark")
+        print("=" * 72)
+
+    for scale_label, markdown_label, n_pkgs, n_files in SCALES:
         managed = build_managed_files(n_pkgs, n_files)
         M = len(managed)
-        print(f"\n{'─' * 72}")
-        print(f"Scale: {scale_label}  (M={M})")
-        print(f"{'─' * 72}")
+        if emit_text:
+            print(f"\n{'─' * 72}")
+            print(f"Scale: {scale_label}  (M={M})")
+            print(f"{'─' * 72}")
 
         # -- Benchmark 1: check_collision ----------------------------------
         #
         # Simulate: P=n_pkgs packages × F=n_files files × I=6 integrators
         # Each call does one collision check.
         calls = n_pkgs * n_files * INTEGRATOR_TYPES
-        test_path = f".github/prompts/pkg-0-file-0.md"
+        test_path = ".github/prompts/pkg-0-file-0.md"
 
         old_time = timeit(check_collision_OLD, test_path, managed, iterations=calls)
         normalized = normalize_managed_files(managed)
         norm_time = timeit(normalize_managed_files, managed, iterations=1)
         new_time = norm_time + timeit(check_collision_NEW, test_path, normalized, iterations=calls)
 
-        print(f"\n  check_collision ({calls:,} calls):")
-        print(f"    OLD (set rebuild per call):  {old_time:>8.2f} ms")
-        print(f"    NEW (pre-normalized O(1)):   {new_time:>8.2f} ms")
         speedup = old_time / new_time if new_time > 0 else float("inf")
-        print(f"    Speedup:                     {speedup:>8.1f}×")
+        if emit_text:
+            print(f"\n  check_collision ({calls:,} calls):")
+            print(f"    OLD (set rebuild per call):  {old_time:>8.2f} ms")
+            print(f"    NEW (pre-normalized O(1)):   {new_time:>8.2f} ms")
+            print(f"    Speedup:                     {speedup:>8.1f}x")
 
         # -- Benchmark 2: sync_remove_files --------------------------------
         #
@@ -182,11 +204,12 @@ def run_benchmarks():
                 sync_remove_new(buckets[prefix])
             new_sync += (time.perf_counter() - t0) * 1000
 
-        print(f"\n  sync_remove_files ({iters} uninstall cycles × 6 integrators):")
-        print(f"    OLD (6× full-set scan):      {old_sync:>8.2f} ms")
-        print(f"    NEW (pre-partitioned):       {new_sync:>8.2f} ms")
         speedup2 = old_sync / new_sync if new_sync > 0 else float("inf")
-        print(f"    Speedup:                     {speedup2:>8.1f}×")
+        if emit_text:
+            print(f"\n  sync_remove_files ({iters} uninstall cycles x 6 integrators):")
+            print(f"    OLD (6x full-set scan):      {old_sync:>8.2f} ms")
+            print(f"    NEW (pre-partitioned):       {new_sync:>8.2f} ms")
+            print(f"    Speedup:                     {speedup2:>8.1f}x")
 
         # -- Benchmark 3: empty-parent cleanup ----------------------------
         #
@@ -208,7 +231,7 @@ def _make_tree(base: Path, count: int, nest: int):
             return paths
 
         # OLD: per-file walk-up
-        tmp_old = Path(tempfile.mkdtemp())
+        tmp_old = _make_temp_dir(work_dir)
         try:
             files_old = _make_tree(tmp_old, n_deleted, depth)
             for f in files_old:
@@ -230,7 +253,7 @@ def _make_tree(base: Path, count: int, nest: int):
             shutil.rmtree(tmp_old, ignore_errors=True)
 
         # NEW: batch bottom-up
-        tmp_new = Path(tempfile.mkdtemp())
+        tmp_new = _make_temp_dir(work_dir)
         try:
             files_new = _make_tree(tmp_new, n_deleted, depth)
             for f in files_new:
@@ -253,11 +276,12 @@ def _make_tree(base: Path, count: int, nest: int):
         finally:
             shutil.rmtree(tmp_new, ignore_errors=True)
 
-        print(f"\n  cleanup_empty_parents ({n_deleted} deleted files, depth={depth}):")
-        print(f"    OLD (per-file walk-up):      {old_parent_ms:>8.2f} ms")
-        print(f"    NEW (batch bottom-up):       {new_parent_ms:>8.2f} ms")
         speedup3 = old_parent_ms / new_parent_ms if new_parent_ms > 0 else float("inf")
-        print(f"    Speedup:                     {speedup3:>8.1f}×")
+        if emit_text:
+            print(f"\n  cleanup_empty_parents ({n_deleted} deleted files, depth={depth}):")
+            print(f"    OLD (per-file walk-up):      {old_parent_ms:>8.2f} ms")
+            print(f"    NEW (batch bottom-up):       {new_parent_ms:>8.2f} ms")
+            print(f"    Speedup:                     {speedup3:>8.1f}x")
 
         # -- Benchmark 4: scoped vs. union-all deployed files --------------
         #
@@ -294,15 +318,93 @@ def _make_tree(base: Path, count: int, nest: int):
                 _ = [p for p in removed_files if p.startswith(prefix)]
         new_scope_ms = (time.perf_counter() - t0) * 1000
 
-        print(f"\n  scoped uninstall set (removing {removed_count}/{n_pkgs} pkgs, {iters4} cycles):")
-        print(f"    OLD (union ALL {len(all_files)} paths):     {old_scope_ms:>8.2f} ms")
-        print(f"    NEW (union removed {len(removed_files)} paths): {new_scope_ms:>8.2f} ms")
         speedup4 = old_scope_ms / new_scope_ms if new_scope_ms > 0 else float("inf")
-        print(f"    Speedup:                     {speedup4:>8.1f}×")
-
-    print(f"\n{'=' * 72}")
-    print("Done.")
+        if emit_text:
+            print(f"\n  scoped uninstall set (removing {removed_count}/{n_pkgs} pkgs, {iters4} cycles):")
+            print(f"    OLD (union ALL {len(all_files)} paths):     {old_scope_ms:>8.2f} ms")
+            print(f"    NEW (union removed {len(removed_files)} paths): {new_scope_ms:>8.2f} ms")
+            print(f"    Speedup:                     {speedup4:>8.1f}x")
+
+        results.append(
+            {
+                "scale": markdown_label,
+                "check_collision": _format_speedup(speedup),
+                "sync_remove_files": _format_speedup(speedup2),
+                "cleanup_empty_parents": _format_speedup(speedup3),
+                "scoped_uninstall": _format_speedup(speedup4),
+            }
+        )
+
+    if emit_text:
+        print(f"\n{'=' * 72}")
+        print("Done.")
+
+    return results
+
+
+def render_markdown(results: list[dict[str, str]]) -> str:
+    lines = [
+        DOC_START,
+        "The table below is generated by `scripts/benchmark_manifest_ops.py` during the docs build.",
+        "",
+        "| Scale | `check_collision` speedup | `sync_remove_files` speedup | `cleanup_empty_parents` speedup | Scoped uninstall speedup |",
+        "|---|---:|---:|---:|---:|",
+    ]
+    for result in results:
+        lines.append(
+            f"| {result['scale']} | {result['check_collision']} | {result['sync_remove_files']} | "
+            f"{result['cleanup_empty_parents']} | {result['scoped_uninstall']} |"
+        )
+    lines.extend(
+        [
+            "",
+            "`cleanup_empty_parents` may show a small regression at low deleted-file counts because the batch bottom-up algorithm has higher constant overhead than the legacy per-file walk-up. This is expected and acceptable given the gains on the other three operations.",
+            DOC_END,
+        ]
+    )
+    return "\n".join(lines)
+
+
+def update_doc(path: Path, markdown: str) -> None:
+    content = path.read_text(encoding="utf-8")
+    if DOC_START not in content:
+        raise ValueError(f"Could not update {path}: missing {DOC_START} marker")
+    start = content.index(DOC_START)
+    if DOC_END not in content[start:]:
+        raise ValueError(f"Could not update {path}: missing {DOC_END} marker")
+    end = content.index(DOC_END, start) + len(DOC_END)
+    path.write_text(f"{content[:start]}{markdown}{content[end:]}", encoding="utf-8")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=HELP_DESCRIPTION)
+    parser.add_argument(
+        "--work-dir",
+        type=Path,
+        default=None,
+        help="Writable directory for temporary benchmark trees (defaults to Python's temp dir).",
+    )
+    parser.add_argument(
+        "--markdown",
+        action="store_true",
+        help="Emit the docs markdown benchmark table instead of the console report.",
+    )
+    parser.add_argument(
+        "--update-doc",
+        type=Path,
+        default=None,
+        help="Replace the generated benchmark block in the given docs page.",
+    )
+    return parser.parse_args()
 
 
 if __name__ == "__main__":
-    run_benchmarks()
+    args = parse_args()
+    benchmark_results = run_benchmarks(work_dir=args.work_dir, emit_text=not args.markdown)
+    if args.markdown or args.update_doc is not None:
+        benchmark_markdown = render_markdown(benchmark_results)
+        if args.update_doc is not None:
+            update_doc(args.update_doc, benchmark_markdown)
+            print(f"Updated {args.update_doc}")
+        else:
+            print(benchmark_markdown)
diff --git a/tests/scripts/test_benchmark_manifest_ops.py b/tests/scripts/test_benchmark_manifest_ops.py
@@ -0,0 +1,54 @@
+import importlib.util
+from pathlib import Path
+
+
+def load_benchmark_module():
+    script_path = Path(__file__).resolve().parents[2] / "scripts" / "benchmark_manifest_ops.py"
+    spec = importlib.util.spec_from_file_location("benchmark_manifest_ops", script_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+def test_run_benchmarks_uses_configured_work_dir(tmp_path, monkeypatch):
+    benchmark = load_benchmark_module()
+    created_paths = []
+    original_mkdtemp = benchmark.tempfile.mkdtemp
+
+    def recording_mkdtemp(*args, **kwargs):
+        path = Path(original_mkdtemp(*args, **kwargs))
+        created_paths.append(path)
+        return str(path)
+
+    monkeypatch.setattr(benchmark.tempfile, "mkdtemp", recording_mkdtemp)
+
+    results = benchmark.run_benchmarks(work_dir=tmp_path, emit_text=False)
+
+    assert [result["scale"] for result in results] == [
+        "Current: 10 pkgs, 50 paths",
+        "Growing: 50 pkgs, 250 paths",
+        "Large monorepo: 100 pkgs, 2,000 paths",
+    ]
+    assert created_paths
+    paths_outside_work_dir = [path for path in created_paths if not path.is_relative_to(tmp_path)]
+    assert not paths_outside_work_dir, f"Paths outside work dir: {paths_outside_work_dir}"
+    assert not any(tmp_path.iterdir())
+
+    markdown = benchmark.render_markdown(results)
+    assert benchmark.DOC_START in markdown
+    assert benchmark.DOC_END in markdown
+    assert "| Current: 10 pkgs, 50 paths |" in markdown
+
+
+def test_update_doc_replaces_generated_block(tmp_path):
+    benchmark = load_benchmark_module()
+    doc_path = tmp_path / "page.mdx"
+    doc_path.write_text(
+        f"before\n{benchmark.DOC_START}\nold content\n{benchmark.DOC_END}\nafter\n"
+    )
+
+    benchmark.update_doc(doc_path, f"{benchmark.DOC_START}\nnew content\n{benchmark.DOC_END}")
+
+    assert doc_path.read_text() == (
+        f"before\n{benchmark.DOC_START}\nnew content\n{benchmark.DOC_END}\nafter\n"
+    )