From 503eec25aa324124beb38ee1d24268415e0302ca Mon Sep 17 00:00:00 2001
From: Simon Strandgaard <neoneye@gmail.com>
Date: Mon, 9 Mar 2026 13:05:48 +0100
Subject: [PATCH] Add Fermi sanity check validation gate for plan assumptions
 (#88)

Rule-based validation that audits distilled assumptions for common
failure modes: missing quantification, implausibly wide ranges (>100x
span ratio), and low-confidence claims without evidence references.
Runs as a pipeline task after DistillAssumptionsTask with no LLM calls.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 worker_plan/worker_plan_api/filenames.py      |   2 +
 .../plan/run_plan_pipeline.py                 |  45 +++
 .../validation/__init__.py                    |   0
 .../validation/fermi_sanity_check.py          | 265 ++++++++++++++++++
 4 files changed, 312 insertions(+)
 create mode 100644 worker_plan/worker_plan_internal/validation/__init__.py
 create mode 100644 worker_plan/worker_plan_internal/validation/fermi_sanity_check.py

diff --git a/worker_plan/worker_plan_api/filenames.py b/worker_plan/worker_plan_api/filenames.py
index 82abbea8..61a85ade 100644
--- a/worker_plan/worker_plan_api/filenames.py
+++ b/worker_plan/worker_plan_api/filenames.py
@@ -37,6 +37,8 @@ class FilenameEnum(str, Enum):
     REVIEW_ASSUMPTIONS_MARKDOWN = "003-9-review_assumptions.md"
     CONSOLIDATE_ASSUMPTIONS_FULL_MARKDOWN = "003-10-consolidate_assumptions_full.md"
     CONSOLIDATE_ASSUMPTIONS_SHORT_MARKDOWN = "003-11-consolidate_assumptions_short.md"
+    FERMI_SANITY_CHECK_RAW = "003-12-fermi_sanity_check_raw.json"
+    FERMI_SANITY_CHECK_MARKDOWN = "003-13-fermi_sanity_check.md"
     PRE_PROJECT_ASSESSMENT_RAW = "004-1-pre_project_assessment_raw.json"
     PRE_PROJECT_ASSESSMENT = "004-2-pre_project_assessment.json"
     PROJECT_PLAN_RAW = "005-1-project_plan_raw.json"
diff --git a/worker_plan/worker_plan_internal/plan/run_plan_pipeline.py b/worker_plan/worker_plan_internal/plan/run_plan_pipeline.py
index fa5e8dbd..ea9f72f8 100644
--- a/worker_plan/worker_plan_internal/plan/run_plan_pipeline.py
+++ b/worker_plan/worker_plan_internal/plan/run_plan_pipeline.py
@@ -35,6 +35,7 @@
 from worker_plan_internal.assume.make_assumptions import MakeAssumptions
 from worker_plan_internal.assume.distill_assumptions import DistillAssumptions
 from worker_plan_internal.assume.review_assumptions import ReviewAssumptions
+from worker_plan_internal.validation.fermi_sanity_check import run_fermi_sanity_check
 from worker_plan_internal.assume.shorten_markdown import ShortenMarkdown
 from worker_plan_internal.expert.pre_project_assessment import PreProjectAssessment
 from worker_plan_internal.plan.project_plan import ProjectPlan
@@ -1020,6 +1021,49 @@ def run_with_llm(self, llm: LLM) -> None:
         review_assumptions.save_markdown(str(output_markdown_path))
 
 
+class FermiSanityCheckTask(PlanTask):
+    """
+    Rule-based validation gate for plan assumptions.
+    No LLM calls — runs fast and catches common failure modes.
+    """
+    def requires(self):
+        return {
+            'distill_assumptions': self.clone(DistillAssumptionsTask),
+        }
+
+    def output(self):
+        return {
+            'raw': self.local_target(FilenameEnum.FERMI_SANITY_CHECK_RAW),
+            'markdown': self.local_target(FilenameEnum.FERMI_SANITY_CHECK_MARKDOWN),
+        }
+
+    def run_inner(self):
+        # Read the distilled assumptions JSON.
+        with self.input()['distill_assumptions']['raw'].open("r") as f:
+            raw_data = json.loads(f.read())
+
+        # Extract assumption_list from the response field.
+        response = raw_data.get("response", {})
+        assumption_list = response.get("assumption_list", [])
+
+        if not assumption_list:
+            logger.warning("No assumptions found for Fermi sanity check.")
+
+        report = run_fermi_sanity_check(assumption_list)
+
+        # Write outputs.
+        report.save_raw(self.output()['raw'].path)
+        report.save_markdown(self.output()['markdown'].path)
+
+        logger.info(
+            "Fermi sanity check: %s (pass=%d, warn=%d, fail=%d)",
+            report.overall_result.value,
+            report.pass_count,
+            report.warn_count,
+            report.fail_count,
+        )
+
+
 class ConsolidateAssumptionsMarkdownTask(PlanTask):
     """
     Combines multiple small markdown documents into a single big document.
@@ -3757,6 +3801,7 @@ def requires(self):
             'make_assumptions': self.clone(MakeAssumptionsTask),
             'assumptions': self.clone(DistillAssumptionsTask),
             'review_assumptions': self.clone(ReviewAssumptionsTask),
+            'fermi_sanity_check': self.clone(FermiSanityCheckTask),
             'consolidate_assumptions_markdown': self.clone(ConsolidateAssumptionsMarkdownTask),
             'pre_project_assessment': self.clone(PreProjectAssessmentTask),
             'project_plan': self.clone(ProjectPlanTask),
diff --git a/worker_plan/worker_plan_internal/validation/__init__.py b/worker_plan/worker_plan_internal/validation/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/worker_plan/worker_plan_internal/validation/fermi_sanity_check.py b/worker_plan/worker_plan_internal/validation/fermi_sanity_check.py
new file mode 100644
index 00000000..575249f3
--- /dev/null
+++ b/worker_plan/worker_plan_internal/validation/fermi_sanity_check.py
@@ -0,0 +1,265 @@
+"""
+Rule-based validation gate for plan assumptions.
+
+Audits assumptions from the pipeline for common failure modes:
+- Missing quantification (no numbers/bounds)
+- Implausibly wide spans (upper/lower ratio > 100x)
+- Missing evidence for low-confidence claims
+- Budget/timeline/team values that look ungrounded
+
+This is a pure rule-based check — no LLM calls. It runs fast and catches
+the most common failure modes in LLM-generated assumptions.
+
+See: docs/proposals/88-fermi-sanity-check-validation-gate.md
+
+PROMPT> python -m worker_plan_internal.validation.fermi_sanity_check
+"""
+import json
+import logging
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Optional
+from enum import Enum
+
+logger = logging.getLogger(__name__)
+
+
+class CheckResult(str, Enum):
+    PASS = "pass"
+    WARN = "warn"
+    FAIL = "fail"
+
+
+@dataclass
+class AssumptionCheck:
+    """Result of validating a single assumption."""
+    assumption_index: int
+    assumption_text: str
+    result: CheckResult
+    checks: List[dict] = field(default_factory=list)
+
+    def add_check(self, name: str, result: CheckResult, detail: str) -> None:
+        self.checks.append({"name": name, "result": result.value, "detail": detail})
+        # Promote the overall result to the worst seen.
+        if result == CheckResult.FAIL:
+            self.result = CheckResult.FAIL
+        elif result == CheckResult.WARN and self.result != CheckResult.FAIL:
+            self.result = CheckResult.WARN
+
+
+# Regex for numbers (integers, decimals, with optional currency/unit suffixes).
+_NUMBER_PATTERN = re.compile(
+    r'\b\d[\d,]*\.?\d*\s*(?:%|USD|EUR|GBP|JPY|CNY|INR|BRL|CHF|CAD|AUD|'
+    r'million|billion|thousand|hundred|M|B|K|k|'
+    r'years?|months?|weeks?|days?|hours?|'
+    r'people|persons?|FTEs?|employees?|staff|'
+    r'km|miles?|m|meters?|feet|'
+    r'kg|lbs?|tons?|tonnes?|'
+    r'units?|items?|pieces?|'
+    r'sq\s*m|sq\s*ft|hectares?|acres?)?\b',
+    re.IGNORECASE,
+)
+
+# Patterns that suggest a range (two values forming bounds).
+_RANGE_PATTERN = re.compile(
+    r'\d[\d,]*\.?\d*\s*[-–—to]+\s*\d[\d,]*\.?\d*',
+    re.IGNORECASE,
+)
+
+# Weak confidence markers.
+_LOW_CONFIDENCE_MARKERS = [
+    "approximately", "roughly", "around", "about",
+    "estimated", "estimate", "assumed", "assuming",
+    "unclear", "uncertain", "unknown",
+    "best guess", "placeholder", "tbd", "to be determined",
+]
+
+
+def _has_quantification(text: str) -> bool:
+    """True if the assumption contains at least one number with context."""
+    return bool(_NUMBER_PATTERN.search(text))
+
+
+def _has_range(text: str) -> bool:
+    """True if the assumption specifies a range (lower-upper bounds)."""
+    return bool(_RANGE_PATTERN.search(text))
+
+
+def _extract_range_ratio(text: str) -> Optional[float]:
+    """Extract the ratio of upper/lower from the first range found."""
+    match = _RANGE_PATTERN.search(text)
+    if not match:
+        return None
+    nums = re.findall(r'\d[\d,]*\.?\d*', match.group())
+    if len(nums) < 2:
+        return None
+    try:
+        lower = float(nums[0].replace(",", ""))
+        upper = float(nums[1].replace(",", ""))
+        if lower <= 0:
+            return None
+        return upper / lower
+    except (ValueError, ZeroDivisionError):
+        return None
+
+
+def _has_low_confidence_marker(text: str) -> bool:
+    text_lower = text.lower()
+    return any(marker in text_lower for marker in _LOW_CONFIDENCE_MARKERS)
+
+
+def _has_evidence_reference(text: str) -> bool:
+    """True if the text references evidence, sources, or data."""
+    evidence_markers = [
+        "based on", "according to", "data shows", "research",
+        "study", "survey", "report", "analysis", "benchmark",
+        "historical", "comparable", "industry standard",
+        "source:", "ref:", "citation",
+    ]
+    text_lower = text.lower()
+    return any(marker in text_lower for marker in evidence_markers)
+
+
+def validate_assumption(index: int, text: str) -> AssumptionCheck:
+    """Run all validation rules on a single assumption."""
+    check = AssumptionCheck(
+        assumption_index=index,
+        assumption_text=text,
+        result=CheckResult.PASS,
+    )
+
+    # Rule 1: Quantification present
+    if not _has_quantification(text):
+        check.add_check(
+            "quantification",
+            CheckResult.WARN,
+            "No numeric values found. Consider adding specific numbers, bounds, or ranges.",
+        )
+    else:
+        check.add_check("quantification", CheckResult.PASS, "Contains numeric values.")
+
+    # Rule 2: Range bounds (only if quantified)
+    if _has_quantification(text) and _has_range(text):
+        ratio = _extract_range_ratio(text)
+        if ratio is not None:
+            if ratio > 100:
+                check.add_check(
+                    "span_ratio",
+                    CheckResult.FAIL,
+                    f"Range span ratio is {ratio:.0f}x (>100x). Bounds are implausibly wide.",
+                )
+            elif ratio > 50:
+                check.add_check(
+                    "span_ratio",
+                    CheckResult.WARN,
+                    f"Range span ratio is {ratio:.0f}x (>50x). Consider tightening bounds.",
+                )
+            else:
+                check.add_check("span_ratio", CheckResult.PASS, f"Range span ratio is {ratio:.1f}x.")
+
+    # Rule 3: Low confidence without evidence
+    if _has_low_confidence_marker(text) and not _has_evidence_reference(text):
+        check.add_check(
+            "evidence_for_uncertainty",
+            CheckResult.WARN,
+            "Low-confidence language detected but no evidence reference. Consider citing a source.",
+        )
+
+    return check
+
+
+@dataclass
+class FermiSanityCheckReport:
+    """Full validation report for a set of assumptions."""
+    checks: List[AssumptionCheck]
+
+    @property
+    def total(self) -> int:
+        return len(self.checks)
+
+    @property
+    def pass_count(self) -> int:
+        return sum(1 for c in self.checks if c.result == CheckResult.PASS)
+
+    @property
+    def warn_count(self) -> int:
+        return sum(1 for c in self.checks if c.result == CheckResult.WARN)
+
+    @property
+    def fail_count(self) -> int:
+        return sum(1 for c in self.checks if c.result == CheckResult.FAIL)
+
+    @property
+    def pass_rate_pct(self) -> float:
+        if self.total == 0:
+            return 100.0
+        return (self.pass_count / self.total) * 100.0
+
+    @property
+    def overall_result(self) -> CheckResult:
+        if self.fail_count > 0:
+            return CheckResult.FAIL
+        if self.warn_count > 0:
+            return CheckResult.WARN
+        return CheckResult.PASS
+
+    def to_dict(self) -> dict:
+        return {
+            "overall_result": self.overall_result.value,
+            "pass_rate_pct": round(self.pass_rate_pct, 1),
+            "total": self.total,
+            "pass_count": self.pass_count,
+            "warn_count": self.warn_count,
+            "fail_count": self.fail_count,
+            "checks": [
+                {
+                    "index": c.assumption_index,
+                    "text": c.assumption_text,
+                    "result": c.result.value,
+                    "checks": c.checks,
+                }
+                for c in self.checks
+            ],
+        }
+
+    def save_raw(self, file_path: str) -> None:
+        Path(file_path).write_text(json.dumps(self.to_dict(), indent=2), encoding="utf-8")
+
+    def save_markdown(self, file_path: str) -> None:
+        lines = ["# Fermi Sanity Check Report\n"]
+        lines.append(f"**Overall Result:** {self.overall_result.value}")
+        lines.append(f"**Pass Rate:** {self.pass_rate_pct:.1f}%")
+        lines.append(f"**Summary:** {self.pass_count} pass, {self.warn_count} warn, {self.fail_count} fail out of {self.total}\n")
+
+        for check in self.checks:
+            icon = {"pass": "OK", "warn": "WARN", "fail": "FAIL"}[check.result.value]
+            lines.append(f"## [{icon}] Assumption {check.assumption_index + 1}\n")
+            lines.append(f"> {check.assumption_text}\n")
+            for c in check.checks:
+                result_icon = {"pass": "OK", "warn": "WARN", "fail": "FAIL"}[c["result"]]
+                lines.append(f"- **{c['name']}** [{result_icon}]: {c['detail']}")
+            lines.append("")
+
+        Path(file_path).write_text("\n".join(lines), encoding="utf-8")
+
+
+def run_fermi_sanity_check(assumptions: List[str]) -> FermiSanityCheckReport:
+    """Validate a list of assumption strings and return the report."""
+    checks = [validate_assumption(i, text) for i, text in enumerate(assumptions)]
+    return FermiSanityCheckReport(checks=checks)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+
+    sample_assumptions = [
+        "The project budget is estimated at 500,000-2,000,000 EUR over 3 years.",
+        "Approximately 15 team members will be needed, though this is unclear.",
+        "The timeline is roughly 18 months based on industry benchmarks for similar projects.",
+        "Stakeholder engagement will be sufficient.",
+        "Infrastructure costs range from 1,000 to 500,000 USD depending on scale.",
+    ]
+
+    report = run_fermi_sanity_check(sample_assumptions)
+    print(json.dumps(report.to_dict(), indent=2))