From e4527ba7db7b18be24930f65edf0cc6eb4d1705d Mon Sep 17 00:00:00 2001
From: hubertpysklo <hubert@uni.minerva.edu>
Date: Thu, 12 Mar 2026 16:01:26 +0530
Subject: [PATCH] AGENTS.MD

---
 AGENTS.md                                     |  444 +++-
 .../bayesian_boostrapping_results.json        | 1901 +----------------
 2 files changed, 405 insertions(+), 1940 deletions(-)
diff --git a/AGENTS.md b/AGENTS.md
index 3e6c63e..3a90a6c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -11,7 +11,7 @@ reproducible environments** backed by PostgreSQL schema cloning.
 ```
 ┌──────────────────────────┐       ┌──────────────────────┐
 │  Evaluation Client       │       │   Agent Sandbox      │
-│  (prime eval / SDK)      │──────▶│   (Docker container) │
+│  (SDK / notebooks)       │──────▶│   (Docker container) │
 │                          │       │                      │
 │  1. initEnv              │       │  Runs agent code     │
 │  2. startRun             │       │  Makes API calls ──┐ │
@@ -170,26 +170,367 @@ pytest tests/performance/test_box_bench_perf.py -v -s
 pytest tests/integration/ -v
 ```
 
-## Running Evaluations Locally
+## Running Evaluations
 
-```bash
-# 1. Activate the bench environment's venv
-source third_party/prime-environments/environments/agent_diff_bench/.venv/bin/activate
+### Using the SDK (Local, No External Dependencies)
+
+The SDK can fetch test suites directly from the platform — no HuggingFace or
+third-party tooling needed. This is the primary way to run evaluations.
+
+```python
+from agent_diff import AgentDiff, BashExecutorProxy
+
+client = AgentDiff()  # uses AGENT_DIFF_API_KEY and AGENT_DIFF_BASE_URL env vars
+
+# List available test suites
+suites = client.list_test_suites()
+for s in suites.suites:
+    print(f"{s.id}  {s.name}")
+
+# Get a specific suite with its tests
+suite = client.get_test_suite(suite_id="<suite-uuid>", expand=True)
+
+# Run each test
+for test in suite.tests:
+    env = client.init_env(
+        templateService=test.type,
+        templateName=test.template_schema,
+        impersonateUserId=test.impersonate_user_id,
+    )
+    run = client.start_run(envId=env.environmentId)
+    bash = BashExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)
+
+    # --- your agent loop goes here, calling bash.execute(command) ---
+
+    client.evaluate_run(runId=run.runId, expectedOutput=test.expected_output)
+    result = client.get_results_for_run(runId=run.runId)
+    print(f"{test.name}: {'PASS' if result.passed else 'FAIL'} score={result.score}")
+
+    client.delete_env(envId=env.environmentId)
+```
+
+### Using HuggingFace Dataset
+
+Alternatively, load tasks from the published HuggingFace dataset:
+
+```python
+from agent_diff import AgentDiff, BashExecutorProxy
+from datasets import load_dataset
+
+client = AgentDiff()
+dataset = load_dataset("hubertmarek/agent-diff-bench", split="test")
+
+for example in dataset:
+    info = json.loads(example["info"])
+    expected = json.loads(example["answer"])
+
+    env = client.init_env(
+        templateService=info["service"],
+        templateName=info["seed_template"],
+        impersonateUserId=info["impersonate_user_id"],
+    )
+    run = client.start_run(envId=env.environmentId)
+    bash = BashExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key)
+
+    # --- your agent loop goes here, calling bash.execute(command) ---
+
+    client.evaluate_run(runId=run.runId, expectedOutput=expected)
+    result = client.get_results_for_run(runId=run.runId)
+    print(f"{example['test_id']}: {'PASS' if result.passed else 'FAIL'} score={result.score}")
 
-# 2. Install the environment package
-cd third_party/prime-environments/environments/agent_diff_bench
-uv pip install -e .
+    client.delete_env(envId=env.environmentId)
+```
+
+See `examples/react_agent_benchmark.ipynb` and `examples/langchain_agent_benchmark.ipynb`
+for full runnable notebook examples.
+
+---
+
+## SessionManager & Isolation Architecture
+
+The isolation system is the core of Agent-Diff. It allows every evaluation run to
+operate on its own independent copy of a service's database without cross-contamination.
+
+### SessionManager (`src/platform/isolationEngine/session.py`)
+
+`SessionManager` wraps a single SQLAlchemy `Engine` and provides scoped sessions
+at two levels:
+
+1. **Meta sessions** — operate on the `public` schema where platform tables live
+   (`TemplateEnvironment`, `RunTimeEnvironment`, `Test`, `TestSuite`, etc.):
+
+   ```python
+   with session_manager.with_meta_session() as session:
+       # session is bound to `public` schema
+       env = session.query(RunTimeEnvironment).filter(...).one()
+   ```
+
+2. **Environment sessions** — operate on an isolated `state_<uuid>` schema that
+   contains the cloned service data for one evaluation run:
+
+   ```python
+   with session_manager.with_session_for_environment(env_id) as session:
+       # session is bound to `state_abc123...` schema
+       # all ORM queries hit the cloned tables for this environment only
+   ```
+
+   Internally this calls `lookup_environment(env_id)` to find the schema name from
+   the `RunTimeEnvironment` table, then uses SQLAlchemy's `schema_translate_map`
+   to redirect all unqualified table references to that schema:
 
-# 3. Run evaluation (from the agent_diff_bench directory)
-uv run prime eval run agent-diff-bench \
-  -m "openai/gpt-5-mini" \
-  -n 5 -r 3 -s \
-  -a '{"agentdiff_api_key": "ad_live_sk_..."}'
+   ```python
+   translated = base_engine.execution_options(schema_translate_map={None: schema})
+   ```
+
+   This means service code (Box, Slack, etc.) never needs to know which schema it's
+   hitting — the ORM models declare tables without a schema, and the engine-level
+   translation handles routing transparently.
+
+### IsolationMiddleware (`src/platform/api/middleware.py`)
+
+`IsolationMiddleware` is the Starlette middleware that sits in front of all
+`/api/env/{env_id}/services/...` requests. It is what connects HTTP requests to
+the correct isolated database session:
+
+1. **Extract `env_id`** from the URL path
+2. **Authenticate** the API key via `get_principal_id()`
+3. **Look up environment** in the meta DB to retrieve `impersonate_user_id`
+4. **Open a scoped DB session** via `session_manager.with_session_for_environment(env_id)`
+5. **Attach to `request.state`** so service handlers can access it:
+   - `request.state.db_session` — SQLAlchemy session scoped to the environment schema
+   - `request.state.environment_id` — the environment UUID string
+   - `request.state.impersonate_user_id` — which user the agent is acting as
+   - `request.state.impersonate_email` — alternative email-based impersonation
+   - `request.state.principal_id` — the authenticated API key owner
+
+Every service route handler accesses these via helper functions like:
+
+```python
+def _session(request: Request) -> Session:
+    return getattr(request.state, "db_session", None)
 ```
 
-Results are saved to: `third_party/prime-environments/environments/agent_diff_bench/eval_results/`
+### How Environments Are Created
+
+When `initEnv` is called:
+
+1. `TemplateManager.resolve_init_template()` finds the template by service+name (or ID)
+2. `CoreIsolationEngine.create_environment()` either claims a pre-built schema from
+   the pool or clones one from the template:
+   - `PoolManager.claim_ready_schema()` — fast path, reuses a pre-built clone
+   - `EnvironmentHandler.clone_schema_from_environment()` — slow path, creates schema +
+     copies tables + copies data from the template
+3. A `RunTimeEnvironment` row is written to the meta `public` schema with TTL and status
+4. The environment ID is returned to the caller
+
+---
+
+## Adding a New Service
 
-## Database Seeding
+Each service follows a consistent pattern. Study the existing ones:
+
+| Service  | API Style | Routes file | DB schema | DB operations |
+|----------|-----------|------------|-----------|---------------|
+| Slack    | Web API (flat endpoints) | `services/slack/api/methods.py` | `services/slack/database/schema.py` | `services/slack/database/operations.py` |
+| Box      | REST (resource paths)    | `services/box/api/routes.py`    | `services/box/database/schema.py`   | `services/box/database/operations.py`   |
+| Calendar | REST (Google style)      | `services/calendar/api/methods.py` | `services/calendar/database/schema.py` | `services/calendar/database/operations.py` |
+| Linear   | GraphQL (Ariadne)        | `services/linear/api/resolvers.py` | `services/linear/database/schema.py` | (inline in resolvers) |
+
+### Step-by-step
+
+**1. Create the service directory structure:**
+
+```
+backend/src/services/myservice/
+  __init__.py
+  database/
+    __init__.py
+    base.py            # DeclarativeBase subclass
+    schema.py          # SQLAlchemy ORM models
+    operations.py      # CRUD functions (take a Session argument)
+  api/
+    __init__.py
+    routes.py          # Starlette Route list
+```
+
+**2. Define the database schema** in `database/schema.py`:
+
+```python
+from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
+from sqlalchemy import String, DateTime
+
+class Base(DeclarativeBase):
+    pass
+
+class MyEntity(Base):
+    __tablename__ = "my_entities"
+    id: Mapped[str] = mapped_column(String(50), primary_key=True)
+    name: Mapped[str] = mapped_column(String(255))
+    # ...
+```
+
+Each service has its own `Base` — this is important because `Base.metadata` is used
+independently during schema creation and cloning.
+
+**3. Write route handlers** that read from `request.state`:
+
+```python
+from starlette.requests import Request
+from starlette.responses import JSONResponse
+from starlette.routing import Route
+
+def _session(request: Request):
+    return getattr(request.state, "db_session", None)
+
+def _user_id(request: Request):
+    return getattr(request.state, "impersonate_user_id", None)
+
+async def list_entities(request: Request):
+    session = _session(request)
+    entities = session.query(MyEntity).all()
+    return JSONResponse({"items": [...]})
+
+routes = [
+    Route("/entities", list_entities, methods=["GET"]),
+    Route("/entities/{id}", get_entity, methods=["GET"]),
+    # ...
+]
+```
+
+The key contract: your handlers must only use `request.state.db_session` for DB access.
+The IsolationMiddleware has already scoped this session to the correct environment schema.
+
+**4. Mount the service in `src/platform/api/main.py`:**
+
+```python
+from src.services.myservice.api.routes import routes as myservice_routes
+
+# Inside create_app():
+myservice_router = Router(myservice_routes)
+app.mount("/api/env/{env_id}/services/myservice", myservice_router)
+```
+
+**5. Write a seed script** in `backend/utils/seed_myservice_template.py` that:
+- Creates the PostgreSQL schema (e.g. `myservice_default`)
+- Uses `Base.metadata.create_all()` to create tables
+- Inserts seed data from a JSON file
+- Registers the template via `EnvironmentHandler.register_template()`
+
+Follow `seed_slack_template.py` as a reference — it shows the full pattern including
+schema creation, table ordering, and template registration.
+
+**6. Add seed data** in `examples/myservice/seeds/myservice_default.json` and copy to
+`backend/seeds/myservice/` for Docker builds.
+
+**7. Register the seed script** in the Docker startup command in `ops/docker-compose.yml`:
+
+```yaml
+command: >
+  sh -c "
+    alembic upgrade head &&
+    if [ \"$$SEED\" = 'true' ]; then
+      # ... existing seed scripts ...
+      python utils/seed_myservice_template.py;
+    fi &&
+    uvicorn src.platform.api.main:app --host 0.0.0.0 --port 8000
+  "
+```
+
+### GraphQL Services (Linear Pattern)
+
+If the service uses GraphQL instead of REST, follow the Linear pattern:
+
+- Define a `.graphql` schema file in `services/myservice/api/schema/`
+- Write Ariadne resolvers in `services/myservice/api/resolvers.py`
+- Create a custom `GraphQL` subclass (like `LinearGraphQL`) that extracts
+  `request.state.db_session` and passes it into the resolver context
+- Mount with `app.mount(...)` passing the GraphQL ASGI app directly
+
+---
+
+## Adding Test Suites
+
+Test suites define evaluation tasks with expected state-change assertions. They are
+loaded into the platform DB by `backend/utils/seed_tests.py`.
+
+### Test Suite JSON Format
+
+```json
+{
+  "name": "My Service Bench",
+  "description": "Benchmark tests for MyService",
+  "owner": "dev-user",
+  "ignore_fields": {
+    "global": ["created_at", "modified_at"]
+  },
+  "tests": [
+    {
+      "id": "test_1",
+      "name": "Create an entity",
+      "prompt": "Create an entity named 'foo' in the workspace.",
+      "type": "actionEval",
+      "seed_template": "myservice_default",
+      "impersonate_user_id": "user-123",
+      "assertions": [
+        {
+          "diff_type": "added",
+          "entity": "my_entities",
+          "where": { "name": { "eq": "foo" } },
+          "expected_count": 1
+        }
+      ]
+    }
+  ]
+}
+```
+
+Key fields per test:
+- **`id`** — unique string ID within the suite (used to generate deterministic UUIDs)
+- **`prompt`** — the natural language task given to the agent
+- **`type`** — typically `"actionEval"` for state-diff-based evaluation
+- **`seed_template`** — which template schema to clone (e.g. `"slack_default"`)
+- **`impersonate_user_id`** — which user the agent acts as
+- **`assertions`** — list of expected state diffs (added/updated/deleted rows, field
+  value checks). See the existing bench files for assertion patterns.
+
+Suite-level `ignore_fields` are merged into every test's expected output — use for
+timestamps and auto-generated fields that vary between runs.
+
+### Where to Put Test Suite Files
+
+Test suites live in two mirrored locations:
+
+- **`examples/{service}/testsuites/{suite_name}.json`** — canonical source for local dev
+- **`backend/seeds/testsuites/{suite_name}.json`** — copied here for Docker builds
+
+The seed script `seed_tests.py` checks `backend/seeds/testsuites/` first (Docker path),
+then falls back to scanning `examples/*/testsuites/*.json` (local dev path).
+
+### How Seeding Works
+
+`seed_tests.py` is idempotent — it can be re-run safely:
+
+1. Scans for `*.json` files in the testsuites directory
+2. For each file, checks if a suite with the same `name` + `owner` already exists
+3. If it exists, deletes all its tests and memberships, then re-creates them
+4. If new, creates a `TestSuite` with a deterministic UUID (from `uuid5(namespace, "suite:{owner}:{name}")`)
+5. Creates a `Test` row for each test entry, with a deterministic UUID
+6. Creates `TestMembership` rows linking tests to the suite
+
+### Running the Seeder
+
+```bash
+# Local (requires DATABASE_URL in env or .env)
+cd backend
+python utils/seed_tests.py
+
+# Docker (runs automatically when SEED=true)
+docker-compose up  # in ops/
+```
+
+---
+
+## Database Seeding (Templates)
 
 Templates are seeded from JSON files in `backend/seeds/` (Docker) or `examples/` (local).
 
@@ -200,48 +541,67 @@ Seed scripts in `backend/utils/`:
 - `seed_calendar_template.py` — creates calendar_base
 - `seed_tests.py` — loads test suite JSON files
 
+Each seed script follows the same pattern:
+1. Create the PostgreSQL schema with `CREATE SCHEMA {name}`
+2. Create tables using the service's `Base.metadata.create_all()`
+3. Insert data from the JSON seed file in foreign-key-safe table order
+4. Register the template in `TemplateEnvironment` with service, name, location, table_order
+
 On Railway, seeding runs automatically on deploy when `SEED=true` env var is set.
 The Dockerfile startup script runs Alembic migrations then all seed scripts.
 
-## Performance Profiling
+## Git LFS
 
-All `[PERF]` log lines are instrumented for performance tracking:
+Large binary and data files are tracked with Git LFS. Patterns are defined in
+`.gitattributes`:
 
-- **Middleware**: `[PERF] GET /api/env/.../services/box/... total=Xms auth=Xms meta_db=Xms handler=Xms`
-- **Box operations**: `[PERF] search_content TOTAL=Xms`, `[PERF] get_folder_by_id(...) time=Xms`
-- **Box schema**: `[PERF] File._get_path_collection depth=N time=Xms`
-- **Calendar**: `[PERF] Calendar events_list took Xms`
+- `examples/box/seeds/filesystem/**` — Box seed files (PDFs, CSVs, mhtml, etc.)
+- `experiments/kdd 2026/evaluation_outputs/**/*.json` — experiment checkpoint JSONs
+- `experiments/kdd 2026/bayesian_bootstrap_results/qualitative/*` — bootstrap results
 
-Filter with: `grep "\[PERF\]"` in Railway logs.
+If you add new large files (>1MB), add a matching pattern to `.gitattributes` **before**
+committing them. Adding the pattern after the fact only affects future commits — files
+already committed as regular blobs need `git lfs migrate import --no-rewrite` to convert.
 
 ## Key Directories
 
 ```
 backend/
   src/
-    platform/          # Platform API (initEnv, runs, evaluation)
+    platform/
+      api/
+        main.py          # App factory, middleware wiring, service mounting
+        middleware.py     # PlatformMiddleware + IsolationMiddleware
+        routes.py        # Platform API endpoints (initEnv, runs, evaluation)
+      isolationEngine/
+        session.py       # SessionManager (meta + environment sessions)
+        core.py          # CoreIsolationEngine (create/delete environments)
+        environment.py   # EnvironmentHandler (schema cloning, template registration)
+        pool.py          # PoolManager (pre-built schema pool)
+        templateManager.py # Template resolution logic
+      evaluationEngine/  # State-diff evaluation, assertion engine
+      testManager/       # Test suite management
+      db/schema.py       # Platform ORM models (TemplateEnvironment, RunTimeEnvironment, Test, etc.)
     services/
-      box/             # Box API replica
-      slack/           # Slack API replica
-      linear/          # Linear API replica
-      calendar/        # Calendar API replica
+      box/               # Box API replica (REST)
+      slack/             # Slack API replica (Web API)
+      linear/            # Linear API replica (GraphQL / Ariadne)
+      calendar/          # Calendar API replica (REST, Google style)
   tests/
-    integration/       # Full-stack integration tests
-    performance/       # Performance/benchmark tests
-    validation/        # API parity tests
-    unit/              # Unit tests
-  utils/               # Seed scripts
-  seeds/               # Seed data JSON files (for Docker)
+    integration/         # Full-stack integration tests
+    performance/         # Performance/benchmark tests
+    validation/          # API parity tests
+    unit/                # Unit tests
+  utils/                 # Seed scripts (seed_*_template.py, seed_tests.py)
+  seeds/                 # Seed data JSON files (for Docker)
 
-sdk/agent-diff-python/ # Python SDK (agent_diff package)
+sdk/agent-diff-python/   # Python SDK (agent_diff package)
 
 examples/
-  box/                 # Box seed data + test suites
-  linear/              # Linear seed data + test suites
-  slack/               # Slack seed data + test suites
-  calendar/            # Calendar seed data
-
-third_party/prime-environments/environments/agent_diff_bench/
-  agent_diff_bench.py  # Entry point for prime eval
-  src/environment.py   # Environment setup (initEnv, startRun, etc.)
+  box/                   # Box seed data + test suites
+  linear/                # Linear seed data + test suites
+  slack/                 # Slack seed data + test suites
+  calendar/              # Calendar seed data
+  react_agent_benchmark.ipynb       # ReAct agent evaluation notebook
+  langchain_agent_benchmark.ipynb   # LangChain agent evaluation notebook
 ```
diff --git a/experiments/kdd 2026/bayesian_bootstrap_results/qualitative/bayesian_boostrapping_results.json b/experiments/kdd 2026/bayesian_bootstrap_results/qualitative/bayesian_boostrapping_results.json
index 24aada2..60fdf3c 100644
--- a/experiments/kdd 2026/bayesian_bootstrap_results/qualitative/bayesian_boostrapping_results.json	
+++ b/experiments/kdd 2026/bayesian_bootstrap_results/qualitative/bayesian_boostrapping_results.json	
@@ -1,1898 +1,3 @@
-{
-  "runs": [
-    {
-      "run_id": "3232708f-7551-4cba-8cd4-859337027096",
-      "timestamp": "2026-02-09T01:15:47.774549",
-      "description": "Per-model recovery strategy effectiveness (delta = with_strategy - without_strategy)",
-      "metadata": {
-        "cluster_key": "test_id",
-        "score_field": "score (base_score 0-100)",
-        "total_runs_with_recovery": 3625
-      },
-      "results": {
-        "analysis_type": "per_model_strategy_effectiveness",
-        "n_models": 9,
-        "n_strategies": 11,
-        "strategies_analyzed": [
-          "retry_same",
-          "retry_modified_params",
-          "switch_tool",
-          "lookup_correct_value",
-          "backtrack",
-          "parse_error_message",
-          "change_strategy",
-          "break_into_steps",
-          "verify_prerequisites",
-          "skip_and_continue",
-          "use_fallback"
-        ],
-        "min_usage_threshold": 0.05,
-        "n_bootstrap_draws": 10000,
-        "results_by_model": [
-          {
-            "model": "anthropic/claude-haiku-4.5",
-            "strategy": "retry_same",
-            "delta": -13.670952478879222,
-            "ci_lo": -26.405423864456555,
-            "ci_hi": -1.1496987738885156,
-            "p_gt_0": 0.0155,
-            "p_lt_0": 0.9845,
-            "n_with": 48,
-            "n_without": 313,
-            "usage_rate": 0.1329639889196676,
-            "rank": 9
-          },
-          {
-            "model": "anthropic/claude-haiku-4.5",
-            "strategy": "retry_modified_params",
-            "delta": -3.5376743187005713,
-            "ci_lo": -13.194239582548864,
-            "ci_hi": 6.098779036481566,
-            "p_gt_0": 0.2375,
-            "p_lt_0": 0.7625,
-            "n_with": 157,
-            "n_without": 204,
-            "usage_rate": 0.43490304709141275,
-            "rank": 5
-          },
-          {
-            "model": "anthropic/claude-haiku-4.5",
-            "strategy": "switch_tool",
-            "delta": -9.780547145545386,
-            "ci_lo": -28.50861768916372,
-            "ci_hi": 7.446339852685261,
-            "p_gt_0": 0.1448,
-            "p_lt_0": 0.8552,
-            "n_with": 21,
-            "n_without": 340,
-            "usage_rate": 0.05817174515235457,
-            "rank": 7
-          },
-          {
-            "model": "anthropic/claude-haiku-4.5",
-            "strategy": "lookup_correct_value",
-            "delta": 17.30312301886274,
-            "ci_lo": 5.870108985176336,
-            "ci_hi": 28.5956370570439,
-            "p_gt_0": 0.9984,
-            "p_lt_0": 0.0016,
-            "n_with": 277,
-            "n_without": 84,
-            "usage_rate": 0.7673130193905817,
-            "rank": 2
-          },
-          {
-            "model": "anthropic/claude-haiku-4.5",
-            "strategy": "backtrack",
-            "delta": -16.870918001853607,
-            "ci_lo": -28.59058283605294,
-            "ci_hi": -5.200585945040907,
-            "p_gt_0": 0.0027,
-            "p_lt_0": 0.9973,
-            "n_with": 73,
-            "n_without": 288,
-            "usage_rate": 0.20221606648199447,
-            "rank": 11
-          },
-          {
-            "model": "anthropic/claude-haiku-4.5",
-            "strategy": "parse_error_message",
-            "delta": -13.722357772149305,
-            "ci_lo": -23.128316453216478,
-            "ci_hi": -4.3734826996324285,
-            "p_gt_0": 0.0024,
-            "p_lt_0": 0.9976,
-            "n_with": 157,
-            "n_without": 204,
-            "usage_rate": 0.43490304709141275,
-            "rank": 10
-          },
-          {
-            "model": "anthropic/claude-haiku-4.5",
-            "strategy": "change_strategy",
-            "delta": -7.875780734590356,
-            "ci_lo": -17.947717608774926,
-            "ci_hi": 1.8109216676435835,
-            "p_gt_0": 0.0621,
-            "p_lt_0": 0.9379,
-            "n_with": 126,
-            "n_without": 235,
-            "usage_rate": 0.3490304709141274,
-            "rank": 6
-          },
-          {
-            "model": "anthropic/claude-haiku-4.5",
-            "strategy": "break_into_steps",
-            "delta": 31.502451436469325,
-            "ci_lo": 17.792240906956607,
-            "ci_hi": 44.85454555502225,
-            "p_gt_0": 1.0,
-            "p_lt_0": 0.0,
-            "n_with": 309,
-            "n_without": 52,
-            "usage_rate": 0.8559556786703602,
-            "rank": 1
-          },
-          {
-            "model": "anthropic/claude-haiku-4.5",
-            "strategy": "verify_prerequisites",
-            "delta": 12.956636864754728,
-            "ci_lo": 3.424139647638783,
-            "ci_hi": 22.44115374481087,
-            "p_gt_0": 0.9966,
-            "p_lt_0": 0.0034,
-            "n_with": 181,
-            "n_without": 180,
-            "usage_rate": 0.5013850415512465,
-            "rank": 3
-          },
-          {
-            "model": "anthropic/claude-haiku-4.5",
-            "strategy": "skip_and_continue",
-            "delta": -12.288213612965258,
-            "ci_lo": -26.968059902206118,
-            "ci_hi": 1.287467224431798,
-            "p_gt_0": 0.0372,
-            "p_lt_0": 0.9628,
-            "n_with": 30,
-            "n_without": 331,
-            "usage_rate": 0.08310249307479224,
-            "rank": 8
-          },
-          {
-            "model": "anthropic/claude-haiku-4.5",
-            "strategy": "use_fallback",
-            "delta": -0.36833640817840557,
-            "ci_lo": -21.258429526507612,
-            "ci_hi": 17.1812955632496,
-            "p_gt_0": 0.5139,
-            "p_lt_0": 0.4861,
-            "n_with": 20,
-            "n_without": 341,
-            "usage_rate": 0.055401662049861494,
-            "rank": 4
-          },
-          {
-            "model": "deepseek/deepseek-v3.2",
-            "strategy": "retry_same",
-            "delta": -14.188100554537266,
-            "ci_lo": -35.76112154358545,
-            "ci_hi": 0.5063026676328819,
-            "p_gt_0": 0.0304,
-            "p_lt_0": 0.9696,
-            "n_with": 12,
-            "n_without": 416,
-            "usage_rate": 0.028037383177570093,
-            "rank": 11
-          },
-          {
-            "model": "deepseek/deepseek-v3.2",
-            "strategy": "retry_modified_params",
-            "delta": -6.638198731887192,
-            "ci_lo": -12.136623496593883,
-            "ci_hi": -1.4223226757095515,
-            "p_gt_0": 0.0064,
-            "p_lt_0": 0.9936,
-            "n_with": 168,
-            "n_without": 260,
-            "usage_rate": 0.3925233644859813,
-            "rank": 5
-          },
-          {
-            "model": "deepseek/deepseek-v3.2",
-            "strategy": "switch_tool",
-            "delta": -5.481083655104449,
-            "ci_lo": -18.629707585529463,
-            "ci_hi": 3.8066212041323673,
-            "p_gt_0": 0.161,
-            "p_lt_0": 0.839,
-            "n_with": 30,
-            "n_without": 398,
-            "usage_rate": 0.07009345794392523,
-            "rank": 3
-          },
-          {
-            "model": "deepseek/deepseek-v3.2",
-            "strategy": "lookup_correct_value",
-            "delta": 6.663740829223979,
-            "ci_lo": -1.8242866382482426,
-            "ci_hi": 17.509424517822136,
-            "p_gt_0": 0.9259,
-            "p_lt_0": 0.0741,
-            "n_with": 377,
-            "n_without": 51,
-            "usage_rate": 0.8808411214953271,
-            "rank": 1
-          },
-          {
-            "model": "deepseek/deepseek-v3.2",
-            "strategy": "backtrack",
-            "delta": -7.898583321582811,
-            "ci_lo": -16.78344824003216,
-            "ci_hi": -0.8870673106196507,
-            "p_gt_0": 0.0128,
-            "p_lt_0": 0.9872,
-            "n_with": 76,
-            "n_without": 352,
-            "usage_rate": 0.17757009345794392,
-            "rank": 8
-          },
-          {
-            "model": "deepseek/deepseek-v3.2",
-            "strategy": "parse_error_message",
-            "delta": -5.540272236291402,
-            "ci_lo": -11.291035414817875,
-            "ci_hi": -0.18840971351639632,
-            "p_gt_0": 0.0215,
-            "p_lt_0": 0.9785,
-            "n_with": 147,
-            "n_without": 281,
-            "usage_rate": 0.34345794392523366,
-            "rank": 4
-          },
-          {
-            "model": "deepseek/deepseek-v3.2",
-            "strategy": "change_strategy",
-            "delta": -7.48541195012128,
-            "ci_lo": -13.361977662551174,
-            "ci_hi": -2.1007550135388335,
-            "p_gt_0": 0.0039,
-            "p_lt_0": 0.9961,
-            "n_with": 165,
-            "n_without": 263,
-            "usage_rate": 0.3855140186915888,
-            "rank": 7
-          },
-          {
-            "model": "deepseek/deepseek-v3.2",
-            "strategy": "break_into_steps",
-            "delta": -8.192502077144422,
-            "ci_lo": -12.218749037288632,
-            "ci_hi": -3.1261223156091877,
-            "p_gt_0": 0.0015,
-            "p_lt_0": 0.9985,
-            "n_with": 419,
-            "n_without": 9,
-            "usage_rate": 0.9789719626168224,
-            "rank": 9
-          },
-          {
-            "model": "deepseek/deepseek-v3.2",
-            "strategy": "verify_prerequisites",
-            "delta": 3.3649165088143738,
-            "ci_lo": -2.2266992286166243,
-            "ci_hi": 9.491704662686349,
-            "p_gt_0": 0.8786,
-            "p_lt_0": 0.1214,
-            "n_with": 296,
-            "n_without": 132,
-            "usage_rate": 0.6915887850467289,
-            "rank": 2
-          },
-          {
-            "model": "deepseek/deepseek-v3.2",
-            "strategy": "skip_and_continue",
-            "delta": -11.580075697984528,
-            "ci_lo": -26.126098020927113,
-            "ci_hi": -0.6978544527320647,
-            "p_gt_0": 0.0166,
-            "p_lt_0": 0.9834,
-            "n_with": 30,
-            "n_without": 398,
-            "usage_rate": 0.07009345794392523,
-            "rank": 10
-          },
-          {
-            "model": "deepseek/deepseek-v3.2",
-            "strategy": "use_fallback",
-            "delta": -7.150749930583509,
-            "ci_lo": -18.334503520561586,
-            "ci_hi": 1.5354556117732354,
-            "p_gt_0": 0.0634,
-            "p_lt_0": 0.9366,
-            "n_with": 43,
-            "n_without": 385,
-            "usage_rate": 0.10046728971962617,
-            "rank": 6
-          },
-          {
-            "model": "google/gemini-3-flash-preview",
-            "strategy": "retry_same",
-            "delta": -22.88786562675688,
-            "ci_lo": -36.54470006610749,
-            "ci_hi": -9.883326388604234,
-            "p_gt_0": 0.0001,
-            "p_lt_0": 0.9999,
-            "n_with": 36,
-            "n_without": 372,
-            "usage_rate": 0.08823529411764706,
-            "rank": 11
-          },
-          {
-            "model": "google/gemini-3-flash-preview",
-            "strategy": "retry_modified_params",
-            "delta": -0.36544721236025773,
-            "ci_lo": -7.600840041539863,
-            "ci_hi": 6.961833702763105,
-            "p_gt_0": 0.4563,
-            "p_lt_0": 0.5437,
-            "n_with": 162,
-            "n_without": 246,
-            "usage_rate": 0.39705882352941174,
-            "rank": 6
-          },
-          {
-            "model": "google/gemini-3-flash-preview",
-            "strategy": "switch_tool",
-            "delta": -2.5508642536741157,
-            "ci_lo": -18.51177678737422,
-            "ci_hi": 10.05521388841985,
-            "p_gt_0": 0.3939,
-            "p_lt_0": 0.6061,
-            "n_with": 28,
-            "n_without": 380,
-            "usage_rate": 0.06862745098039216,
-            "rank": 9
-          },
-          {
-            "model": "google/gemini-3-flash-preview",
-            "strategy": "lookup_correct_value",
-            "delta": 23.264983516345882,
-            "ci_lo": 11.969469386546896,
-            "ci_hi": 35.32080862468233,
-            "p_gt_0": 0.9999,
-            "p_lt_0": 0.0001,
-            "n_with": 350,
-            "n_without": 58,
-            "usage_rate": 0.8578431372549019,
-            "rank": 1
-          },
-          {
-            "model": "google/gemini-3-flash-preview",
-            "strategy": "backtrack",
-            "delta": -1.1062962149436983,
-            "ci_lo": -10.205454929391083,
-            "ci_hi": 6.808863042799657,
-            "p_gt_0": 0.4165,
-            "p_lt_0": 0.5835,
-            "n_with": 89,
-            "n_without": 319,
-            "usage_rate": 0.2181372549019608,
-            "rank": 7
-          },
-          {
-            "model": "google/gemini-3-flash-preview",
-            "strategy": "parse_error_message",
-            "delta": -2.031387443012713,
-            "ci_lo": -9.283337951130392,
-            "ci_hi": 5.030873089731125,
-            "p_gt_0": 0.2911,
-            "p_lt_0": 0.7089,
-            "n_with": 137,
-            "n_without": 271,
-            "usage_rate": 0.33578431372549017,
-            "rank": 8
-          },
-          {
-            "model": "google/gemini-3-flash-preview",
-            "strategy": "change_strategy",
-            "delta": 0.1456861206541943,
-            "ci_lo": -7.672728144645115,
-            "ci_hi": 7.703861831092565,
-            "p_gt_0": 0.5222,
-            "p_lt_0": 0.4778,
-            "n_with": 130,
-            "n_without": 278,
-            "usage_rate": 0.31862745098039214,
-            "rank": 4
-          },
-          {
-            "model": "google/gemini-3-flash-preview",
-            "strategy": "break_into_steps",
-            "delta": 15.659963708654931,
-            "ci_lo": 4.971144475459527,
-            "ci_hi": 27.07229043438363,
-            "p_gt_0": 0.9991,
-            "p_lt_0": 0.0009,
-            "n_with": 348,
-            "n_without": 60,
-            "usage_rate": 0.8529411764705882,
-            "rank": 2
-          },
-          {
-            "model": "google/gemini-3-flash-preview",
-            "strategy": "verify_prerequisites",
-            "delta": 8.498155484368027,
-            "ci_lo": 1.7654139742943358,
-            "ci_hi": 15.3771571336391,
-            "p_gt_0": 0.9938,
-            "p_lt_0": 0.0062,
-            "n_with": 194,
-            "n_without": 214,
-            "usage_rate": 0.47549019607843135,
-            "rank": 3
-          },
-          {
-            "model": "google/gemini-3-flash-preview",
-            "strategy": "skip_and_continue",
-            "delta": 0.0908140649130664,
-            "ci_lo": -13.932378871275402,
-            "ci_hi": 10.992675173775162,
-            "p_gt_0": 0.5341,
-            "p_lt_0": 0.4659,
-            "n_with": 21,
-            "n_without": 387,
-            "usage_rate": 0.051470588235294115,
-            "rank": 5
-          },
-          {
-            "model": "google/gemini-3-flash-preview",
-            "strategy": "use_fallback",
-            "delta": -10.493423096520843,
-            "ci_lo": -27.046348750015174,
-            "ci_hi": 3.991294987747103,
-            "p_gt_0": 0.0845,
-            "p_lt_0": 0.9155,
-            "n_with": 20,
-            "n_without": 388,
-            "usage_rate": 0.049019607843137254,
-            "rank": 10
-          },
-          {
-            "model": "meta-llama/llama-4-scout",
-            "strategy": "retry_same",
-            "delta": -30.75329513068521,
-            "ci_lo": -39.126101595909596,
-            "ci_hi": -22.03053754818564,
-            "p_gt_0": 0.0,
-            "p_lt_0": 1.0,
-            "n_with": 156,
-            "n_without": 226,
-            "usage_rate": 0.4083769633507853,
-            "rank": 11
-          },
-          {
-            "model": "meta-llama/llama-4-scout",
-            "strategy": "retry_modified_params",
-            "delta": -16.902076581419532,
-            "ci_lo": -27.21661704594761,
-            "ci_hi": -6.3529625473618365,
-            "p_gt_0": 0.0011,
-            "p_lt_0": 0.9989,
-            "n_with": 290,
-            "n_without": 92,
-            "usage_rate": 0.7591623036649214,
-            "rank": 9
-          },
-          {
-            "model": "meta-llama/llama-4-scout",
-            "strategy": "switch_tool",
-            "delta": -18.35914151795654,
-            "ci_lo": -29.57729780711268,
-            "ci_hi": -6.180325704866352,
-            "p_gt_0": 0.0021,
-            "p_lt_0": 0.9979,
-            "n_with": 59,
-            "n_without": 323,
-            "usage_rate": 0.1544502617801047,
-            "rank": 10
-          },
-          {
-            "model": "meta-llama/llama-4-scout",
-            "strategy": "lookup_correct_value",
-            "delta": 12.537408555273544,
-            "ci_lo": 3.0476468930474963,
-            "ci_hi": 21.8270806179885,
-            "p_gt_0": 0.9953,
-            "p_lt_0": 0.0047,
-            "n_with": 234,
-            "n_without": 148,
-            "usage_rate": 0.612565445026178,
-            "rank": 2
-          },
-          {
-            "model": "meta-llama/llama-4-scout",
-            "strategy": "backtrack",
-            "delta": -5.5448336087675605,
-            "ci_lo": -14.930901117661586,
-            "ci_hi": 3.8153504045085525,
-            "p_gt_0": 0.1284,
-            "p_lt_0": 0.8716,
-            "n_with": 125,
-            "n_without": 257,
-            "usage_rate": 0.32722513089005234,
-            "rank": 5
-          },
-          {
-            "model": "meta-llama/llama-4-scout",
-            "strategy": "parse_error_message",
-            "delta": -9.585729735519266,
-            "ci_lo": -19.855676889721124,
-            "ci_hi": 0.6330294605123999,
-            "p_gt_0": 0.0343,
-            "p_lt_0": 0.9657,
-            "n_with": 282,
-            "n_without": 100,
-            "usage_rate": 0.7382198952879581,
-            "rank": 6
-          },
-          {
-            "model": "meta-llama/llama-4-scout",
-            "strategy": "change_strategy",
-            "delta": -11.22957928106905,
-            "ci_lo": -20.857994513889917,
-            "ci_hi": -1.6382879768882503,
-            "p_gt_0": 0.0103,
-            "p_lt_0": 0.9897,
-            "n_with": 251,
-            "n_without": 131,
-            "usage_rate": 0.6570680628272252,
-            "rank": 7
-          },
-          {
-            "model": "meta-llama/llama-4-scout",
-            "strategy": "break_into_steps",
-            "delta": 16.313899724511856,
-            "ci_lo": 6.712120011211371,
-            "ci_hi": 25.801760009787802,
-            "p_gt_0": 0.9996,
-            "p_lt_0": 0.0004,
-            "n_with": 255,
-            "n_without": 127,
-            "usage_rate": 0.6675392670157068,
-            "rank": 1
-          },
-          {
-            "model": "meta-llama/llama-4-scout",
-            "strategy": "verify_prerequisites",
-            "delta": 3.8544987268740596,
-            "ci_lo": -7.313387902712735,
-            "ci_hi": 15.173087424603711,
-            "p_gt_0": 0.7497,
-            "p_lt_0": 0.2503,
-            "n_with": 77,
-            "n_without": 305,
-            "usage_rate": 0.20157068062827224,
-            "rank": 3
-          },
-          {
-            "model": "meta-llama/llama-4-scout",
-            "strategy": "skip_and_continue",
-            "delta": -5.000170297906893,
-            "ci_lo": -14.644049357903315,
-            "ci_hi": 4.945912602475521,
-            "p_gt_0": 0.1585,
-            "p_lt_0": 0.8415,
-            "n_with": 91,
-            "n_without": 291,
-            "usage_rate": 0.23821989528795812,
-            "rank": 4
-          },
-          {
-            "model": "meta-llama/llama-4-scout",
-            "strategy": "use_fallback",
-            "delta": -13.412495085337778,
-            "ci_lo": -25.859492913196565,
-            "ci_hi": 0.1896623137258898,
-            "p_gt_0": 0.0267,
-            "p_lt_0": 0.9733,
-            "n_with": 44,
-            "n_without": 338,
-            "usage_rate": 0.11518324607329843,
-            "rank": 8
-          },
-          {
-            "model": "mistralai/devstral-2512",
-            "strategy": "retry_same",
-            "delta": -10.871197522137857,
-            "ci_lo": -24.409322311410907,
-            "ci_hi": 0.2718312653732015,
-            "p_gt_0": 0.0292,
-            "p_lt_0": 0.9708,
-            "n_with": 30,
-            "n_without": 398,
-            "usage_rate": 0.07009345794392523,
-            "rank": 9
-          },
-          {
-            "model": "mistralai/devstral-2512",
-            "strategy": "retry_modified_params",
-            "delta": -3.991243726034971,
-            "ci_lo": -9.273382636579214,
-            "ci_hi": 1.3620362507785932,
-            "p_gt_0": 0.0702,
-            "p_lt_0": 0.9298,
-            "n_with": 227,
-            "n_without": 201,
-            "usage_rate": 0.530373831775701,
-            "rank": 7
-          },
-          {
-            "model": "mistralai/devstral-2512",
-            "strategy": "switch_tool",
-            "delta": 1.2572701429250817,
-            "ci_lo": -9.75658246981209,
-            "ci_hi": 8.76722005472962,
-            "p_gt_0": 0.6603,
-            "p_lt_0": 0.3397,
-            "n_with": 27,
-            "n_without": 401,
-            "usage_rate": 0.0630841121495327,
-            "rank": 3
-          },
-          {
-            "model": "mistralai/devstral-2512",
-            "strategy": "lookup_correct_value",
-            "delta": 13.427418480834746,
-            "ci_lo": 5.178825155041258,
-            "ci_hi": 22.715860911705352,
-            "p_gt_0": 0.9999,
-            "p_lt_0": 0.0001,
-            "n_with": 352,
-            "n_without": 76,
-            "usage_rate": 0.822429906542056,
-            "rank": 1
-          },
-          {
-            "model": "mistralai/devstral-2512",
-            "strategy": "backtrack",
-            "delta": -1.2082621150299944,
-            "ci_lo": -9.742499316071864,
-            "ci_hi": 5.717615184192211,
-            "p_gt_0": 0.4051,
-            "p_lt_0": 0.5949,
-            "n_with": 70,
-            "n_without": 358,
-            "usage_rate": 0.16355140186915887,
-            "rank": 5
-          },
-          {
-            "model": "mistralai/devstral-2512",
-            "strategy": "parse_error_message",
-            "delta": -3.8101440959857515,
-            "ci_lo": -9.246835945418736,
-            "ci_hi": 1.6203188546943876,
-            "p_gt_0": 0.085,
-            "p_lt_0": 0.915,
-            "n_with": 208,
-            "n_without": 220,
-            "usage_rate": 0.48598130841121495,
-            "rank": 6
-          },
-          {
-            "model": "mistralai/devstral-2512",
-            "strategy": "change_strategy",
-            "delta": -4.8581712104614425,
-            "ci_lo": -10.206364974693907,
-            "ci_hi": 0.3219879507011329,
-            "p_gt_0": 0.0324,
-            "p_lt_0": 0.9676,
-            "n_with": 210,
-            "n_without": 218,
-            "usage_rate": 0.49065420560747663,
-            "rank": 8
-          },
-          {
-            "model": "mistralai/devstral-2512",
-            "strategy": "break_into_steps",
-            "delta": 0.6965722255026323,
-            "ci_lo": -6.867126640240155,
-            "ci_hi": 10.381321377227321,
-            "p_gt_0": 0.5289,
-            "p_lt_0": 0.4711,
-            "n_with": 393,
-            "n_without": 35,
-            "usage_rate": 0.9182242990654206,
-            "rank": 4
-          },
-          {
-            "model": "mistralai/devstral-2512",
-            "strategy": "verify_prerequisites",
-            "delta": 1.9140202826861041,
-            "ci_lo": -3.124658400301976,
-            "ci_hi": 7.235743815950695,
-            "p_gt_0": 0.764,
-            "p_lt_0": 0.236,
-            "n_with": 235,
-            "n_without": 193,
-            "usage_rate": 0.5490654205607477,
-            "rank": 2
-          },
-          {
-            "model": "mistralai/devstral-2512",
-            "strategy": "skip_and_continue",
-            "delta": -11.379797716296984,
-            "ci_lo": -21.32156683059864,
-            "ci_hi": -2.9800207916094914,
-            "p_gt_0": 0.0025,
-            "p_lt_0": 0.9975,
-            "n_with": 47,
-            "n_without": 381,
-            "usage_rate": 0.10981308411214953,
-            "rank": 10
-          },
-          {
-            "model": "mistralai/devstral-2512",
-            "strategy": "use_fallback",
-            "delta": -17.053620266185956,
-            "ci_lo": -31.12086198868929,
-            "ci_hi": -4.830656333277274,
-            "p_gt_0": 0.0018,
-            "p_lt_0": 0.9982,
-            "n_with": 29,
-            "n_without": 399,
-            "usage_rate": 0.06775700934579439,
-            "rank": 11
-          },
-          {
-            "model": "moonshotai/kimi-k2-0905",
-            "strategy": "retry_same",
-            "delta": -14.94681310063999,
-            "ci_lo": -30.042167925544508,
-            "ci_hi": -1.2143930288475762,
-            "p_gt_0": 0.0161,
-            "p_lt_0": 0.9839,
-            "n_with": 28,
-            "n_without": 379,
-            "usage_rate": 0.0687960687960688,
-            "rank": 10
-          },
-          {
-            "model": "moonshotai/kimi-k2-0905",
-            "strategy": "retry_modified_params",
-            "delta": 2.563053489094396,
-            "ci_lo": -4.241087779817882,
-            "ci_hi": 9.431783679888426,
-            "p_gt_0": 0.7711,
-            "p_lt_0": 0.2289,
-            "n_with": 175,
-            "n_without": 232,
-            "usage_rate": 0.42997542997543,
-            "rank": 4
-          },
-          {
-            "model": "moonshotai/kimi-k2-0905",
-            "strategy": "switch_tool",
-            "delta": -2.120302518182182,
-            "ci_lo": -16.425874982124956,
-            "ci_hi": 8.943403844852325,
-            "p_gt_0": 0.4048,
-            "p_lt_0": 0.5952,
-            "n_with": 22,
-            "n_without": 385,
-            "usage_rate": 0.05405405405405406,
-            "rank": 6
-          },
-          {
-            "model": "moonshotai/kimi-k2-0905",
-            "strategy": "lookup_correct_value",
-            "delta": 27.63876052170265,
-            "ci_lo": 17.57734194611147,
-            "ci_hi": 37.900459227317455,
-            "p_gt_0": 1.0,
-            "p_lt_0": 0.0,
-            "n_with": 340,
-            "n_without": 67,
-            "usage_rate": 0.8353808353808354,
-            "rank": 1
-          },
-          {
-            "model": "moonshotai/kimi-k2-0905",
-            "strategy": "backtrack",
-            "delta": 1.4478534603940318,
-            "ci_lo": -7.377841574505178,
-            "ci_hi": 9.361635851942477,
-            "p_gt_0": 0.6439,
-            "p_lt_0": 0.3561,
-            "n_with": 76,
-            "n_without": 331,
-            "usage_rate": 0.18673218673218672,
-            "rank": 5
-          },
-          {
-            "model": "moonshotai/kimi-k2-0905",
-            "strategy": "parse_error_message",
-            "delta": -2.5352487320565045,
-            "ci_lo": -10.131303192929202,
-            "ci_hi": 4.913736525891758,
-            "p_gt_0": 0.2575,
-            "p_lt_0": 0.7425,
-            "n_with": 149,
-            "n_without": 258,
-            "usage_rate": 0.36609336609336607,
-            "rank": 7
-          },
-          {
-            "model": "moonshotai/kimi-k2-0905",
-            "strategy": "change_strategy",
-            "delta": -2.888959311641334,
-            "ci_lo": -10.41568531090378,
-            "ci_hi": 4.446592546551432,
-            "p_gt_0": 0.2208,
-            "p_lt_0": 0.7792,
-            "n_with": 142,
-            "n_without": 265,
-            "usage_rate": 0.3488943488943489,
-            "rank": 8
-          },
-          {
-            "model": "moonshotai/kimi-k2-0905",
-            "strategy": "break_into_steps",
-            "delta": 21.34368407696186,
-            "ci_lo": 6.677416266517301,
-            "ci_hi": 37.005647009625285,
-            "p_gt_0": 0.9985,
-            "p_lt_0": 0.0015,
-            "n_with": 376,
-            "n_without": 31,
-            "usage_rate": 0.9238329238329238,
-            "rank": 2
-          },
-          {
-            "model": "moonshotai/kimi-k2-0905",
-            "strategy": "verify_prerequisites",
-            "delta": 12.81547665202495,
-            "ci_lo": 5.914537543007963,
-            "ci_hi": 19.538816837669604,
-            "p_gt_0": 0.9999,
-            "p_lt_0": 0.0001,
-            "n_with": 197,
-            "n_without": 210,
-            "usage_rate": 0.48402948402948404,
-            "rank": 3
-          },
-          {
-            "model": "moonshotai/kimi-k2-0905",
-            "strategy": "skip_and_continue",
-            "delta": -4.92614599206687,
-            "ci_lo": -14.185253589768433,
-            "ci_hi": 4.061437603499923,
-            "p_gt_0": 0.144,
-            "p_lt_0": 0.856,
-            "n_with": 45,
-            "n_without": 362,
-            "usage_rate": 0.11056511056511056,
-            "rank": 9
-          },
-          {
-            "model": "moonshotai/kimi-k2-0905",
-            "strategy": "use_fallback",
-            "delta": -19.85300433047656,
-            "ci_lo": -35.95572567962597,
-            "ci_hi": -5.509031693992781,
-            "p_gt_0": 0.0023,
-            "p_lt_0": 0.9977,
-            "n_with": 27,
-            "n_without": 380,
-            "usage_rate": 0.06633906633906633,
-            "rank": 11
-          },
-          {
-            "model": "openai/gpt-oss-120b",
-            "strategy": "retry_same",
-            "delta": 0.7921348493221181,
-            "ci_lo": -8.217049237888544,
-            "ci_hi": 8.6506535112169,
-            "p_gt_0": 0.5896,
-            "p_lt_0": 0.4104,
-            "n_with": 70,
-            "n_without": 313,
-            "usage_rate": 0.18276762402088773,
-            "rank": 6
-          },
-          {
-            "model": "openai/gpt-oss-120b",
-            "strategy": "retry_modified_params",
-            "delta": 0.43459904631507407,
-            "ci_lo": -6.629157840423662,
-            "ci_hi": 7.663041513102244,
-            "p_gt_0": 0.5473,
-            "p_lt_0": 0.4527,
-            "n_with": 218,
-            "n_without": 165,
-            "usage_rate": 0.5691906005221932,
-            "rank": 7
-          },
-          {
-            "model": "openai/gpt-oss-120b",
-            "strategy": "switch_tool",
-            "delta": -14.505097728985476,
-            "ci_lo": -35.29924844915861,
-            "ci_hi": 3.6944530131282303,
-            "p_gt_0": 0.0666,
-            "p_lt_0": 0.9334,
-            "n_with": 16,
-            "n_without": 367,
-            "usage_rate": 0.04177545691906005,
-            "rank": 11
-          },
-          {
-            "model": "openai/gpt-oss-120b",
-            "strategy": "lookup_correct_value",
-            "delta": 6.562993073588253,
-            "ci_lo": -2.557030189427241,
-            "ci_hi": 16.716936671415294,
-            "p_gt_0": 0.9182,
-            "p_lt_0": 0.0818,
-            "n_with": 304,
-            "n_without": 79,
-            "usage_rate": 0.793733681462141,
-            "rank": 2
-          },
-          {
-            "model": "openai/gpt-oss-120b",
-            "strategy": "backtrack",
-            "delta": 1.2582914135034888,
-            "ci_lo": -9.021542391262608,
-            "ci_hi": 9.833824168822426,
-            "p_gt_0": 0.6267,
-            "p_lt_0": 0.3733,
-            "n_with": 55,
-            "n_without": 328,
-            "usage_rate": 0.14360313315926893,
-            "rank": 5
-          },
-          {
-            "model": "openai/gpt-oss-120b",
-            "strategy": "parse_error_message",
-            "delta": 3.8295274321876307,
-            "ci_lo": -3.4849220833694674,
-            "ci_hi": 10.824180599952456,
-            "p_gt_0": 0.8547,
-            "p_lt_0": 0.1453,
-            "n_with": 172,
-            "n_without": 211,
-            "usage_rate": 0.4490861618798956,
-            "rank": 3
-          },
-          {
-            "model": "openai/gpt-oss-120b",
-            "strategy": "change_strategy",
-            "delta": -0.2654544163244912,
-            "ci_lo": -7.65915124419173,
-            "ci_hi": 7.051386868606802,
-            "p_gt_0": 0.4762,
-            "p_lt_0": 0.5238,
-            "n_with": 138,
-            "n_without": 245,
-            "usage_rate": 0.360313315926893,
-            "rank": 8
-          },
-          {
-            "model": "openai/gpt-oss-120b",
-            "strategy": "break_into_steps",
-            "delta": 9.129573540337937,
-            "ci_lo": -1.6905459585992744,
-            "ci_hi": 20.973889864192227,
-            "p_gt_0": 0.9453,
-            "p_lt_0": 0.0547,
-            "n_with": 337,
-            "n_without": 46,
-            "usage_rate": 0.8798955613577023,
-            "rank": 1
-          },
-          {
-            "model": "openai/gpt-oss-120b",
-            "strategy": "verify_prerequisites",
-            "delta": 3.5290513306313236,
-            "ci_lo": -3.561829056656785,
-            "ci_hi": 10.605508712100065,
-            "p_gt_0": 0.8371,
-            "p_lt_0": 0.1629,
-            "n_with": 156,
-            "n_without": 227,
-            "usage_rate": 0.4073107049608355,
-            "rank": 4
-          },
-          {
-            "model": "openai/gpt-oss-120b",
-            "strategy": "skip_and_continue",
-            "delta": -7.11287592644546,
-            "ci_lo": -18.42436524562686,
-            "ci_hi": 2.676079346993676,
-            "p_gt_0": 0.084,
-            "p_lt_0": 0.916,
-            "n_with": 26,
-            "n_without": 357,
-            "usage_rate": 0.06788511749347259,
-            "rank": 10
-          },
-          {
-            "model": "openai/gpt-oss-120b",
-            "strategy": "use_fallback",
-            "delta": -2.693459946451506,
-            "ci_lo": -16.473386569642685,
-            "ci_hi": 8.67382944263421,
-            "p_gt_0": 0.3536,
-            "p_lt_0": 0.6464,
-            "n_with": 31,
-            "n_without": 352,
-            "usage_rate": 0.08093994778067885,
-            "rank": 9
-          },
-          {
-            "model": "qwen/qwen3-vl-235b-a22b-instruct",
-            "strategy": "retry_same",
-            "delta": -36.16427400478764,
-            "ci_lo": -48.907611542160744,
-            "ci_hi": -23.47952365051624,
-            "p_gt_0": 0.0,
-            "p_lt_0": 1.0,
-            "n_with": 48,
-            "n_without": 368,
-            "usage_rate": 0.11538461538461539,
-            "rank": 11
-          },
-          {
-            "model": "qwen/qwen3-vl-235b-a22b-instruct",
-            "strategy": "retry_modified_params",
-            "delta": -12.006718045243717,
-            "ci_lo": -19.08720143702119,
-            "ci_hi": -5.106186416114163,
-            "p_gt_0": 0.0003,
-            "p_lt_0": 0.9997,
-            "n_with": 196,
-            "n_without": 220,
-            "usage_rate": 0.47115384615384615,
-            "rank": 6
-          },
-          {
-            "model": "qwen/qwen3-vl-235b-a22b-instruct",
-            "strategy": "switch_tool",
-            "delta": -12.012970843871116,
-            "ci_lo": -24.593354395304814,
-            "ci_hi": -0.9141324110497081,
-            "p_gt_0": 0.0162,
-            "p_lt_0": 0.9838,
-            "n_with": 40,
-            "n_without": 376,
-            "usage_rate": 0.09615384615384616,
-            "rank": 7
-          },
-          {
-            "model": "qwen/qwen3-vl-235b-a22b-instruct",
-            "strategy": "lookup_correct_value",
-            "delta": 17.304343249575936,
-            "ci_lo": 7.557247400304122,
-            "ci_hi": 27.88739625912021,
-            "p_gt_0": 0.9999,
-            "p_lt_0": 0.0001,
-            "n_with": 330,
-            "n_without": 86,
-            "usage_rate": 0.7932692307692307,
-            "rank": 2
-          },
-          {
-            "model": "qwen/qwen3-vl-235b-a22b-instruct",
-            "strategy": "backtrack",
-            "delta": -10.194686020350298,
-            "ci_lo": -19.95478528777536,
-            "ci_hi": -1.4029736834860957,
-            "p_gt_0": 0.0116,
-            "p_lt_0": 0.9884,
-            "n_with": 74,
-            "n_without": 342,
-            "usage_rate": 0.1778846153846154,
-            "rank": 5
-          },
-          {
-            "model": "qwen/qwen3-vl-235b-a22b-instruct",
-            "strategy": "parse_error_message",
-            "delta": -9.125018859671272,
-            "ci_lo": -15.809533560721597,
-            "ci_hi": -2.5074691422163338,
-            "p_gt_0": 0.0038,
-            "p_lt_0": 0.9962,
-            "n_with": 202,
-            "n_without": 214,
-            "usage_rate": 0.4855769230769231,
-            "rank": 4
-          },
-          {
-            "model": "qwen/qwen3-vl-235b-a22b-instruct",
-            "strategy": "change_strategy",
-            "delta": -13.65453407958844,
-            "ci_lo": -21.300701332285666,
-            "ci_hi": -6.507512403487294,
-            "p_gt_0": 0.0001,
-            "p_lt_0": 0.9999,
-            "n_with": 166,
-            "n_without": 250,
-            "usage_rate": 0.39903846153846156,
-            "rank": 8
-          },
-          {
-            "model": "qwen/qwen3-vl-235b-a22b-instruct",
-            "strategy": "break_into_steps",
-            "delta": 20.317388231829593,
-            "ci_lo": 9.16939116008763,
-            "ci_hi": 32.011773935226415,
-            "p_gt_0": 1.0,
-            "p_lt_0": 0.0,
-            "n_with": 356,
-            "n_without": 60,
-            "usage_rate": 0.8557692307692307,
-            "rank": 1
-          },
-          {
-            "model": "qwen/qwen3-vl-235b-a22b-instruct",
-            "strategy": "verify_prerequisites",
-            "delta": 4.477513873857788,
-            "ci_lo": -2.0905671452852235,
-            "ci_hi": 10.993040116102074,
-            "p_gt_0": 0.913,
-            "p_lt_0": 0.087,
-            "n_with": 164,
-            "n_without": 252,
-            "usage_rate": 0.3942307692307692,
-            "rank": 3
-          },
-          {
-            "model": "qwen/qwen3-vl-235b-a22b-instruct",
-            "strategy": "skip_and_continue",
-            "delta": -15.954033523893935,
-            "ci_lo": -25.68040461446644,
-            "ci_hi": -6.921313988084732,
-            "p_gt_0": 0.0002,
-            "p_lt_0": 0.9998,
-            "n_with": 58,
-            "n_without": 358,
-            "usage_rate": 0.13942307692307693,
-            "rank": 9
-          },
-          {
-            "model": "qwen/qwen3-vl-235b-a22b-instruct",
-            "strategy": "use_fallback",
-            "delta": -25.70524743986402,
-            "ci_lo": -47.5444640074283,
-            "ci_hi": -5.733473043076171,
-            "p_gt_0": 0.0045,
-            "p_lt_0": 0.9955,
-            "n_with": 20,
-            "n_without": 396,
-            "usage_rate": 0.04807692307692308,
-            "rank": 10
-          },
-          {
-            "model": "x-ai/grok-4.1-fast",
-            "strategy": "retry_same",
-            "delta": -32.15201735248772,
-            "ci_lo": -54.37906388531154,
-            "ci_hi": -7.8972247889490035,
-            "p_gt_0": 0.0051,
-            "p_lt_0": 0.9949,
-            "n_with": 11,
-            "n_without": 401,
-            "usage_rate": 0.02669902912621359,
-            "rank": 11
-          },
-          {
-            "model": "x-ai/grok-4.1-fast",
-            "strategy": "retry_modified_params",
-            "delta": -10.946842069805498,
-            "ci_lo": -18.45623910116072,
-            "ci_hi": -3.1408382573564295,
-            "p_gt_0": 0.0039,
-            "p_lt_0": 0.9961,
-            "n_with": 214,
-            "n_without": 198,
-            "usage_rate": 0.5194174757281553,
-            "rank": 8
-          },
-          {
-            "model": "x-ai/grok-4.1-fast",
-            "strategy": "switch_tool",
-            "delta": -10.52544168932759,
-            "ci_lo": -23.516932575654625,
-            "ci_hi": 1.3525361562209557,
-            "p_gt_0": 0.0421,
-            "p_lt_0": 0.9579,
-            "n_with": 34,
-            "n_without": 378,
-            "usage_rate": 0.0825242718446602,
-            "rank": 7
-          },
-          {
-            "model": "x-ai/grok-4.1-fast",
-            "strategy": "lookup_correct_value",
-            "delta": 0.5114340335954243,
-            "ci_lo": -7.585424772905799,
-            "ci_hi": 9.479036401925407,
-            "p_gt_0": 0.5395,
-            "p_lt_0": 0.4605,
-            "n_with": 327,
-            "n_without": 85,
-            "usage_rate": 0.7936893203883495,
-            "rank": 2
-          },
-          {
-            "model": "x-ai/grok-4.1-fast",
-            "strategy": "backtrack",
-            "delta": 1.1204542282706016,
-            "ci_lo": -8.124457128516607,
-            "ci_hi": 9.698204260777805,
-            "p_gt_0": 0.6082,
-            "p_lt_0": 0.3918,
-            "n_with": 88,
-            "n_without": 324,
-            "usage_rate": 0.21359223300970873,
-            "rank": 1
-          },
-          {
-            "model": "x-ai/grok-4.1-fast",
-            "strategy": "parse_error_message",
-            "delta": -9.060388671357135,
-            "ci_lo": -16.607785125894747,
-            "ci_hi": -1.5726541401240306,
-            "p_gt_0": 0.0094,
-            "p_lt_0": 0.9906,
-            "n_with": 223,
-            "n_without": 189,
-            "usage_rate": 0.5412621359223301,
-            "rank": 6
-          },
-          {
-            "model": "x-ai/grok-4.1-fast",
-            "strategy": "change_strategy",
-            "delta": -7.6004227553770045,
-            "ci_lo": -15.525292681836946,
-            "ci_hi": 0.16463485865282165,
-            "p_gt_0": 0.0283,
-            "p_lt_0": 0.9717,
-            "n_with": 151,
-            "n_without": 261,
-            "usage_rate": 0.36650485436893204,
-            "rank": 5
-          },
-          {
-            "model": "x-ai/grok-4.1-fast",
-            "strategy": "break_into_steps",
-            "delta": -1.2305121365587328,
-            "ci_lo": -11.28174771473798,
-            "ci_hi": 9.97852201029388,
-            "p_gt_0": 0.4001,
-            "p_lt_0": 0.5999,
-            "n_with": 355,
-            "n_without": 57,
-            "usage_rate": 0.8616504854368932,
-            "rank": 4
-          },
-          {
-            "model": "x-ai/grok-4.1-fast",
-            "strategy": "verify_prerequisites",
-            "delta": -0.06582422531849998,
-            "ci_lo": -7.856554853319606,
-            "ci_hi": 7.497168446306337,
-            "p_gt_0": 0.4983,
-            "p_lt_0": 0.5017,
-            "n_with": 178,
-            "n_without": 234,
-            "usage_rate": 0.4320388349514563,
-            "rank": 3
-          },
-          {
-            "model": "x-ai/grok-4.1-fast",
-            "strategy": "skip_and_continue",
-            "delta": -17.234302279539854,
-            "ci_lo": -27.06889007172422,
-            "ci_hi": -7.938852148235506,
-            "p_gt_0": 0.0,
-            "p_lt_0": 1.0,
-            "n_with": 56,
-            "n_without": 356,
-            "usage_rate": 0.13592233009708737,
-            "rank": 9
-          },
-          {
-            "model": "x-ai/grok-4.1-fast",
-            "strategy": "use_fallback",
-            "delta": -18.223368522567686,
-            "ci_lo": -32.35601942300494,
-            "ci_hi": -4.608205740786368,
-            "p_gt_0": 0.003,
-            "p_lt_0": 0.997,
-            "n_with": 34,
-            "n_without": 378,
-            "usage_rate": 0.0825242718446602,
-            "rank": 10
-          }
-        ]
-      }
-    },
-    {
-      "run_id": "0d8e1dce-f8ad-4855-bdb8-97906aa589e5",
-      "timestamp": "2026-02-09T13:05:48.239918",
-      "description": "Recovery Strategy Delta: Top 4 vs Bottom 5 (median split, including llama)",
-      "metadata": {
-        "split_method": "median_split_top4_vs_bottom5",
-        "top_models": [
-          "deepseek/deepseek-v3.2",
-          "mistralai/devstral-2512",
-          "moonshotai/kimi-k2-0905",
-          "qwen/qwen3-vl-235b-a22b-instruct"
-        ],
-        "bottom_models": [
-          "anthropic/claude-haiku-4.5",
-          "google/gemini-3-flash-preview",
-          "meta-llama/llama-4-scout",
-          "openai/gpt-oss-120b",
-          "x-ai/grok-4.1-fast"
-        ],
-        "includes_llama": true,
-        "min_usage_threshold": 0.05,
-        "strategies_analyzed": [
-          "retry_same",
-          "retry_modified_params",
-          "switch_tool",
-          "lookup_correct_value",
-          "backtrack",
-          "parse_error_message",
-          "change_strategy",
-          "break_into_steps",
-          "verify_prerequisites",
-          "skip_and_continue",
-          "use_fallback"
-        ],
-        "strategies_excluded": [
-          "handle_ui_obstacle",
-          "wait_and_retry",
-          "other_recovery_strategy"
-        ],
-        "n_draws": 10000,
-        "seed": 42
-      },
-      "results": {
-        "retry_same": {
-          "mean": -0.09103604921658948,
-          "median": -0.09109260994987882,
-          "lo": -0.10957884846730745,
-          "hi": -0.0719766860823506,
-          "p_gt_0": 0.0,
-          "p_lt_0": 1.0,
-          "n_clusters": 434,
-          "n_a": 1677,
-          "n_b": 1938
-        },
-        "retry_modified_params": {
-          "mean": -0.07951564582181299,
-          "median": -0.07947052083112649,
-          "lo": -0.10828638500319165,
-          "hi": -0.05012203842085975,
-          "p_gt_0": 0.0,
-          "p_lt_0": 1.0,
-          "n_clusters": 434,
-          "n_a": 1677,
-          "n_b": 1938
-        },
-        "switch_tool": {
-          "mean": -0.009124047556068765,
-          "median": -0.00920404485653227,
-          "lo": -0.025539357832924447,
-          "hi": 0.0072456718548228796,
-          "p_gt_0": 0.1402,
-          "p_lt_0": 0.8598,
-          "n_clusters": 434,
-          "n_a": 1677,
-          "n_b": 1938
-        },
-        "lookup_correct_value": {
-          "mean": 0.06622385682643525,
-          "median": 0.06618886826991816,
-          "lo": 0.04354034480804078,
-          "hi": 0.08911349208358349,
-          "p_gt_0": 1.0,
-          "p_lt_0": 0.0,
-          "n_clusters": 434,
-          "n_a": 1677,
-          "n_b": 1938
-        },
-        "backtrack": {
-          "mean": -0.03966271271009249,
-          "median": -0.03945044443112419,
-          "lo": -0.06588868149538812,
-          "hi": -0.01328529782285673,
-          "p_gt_0": 0.0017,
-          "p_lt_0": 0.9983,
-          "n_clusters": 434,
-          "n_a": 1677,
-          "n_b": 1938
-        },
-        "parse_error_message": {
-          "mean": -0.07303215462932654,
-          "median": -0.07314696379010416,
-          "lo": -0.10321193918873811,
-          "hi": -0.04298633030130917,
-          "p_gt_0": 0.0,
-          "p_lt_0": 1.0,
-          "n_clusters": 434,
-          "n_a": 1677,
-          "n_b": 1938
-        },
-        "change_strategy": {
-          "mean": -0.0035037361411441024,
-          "median": -0.0037119582349364466,
-          "lo": -0.03194505967257071,
-          "hi": 0.024790497912982545,
-          "p_gt_0": 0.4029,
-          "p_lt_0": 0.5971,
-          "n_clusters": 434,
-          "n_a": 1677,
-          "n_b": 1938
-        },
-        "break_into_steps": {
-          "mean": 0.09508089679697927,
-          "median": 0.09506391711573636,
-          "lo": 0.07409295465445424,
-          "hi": 0.11691936055130064,
-          "p_gt_0": 1.0,
-          "p_lt_0": 0.0,
-          "n_clusters": 434,
-          "n_a": 1677,
-          "n_b": 1938
-        },
-        "verify_prerequisites": {
-          "mean": 0.13058965519621055,
-          "median": 0.13083256078708905,
-          "lo": 0.10005243716485707,
-          "hi": 0.1608955933944794,
-          "p_gt_0": 1.0,
-          "p_lt_0": 0.0,
-          "n_clusters": 434,
-          "n_a": 1677,
-          "n_b": 1938
-        },
-        "skip_and_continue": {
-          "mean": -0.007385318964547756,
-          "median": -0.007389518236925505,
-          "lo": -0.02436596949165244,
-          "hi": 0.009816767711473884,
-          "p_gt_0": 0.2,
-          "p_lt_0": 0.8,
-          "n_clusters": 434,
-          "n_a": 1677,
-          "n_b": 1938
-        },
-        "use_fallback": {
-          "mean": -0.004892152991523062,
-          "median": -0.005073872845546126,
-          "lo": -0.02163104032438776,
-          "hi": 0.012701314015813678,
-          "p_gt_0": 0.2864,
-          "p_lt_0": 0.7136,
-          "n_clusters": 434,
-          "n_a": 1677,
-          "n_b": 1938
-        }
-      }
-    },
-    {
-      "run_id": "ad81ff57-25e1-4e3a-956f-d409cd15dd36",
-      "timestamp": "2026-02-09T13:07:49.743370",
-      "description": "Recovery Strategy Delta: Top 4 vs Bottom 4 (median split, excluding llama outlier)",
-      "metadata": {
-        "split_method": "median_split_top4_vs_bottom4",
-        "top_models": [
-          "deepseek/deepseek-v3.2",
-          "mistralai/devstral-2512",
-          "moonshotai/kimi-k2-0905",
-          "qwen/qwen3-vl-235b-a22b-instruct"
-        ],
-        "bottom_models": [
-          "anthropic/claude-haiku-4.5",
-          "google/gemini-3-flash-preview",
-          "openai/gpt-oss-120b",
-          "x-ai/grok-4.1-fast"
-        ],
-        "excluded_models": [
-          "meta-llama/llama-4-scout"
-        ],
-        "exclusion_reason": "llama excluded as statistical outlier",
-        "includes_llama": false,
-        "min_usage_threshold": 0.05,
-        "strategies_analyzed": [
-          "retry_same",
-          "retry_modified_params",
-          "switch_tool",
-          "lookup_correct_value",
-          "backtrack",
-          "parse_error_message",
-          "change_strategy",
-          "break_into_steps",
-          "verify_prerequisites",
-          "skip_and_continue",
-          "use_fallback"
-        ],
-        "strategies_excluded": [
-          "handle_ui_obstacle",
-          "wait_and_retry",
-          "other_recovery_strategy"
-        ],
-        "n_draws": 10000,
-        "seed": 42
-      },
-      "results": {
-        "retry_same": {
-          "mean": -0.031652743116263454,
-          "median": -0.03160324178012844,
-          "lo": -0.05016915407630905,
-          "hi": -0.013034324016886496,
-          "p_gt_0": 0.0003,
-          "p_lt_0": 0.9997,
-          "n_clusters": 432,
-          "n_a": 1674,
-          "n_b": 1559
-        },
-        "retry_modified_params": {
-          "mean": -0.02161081261538253,
-          "median": -0.02165067239813062,
-          "lo": -0.05158566895808769,
-          "hi": 0.008820442173665115,
-          "p_gt_0": 0.0825,
-          "p_lt_0": 0.9175,
-          "n_clusters": 432,
-          "n_a": 1674,
-          "n_b": 1559
-        },
-        "switch_tool": {
-          "mean": 0.006117927318999696,
-          "median": 0.006054446625149555,
-          "lo": -0.00996929027863427,
-          "hi": 0.02248732864405629,
-          "p_gt_0": 0.7681,
-          "p_lt_0": 0.2319,
-          "n_clusters": 432,
-          "n_a": 1674,
-          "n_b": 1559
-        },
-        "lookup_correct_value": {
-          "mean": 0.03291443477255676,
-          "median": 0.03290993922423108,
-          "lo": 0.010056096665411796,
-          "hi": 0.05553228093568756,
-          "p_gt_0": 0.9982,
-          "p_lt_0": 0.0018,
-          "n_clusters": 432,
-          "n_a": 1674,
-          "n_b": 1559
-        },
-        "backtrack": {
-          "mean": -0.016004131905160052,
-          "median": -0.015947622906689496,
-          "lo": -0.042801713950029865,
-          "hi": 0.01102089652206714,
-          "p_gt_0": 0.1196,
-          "p_lt_0": 0.8804,
-          "n_clusters": 432,
-          "n_a": 1674,
-          "n_b": 1559
-        },
-        "parse_error_message": {
-          "mean": -0.015371608203945751,
-          "median": -0.015367242135045728,
-          "lo": -0.04594248661910706,
-          "hi": 0.015852663423577936,
-          "p_gt_0": 0.1621,
-          "p_lt_0": 0.8379,
-          "n_clusters": 432,
-          "n_a": 1674,
-          "n_b": 1559
-        },
-        "change_strategy": {
-          "mean": 0.0593181818093854,
-          "median": 0.059294457855713384,
-          "lo": 0.030479928337650213,
-          "hi": 0.08871075534968872,
-          "p_gt_0": 0.9999,
-          "p_lt_0": 0.0001,
-          "n_clusters": 432,
-          "n_a": 1674,
-          "n_b": 1559
-        },
-        "break_into_steps": {
-          "mean": 0.05582580083110896,
-          "median": 0.055803819368386964,
-          "lo": 0.03338900451524528,
-          "hi": 0.07796549475144765,
-          "p_gt_0": 1.0,
-          "p_lt_0": 0.0,
-          "n_clusters": 432,
-          "n_a": 1674,
-          "n_b": 1559
-        },
-        "verify_prerequisites": {
-          "mean": 0.0796079631326316,
-          "median": 0.07980403092340446,
-          "lo": 0.04780512557757043,
-          "hi": 0.11116572394418382,
-          "p_gt_0": 1.0,
-          "p_lt_0": 0.0,
-          "n_clusters": 432,
-          "n_a": 1674,
-          "n_b": 1559
-        },
-        "skip_and_continue": {
-          "mean": 0.02181295514133002,
-          "median": 0.02188015089318647,
-          "lo": 0.00550088914951375,
-          "hi": 0.037811566821686106,
-          "p_gt_0": 0.9943,
-          "p_lt_0": 0.0057,
-          "n_clusters": 432,
-          "n_a": 1674,
-          "n_b": 1559
-        },
-        "use_fallback": {
-          "mean": 0.005542053085614652,
-          "median": 0.005370788590476558,
-          "lo": -0.01163616726486241,
-          "hi": 0.022986279703769395,
-          "p_gt_0": 0.7299,
-          "p_lt_0": 0.2701,
-          "n_clusters": 432,
-          "n_a": 1674,
-          "n_b": 1559
-        }
-      }
-    },
-    {
-      "run_id": "dcbe05e2-56ae-444a-b9ff-6330e1db704b",
-      "timestamp": "2026-02-09T03:42:06.737725",
-      "description": "API Knowledge Error Ratio: no_docs vs with_docs",
-      "metadata": {
-        "statistic": "weighted_api_knowledge_ratio",
-        "api_knowledge_errors": [
-          "endpoint_selection",
-          "parameter_misuse"
-        ],
-        "comparison": "no_docs vs with_docs (ratio of API errors to total errors)",
-        "bootstrap_method": "bayes_bootstrap_delta_clustered",
-        "cluster_key": "test_id",
-        "n_draws": 100000,
-        "seed": 42,
-        "interpretation": "positive delta = docs reduce API knowledge error proportion",
-        "n_runs_analyzed": 3955
-      },
-      "results": {
-        "claude-haiku-4.5": {
-          "mean": 0.040584999247051334,
-          "median": 0.040411382084580685,
-          "lo": 0.022880560929575713,
-          "hi": 0.0592542604024031,
-          "p_gt_0": 1.0,
-          "p_lt_0": 0.0,
-          "n_clusters": 220,
-          "n_a": 220,
-          "n_b": 220
-        },
-        "deepseek-v3.2": {
-          "mean": 0.0827939248534559,
-          "median": 0.08264530013667731,
-          "lo": 0.03879754002291,
-          "hi": 0.1273327511847746,
-          "p_gt_0": 0.9999,
-          "p_lt_0": 0.0001,
-          "n_clusters": 218,
-          "n_a": 218,
-          "n_b": 218
-        },
-        "devstral-2512": {
-          "mean": 0.09932723249662846,
-          "median": 0.09937916396991449,
-          "lo": 0.06282952406498639,
-          "hi": 0.1356795256695502,
-          "p_gt_0": 1.0,
-          "p_lt_0": 0.0,
-          "n_clusters": 222,
-          "n_a": 222,
-          "n_b": 222
-        },
-        "gemini-3-flash-preview": {
-          "mean": 0.01945807285926072,
-          "median": 0.019460189434683868,
-          "lo": -0.0061190424196176305,
-          "hi": 0.04524343743648777,
-          "p_gt_0": 0.93244,
-          "p_lt_0": 0.06756,
-          "n_clusters": 221,
-          "n_a": 221,
-          "n_b": 221
-        },
-        "gpt-oss-120b": {
-          "mean": 0.03579026894740811,
-          "median": 0.03573244387144292,
-          "lo": 0.017685278670376746,
-          "hi": 0.05425853745165322,
-          "p_gt_0": 0.99992,
-          "p_lt_0": 8e-05,
-          "n_clusters": 220,
-          "n_a": 220,
-          "n_b": 220
-        },
-        "grok-4.1-fast": {
-          "mean": 0.04454445004438593,
-          "median": 0.04458698609248999,
-          "lo": 0.02222556413559329,
-          "hi": 0.06672968928884401,
-          "p_gt_0": 0.99994,
-          "p_lt_0": 6e-05,
-          "n_clusters": 218,
-          "n_a": 218,
-          "n_b": 218
-        },
-        "kimi-k2-0905": {
-          "mean": 0.0647231912310758,
-          "median": 0.06470822980606873,
-          "lo": 0.034092411817744625,
-          "hi": 0.09516436723859083,
-          "p_gt_0": 0.99999,
-          "p_lt_0": 1e-05,
-          "n_clusters": 217,
-          "n_a": 217,
-          "n_b": 217
-        },
-        "llama-4-scout": {
-          "mean": 0.015108065386857522,
-          "median": 0.01514639283103629,
-          "lo": 0.00039617586850217043,
-          "hi": 0.029740066793261338,
-          "p_gt_0": 0.97755,
-          "p_lt_0": 0.02245,
-          "n_clusters": 201,
-          "n_a": 201,
-          "n_b": 201
-        },
-        "qwen3-vl-235b-a22b-instruct": {
-          "mean": 0.05699903371532211,
-          "median": 0.05707557533237209,
-          "lo": 0.030735106002460502,
-          "hi": 0.08282484619874553,
-          "p_gt_0": 0.99997,
-          "p_lt_0": 3e-05,
-          "n_clusters": 220,
-          "n_a": 220,
-          "n_b": 220
-        }
-      }
-    },
-    {
-      "run_id": "57dc2fc5-45bc-4df3-b4bd-3e0bfe273196",
-      "timestamp": "2026-02-09T13:02:31.571505",
-      "description": "API Knowledge Error Absolute Rate: no_docs vs with_docs",
-      "metadata": {
-        "statistic": "weighted_api_knowledge_rate",
-        "api_knowledge_errors": [
-          "endpoint_selection",
-          "parameter_misuse"
-        ],
-        "comparison": "no_docs vs with_docs (absolute rate, not ratio)",
-        "bootstrap_method": "bayes_bootstrap_delta_clustered",
-        "cluster_key": "test_id",
-        "n_draws": 100000,
-        "seed": 42,
-        "interpretation": "positive delta = docs reduce absolute API knowledge error count",
-        "n_runs_analyzed": 3955
-      },
-      "results": {
-        "claude-haiku-4.5": {
-          "mean": 0.2137941426479078,
-          "median": 0.21305118327847652,
-          "lo": 0.12588787283599542,
-          "hi": 0.3060362483668092,
-          "p_gt_0": 1.0,
-          "p_lt_0": 0.0,
-          "n_clusters": 220,
-          "n_a": 220,
-          "n_b": 220
-        },
-        "deepseek-v3.2": {
-          "mean": 0.18803051898661474,
-          "median": 0.1871681665881832,
-          "lo": 0.10723387291656583,
-          "hi": 0.27295111855349335,
-          "p_gt_0": 1.0,
-          "p_lt_0": 0.0,
-          "n_clusters": 218,
-          "n_a": 218,
-          "n_b": 218
-        },
-        "devstral-2512": {
-          "mean": 0.42810180654321794,
-          "median": 0.42767954420834675,
-          "lo": 0.32962398945822163,
-          "hi": 0.5293243098945404,
-          "p_gt_0": 1.0,
-          "p_lt_0": 0.0,
-          "n_clusters": 222,
-          "n_a": 222,
-          "n_b": 222
-        },
-        "gemini-3-flash-preview": {
-          "mean": 0.10861533225990665,
-          "median": 0.1081297070156765,
-          "lo": 0.02982038231077991,
-          "hi": 0.19032342313962403,
-          "p_gt_0": 0.99659,
-          "p_lt_0": 0.00341,
-          "n_clusters": 221,
-          "n_a": 221,
-          "n_b": 221
-        },
-        "gpt-oss-120b": {
-          "mean": 0.19542576426829905,
-          "median": 0.19495858260884366,
-          "lo": 0.10747117586556258,
-          "hi": 0.28596175957234665,
-          "p_gt_0": 1.0,
-          "p_lt_0": 0.0,
-          "n_clusters": 220,
-          "n_a": 220,
-          "n_b": 220
-        },
-        "grok-4.1-fast": {
-          "mean": 0.29337007770491824,
-          "median": 0.29316892988260357,
-          "lo": 0.18808424618275657,
-          "hi": 0.3996602493261472,
-          "p_gt_0": 1.0,
-          "p_lt_0": 0.0,
-          "n_clusters": 218,
-          "n_a": 218,
-          "n_b": 218
-        },
-        "kimi-k2-0905": {
-          "mean": 0.17974225215601594,
-          "median": 0.1791712367421608,
-          "lo": 0.08430573831277695,
-          "hi": 0.27865431520693223,
-          "p_gt_0": 0.99988,
-          "p_lt_0": 0.00012,
-          "n_clusters": 217,
-          "n_a": 217,
-          "n_b": 217
-        },
-        "llama-4-scout": {
-          "mean": 0.2785938666995269,
-          "median": 0.279094134979458,
-          "lo": 0.1548222612026634,
-          "hi": 0.39967386641674946,
-          "p_gt_0": 1.0,
-          "p_lt_0": 0.0,
-          "n_clusters": 201,
-          "n_a": 201,
-          "n_b": 201
-        },
-        "qwen3-vl-235b-a22b-instruct": {
-          "mean": 0.42258769140462166,
-          "median": 0.42222519420523874,
-          "lo": 0.3279747731692246,
-          "hi": 0.519200255881689,
-          "p_gt_0": 1.0,
-          "p_lt_0": 0.0,
-          "n_clusters": 220,
-          "n_a": 220,
-          "n_b": 220
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
+version https://git-lfs.github.com/spec/v1
+oid sha256:74f53bd694440de2f5f07dcc2d29806313592df802bdf85be82002f78e3908ba
+size 60375