From e4527ba7db7b18be24930f65edf0cc6eb4d1705d Mon Sep 17 00:00:00 2001 From: hubertpysklo Date: Thu, 12 Mar 2026 16:01:26 +0530 Subject: [PATCH] AGENTS.MD --- AGENTS.md | 444 +++- .../bayesian_boostrapping_results.json | 1901 +---------------- 2 files changed, 405 insertions(+), 1940 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 3e6c63e..3a90a6c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -11,7 +11,7 @@ reproducible environments** backed by PostgreSQL schema cloning. ``` ┌──────────────────────────┐ ┌──────────────────────┐ │ Evaluation Client │ │ Agent Sandbox │ -│ (prime eval / SDK) │──────▶│ (Docker container) │ +│ (SDK / notebooks) │──────▶│ (Docker container) │ │ │ │ │ │ 1. initEnv │ │ Runs agent code │ │ 2. startRun │ │ Makes API calls ──┐ │ @@ -170,26 +170,367 @@ pytest tests/performance/test_box_bench_perf.py -v -s pytest tests/integration/ -v ``` -## Running Evaluations Locally +## Running Evaluations -```bash -# 1. Activate the bench environment's venv -source third_party/prime-environments/environments/agent_diff_bench/.venv/bin/activate +### Using the SDK (Local, No External Dependencies) + +The SDK can fetch test suites directly from the platform — no HuggingFace or +third-party tooling needed. This is the primary way to run evaluations. + +```python +from agent_diff import AgentDiff, BashExecutorProxy + +client = AgentDiff() # uses AGENT_DIFF_API_KEY and AGENT_DIFF_BASE_URL env vars + +# List available test suites +suites = client.list_test_suites() +for s in suites.suites: + print(f"{s.id} {s.name}") + +# Get a specific suite with its tests +suite = client.get_test_suite(suite_id="", expand=True) + +# Run each test +for test in suite.tests: + env = client.init_env( + templateService=test.type, + templateName=test.template_schema, + impersonateUserId=test.impersonate_user_id, + ) + run = client.start_run(envId=env.environmentId) + bash = BashExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key) + + # --- your agent loop goes here, calling bash.execute(command) --- + + client.evaluate_run(runId=run.runId, expectedOutput=test.expected_output) + result = client.get_results_for_run(runId=run.runId) + print(f"{test.name}: {'PASS' if result.passed else 'FAIL'} score={result.score}") + + client.delete_env(envId=env.environmentId) +``` + +### Using HuggingFace Dataset + +Alternatively, load tasks from the published HuggingFace dataset: + +```python +from agent_diff import AgentDiff, BashExecutorProxy +from datasets import load_dataset + +client = AgentDiff() +dataset = load_dataset("hubertmarek/agent-diff-bench", split="test") + +for example in dataset: + info = json.loads(example["info"]) + expected = json.loads(example["answer"]) + + env = client.init_env( + templateService=info["service"], + templateName=info["seed_template"], + impersonateUserId=info["impersonate_user_id"], + ) + run = client.start_run(envId=env.environmentId) + bash = BashExecutorProxy(env.environmentId, base_url=client.base_url, api_key=client.api_key) + + # --- your agent loop goes here, calling bash.execute(command) --- + + client.evaluate_run(runId=run.runId, expectedOutput=expected) + result = client.get_results_for_run(runId=run.runId) + print(f"{example['test_id']}: {'PASS' if result.passed else 'FAIL'} score={result.score}") -# 2. Install the environment package -cd third_party/prime-environments/environments/agent_diff_bench -uv pip install -e . + client.delete_env(envId=env.environmentId) +``` + +See `examples/react_agent_benchmark.ipynb` and `examples/langchain_agent_benchmark.ipynb` +for full runnable notebook examples. + +--- + +## SessionManager & Isolation Architecture + +The isolation system is the core of Agent-Diff. It allows every evaluation run to +operate on its own independent copy of a service's database without cross-contamination. + +### SessionManager (`src/platform/isolationEngine/session.py`) + +`SessionManager` wraps a single SQLAlchemy `Engine` and provides scoped sessions +at two levels: + +1. **Meta sessions** — operate on the `public` schema where platform tables live + (`TemplateEnvironment`, `RunTimeEnvironment`, `Test`, `TestSuite`, etc.): + + ```python + with session_manager.with_meta_session() as session: + # session is bound to `public` schema + env = session.query(RunTimeEnvironment).filter(...).one() + ``` + +2. **Environment sessions** — operate on an isolated `state_` schema that + contains the cloned service data for one evaluation run: + + ```python + with session_manager.with_session_for_environment(env_id) as session: + # session is bound to `state_abc123...` schema + # all ORM queries hit the cloned tables for this environment only + ``` + + Internally this calls `lookup_environment(env_id)` to find the schema name from + the `RunTimeEnvironment` table, then uses SQLAlchemy's `schema_translate_map` + to redirect all unqualified table references to that schema: -# 3. Run evaluation (from the agent_diff_bench directory) -uv run prime eval run agent-diff-bench \ - -m "openai/gpt-5-mini" \ - -n 5 -r 3 -s \ - -a '{"agentdiff_api_key": "ad_live_sk_..."}' + ```python + translated = base_engine.execution_options(schema_translate_map={None: schema}) + ``` + + This means service code (Box, Slack, etc.) never needs to know which schema it's + hitting — the ORM models declare tables without a schema, and the engine-level + translation handles routing transparently. + +### IsolationMiddleware (`src/platform/api/middleware.py`) + +`IsolationMiddleware` is the Starlette middleware that sits in front of all +`/api/env/{env_id}/services/...` requests. It is what connects HTTP requests to +the correct isolated database session: + +1. **Extract `env_id`** from the URL path +2. **Authenticate** the API key via `get_principal_id()` +3. **Look up environment** in the meta DB to retrieve `impersonate_user_id` +4. **Open a scoped DB session** via `session_manager.with_session_for_environment(env_id)` +5. **Attach to `request.state`** so service handlers can access it: + - `request.state.db_session` — SQLAlchemy session scoped to the environment schema + - `request.state.environment_id` — the environment UUID string + - `request.state.impersonate_user_id` — which user the agent is acting as + - `request.state.impersonate_email` — alternative email-based impersonation + - `request.state.principal_id` — the authenticated API key owner + +Every service route handler accesses these via helper functions like: + +```python +def _session(request: Request) -> Session: + return getattr(request.state, "db_session", None) ``` -Results are saved to: `third_party/prime-environments/environments/agent_diff_bench/eval_results/` +### How Environments Are Created + +When `initEnv` is called: + +1. `TemplateManager.resolve_init_template()` finds the template by service+name (or ID) +2. `CoreIsolationEngine.create_environment()` either claims a pre-built schema from + the pool or clones one from the template: + - `PoolManager.claim_ready_schema()` — fast path, reuses a pre-built clone + - `EnvironmentHandler.clone_schema_from_environment()` — slow path, creates schema + + copies tables + copies data from the template +3. A `RunTimeEnvironment` row is written to the meta `public` schema with TTL and status +4. The environment ID is returned to the caller + +--- + +## Adding a New Service -## Database Seeding +Each service follows a consistent pattern. Study the existing ones: + +| Service | API Style | Routes file | DB schema | DB operations | +|----------|-----------|------------|-----------|---------------| +| Slack | Web API (flat endpoints) | `services/slack/api/methods.py` | `services/slack/database/schema.py` | `services/slack/database/operations.py` | +| Box | REST (resource paths) | `services/box/api/routes.py` | `services/box/database/schema.py` | `services/box/database/operations.py` | +| Calendar | REST (Google style) | `services/calendar/api/methods.py` | `services/calendar/database/schema.py` | `services/calendar/database/operations.py` | +| Linear | GraphQL (Ariadne) | `services/linear/api/resolvers.py` | `services/linear/database/schema.py` | (inline in resolvers) | + +### Step-by-step + +**1. Create the service directory structure:** + +``` +backend/src/services/myservice/ + __init__.py + database/ + __init__.py + base.py # DeclarativeBase subclass + schema.py # SQLAlchemy ORM models + operations.py # CRUD functions (take a Session argument) + api/ + __init__.py + routes.py # Starlette Route list +``` + +**2. Define the database schema** in `database/schema.py`: + +```python +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column +from sqlalchemy import String, DateTime + +class Base(DeclarativeBase): + pass + +class MyEntity(Base): + __tablename__ = "my_entities" + id: Mapped[str] = mapped_column(String(50), primary_key=True) + name: Mapped[str] = mapped_column(String(255)) + # ... +``` + +Each service has its own `Base` — this is important because `Base.metadata` is used +independently during schema creation and cloning. + +**3. Write route handlers** that read from `request.state`: + +```python +from starlette.requests import Request +from starlette.responses import JSONResponse +from starlette.routing import Route + +def _session(request: Request): + return getattr(request.state, "db_session", None) + +def _user_id(request: Request): + return getattr(request.state, "impersonate_user_id", None) + +async def list_entities(request: Request): + session = _session(request) + entities = session.query(MyEntity).all() + return JSONResponse({"items": [...]}) + +routes = [ + Route("/entities", list_entities, methods=["GET"]), + Route("/entities/{id}", get_entity, methods=["GET"]), + # ... +] +``` + +The key contract: your handlers must only use `request.state.db_session` for DB access. +The IsolationMiddleware has already scoped this session to the correct environment schema. + +**4. Mount the service in `src/platform/api/main.py`:** + +```python +from src.services.myservice.api.routes import routes as myservice_routes + +# Inside create_app(): +myservice_router = Router(myservice_routes) +app.mount("/api/env/{env_id}/services/myservice", myservice_router) +``` + +**5. Write a seed script** in `backend/utils/seed_myservice_template.py` that: +- Creates the PostgreSQL schema (e.g. `myservice_default`) +- Uses `Base.metadata.create_all()` to create tables +- Inserts seed data from a JSON file +- Registers the template via `EnvironmentHandler.register_template()` + +Follow `seed_slack_template.py` as a reference — it shows the full pattern including +schema creation, table ordering, and template registration. + +**6. Add seed data** in `examples/myservice/seeds/myservice_default.json` and copy to +`backend/seeds/myservice/` for Docker builds. + +**7. Register the seed script** in the Docker startup command in `ops/docker-compose.yml`: + +```yaml +command: > + sh -c " + alembic upgrade head && + if [ \"$$SEED\" = 'true' ]; then + # ... existing seed scripts ... + python utils/seed_myservice_template.py; + fi && + uvicorn src.platform.api.main:app --host 0.0.0.0 --port 8000 + " +``` + +### GraphQL Services (Linear Pattern) + +If the service uses GraphQL instead of REST, follow the Linear pattern: + +- Define a `.graphql` schema file in `services/myservice/api/schema/` +- Write Ariadne resolvers in `services/myservice/api/resolvers.py` +- Create a custom `GraphQL` subclass (like `LinearGraphQL`) that extracts + `request.state.db_session` and passes it into the resolver context +- Mount with `app.mount(...)` passing the GraphQL ASGI app directly + +--- + +## Adding Test Suites + +Test suites define evaluation tasks with expected state-change assertions. They are +loaded into the platform DB by `backend/utils/seed_tests.py`. + +### Test Suite JSON Format + +```json +{ + "name": "My Service Bench", + "description": "Benchmark tests for MyService", + "owner": "dev-user", + "ignore_fields": { + "global": ["created_at", "modified_at"] + }, + "tests": [ + { + "id": "test_1", + "name": "Create an entity", + "prompt": "Create an entity named 'foo' in the workspace.", + "type": "actionEval", + "seed_template": "myservice_default", + "impersonate_user_id": "user-123", + "assertions": [ + { + "diff_type": "added", + "entity": "my_entities", + "where": { "name": { "eq": "foo" } }, + "expected_count": 1 + } + ] + } + ] +} +``` + +Key fields per test: +- **`id`** — unique string ID within the suite (used to generate deterministic UUIDs) +- **`prompt`** — the natural language task given to the agent +- **`type`** — typically `"actionEval"` for state-diff-based evaluation +- **`seed_template`** — which template schema to clone (e.g. `"slack_default"`) +- **`impersonate_user_id`** — which user the agent acts as +- **`assertions`** — list of expected state diffs (added/updated/deleted rows, field + value checks). See the existing bench files for assertion patterns. + +Suite-level `ignore_fields` are merged into every test's expected output — use for +timestamps and auto-generated fields that vary between runs. + +### Where to Put Test Suite Files + +Test suites live in two mirrored locations: + +- **`examples/{service}/testsuites/{suite_name}.json`** — canonical source for local dev +- **`backend/seeds/testsuites/{suite_name}.json`** — copied here for Docker builds + +The seed script `seed_tests.py` checks `backend/seeds/testsuites/` first (Docker path), +then falls back to scanning `examples/*/testsuites/*.json` (local dev path). + +### How Seeding Works + +`seed_tests.py` is idempotent — it can be re-run safely: + +1. Scans for `*.json` files in the testsuites directory +2. For each file, checks if a suite with the same `name` + `owner` already exists +3. If it exists, deletes all its tests and memberships, then re-creates them +4. If new, creates a `TestSuite` with a deterministic UUID (from `uuid5(namespace, "suite:{owner}:{name}")`) +5. Creates a `Test` row for each test entry, with a deterministic UUID +6. Creates `TestMembership` rows linking tests to the suite + +### Running the Seeder + +```bash +# Local (requires DATABASE_URL in env or .env) +cd backend +python utils/seed_tests.py + +# Docker (runs automatically when SEED=true) +docker-compose up # in ops/ +``` + +--- + +## Database Seeding (Templates) Templates are seeded from JSON files in `backend/seeds/` (Docker) or `examples/` (local). @@ -200,48 +541,67 @@ Seed scripts in `backend/utils/`: - `seed_calendar_template.py` — creates calendar_base - `seed_tests.py` — loads test suite JSON files +Each seed script follows the same pattern: +1. Create the PostgreSQL schema with `CREATE SCHEMA {name}` +2. Create tables using the service's `Base.metadata.create_all()` +3. Insert data from the JSON seed file in foreign-key-safe table order +4. Register the template in `TemplateEnvironment` with service, name, location, table_order + On Railway, seeding runs automatically on deploy when `SEED=true` env var is set. The Dockerfile startup script runs Alembic migrations then all seed scripts. -## Performance Profiling +## Git LFS -All `[PERF]` log lines are instrumented for performance tracking: +Large binary and data files are tracked with Git LFS. Patterns are defined in +`.gitattributes`: -- **Middleware**: `[PERF] GET /api/env/.../services/box/... total=Xms auth=Xms meta_db=Xms handler=Xms` -- **Box operations**: `[PERF] search_content TOTAL=Xms`, `[PERF] get_folder_by_id(...) time=Xms` -- **Box schema**: `[PERF] File._get_path_collection depth=N time=Xms` -- **Calendar**: `[PERF] Calendar events_list took Xms` +- `examples/box/seeds/filesystem/**` — Box seed files (PDFs, CSVs, mhtml, etc.) +- `experiments/kdd 2026/evaluation_outputs/**/*.json` — experiment checkpoint JSONs +- `experiments/kdd 2026/bayesian_bootstrap_results/qualitative/*` — bootstrap results -Filter with: `grep "\[PERF\]"` in Railway logs. +If you add new large files (>1MB), add a matching pattern to `.gitattributes` **before** +committing them. Adding the pattern after the fact only affects future commits — files +already committed as regular blobs need `git lfs migrate import --no-rewrite` to convert. ## Key Directories ``` backend/ src/ - platform/ # Platform API (initEnv, runs, evaluation) + platform/ + api/ + main.py # App factory, middleware wiring, service mounting + middleware.py # PlatformMiddleware + IsolationMiddleware + routes.py # Platform API endpoints (initEnv, runs, evaluation) + isolationEngine/ + session.py # SessionManager (meta + environment sessions) + core.py # CoreIsolationEngine (create/delete environments) + environment.py # EnvironmentHandler (schema cloning, template registration) + pool.py # PoolManager (pre-built schema pool) + templateManager.py # Template resolution logic + evaluationEngine/ # State-diff evaluation, assertion engine + testManager/ # Test suite management + db/schema.py # Platform ORM models (TemplateEnvironment, RunTimeEnvironment, Test, etc.) services/ - box/ # Box API replica - slack/ # Slack API replica - linear/ # Linear API replica - calendar/ # Calendar API replica + box/ # Box API replica (REST) + slack/ # Slack API replica (Web API) + linear/ # Linear API replica (GraphQL / Ariadne) + calendar/ # Calendar API replica (REST, Google style) tests/ - integration/ # Full-stack integration tests - performance/ # Performance/benchmark tests - validation/ # API parity tests - unit/ # Unit tests - utils/ # Seed scripts - seeds/ # Seed data JSON files (for Docker) + integration/ # Full-stack integration tests + performance/ # Performance/benchmark tests + validation/ # API parity tests + unit/ # Unit tests + utils/ # Seed scripts (seed_*_template.py, seed_tests.py) + seeds/ # Seed data JSON files (for Docker) -sdk/agent-diff-python/ # Python SDK (agent_diff package) +sdk/agent-diff-python/ # Python SDK (agent_diff package) examples/ - box/ # Box seed data + test suites - linear/ # Linear seed data + test suites - slack/ # Slack seed data + test suites - calendar/ # Calendar seed data - -third_party/prime-environments/environments/agent_diff_bench/ - agent_diff_bench.py # Entry point for prime eval - src/environment.py # Environment setup (initEnv, startRun, etc.) + box/ # Box seed data + test suites + linear/ # Linear seed data + test suites + slack/ # Slack seed data + test suites + calendar/ # Calendar seed data + react_agent_benchmark.ipynb # ReAct agent evaluation notebook + langchain_agent_benchmark.ipynb # LangChain agent evaluation notebook ``` diff --git a/experiments/kdd 2026/bayesian_bootstrap_results/qualitative/bayesian_boostrapping_results.json b/experiments/kdd 2026/bayesian_bootstrap_results/qualitative/bayesian_boostrapping_results.json index 24aada2..60fdf3c 100644 --- a/experiments/kdd 2026/bayesian_bootstrap_results/qualitative/bayesian_boostrapping_results.json +++ b/experiments/kdd 2026/bayesian_bootstrap_results/qualitative/bayesian_boostrapping_results.json @@ -1,1898 +1,3 @@ -{ - "runs": [ - { - "run_id": "3232708f-7551-4cba-8cd4-859337027096", - "timestamp": "2026-02-09T01:15:47.774549", - "description": "Per-model recovery strategy effectiveness (delta = with_strategy - without_strategy)", - "metadata": { - "cluster_key": "test_id", - "score_field": "score (base_score 0-100)", - "total_runs_with_recovery": 3625 - }, - "results": { - "analysis_type": "per_model_strategy_effectiveness", - "n_models": 9, - "n_strategies": 11, - "strategies_analyzed": [ - "retry_same", - "retry_modified_params", - "switch_tool", - "lookup_correct_value", - "backtrack", - "parse_error_message", - "change_strategy", - "break_into_steps", - "verify_prerequisites", - "skip_and_continue", - "use_fallback" - ], - "min_usage_threshold": 0.05, - "n_bootstrap_draws": 10000, - "results_by_model": [ - { - "model": "anthropic/claude-haiku-4.5", - "strategy": "retry_same", - "delta": -13.670952478879222, - "ci_lo": -26.405423864456555, - "ci_hi": -1.1496987738885156, - "p_gt_0": 0.0155, - "p_lt_0": 0.9845, - "n_with": 48, - "n_without": 313, - "usage_rate": 0.1329639889196676, - "rank": 9 - }, - { - "model": "anthropic/claude-haiku-4.5", - "strategy": "retry_modified_params", - "delta": -3.5376743187005713, - "ci_lo": -13.194239582548864, - "ci_hi": 6.098779036481566, - "p_gt_0": 0.2375, - "p_lt_0": 0.7625, - "n_with": 157, - "n_without": 204, - "usage_rate": 0.43490304709141275, - "rank": 5 - }, - { - "model": "anthropic/claude-haiku-4.5", - "strategy": "switch_tool", - "delta": -9.780547145545386, - "ci_lo": -28.50861768916372, - "ci_hi": 7.446339852685261, - "p_gt_0": 0.1448, - "p_lt_0": 0.8552, - "n_with": 21, - "n_without": 340, - "usage_rate": 0.05817174515235457, - "rank": 7 - }, - { - "model": "anthropic/claude-haiku-4.5", - "strategy": "lookup_correct_value", - "delta": 17.30312301886274, - "ci_lo": 5.870108985176336, - "ci_hi": 28.5956370570439, - "p_gt_0": 0.9984, - "p_lt_0": 0.0016, - "n_with": 277, - "n_without": 84, - "usage_rate": 0.7673130193905817, - "rank": 2 - }, - { - "model": "anthropic/claude-haiku-4.5", - "strategy": "backtrack", - "delta": -16.870918001853607, - "ci_lo": -28.59058283605294, - "ci_hi": -5.200585945040907, - "p_gt_0": 0.0027, - "p_lt_0": 0.9973, - "n_with": 73, - "n_without": 288, - "usage_rate": 0.20221606648199447, - "rank": 11 - }, - { - "model": "anthropic/claude-haiku-4.5", - "strategy": "parse_error_message", - "delta": -13.722357772149305, - "ci_lo": -23.128316453216478, - "ci_hi": -4.3734826996324285, - "p_gt_0": 0.0024, - "p_lt_0": 0.9976, - "n_with": 157, - "n_without": 204, - "usage_rate": 0.43490304709141275, - "rank": 10 - }, - { - "model": "anthropic/claude-haiku-4.5", - "strategy": "change_strategy", - "delta": -7.875780734590356, - "ci_lo": -17.947717608774926, - "ci_hi": 1.8109216676435835, - "p_gt_0": 0.0621, - "p_lt_0": 0.9379, - "n_with": 126, - "n_without": 235, - "usage_rate": 0.3490304709141274, - "rank": 6 - }, - { - "model": "anthropic/claude-haiku-4.5", - "strategy": "break_into_steps", - "delta": 31.502451436469325, - "ci_lo": 17.792240906956607, - "ci_hi": 44.85454555502225, - "p_gt_0": 1.0, - "p_lt_0": 0.0, - "n_with": 309, - "n_without": 52, - "usage_rate": 0.8559556786703602, - "rank": 1 - }, - { - "model": "anthropic/claude-haiku-4.5", - "strategy": "verify_prerequisites", - "delta": 12.956636864754728, - "ci_lo": 3.424139647638783, - "ci_hi": 22.44115374481087, - "p_gt_0": 0.9966, - "p_lt_0": 0.0034, - "n_with": 181, - "n_without": 180, - "usage_rate": 0.5013850415512465, - "rank": 3 - }, - { - "model": "anthropic/claude-haiku-4.5", - "strategy": "skip_and_continue", - "delta": -12.288213612965258, - "ci_lo": -26.968059902206118, - "ci_hi": 1.287467224431798, - "p_gt_0": 0.0372, - "p_lt_0": 0.9628, - "n_with": 30, - "n_without": 331, - "usage_rate": 0.08310249307479224, - "rank": 8 - }, - { - "model": "anthropic/claude-haiku-4.5", - "strategy": "use_fallback", - "delta": -0.36833640817840557, - "ci_lo": -21.258429526507612, - "ci_hi": 17.1812955632496, - "p_gt_0": 0.5139, - "p_lt_0": 0.4861, - "n_with": 20, - "n_without": 341, - "usage_rate": 0.055401662049861494, - "rank": 4 - }, - { - "model": "deepseek/deepseek-v3.2", - "strategy": "retry_same", - "delta": -14.188100554537266, - "ci_lo": -35.76112154358545, - "ci_hi": 0.5063026676328819, - "p_gt_0": 0.0304, - "p_lt_0": 0.9696, - "n_with": 12, - "n_without": 416, - "usage_rate": 0.028037383177570093, - "rank": 11 - }, - { - "model": "deepseek/deepseek-v3.2", - "strategy": "retry_modified_params", - "delta": -6.638198731887192, - "ci_lo": -12.136623496593883, - "ci_hi": -1.4223226757095515, - "p_gt_0": 0.0064, - "p_lt_0": 0.9936, - "n_with": 168, - "n_without": 260, - "usage_rate": 0.3925233644859813, - "rank": 5 - }, - { - "model": "deepseek/deepseek-v3.2", - "strategy": "switch_tool", - "delta": -5.481083655104449, - "ci_lo": -18.629707585529463, - "ci_hi": 3.8066212041323673, - "p_gt_0": 0.161, - "p_lt_0": 0.839, - "n_with": 30, - "n_without": 398, - "usage_rate": 0.07009345794392523, - "rank": 3 - }, - { - "model": "deepseek/deepseek-v3.2", - "strategy": "lookup_correct_value", - "delta": 6.663740829223979, - "ci_lo": -1.8242866382482426, - "ci_hi": 17.509424517822136, - "p_gt_0": 0.9259, - "p_lt_0": 0.0741, - "n_with": 377, - "n_without": 51, - "usage_rate": 0.8808411214953271, - "rank": 1 - }, - { - "model": "deepseek/deepseek-v3.2", - "strategy": "backtrack", - "delta": -7.898583321582811, - "ci_lo": -16.78344824003216, - "ci_hi": -0.8870673106196507, - "p_gt_0": 0.0128, - "p_lt_0": 0.9872, - "n_with": 76, - "n_without": 352, - "usage_rate": 0.17757009345794392, - "rank": 8 - }, - { - "model": "deepseek/deepseek-v3.2", - "strategy": "parse_error_message", - "delta": -5.540272236291402, - "ci_lo": -11.291035414817875, - "ci_hi": -0.18840971351639632, - "p_gt_0": 0.0215, - "p_lt_0": 0.9785, - "n_with": 147, - "n_without": 281, - "usage_rate": 0.34345794392523366, - "rank": 4 - }, - { - "model": "deepseek/deepseek-v3.2", - "strategy": "change_strategy", - "delta": -7.48541195012128, - "ci_lo": -13.361977662551174, - "ci_hi": -2.1007550135388335, - "p_gt_0": 0.0039, - "p_lt_0": 0.9961, - "n_with": 165, - "n_without": 263, - "usage_rate": 0.3855140186915888, - "rank": 7 - }, - { - "model": "deepseek/deepseek-v3.2", - "strategy": "break_into_steps", - "delta": -8.192502077144422, - "ci_lo": -12.218749037288632, - "ci_hi": -3.1261223156091877, - "p_gt_0": 0.0015, - "p_lt_0": 0.9985, - "n_with": 419, - "n_without": 9, - "usage_rate": 0.9789719626168224, - "rank": 9 - }, - { - "model": "deepseek/deepseek-v3.2", - "strategy": "verify_prerequisites", - "delta": 3.3649165088143738, - "ci_lo": -2.2266992286166243, - "ci_hi": 9.491704662686349, - "p_gt_0": 0.8786, - "p_lt_0": 0.1214, - "n_with": 296, - "n_without": 132, - "usage_rate": 0.6915887850467289, - "rank": 2 - }, - { - "model": "deepseek/deepseek-v3.2", - "strategy": "skip_and_continue", - "delta": -11.580075697984528, - "ci_lo": -26.126098020927113, - "ci_hi": -0.6978544527320647, - "p_gt_0": 0.0166, - "p_lt_0": 0.9834, - "n_with": 30, - "n_without": 398, - "usage_rate": 0.07009345794392523, - "rank": 10 - }, - { - "model": "deepseek/deepseek-v3.2", - "strategy": "use_fallback", - "delta": -7.150749930583509, - "ci_lo": -18.334503520561586, - "ci_hi": 1.5354556117732354, - "p_gt_0": 0.0634, - "p_lt_0": 0.9366, - "n_with": 43, - "n_without": 385, - "usage_rate": 0.10046728971962617, - "rank": 6 - }, - { - "model": "google/gemini-3-flash-preview", - "strategy": "retry_same", - "delta": -22.88786562675688, - "ci_lo": -36.54470006610749, - "ci_hi": -9.883326388604234, - "p_gt_0": 0.0001, - "p_lt_0": 0.9999, - "n_with": 36, - "n_without": 372, - "usage_rate": 0.08823529411764706, - "rank": 11 - }, - { - "model": "google/gemini-3-flash-preview", - "strategy": "retry_modified_params", - "delta": -0.36544721236025773, - "ci_lo": -7.600840041539863, - "ci_hi": 6.961833702763105, - "p_gt_0": 0.4563, - "p_lt_0": 0.5437, - "n_with": 162, - "n_without": 246, - "usage_rate": 0.39705882352941174, - "rank": 6 - }, - { - "model": "google/gemini-3-flash-preview", - "strategy": "switch_tool", - "delta": -2.5508642536741157, - "ci_lo": -18.51177678737422, - "ci_hi": 10.05521388841985, - "p_gt_0": 0.3939, - "p_lt_0": 0.6061, - "n_with": 28, - "n_without": 380, - "usage_rate": 0.06862745098039216, - "rank": 9 - }, - { - "model": "google/gemini-3-flash-preview", - "strategy": "lookup_correct_value", - "delta": 23.264983516345882, - "ci_lo": 11.969469386546896, - "ci_hi": 35.32080862468233, - "p_gt_0": 0.9999, - "p_lt_0": 0.0001, - "n_with": 350, - "n_without": 58, - "usage_rate": 0.8578431372549019, - "rank": 1 - }, - { - "model": "google/gemini-3-flash-preview", - "strategy": "backtrack", - "delta": -1.1062962149436983, - "ci_lo": -10.205454929391083, - "ci_hi": 6.808863042799657, - "p_gt_0": 0.4165, - "p_lt_0": 0.5835, - "n_with": 89, - "n_without": 319, - "usage_rate": 0.2181372549019608, - "rank": 7 - }, - { - "model": "google/gemini-3-flash-preview", - "strategy": "parse_error_message", - "delta": -2.031387443012713, - "ci_lo": -9.283337951130392, - "ci_hi": 5.030873089731125, - "p_gt_0": 0.2911, - "p_lt_0": 0.7089, - "n_with": 137, - "n_without": 271, - "usage_rate": 0.33578431372549017, - "rank": 8 - }, - { - "model": "google/gemini-3-flash-preview", - "strategy": "change_strategy", - "delta": 0.1456861206541943, - "ci_lo": -7.672728144645115, - "ci_hi": 7.703861831092565, - "p_gt_0": 0.5222, - "p_lt_0": 0.4778, - "n_with": 130, - "n_without": 278, - "usage_rate": 0.31862745098039214, - "rank": 4 - }, - { - "model": "google/gemini-3-flash-preview", - "strategy": "break_into_steps", - "delta": 15.659963708654931, - "ci_lo": 4.971144475459527, - "ci_hi": 27.07229043438363, - "p_gt_0": 0.9991, - "p_lt_0": 0.0009, - "n_with": 348, - "n_without": 60, - "usage_rate": 0.8529411764705882, - "rank": 2 - }, - { - "model": "google/gemini-3-flash-preview", - "strategy": "verify_prerequisites", - "delta": 8.498155484368027, - "ci_lo": 1.7654139742943358, - "ci_hi": 15.3771571336391, - "p_gt_0": 0.9938, - "p_lt_0": 0.0062, - "n_with": 194, - "n_without": 214, - "usage_rate": 0.47549019607843135, - "rank": 3 - }, - { - "model": "google/gemini-3-flash-preview", - "strategy": "skip_and_continue", - "delta": 0.0908140649130664, - "ci_lo": -13.932378871275402, - "ci_hi": 10.992675173775162, - "p_gt_0": 0.5341, - "p_lt_0": 0.4659, - "n_with": 21, - "n_without": 387, - "usage_rate": 0.051470588235294115, - "rank": 5 - }, - { - "model": "google/gemini-3-flash-preview", - "strategy": "use_fallback", - "delta": -10.493423096520843, - "ci_lo": -27.046348750015174, - "ci_hi": 3.991294987747103, - "p_gt_0": 0.0845, - "p_lt_0": 0.9155, - "n_with": 20, - "n_without": 388, - "usage_rate": 0.049019607843137254, - "rank": 10 - }, - { - "model": "meta-llama/llama-4-scout", - "strategy": "retry_same", - "delta": -30.75329513068521, - "ci_lo": -39.126101595909596, - "ci_hi": -22.03053754818564, - "p_gt_0": 0.0, - "p_lt_0": 1.0, - "n_with": 156, - "n_without": 226, - "usage_rate": 0.4083769633507853, - "rank": 11 - }, - { - "model": "meta-llama/llama-4-scout", - "strategy": "retry_modified_params", - "delta": -16.902076581419532, - "ci_lo": -27.21661704594761, - "ci_hi": -6.3529625473618365, - "p_gt_0": 0.0011, - "p_lt_0": 0.9989, - "n_with": 290, - "n_without": 92, - "usage_rate": 0.7591623036649214, - "rank": 9 - }, - { - "model": "meta-llama/llama-4-scout", - "strategy": "switch_tool", - "delta": -18.35914151795654, - "ci_lo": -29.57729780711268, - "ci_hi": -6.180325704866352, - "p_gt_0": 0.0021, - "p_lt_0": 0.9979, - "n_with": 59, - "n_without": 323, - "usage_rate": 0.1544502617801047, - "rank": 10 - }, - { - "model": "meta-llama/llama-4-scout", - "strategy": "lookup_correct_value", - "delta": 12.537408555273544, - "ci_lo": 3.0476468930474963, - "ci_hi": 21.8270806179885, - "p_gt_0": 0.9953, - "p_lt_0": 0.0047, - "n_with": 234, - "n_without": 148, - "usage_rate": 0.612565445026178, - "rank": 2 - }, - { - "model": "meta-llama/llama-4-scout", - "strategy": "backtrack", - "delta": -5.5448336087675605, - "ci_lo": -14.930901117661586, - "ci_hi": 3.8153504045085525, - "p_gt_0": 0.1284, - "p_lt_0": 0.8716, - "n_with": 125, - "n_without": 257, - "usage_rate": 0.32722513089005234, - "rank": 5 - }, - { - "model": "meta-llama/llama-4-scout", - "strategy": "parse_error_message", - "delta": -9.585729735519266, - "ci_lo": -19.855676889721124, - "ci_hi": 0.6330294605123999, - "p_gt_0": 0.0343, - "p_lt_0": 0.9657, - "n_with": 282, - "n_without": 100, - "usage_rate": 0.7382198952879581, - "rank": 6 - }, - { - "model": "meta-llama/llama-4-scout", - "strategy": "change_strategy", - "delta": -11.22957928106905, - "ci_lo": -20.857994513889917, - "ci_hi": -1.6382879768882503, - "p_gt_0": 0.0103, - "p_lt_0": 0.9897, - "n_with": 251, - "n_without": 131, - "usage_rate": 0.6570680628272252, - "rank": 7 - }, - { - "model": "meta-llama/llama-4-scout", - "strategy": "break_into_steps", - "delta": 16.313899724511856, - "ci_lo": 6.712120011211371, - "ci_hi": 25.801760009787802, - "p_gt_0": 0.9996, - "p_lt_0": 0.0004, - "n_with": 255, - "n_without": 127, - "usage_rate": 0.6675392670157068, - "rank": 1 - }, - { - "model": "meta-llama/llama-4-scout", - "strategy": "verify_prerequisites", - "delta": 3.8544987268740596, - "ci_lo": -7.313387902712735, - "ci_hi": 15.173087424603711, - "p_gt_0": 0.7497, - "p_lt_0": 0.2503, - "n_with": 77, - "n_without": 305, - "usage_rate": 0.20157068062827224, - "rank": 3 - }, - { - "model": "meta-llama/llama-4-scout", - "strategy": "skip_and_continue", - "delta": -5.000170297906893, - "ci_lo": -14.644049357903315, - "ci_hi": 4.945912602475521, - "p_gt_0": 0.1585, - "p_lt_0": 0.8415, - "n_with": 91, - "n_without": 291, - "usage_rate": 0.23821989528795812, - "rank": 4 - }, - { - "model": "meta-llama/llama-4-scout", - "strategy": "use_fallback", - "delta": -13.412495085337778, - "ci_lo": -25.859492913196565, - "ci_hi": 0.1896623137258898, - "p_gt_0": 0.0267, - "p_lt_0": 0.9733, - "n_with": 44, - "n_without": 338, - "usage_rate": 0.11518324607329843, - "rank": 8 - }, - { - "model": "mistralai/devstral-2512", - "strategy": "retry_same", - "delta": -10.871197522137857, - "ci_lo": -24.409322311410907, - "ci_hi": 0.2718312653732015, - "p_gt_0": 0.0292, - "p_lt_0": 0.9708, - "n_with": 30, - "n_without": 398, - "usage_rate": 0.07009345794392523, - "rank": 9 - }, - { - "model": "mistralai/devstral-2512", - "strategy": "retry_modified_params", - "delta": -3.991243726034971, - "ci_lo": -9.273382636579214, - "ci_hi": 1.3620362507785932, - "p_gt_0": 0.0702, - "p_lt_0": 0.9298, - "n_with": 227, - "n_without": 201, - "usage_rate": 0.530373831775701, - "rank": 7 - }, - { - "model": "mistralai/devstral-2512", - "strategy": "switch_tool", - "delta": 1.2572701429250817, - "ci_lo": -9.75658246981209, - "ci_hi": 8.76722005472962, - "p_gt_0": 0.6603, - "p_lt_0": 0.3397, - "n_with": 27, - "n_without": 401, - "usage_rate": 0.0630841121495327, - "rank": 3 - }, - { - "model": "mistralai/devstral-2512", - "strategy": "lookup_correct_value", - "delta": 13.427418480834746, - "ci_lo": 5.178825155041258, - "ci_hi": 22.715860911705352, - "p_gt_0": 0.9999, - "p_lt_0": 0.0001, - "n_with": 352, - "n_without": 76, - "usage_rate": 0.822429906542056, - "rank": 1 - }, - { - "model": "mistralai/devstral-2512", - "strategy": "backtrack", - "delta": -1.2082621150299944, - "ci_lo": -9.742499316071864, - "ci_hi": 5.717615184192211, - "p_gt_0": 0.4051, - "p_lt_0": 0.5949, - "n_with": 70, - "n_without": 358, - "usage_rate": 0.16355140186915887, - "rank": 5 - }, - { - "model": "mistralai/devstral-2512", - "strategy": "parse_error_message", - "delta": -3.8101440959857515, - "ci_lo": -9.246835945418736, - "ci_hi": 1.6203188546943876, - "p_gt_0": 0.085, - "p_lt_0": 0.915, - "n_with": 208, - "n_without": 220, - "usage_rate": 0.48598130841121495, - "rank": 6 - }, - { - "model": "mistralai/devstral-2512", - "strategy": "change_strategy", - "delta": -4.8581712104614425, - "ci_lo": -10.206364974693907, - "ci_hi": 0.3219879507011329, - "p_gt_0": 0.0324, - "p_lt_0": 0.9676, - "n_with": 210, - "n_without": 218, - "usage_rate": 0.49065420560747663, - "rank": 8 - }, - { - "model": "mistralai/devstral-2512", - "strategy": "break_into_steps", - "delta": 0.6965722255026323, - "ci_lo": -6.867126640240155, - "ci_hi": 10.381321377227321, - "p_gt_0": 0.5289, - "p_lt_0": 0.4711, - "n_with": 393, - "n_without": 35, - "usage_rate": 0.9182242990654206, - "rank": 4 - }, - { - "model": "mistralai/devstral-2512", - "strategy": "verify_prerequisites", - "delta": 1.9140202826861041, - "ci_lo": -3.124658400301976, - "ci_hi": 7.235743815950695, - "p_gt_0": 0.764, - "p_lt_0": 0.236, - "n_with": 235, - "n_without": 193, - "usage_rate": 0.5490654205607477, - "rank": 2 - }, - { - "model": "mistralai/devstral-2512", - "strategy": "skip_and_continue", - "delta": -11.379797716296984, - "ci_lo": -21.32156683059864, - "ci_hi": -2.9800207916094914, - "p_gt_0": 0.0025, - "p_lt_0": 0.9975, - "n_with": 47, - "n_without": 381, - "usage_rate": 0.10981308411214953, - "rank": 10 - }, - { - "model": "mistralai/devstral-2512", - "strategy": "use_fallback", - "delta": -17.053620266185956, - "ci_lo": -31.12086198868929, - "ci_hi": -4.830656333277274, - "p_gt_0": 0.0018, - "p_lt_0": 0.9982, - "n_with": 29, - "n_without": 399, - "usage_rate": 0.06775700934579439, - "rank": 11 - }, - { - "model": "moonshotai/kimi-k2-0905", - "strategy": "retry_same", - "delta": -14.94681310063999, - "ci_lo": -30.042167925544508, - "ci_hi": -1.2143930288475762, - "p_gt_0": 0.0161, - "p_lt_0": 0.9839, - "n_with": 28, - "n_without": 379, - "usage_rate": 0.0687960687960688, - "rank": 10 - }, - { - "model": "moonshotai/kimi-k2-0905", - "strategy": "retry_modified_params", - "delta": 2.563053489094396, - "ci_lo": -4.241087779817882, - "ci_hi": 9.431783679888426, - "p_gt_0": 0.7711, - "p_lt_0": 0.2289, - "n_with": 175, - "n_without": 232, - "usage_rate": 0.42997542997543, - "rank": 4 - }, - { - "model": "moonshotai/kimi-k2-0905", - "strategy": "switch_tool", - "delta": -2.120302518182182, - "ci_lo": -16.425874982124956, - "ci_hi": 8.943403844852325, - "p_gt_0": 0.4048, - "p_lt_0": 0.5952, - "n_with": 22, - "n_without": 385, - "usage_rate": 0.05405405405405406, - "rank": 6 - }, - { - "model": "moonshotai/kimi-k2-0905", - "strategy": "lookup_correct_value", - "delta": 27.63876052170265, - "ci_lo": 17.57734194611147, - "ci_hi": 37.900459227317455, - "p_gt_0": 1.0, - "p_lt_0": 0.0, - "n_with": 340, - "n_without": 67, - "usage_rate": 0.8353808353808354, - "rank": 1 - }, - { - "model": "moonshotai/kimi-k2-0905", - "strategy": "backtrack", - "delta": 1.4478534603940318, - "ci_lo": -7.377841574505178, - "ci_hi": 9.361635851942477, - "p_gt_0": 0.6439, - "p_lt_0": 0.3561, - "n_with": 76, - "n_without": 331, - "usage_rate": 0.18673218673218672, - "rank": 5 - }, - { - "model": "moonshotai/kimi-k2-0905", - "strategy": "parse_error_message", - "delta": -2.5352487320565045, - "ci_lo": -10.131303192929202, - "ci_hi": 4.913736525891758, - "p_gt_0": 0.2575, - "p_lt_0": 0.7425, - "n_with": 149, - "n_without": 258, - "usage_rate": 0.36609336609336607, - "rank": 7 - }, - { - "model": "moonshotai/kimi-k2-0905", - "strategy": "change_strategy", - "delta": -2.888959311641334, - "ci_lo": -10.41568531090378, - "ci_hi": 4.446592546551432, - "p_gt_0": 0.2208, - "p_lt_0": 0.7792, - "n_with": 142, - "n_without": 265, - "usage_rate": 0.3488943488943489, - "rank": 8 - }, - { - "model": "moonshotai/kimi-k2-0905", - "strategy": "break_into_steps", - "delta": 21.34368407696186, - "ci_lo": 6.677416266517301, - "ci_hi": 37.005647009625285, - "p_gt_0": 0.9985, - "p_lt_0": 0.0015, - "n_with": 376, - "n_without": 31, - "usage_rate": 0.9238329238329238, - "rank": 2 - }, - { - "model": "moonshotai/kimi-k2-0905", - "strategy": "verify_prerequisites", - "delta": 12.81547665202495, - "ci_lo": 5.914537543007963, - "ci_hi": 19.538816837669604, - "p_gt_0": 0.9999, - "p_lt_0": 0.0001, - "n_with": 197, - "n_without": 210, - "usage_rate": 0.48402948402948404, - "rank": 3 - }, - { - "model": "moonshotai/kimi-k2-0905", - "strategy": "skip_and_continue", - "delta": -4.92614599206687, - "ci_lo": -14.185253589768433, - "ci_hi": 4.061437603499923, - "p_gt_0": 0.144, - "p_lt_0": 0.856, - "n_with": 45, - "n_without": 362, - "usage_rate": 0.11056511056511056, - "rank": 9 - }, - { - "model": "moonshotai/kimi-k2-0905", - "strategy": "use_fallback", - "delta": -19.85300433047656, - "ci_lo": -35.95572567962597, - "ci_hi": -5.509031693992781, - "p_gt_0": 0.0023, - "p_lt_0": 0.9977, - "n_with": 27, - "n_without": 380, - "usage_rate": 0.06633906633906633, - "rank": 11 - }, - { - "model": "openai/gpt-oss-120b", - "strategy": "retry_same", - "delta": 0.7921348493221181, - "ci_lo": -8.217049237888544, - "ci_hi": 8.6506535112169, - "p_gt_0": 0.5896, - "p_lt_0": 0.4104, - "n_with": 70, - "n_without": 313, - "usage_rate": 0.18276762402088773, - "rank": 6 - }, - { - "model": "openai/gpt-oss-120b", - "strategy": "retry_modified_params", - "delta": 0.43459904631507407, - "ci_lo": -6.629157840423662, - "ci_hi": 7.663041513102244, - "p_gt_0": 0.5473, - "p_lt_0": 0.4527, - "n_with": 218, - "n_without": 165, - "usage_rate": 0.5691906005221932, - "rank": 7 - }, - { - "model": "openai/gpt-oss-120b", - "strategy": "switch_tool", - "delta": -14.505097728985476, - "ci_lo": -35.29924844915861, - "ci_hi": 3.6944530131282303, - "p_gt_0": 0.0666, - "p_lt_0": 0.9334, - "n_with": 16, - "n_without": 367, - "usage_rate": 0.04177545691906005, - "rank": 11 - }, - { - "model": "openai/gpt-oss-120b", - "strategy": "lookup_correct_value", - "delta": 6.562993073588253, - "ci_lo": -2.557030189427241, - "ci_hi": 16.716936671415294, - "p_gt_0": 0.9182, - "p_lt_0": 0.0818, - "n_with": 304, - "n_without": 79, - "usage_rate": 0.793733681462141, - "rank": 2 - }, - { - "model": "openai/gpt-oss-120b", - "strategy": "backtrack", - "delta": 1.2582914135034888, - "ci_lo": -9.021542391262608, - "ci_hi": 9.833824168822426, - "p_gt_0": 0.6267, - "p_lt_0": 0.3733, - "n_with": 55, - "n_without": 328, - "usage_rate": 0.14360313315926893, - "rank": 5 - }, - { - "model": "openai/gpt-oss-120b", - "strategy": "parse_error_message", - "delta": 3.8295274321876307, - "ci_lo": -3.4849220833694674, - "ci_hi": 10.824180599952456, - "p_gt_0": 0.8547, - "p_lt_0": 0.1453, - "n_with": 172, - "n_without": 211, - "usage_rate": 0.4490861618798956, - "rank": 3 - }, - { - "model": "openai/gpt-oss-120b", - "strategy": "change_strategy", - "delta": -0.2654544163244912, - "ci_lo": -7.65915124419173, - "ci_hi": 7.051386868606802, - "p_gt_0": 0.4762, - "p_lt_0": 0.5238, - "n_with": 138, - "n_without": 245, - "usage_rate": 0.360313315926893, - "rank": 8 - }, - { - "model": "openai/gpt-oss-120b", - "strategy": "break_into_steps", - "delta": 9.129573540337937, - "ci_lo": -1.6905459585992744, - "ci_hi": 20.973889864192227, - "p_gt_0": 0.9453, - "p_lt_0": 0.0547, - "n_with": 337, - "n_without": 46, - "usage_rate": 0.8798955613577023, - "rank": 1 - }, - { - "model": "openai/gpt-oss-120b", - "strategy": "verify_prerequisites", - "delta": 3.5290513306313236, - "ci_lo": -3.561829056656785, - "ci_hi": 10.605508712100065, - "p_gt_0": 0.8371, - "p_lt_0": 0.1629, - "n_with": 156, - "n_without": 227, - "usage_rate": 0.4073107049608355, - "rank": 4 - }, - { - "model": "openai/gpt-oss-120b", - "strategy": "skip_and_continue", - "delta": -7.11287592644546, - "ci_lo": -18.42436524562686, - "ci_hi": 2.676079346993676, - "p_gt_0": 0.084, - "p_lt_0": 0.916, - "n_with": 26, - "n_without": 357, - "usage_rate": 0.06788511749347259, - "rank": 10 - }, - { - "model": "openai/gpt-oss-120b", - "strategy": "use_fallback", - "delta": -2.693459946451506, - "ci_lo": -16.473386569642685, - "ci_hi": 8.67382944263421, - "p_gt_0": 0.3536, - "p_lt_0": 0.6464, - "n_with": 31, - "n_without": 352, - "usage_rate": 0.08093994778067885, - "rank": 9 - }, - { - "model": "qwen/qwen3-vl-235b-a22b-instruct", - "strategy": "retry_same", - "delta": -36.16427400478764, - "ci_lo": -48.907611542160744, - "ci_hi": -23.47952365051624, - "p_gt_0": 0.0, - "p_lt_0": 1.0, - "n_with": 48, - "n_without": 368, - "usage_rate": 0.11538461538461539, - "rank": 11 - }, - { - "model": "qwen/qwen3-vl-235b-a22b-instruct", - "strategy": "retry_modified_params", - "delta": -12.006718045243717, - "ci_lo": -19.08720143702119, - "ci_hi": -5.106186416114163, - "p_gt_0": 0.0003, - "p_lt_0": 0.9997, - "n_with": 196, - "n_without": 220, - "usage_rate": 0.47115384615384615, - "rank": 6 - }, - { - "model": "qwen/qwen3-vl-235b-a22b-instruct", - "strategy": "switch_tool", - "delta": -12.012970843871116, - "ci_lo": -24.593354395304814, - "ci_hi": -0.9141324110497081, - "p_gt_0": 0.0162, - "p_lt_0": 0.9838, - "n_with": 40, - "n_without": 376, - "usage_rate": 0.09615384615384616, - "rank": 7 - }, - { - "model": "qwen/qwen3-vl-235b-a22b-instruct", - "strategy": "lookup_correct_value", - "delta": 17.304343249575936, - "ci_lo": 7.557247400304122, - "ci_hi": 27.88739625912021, - "p_gt_0": 0.9999, - "p_lt_0": 0.0001, - "n_with": 330, - "n_without": 86, - "usage_rate": 0.7932692307692307, - "rank": 2 - }, - { - "model": "qwen/qwen3-vl-235b-a22b-instruct", - "strategy": "backtrack", - "delta": -10.194686020350298, - "ci_lo": -19.95478528777536, - "ci_hi": -1.4029736834860957, - "p_gt_0": 0.0116, - "p_lt_0": 0.9884, - "n_with": 74, - "n_without": 342, - "usage_rate": 0.1778846153846154, - "rank": 5 - }, - { - "model": "qwen/qwen3-vl-235b-a22b-instruct", - "strategy": "parse_error_message", - "delta": -9.125018859671272, - "ci_lo": -15.809533560721597, - "ci_hi": -2.5074691422163338, - "p_gt_0": 0.0038, - "p_lt_0": 0.9962, - "n_with": 202, - "n_without": 214, - "usage_rate": 0.4855769230769231, - "rank": 4 - }, - { - "model": "qwen/qwen3-vl-235b-a22b-instruct", - "strategy": "change_strategy", - "delta": -13.65453407958844, - "ci_lo": -21.300701332285666, - "ci_hi": -6.507512403487294, - "p_gt_0": 0.0001, - "p_lt_0": 0.9999, - "n_with": 166, - "n_without": 250, - "usage_rate": 0.39903846153846156, - "rank": 8 - }, - { - "model": "qwen/qwen3-vl-235b-a22b-instruct", - "strategy": "break_into_steps", - "delta": 20.317388231829593, - "ci_lo": 9.16939116008763, - "ci_hi": 32.011773935226415, - "p_gt_0": 1.0, - "p_lt_0": 0.0, - "n_with": 356, - "n_without": 60, - "usage_rate": 0.8557692307692307, - "rank": 1 - }, - { - "model": "qwen/qwen3-vl-235b-a22b-instruct", - "strategy": "verify_prerequisites", - "delta": 4.477513873857788, - "ci_lo": -2.0905671452852235, - "ci_hi": 10.993040116102074, - "p_gt_0": 0.913, - "p_lt_0": 0.087, - "n_with": 164, - "n_without": 252, - "usage_rate": 0.3942307692307692, - "rank": 3 - }, - { - "model": "qwen/qwen3-vl-235b-a22b-instruct", - "strategy": "skip_and_continue", - "delta": -15.954033523893935, - "ci_lo": -25.68040461446644, - "ci_hi": -6.921313988084732, - "p_gt_0": 0.0002, - "p_lt_0": 0.9998, - "n_with": 58, - "n_without": 358, - "usage_rate": 0.13942307692307693, - "rank": 9 - }, - { - "model": "qwen/qwen3-vl-235b-a22b-instruct", - "strategy": "use_fallback", - "delta": -25.70524743986402, - "ci_lo": -47.5444640074283, - "ci_hi": -5.733473043076171, - "p_gt_0": 0.0045, - "p_lt_0": 0.9955, - "n_with": 20, - "n_without": 396, - "usage_rate": 0.04807692307692308, - "rank": 10 - }, - { - "model": "x-ai/grok-4.1-fast", - "strategy": "retry_same", - "delta": -32.15201735248772, - "ci_lo": -54.37906388531154, - "ci_hi": -7.8972247889490035, - "p_gt_0": 0.0051, - "p_lt_0": 0.9949, - "n_with": 11, - "n_without": 401, - "usage_rate": 0.02669902912621359, - "rank": 11 - }, - { - "model": "x-ai/grok-4.1-fast", - "strategy": "retry_modified_params", - "delta": -10.946842069805498, - "ci_lo": -18.45623910116072, - "ci_hi": -3.1408382573564295, - "p_gt_0": 0.0039, - "p_lt_0": 0.9961, - "n_with": 214, - "n_without": 198, - "usage_rate": 0.5194174757281553, - "rank": 8 - }, - { - "model": "x-ai/grok-4.1-fast", - "strategy": "switch_tool", - "delta": -10.52544168932759, - "ci_lo": -23.516932575654625, - "ci_hi": 1.3525361562209557, - "p_gt_0": 0.0421, - "p_lt_0": 0.9579, - "n_with": 34, - "n_without": 378, - "usage_rate": 0.0825242718446602, - "rank": 7 - }, - { - "model": "x-ai/grok-4.1-fast", - "strategy": "lookup_correct_value", - "delta": 0.5114340335954243, - "ci_lo": -7.585424772905799, - "ci_hi": 9.479036401925407, - "p_gt_0": 0.5395, - "p_lt_0": 0.4605, - "n_with": 327, - "n_without": 85, - "usage_rate": 0.7936893203883495, - "rank": 2 - }, - { - "model": "x-ai/grok-4.1-fast", - "strategy": "backtrack", - "delta": 1.1204542282706016, - "ci_lo": -8.124457128516607, - "ci_hi": 9.698204260777805, - "p_gt_0": 0.6082, - "p_lt_0": 0.3918, - "n_with": 88, - "n_without": 324, - "usage_rate": 0.21359223300970873, - "rank": 1 - }, - { - "model": "x-ai/grok-4.1-fast", - "strategy": "parse_error_message", - "delta": -9.060388671357135, - "ci_lo": -16.607785125894747, - "ci_hi": -1.5726541401240306, - "p_gt_0": 0.0094, - "p_lt_0": 0.9906, - "n_with": 223, - "n_without": 189, - "usage_rate": 0.5412621359223301, - "rank": 6 - }, - { - "model": "x-ai/grok-4.1-fast", - "strategy": "change_strategy", - "delta": -7.6004227553770045, - "ci_lo": -15.525292681836946, - "ci_hi": 0.16463485865282165, - "p_gt_0": 0.0283, - "p_lt_0": 0.9717, - "n_with": 151, - "n_without": 261, - "usage_rate": 0.36650485436893204, - "rank": 5 - }, - { - "model": "x-ai/grok-4.1-fast", - "strategy": "break_into_steps", - "delta": -1.2305121365587328, - "ci_lo": -11.28174771473798, - "ci_hi": 9.97852201029388, - "p_gt_0": 0.4001, - "p_lt_0": 0.5999, - "n_with": 355, - "n_without": 57, - "usage_rate": 0.8616504854368932, - "rank": 4 - }, - { - "model": "x-ai/grok-4.1-fast", - "strategy": "verify_prerequisites", - "delta": -0.06582422531849998, - "ci_lo": -7.856554853319606, - "ci_hi": 7.497168446306337, - "p_gt_0": 0.4983, - "p_lt_0": 0.5017, - "n_with": 178, - "n_without": 234, - "usage_rate": 0.4320388349514563, - "rank": 3 - }, - { - "model": "x-ai/grok-4.1-fast", - "strategy": "skip_and_continue", - "delta": -17.234302279539854, - "ci_lo": -27.06889007172422, - "ci_hi": -7.938852148235506, - "p_gt_0": 0.0, - "p_lt_0": 1.0, - "n_with": 56, - "n_without": 356, - "usage_rate": 0.13592233009708737, - "rank": 9 - }, - { - "model": "x-ai/grok-4.1-fast", - "strategy": "use_fallback", - "delta": -18.223368522567686, - "ci_lo": -32.35601942300494, - "ci_hi": -4.608205740786368, - "p_gt_0": 0.003, - "p_lt_0": 0.997, - "n_with": 34, - "n_without": 378, - "usage_rate": 0.0825242718446602, - "rank": 10 - } - ] - } - }, - { - "run_id": "0d8e1dce-f8ad-4855-bdb8-97906aa589e5", - "timestamp": "2026-02-09T13:05:48.239918", - "description": "Recovery Strategy Delta: Top 4 vs Bottom 5 (median split, including llama)", - "metadata": { - "split_method": "median_split_top4_vs_bottom5", - "top_models": [ - "deepseek/deepseek-v3.2", - "mistralai/devstral-2512", - "moonshotai/kimi-k2-0905", - "qwen/qwen3-vl-235b-a22b-instruct" - ], - "bottom_models": [ - "anthropic/claude-haiku-4.5", - "google/gemini-3-flash-preview", - "meta-llama/llama-4-scout", - "openai/gpt-oss-120b", - "x-ai/grok-4.1-fast" - ], - "includes_llama": true, - "min_usage_threshold": 0.05, - "strategies_analyzed": [ - "retry_same", - "retry_modified_params", - "switch_tool", - "lookup_correct_value", - "backtrack", - "parse_error_message", - "change_strategy", - "break_into_steps", - "verify_prerequisites", - "skip_and_continue", - "use_fallback" - ], - "strategies_excluded": [ - "handle_ui_obstacle", - "wait_and_retry", - "other_recovery_strategy" - ], - "n_draws": 10000, - "seed": 42 - }, - "results": { - "retry_same": { - "mean": -0.09103604921658948, - "median": -0.09109260994987882, - "lo": -0.10957884846730745, - "hi": -0.0719766860823506, - "p_gt_0": 0.0, - "p_lt_0": 1.0, - "n_clusters": 434, - "n_a": 1677, - "n_b": 1938 - }, - "retry_modified_params": { - "mean": -0.07951564582181299, - "median": -0.07947052083112649, - "lo": -0.10828638500319165, - "hi": -0.05012203842085975, - "p_gt_0": 0.0, - "p_lt_0": 1.0, - "n_clusters": 434, - "n_a": 1677, - "n_b": 1938 - }, - "switch_tool": { - "mean": -0.009124047556068765, - "median": -0.00920404485653227, - "lo": -0.025539357832924447, - "hi": 0.0072456718548228796, - "p_gt_0": 0.1402, - "p_lt_0": 0.8598, - "n_clusters": 434, - "n_a": 1677, - "n_b": 1938 - }, - "lookup_correct_value": { - "mean": 0.06622385682643525, - "median": 0.06618886826991816, - "lo": 0.04354034480804078, - "hi": 0.08911349208358349, - "p_gt_0": 1.0, - "p_lt_0": 0.0, - "n_clusters": 434, - "n_a": 1677, - "n_b": 1938 - }, - "backtrack": { - "mean": -0.03966271271009249, - "median": -0.03945044443112419, - "lo": -0.06588868149538812, - "hi": -0.01328529782285673, - "p_gt_0": 0.0017, - "p_lt_0": 0.9983, - "n_clusters": 434, - "n_a": 1677, - "n_b": 1938 - }, - "parse_error_message": { - "mean": -0.07303215462932654, - "median": -0.07314696379010416, - "lo": -0.10321193918873811, - "hi": -0.04298633030130917, - "p_gt_0": 0.0, - "p_lt_0": 1.0, - "n_clusters": 434, - "n_a": 1677, - "n_b": 1938 - }, - "change_strategy": { - "mean": -0.0035037361411441024, - "median": -0.0037119582349364466, - "lo": -0.03194505967257071, - "hi": 0.024790497912982545, - "p_gt_0": 0.4029, - "p_lt_0": 0.5971, - "n_clusters": 434, - "n_a": 1677, - "n_b": 1938 - }, - "break_into_steps": { - "mean": 0.09508089679697927, - "median": 0.09506391711573636, - "lo": 0.07409295465445424, - "hi": 0.11691936055130064, - "p_gt_0": 1.0, - "p_lt_0": 0.0, - "n_clusters": 434, - "n_a": 1677, - "n_b": 1938 - }, - "verify_prerequisites": { - "mean": 0.13058965519621055, - "median": 0.13083256078708905, - "lo": 0.10005243716485707, - "hi": 0.1608955933944794, - "p_gt_0": 1.0, - "p_lt_0": 0.0, - "n_clusters": 434, - "n_a": 1677, - "n_b": 1938 - }, - "skip_and_continue": { - "mean": -0.007385318964547756, - "median": -0.007389518236925505, - "lo": -0.02436596949165244, - "hi": 0.009816767711473884, - "p_gt_0": 0.2, - "p_lt_0": 0.8, - "n_clusters": 434, - "n_a": 1677, - "n_b": 1938 - }, - "use_fallback": { - "mean": -0.004892152991523062, - "median": -0.005073872845546126, - "lo": -0.02163104032438776, - "hi": 0.012701314015813678, - "p_gt_0": 0.2864, - "p_lt_0": 0.7136, - "n_clusters": 434, - "n_a": 1677, - "n_b": 1938 - } - } - }, - { - "run_id": "ad81ff57-25e1-4e3a-956f-d409cd15dd36", - "timestamp": "2026-02-09T13:07:49.743370", - "description": "Recovery Strategy Delta: Top 4 vs Bottom 4 (median split, excluding llama outlier)", - "metadata": { - "split_method": "median_split_top4_vs_bottom4", - "top_models": [ - "deepseek/deepseek-v3.2", - "mistralai/devstral-2512", - "moonshotai/kimi-k2-0905", - "qwen/qwen3-vl-235b-a22b-instruct" - ], - "bottom_models": [ - "anthropic/claude-haiku-4.5", - "google/gemini-3-flash-preview", - "openai/gpt-oss-120b", - "x-ai/grok-4.1-fast" - ], - "excluded_models": [ - "meta-llama/llama-4-scout" - ], - "exclusion_reason": "llama excluded as statistical outlier", - "includes_llama": false, - "min_usage_threshold": 0.05, - "strategies_analyzed": [ - "retry_same", - "retry_modified_params", - "switch_tool", - "lookup_correct_value", - "backtrack", - "parse_error_message", - "change_strategy", - "break_into_steps", - "verify_prerequisites", - "skip_and_continue", - "use_fallback" - ], - "strategies_excluded": [ - "handle_ui_obstacle", - "wait_and_retry", - "other_recovery_strategy" - ], - "n_draws": 10000, - "seed": 42 - }, - "results": { - "retry_same": { - "mean": -0.031652743116263454, - "median": -0.03160324178012844, - "lo": -0.05016915407630905, - "hi": -0.013034324016886496, - "p_gt_0": 0.0003, - "p_lt_0": 0.9997, - "n_clusters": 432, - "n_a": 1674, - "n_b": 1559 - }, - "retry_modified_params": { - "mean": -0.02161081261538253, - "median": -0.02165067239813062, - "lo": -0.05158566895808769, - "hi": 0.008820442173665115, - "p_gt_0": 0.0825, - "p_lt_0": 0.9175, - "n_clusters": 432, - "n_a": 1674, - "n_b": 1559 - }, - "switch_tool": { - "mean": 0.006117927318999696, - "median": 0.006054446625149555, - "lo": -0.00996929027863427, - "hi": 0.02248732864405629, - "p_gt_0": 0.7681, - "p_lt_0": 0.2319, - "n_clusters": 432, - "n_a": 1674, - "n_b": 1559 - }, - "lookup_correct_value": { - "mean": 0.03291443477255676, - "median": 0.03290993922423108, - "lo": 0.010056096665411796, - "hi": 0.05553228093568756, - "p_gt_0": 0.9982, - "p_lt_0": 0.0018, - "n_clusters": 432, - "n_a": 1674, - "n_b": 1559 - }, - "backtrack": { - "mean": -0.016004131905160052, - "median": -0.015947622906689496, - "lo": -0.042801713950029865, - "hi": 0.01102089652206714, - "p_gt_0": 0.1196, - "p_lt_0": 0.8804, - "n_clusters": 432, - "n_a": 1674, - "n_b": 1559 - }, - "parse_error_message": { - "mean": -0.015371608203945751, - "median": -0.015367242135045728, - "lo": -0.04594248661910706, - "hi": 0.015852663423577936, - "p_gt_0": 0.1621, - "p_lt_0": 0.8379, - "n_clusters": 432, - "n_a": 1674, - "n_b": 1559 - }, - "change_strategy": { - "mean": 0.0593181818093854, - "median": 0.059294457855713384, - "lo": 0.030479928337650213, - "hi": 0.08871075534968872, - "p_gt_0": 0.9999, - "p_lt_0": 0.0001, - "n_clusters": 432, - "n_a": 1674, - "n_b": 1559 - }, - "break_into_steps": { - "mean": 0.05582580083110896, - "median": 0.055803819368386964, - "lo": 0.03338900451524528, - "hi": 0.07796549475144765, - "p_gt_0": 1.0, - "p_lt_0": 0.0, - "n_clusters": 432, - "n_a": 1674, - "n_b": 1559 - }, - "verify_prerequisites": { - "mean": 0.0796079631326316, - "median": 0.07980403092340446, - "lo": 0.04780512557757043, - "hi": 0.11116572394418382, - "p_gt_0": 1.0, - "p_lt_0": 0.0, - "n_clusters": 432, - "n_a": 1674, - "n_b": 1559 - }, - "skip_and_continue": { - "mean": 0.02181295514133002, - "median": 0.02188015089318647, - "lo": 0.00550088914951375, - "hi": 0.037811566821686106, - "p_gt_0": 0.9943, - "p_lt_0": 0.0057, - "n_clusters": 432, - "n_a": 1674, - "n_b": 1559 - }, - "use_fallback": { - "mean": 0.005542053085614652, - "median": 0.005370788590476558, - "lo": -0.01163616726486241, - "hi": 0.022986279703769395, - "p_gt_0": 0.7299, - "p_lt_0": 0.2701, - "n_clusters": 432, - "n_a": 1674, - "n_b": 1559 - } - } - }, - { - "run_id": "dcbe05e2-56ae-444a-b9ff-6330e1db704b", - "timestamp": "2026-02-09T03:42:06.737725", - "description": "API Knowledge Error Ratio: no_docs vs with_docs", - "metadata": { - "statistic": "weighted_api_knowledge_ratio", - "api_knowledge_errors": [ - "endpoint_selection", - "parameter_misuse" - ], - "comparison": "no_docs vs with_docs (ratio of API errors to total errors)", - "bootstrap_method": "bayes_bootstrap_delta_clustered", - "cluster_key": "test_id", - "n_draws": 100000, - "seed": 42, - "interpretation": "positive delta = docs reduce API knowledge error proportion", - "n_runs_analyzed": 3955 - }, - "results": { - "claude-haiku-4.5": { - "mean": 0.040584999247051334, - "median": 0.040411382084580685, - "lo": 0.022880560929575713, - "hi": 0.0592542604024031, - "p_gt_0": 1.0, - "p_lt_0": 0.0, - "n_clusters": 220, - "n_a": 220, - "n_b": 220 - }, - "deepseek-v3.2": { - "mean": 0.0827939248534559, - "median": 0.08264530013667731, - "lo": 0.03879754002291, - "hi": 0.1273327511847746, - "p_gt_0": 0.9999, - "p_lt_0": 0.0001, - "n_clusters": 218, - "n_a": 218, - "n_b": 218 - }, - "devstral-2512": { - "mean": 0.09932723249662846, - "median": 0.09937916396991449, - "lo": 0.06282952406498639, - "hi": 0.1356795256695502, - "p_gt_0": 1.0, - "p_lt_0": 0.0, - "n_clusters": 222, - "n_a": 222, - "n_b": 222 - }, - "gemini-3-flash-preview": { - "mean": 0.01945807285926072, - "median": 0.019460189434683868, - "lo": -0.0061190424196176305, - "hi": 0.04524343743648777, - "p_gt_0": 0.93244, - "p_lt_0": 0.06756, - "n_clusters": 221, - "n_a": 221, - "n_b": 221 - }, - "gpt-oss-120b": { - "mean": 0.03579026894740811, - "median": 0.03573244387144292, - "lo": 0.017685278670376746, - "hi": 0.05425853745165322, - "p_gt_0": 0.99992, - "p_lt_0": 8e-05, - "n_clusters": 220, - "n_a": 220, - "n_b": 220 - }, - "grok-4.1-fast": { - "mean": 0.04454445004438593, - "median": 0.04458698609248999, - "lo": 0.02222556413559329, - "hi": 0.06672968928884401, - "p_gt_0": 0.99994, - "p_lt_0": 6e-05, - "n_clusters": 218, - "n_a": 218, - "n_b": 218 - }, - "kimi-k2-0905": { - "mean": 0.0647231912310758, - "median": 0.06470822980606873, - "lo": 0.034092411817744625, - "hi": 0.09516436723859083, - "p_gt_0": 0.99999, - "p_lt_0": 1e-05, - "n_clusters": 217, - "n_a": 217, - "n_b": 217 - }, - "llama-4-scout": { - "mean": 0.015108065386857522, - "median": 0.01514639283103629, - "lo": 0.00039617586850217043, - "hi": 0.029740066793261338, - "p_gt_0": 0.97755, - "p_lt_0": 0.02245, - "n_clusters": 201, - "n_a": 201, - "n_b": 201 - }, - "qwen3-vl-235b-a22b-instruct": { - "mean": 0.05699903371532211, - "median": 0.05707557533237209, - "lo": 0.030735106002460502, - "hi": 0.08282484619874553, - "p_gt_0": 0.99997, - "p_lt_0": 3e-05, - "n_clusters": 220, - "n_a": 220, - "n_b": 220 - } - } - }, - { - "run_id": "57dc2fc5-45bc-4df3-b4bd-3e0bfe273196", - "timestamp": "2026-02-09T13:02:31.571505", - "description": "API Knowledge Error Absolute Rate: no_docs vs with_docs", - "metadata": { - "statistic": "weighted_api_knowledge_rate", - "api_knowledge_errors": [ - "endpoint_selection", - "parameter_misuse" - ], - "comparison": "no_docs vs with_docs (absolute rate, not ratio)", - "bootstrap_method": "bayes_bootstrap_delta_clustered", - "cluster_key": "test_id", - "n_draws": 100000, - "seed": 42, - "interpretation": "positive delta = docs reduce absolute API knowledge error count", - "n_runs_analyzed": 3955 - }, - "results": { - "claude-haiku-4.5": { - "mean": 0.2137941426479078, - "median": 0.21305118327847652, - "lo": 0.12588787283599542, - "hi": 0.3060362483668092, - "p_gt_0": 1.0, - "p_lt_0": 0.0, - "n_clusters": 220, - "n_a": 220, - "n_b": 220 - }, - "deepseek-v3.2": { - "mean": 0.18803051898661474, - "median": 0.1871681665881832, - "lo": 0.10723387291656583, - "hi": 0.27295111855349335, - "p_gt_0": 1.0, - "p_lt_0": 0.0, - "n_clusters": 218, - "n_a": 218, - "n_b": 218 - }, - "devstral-2512": { - "mean": 0.42810180654321794, - "median": 0.42767954420834675, - "lo": 0.32962398945822163, - "hi": 0.5293243098945404, - "p_gt_0": 1.0, - "p_lt_0": 0.0, - "n_clusters": 222, - "n_a": 222, - "n_b": 222 - }, - "gemini-3-flash-preview": { - "mean": 0.10861533225990665, - "median": 0.1081297070156765, - "lo": 0.02982038231077991, - "hi": 0.19032342313962403, - "p_gt_0": 0.99659, - "p_lt_0": 0.00341, - "n_clusters": 221, - "n_a": 221, - "n_b": 221 - }, - "gpt-oss-120b": { - "mean": 0.19542576426829905, - "median": 0.19495858260884366, - "lo": 0.10747117586556258, - "hi": 0.28596175957234665, - "p_gt_0": 1.0, - "p_lt_0": 0.0, - "n_clusters": 220, - "n_a": 220, - "n_b": 220 - }, - "grok-4.1-fast": { - "mean": 0.29337007770491824, - "median": 0.29316892988260357, - "lo": 0.18808424618275657, - "hi": 0.3996602493261472, - "p_gt_0": 1.0, - "p_lt_0": 0.0, - "n_clusters": 218, - "n_a": 218, - "n_b": 218 - }, - "kimi-k2-0905": { - "mean": 0.17974225215601594, - "median": 0.1791712367421608, - "lo": 0.08430573831277695, - "hi": 0.27865431520693223, - "p_gt_0": 0.99988, - "p_lt_0": 0.00012, - "n_clusters": 217, - "n_a": 217, - "n_b": 217 - }, - "llama-4-scout": { - "mean": 0.2785938666995269, - "median": 0.279094134979458, - "lo": 0.1548222612026634, - "hi": 0.39967386641674946, - "p_gt_0": 1.0, - "p_lt_0": 0.0, - "n_clusters": 201, - "n_a": 201, - "n_b": 201 - }, - "qwen3-vl-235b-a22b-instruct": { - "mean": 0.42258769140462166, - "median": 0.42222519420523874, - "lo": 0.3279747731692246, - "hi": 0.519200255881689, - "p_gt_0": 1.0, - "p_lt_0": 0.0, - "n_clusters": 220, - "n_a": 220, - "n_b": 220 - } - } - } - ] -} \ No newline at end of file +version https://git-lfs.github.com/spec/v1 +oid sha256:74f53bd694440de2f5f07dcc2d29806313592df802bdf85be82002f78e3908ba +size 60375