From 6c3bb5efd8707678282fd2a4aeed3e940200c1b6 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 17:07:26 +0100 Subject: [PATCH 01/19] Add stop_reason field to plan_status response (Proposal 114-I1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agents can now distinguish user-initiated stops from actual failures: stop_reason is "user_requested" when plan_stop was called, null for real errors. Computed from existing stop_requested DB column — no migration needed. Co-Authored-By: Claude Opus 4.6 --- .../114-mcp-interface-feedback-stress-test.md | 6 +- mcp_cloud/handlers.py | 3 + mcp_cloud/schemas.py | 1 + mcp_cloud/tests/test_plan_status_tool.py | 60 +++++++++++++++++++ mcp_cloud/tool_models.py | 18 ++++++ mcp_local/planexe_mcp_local.py | 2 + 6 files changed, 88 insertions(+), 2 deletions(-) diff --git a/docs/proposals/114-mcp-interface-feedback-stress-test.md b/docs/proposals/114-mcp-interface-feedback-stress-test.md index 346b1ac3..e4e04075 100644 --- a/docs/proposals/114-mcp-interface-feedback-stress-test.md +++ b/docs/proposals/114-mcp-interface-feedback-stress-test.md @@ -19,6 +19,8 @@ The testing surfaced concrete MCP interface friction points that affect agent op ### I1 — `failed` state conflates user-stop and actual failure +**Status:** Implemented (Option B — `stop_reason` field on `plan_status`). + **Problem:** When `plan_stop` is called, the plan transitions to `failed`. When a worker crashes, it also transitions to `failed`. The agent operator cannot distinguish between: - User-initiated stop (nothing went wrong) - Actual failure (network drop, model error, worker crash) @@ -186,7 +188,7 @@ This enables prompt iteration tracking without changing existing behavior for pl | Issue | Existing Proposal | Gap | |-------|-------------------|-----| -| I1 (stopped vs failed) | 87 §4 (deferred) | Needs its own decision — Option A vs B | +| I1 (stopped vs failed) | 87 §4 (deferred) | **Implemented** (Option B — `stop_reason` field) | | I2 (failure diagnostics) | 113 (logs only) | Not surfaced to MCP consumer | | I3 (plan_delete) | None | New | | I4 (idempotency) | None | **Implemented** (PR #242) | @@ -217,7 +219,7 @@ If accepted, I1–I4 and I7–I9 should be added to Proposal 70's quick-win chec | Priority | Issues | Rationale | |----------|--------|-----------| | P1 | I2 (failure diagnostics) | Biggest observability gap. Agent cannot help users debug failures without this. | -| P1 | I1 (stopped vs failed) | Small change (Option B), high diagnostic value. Prerequisite for good plan_resume UX. | +| ~~P1~~ | ~~I1 (stopped vs failed)~~ | **Implemented** (Option B). `stop_reason` field on `plan_status`: `"user_requested"` when `plan_stop` was called, `null` for actual errors. | | P2 | I7 (stall detection) | Prevents agents from waiting indefinitely on stuck plans. | | P2 | I6 (download TTL) | Low effort, reduces friction. | | P2 | I5 (rich SSE events) | Eliminates polling for SSE-capable clients. | diff --git a/mcp_cloud/handlers.py b/mcp_cloud/handlers.py index c95e88d6..369b153d 100644 --- a/mcp_cloud/handlers.py +++ b/mcp_cloud/handlers.py @@ -338,6 +338,9 @@ async def handle_plan_status(arguments: dict[str, Any]) -> CallToolResult: }, "files_count": len(files), "files": files[:10], + "stop_reason": "user_requested" if ( + state == "failed" and plan_snapshot.get("stop_requested") + ) else None, } if state == "failed": diff --git a/mcp_cloud/schemas.py b/mcp_cloud/schemas.py index 91049f61..4c49a79d 100644 --- a/mcp_cloud/schemas.py +++ b/mcp_cloud/schemas.py @@ -177,6 +177,7 @@ class ToolDefinition: "Poll at reasonable intervals (e.g. every 5 minutes): plan generation typically takes 10-20 minutes " "(baseline profile) and may take longer on higher-quality profiles. " "State contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "When state is 'failed', check stop_reason: 'user_requested' means plan_stop was called (consider plan_resume); null means an actual error. " "progress_percentage is 0-100 (integer-like float); 100 when completed. " "Note: steps vary in duration — early steps complete quickly while later steps (review, report generation) " "take longer. Do not use progress_percentage to estimate time remaining. " diff --git a/mcp_cloud/tests/test_plan_status_tool.py b/mcp_cloud/tests/test_plan_status_tool.py index a96d6468..02595223 100644 --- a/mcp_cloud/tests/test_plan_status_tool.py +++ b/mcp_cloud/tests/test_plan_status_tool.py @@ -217,6 +217,66 @@ def test_plan_status_includes_current_step(self): self.assertEqual(result.structuredContent["current_step"], "SWOT Analysis") + def test_plan_status_user_stopped_shows_stop_reason(self): + """Failed plan with stop_requested=True → stop_reason='user_requested'.""" + plan_id = str(uuid.uuid4()) + plan_snapshot = { + "id": plan_id, + "state": PlanState.failed, + "stop_requested": True, + "progress_percentage": 42.0, + "timestamp_created": datetime.now(UTC), + } + with patch( + "mcp_cloud.handlers._get_plan_status_snapshot_sync", + return_value=plan_snapshot, + ), patch( + "mcp_cloud.handlers.fetch_file_list_from_worker_plan", new=AsyncMock(return_value=[]) + ): + result = asyncio.run(handle_plan_status({"plan_id": plan_id})) + + self.assertEqual(result.structuredContent["stop_reason"], "user_requested") + + def test_plan_status_actual_failure_shows_null_stop_reason(self): + """Failed plan with stop_requested=False → stop_reason=None.""" + plan_id = str(uuid.uuid4()) + plan_snapshot = { + "id": plan_id, + "state": PlanState.failed, + "stop_requested": False, + "progress_percentage": 5.5, + "timestamp_created": datetime.now(UTC), + } + with patch( + "mcp_cloud.handlers._get_plan_status_snapshot_sync", + return_value=plan_snapshot, + ), patch( + "mcp_cloud.handlers.fetch_file_list_from_worker_plan", new=AsyncMock(return_value=[]) + ): + result = asyncio.run(handle_plan_status({"plan_id": plan_id})) + + self.assertIsNone(result.structuredContent["stop_reason"]) + + def test_plan_status_non_failed_shows_null_stop_reason(self): + """Processing plan → stop_reason=None regardless of stop_requested.""" + plan_id = str(uuid.uuid4()) + plan_snapshot = { + "id": plan_id, + "state": PlanState.processing, + "stop_requested": True, + "progress_percentage": 50.0, + "timestamp_created": datetime.now(UTC), + } + with patch( + "mcp_cloud.handlers._get_plan_status_snapshot_sync", + return_value=plan_snapshot, + ), patch( + "mcp_cloud.handlers.fetch_file_list_from_worker_plan", new=AsyncMock(return_value=[]) + ): + result = asyncio.run(handle_plan_status({"plan_id": plan_id})) + + self.assertIsNone(result.structuredContent["stop_reason"]) + if __name__ == "__main__": unittest.main() diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index 7762c225..dc28f964 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -214,6 +214,15 @@ class PlanStatusSuccess(BaseModel): "Run `curl -N ` in a background shell — auto-closes on completion/failure." ), ) + stop_reason: str | None = Field( + default=None, + description=( + "Why the plan entered failed state. " + "'user_requested' means plan_stop was called (not an error — consider plan_resume). " + "null means an actual error (worker crash, model error, timeout). " + "Only set when state is 'failed'; always null for other states." + ), + ) class PlanStatusOutput(BaseModel): @@ -268,6 +277,15 @@ class PlanStatusOutput(BaseModel): "Run `curl -N ` in a background shell — auto-closes on completion/failure." ), ) + stop_reason: str | None = Field( + default=None, + description=( + "Why the plan entered failed state. " + "'user_requested' means plan_stop was called (not an error — consider plan_resume). " + "null means an actual error (worker crash, model error, timeout). " + "Only set when state is 'failed'; always null for other states." + ), + ) error: ErrorDetail | None = None diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index 801f92d9..6f6cef22 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -554,6 +554,7 @@ class ToolDefinition: }, }, }, + "stop_reason": {"type": ["string", "null"]}, }, } @@ -759,6 +760,7 @@ class ToolDefinition: "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation typically takes 10-20 minutes " "(baseline profile) and may take longer on higher-quality profiles. " "State contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "When state is 'failed', check stop_reason: 'user_requested' means plan_stop was called (consider plan_resume); null means an actual error. " "progress_percentage is 0-100 (integer-like float); 100 when completed. " "files lists intermediate outputs produced so far; use their updated_at timestamps to detect stalls. " "Unknown plan_id returns PLAN_NOT_FOUND (or REMOTE_ERROR when transport fails). " From 812858c1ae7cc26c82e19dfc62a8a99e68e7e3c2 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 17:12:59 +0100 Subject: [PATCH 02/19] Show "stopped" instead of "failed" for user-initiated stops in frontend The frontend now uses stop_reason to distinguish user-initiated stops from actual failures across all views: - /plan/meta endpoint includes stop_reason in response - Plan detail page shows "stopped" in status bar (initial + polled) - Plan list shows orange "stopped" chip instead of red "failed" - Dashboard recent tasks show "Stopped" label with orange dot Co-Authored-By: Claude Opus 4.6 --- frontend_multi_user/src/app.py | 22 +++++++++++++++++-- frontend_multi_user/templates/index.html | 5 +++-- .../templates/plan_iframe.html | 7 ++++-- frontend_multi_user/templates/plan_list.html | 5 +++++ 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/frontend_multi_user/src/app.py b/frontend_multi_user/src/app.py index 77f910d8..96b95eef 100644 --- a/frontend_multi_user/src/app.py +++ b/frontend_multi_user/src/app.py @@ -2146,6 +2146,7 @@ def index(): self.db.session.query( PlanItem.id, PlanItem.state, + PlanItem.stop_requested, func.substr(PlanItem.prompt, 1, 240).label("prompt_preview"), ) .filter(uid_filter) @@ -2165,6 +2166,7 @@ def index(): self.db.session.query( PlanItem.id, PlanItem.state, + PlanItem.stop_requested, ) .filter(uid_filter) .order_by(PlanItem.timestamp_created.desc()) @@ -2178,10 +2180,15 @@ def index(): prompt_text = self._load_prompt_preview_safe(task.id) else: prompt_text = (prompt_preview or "").strip() or "[Prompt unavailable]" + state = task.state if isinstance(task.state, PlanState) else None + display_state_name = None + if state and state.name == "failed" and task.stop_requested: + display_state_name = "stopped" recent_tasks.append( SimpleNamespace( id=str(task.id), - state=task.state if isinstance(task.state, PlanState) else None, + state=state, + display_state_name=display_state_name, prompt=prompt_text, ) ) @@ -3213,6 +3220,7 @@ def plan(): PlanItem.id, PlanItem.timestamp_created, PlanItem.state, + PlanItem.stop_requested, func.substr(PlanItem.prompt, 1, 240).label("prompt_preview"), ) .filter(uid_filter) @@ -3232,6 +3240,7 @@ def plan(): PlanItem.id, PlanItem.timestamp_created, PlanItem.state, + PlanItem.stop_requested, ) .filter(uid_filter) .order_by(PlanItem.timestamp_created.desc()) @@ -3246,11 +3255,14 @@ def plan(): prompt_text = self._load_prompt_preview_safe(task.id) else: prompt_text = (prompt_preview or "").strip() or "[Prompt unavailable]" + state_name = task.state.name if isinstance(task.state, PlanState) else "pending" + if state_name == "failed" and task.stop_requested: + state_name = "stopped" rows.append({ "id": str(task.id), "created_compact": created_compact, "created_relative": self._format_relative_time(ts), - "status": task.state.name if isinstance(task.state, PlanState) else "pending", + "status": state_name, "prompt": prompt_text, }) return render_template("plan_list.html", plan_rows=rows) @@ -3470,6 +3482,11 @@ def plan_meta(): state_name = task.state.name if isinstance(task.state, PlanState) else "pending" telemetry = self._build_plan_telemetry(task, include_raw=False) failure_trace = self._build_plan_failure_trace(task) + stop_reason = ( + "user_requested" + if state_name == "failed" and task.stop_requested + else None + ) return jsonify({ "id": str(task.id), "state": state_name, @@ -3478,6 +3495,7 @@ def plan_meta(): "generated_report_html": bool(task.has_generated_report_html), "run_zip_snapshot": bool(task.has_run_zip_snapshot), "stop_requested": bool(task.stop_requested), + "stop_reason": stop_reason, "telemetry": telemetry, "failure_trace": failure_trace, }), 200 diff --git a/frontend_multi_user/templates/index.html b/frontend_multi_user/templates/index.html index f2144233..9556ad6c 100644 --- a/frontend_multi_user/templates/index.html +++ b/frontend_multi_user/templates/index.html @@ -278,6 +278,7 @@ .task-status.processing { background: #f59e0b; } .task-status.pending { background: #94a3b8; } .task-status.failed { background: #ef4444; } + .task-status.stopped { background: #ea580c; } .task-prompt { flex: 1; font-size: 0.9rem; @@ -619,10 +620,10 @@

Recent Plans

{% for task in recent_tasks %}
- + {{ task.prompt }} - {% if task.state %}{{ task.state.name | capitalize }}{% else %}Pending{% endif %} + {% if task.display_state_name %}{{ task.display_state_name | capitalize }}{% elif task.state %}{{ task.state.name | capitalize }}{% else %}Pending{% endif %}
diff --git a/frontend_multi_user/templates/plan_iframe.html b/frontend_multi_user/templates/plan_iframe.html index 46ca04dc..6791c046 100644 --- a/frontend_multi_user/templates/plan_iframe.html +++ b/frontend_multi_user/templates/plan_iframe.html @@ -541,7 +541,9 @@

id: {{ task.id }}{% if task.timestamp_created %} · Created: {{ task.timestamp_created.strftime("%Y-%m-%d %H:%M:%S") }}{% endif %}

Status: - {% if task.state %} + {% if task.state and task.state.name == 'failed' and task.stop_requested %} + stopped + {% elif task.state %} {{ task.state.name | lower }} {% else %} pending @@ -1124,6 +1126,7 @@ function updatePanel(meta) { const state = (meta.state || "pending").toLowerCase(); currentPlanState = state; + const displayState = (state === "failed" && meta.stop_reason === "user_requested") ? "stopped" : state; const isCompleted = state === "completed"; const isActive = state === "pending" || state === "processing"; @@ -1131,7 +1134,7 @@ stateEl.style.display = "none"; } else { stateEl.style.display = ""; - const parts = [state]; + const parts = [displayState]; if (meta.progress_percentage !== undefined && meta.progress_percentage !== null) { parts.push(Math.round(meta.progress_percentage) + "%"); } diff --git a/frontend_multi_user/templates/plan_list.html b/frontend_multi_user/templates/plan_list.html index e4f2e165..72f0475f 100644 --- a/frontend_multi_user/templates/plan_list.html +++ b/frontend_multi_user/templates/plan_list.html @@ -98,6 +98,11 @@ color: #c62828; background: #ffebee; } + .status-chip.status-stopped { + border-color: #e65100; + color: #e65100; + background: #fff3e0; + } .plan-cell-prompt { white-space: nowrap; From cb8de688d2f13c3f797553a535b76338ce55ee74 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 17:16:56 +0100 Subject: [PATCH 03/19] Update proposal 114 with I1 implementation notes Document what was actually implemented vs proposed: minimal stop_reason vocabulary (user_requested vs null), no DB changes needed, frontend coverage beyond original MCP-only scope. Note I2 as prerequisite for richer stop_reason values. Co-Authored-By: Claude Opus 4.6 --- .../114-mcp-interface-feedback-stress-test.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/docs/proposals/114-mcp-interface-feedback-stress-test.md b/docs/proposals/114-mcp-interface-feedback-stress-test.md index e4e04075..133cce30 100644 --- a/docs/proposals/114-mcp-interface-feedback-stress-test.md +++ b/docs/proposals/114-mcp-interface-feedback-stress-test.md @@ -19,7 +19,7 @@ The testing surfaced concrete MCP interface friction points that affect agent op ### I1 — `failed` state conflates user-stop and actual failure -**Status:** Implemented (Option B — `stop_reason` field on `plan_status`). +**Status:** Implemented (PR #244, Option B — `stop_reason` field on `plan_status`). **Problem:** When `plan_stop` is called, the plan transitions to `failed`. When a worker crashes, it also transitions to `failed`. The agent operator cannot distinguish between: - User-initiated stop (nothing went wrong) @@ -38,13 +38,21 @@ This matters because the correct response differs: user stop suggests `plan_resu Option B is less disruptive and can be added to `plan_status` without changing state enum values. -**Affected files:** `mcp_cloud/db_queries.py`, `mcp_cloud/handlers.py` (plan_status response), DB model (new column or use existing `parameters` JSONB). +**Implemented approach (PR #244):** Option B with a minimal initial vocabulary: `stop_reason` is `"user_requested"` when `plan_stop` was called, `null` otherwise. Computed at response time from the existing `stop_requested` DB boolean — no migration, no new DB columns. The richer values proposed above (`"worker_crash"`, `"timeout"`, `"model_error"`) require the failure diagnostics infrastructure from I2; once I2 is implemented, `stop_reason` can be extended to report finer-grained failure causes. + +The implementation also went beyond the original MCP-only scope: +- `mcp_local` proxy schema and description updated to match +- `frontend_multi_user` shows "stopped" (orange) instead of "failed" (red) across plan list, plan detail status bar, and dashboard recent tasks + +**Affected files (originally proposed):** `mcp_cloud/db_queries.py`, `mcp_cloud/handlers.py` (plan_status response), DB model (new column or use existing `parameters` JSONB). + +**Affected files (actual):** `mcp_cloud/handlers.py`, `mcp_cloud/tool_models.py`, `mcp_cloud/schemas.py`, `mcp_local/planexe_mcp_local.py`, `frontend_multi_user/src/app.py`, `frontend_multi_user/templates/plan_iframe.html`, `frontend_multi_user/templates/plan_list.html`, `frontend_multi_user/templates/index.html`. No DB-layer changes were needed — `stop_reason` is computed from the existing `stop_requested` column at response time. --- ### I2 — No failure diagnostics in `plan_status` -**Priority: High — identified as the single biggest observability gap.** +**Priority: High — identified as the single biggest observability gap. Also prerequisite for extending I1's `stop_reason` with finer-grained failure causes (`"worker_crash"`, `"timeout"`, `"model_error"`).** **Problem:** When a plan fails, `plan_status` returns `state: "failed"` with no `failure_reason`, `last_error`, or `failed_step` field. The agent can only tell the user "it failed" without explaining why. From 57d362d43de7680d63548f2253eb72588a5bc8c5 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 17:20:37 +0100 Subject: [PATCH 04/19] Suppress redundant progress message for user-stopped plans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a plan is stopped by the user, the status bar previously showed "stopped · 42% · Stop requested by user." — the progress message is redundant since "stopped" already communicates user intent. Now shows just "stopped · 42%". Co-Authored-By: Claude Opus 4.6 --- frontend_multi_user/templates/plan_iframe.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontend_multi_user/templates/plan_iframe.html b/frontend_multi_user/templates/plan_iframe.html index 6791c046..7311198d 100644 --- a/frontend_multi_user/templates/plan_iframe.html +++ b/frontend_multi_user/templates/plan_iframe.html @@ -551,7 +551,7 @@ {% if task.progress_percentage is not none %} · {{ "%.0f"|format(task.progress_percentage) }}% {% endif %} - {% if task.progress_message %} + {% if task.progress_message and not (task.state and task.state.name == 'failed' and task.stop_requested) %} · {{ task.progress_message }} {% endif %} @@ -1138,7 +1138,7 @@ if (meta.progress_percentage !== undefined && meta.progress_percentage !== null) { parts.push(Math.round(meta.progress_percentage) + "%"); } - if (meta.progress_message) { + if (meta.progress_message && displayState !== "stopped") { parts.push(meta.progress_message); } stateTextEl.textContent = parts.join(" · "); From 5ca6fc9f4f76e36c3685c7f2dc340120793b5de8 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 17:24:58 +0100 Subject: [PATCH 05/19] Show "stopped" in Flask Admin for user-stopped plans The PlanItem admin list view now displays "stopped" (orange) instead of "failed" when stop_requested is true, matching the frontend behavior. The DB filter still operates on the raw state column. Co-Authored-By: Claude Opus 4.6 --- frontend_multi_user/src/planexe_modelviews.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/frontend_multi_user/src/planexe_modelviews.py b/frontend_multi_user/src/planexe_modelviews.py index e1935f5b..ecfee4a8 100644 --- a/frontend_multi_user/src/planexe_modelviews.py +++ b/frontend_multi_user/src/planexe_modelviews.py @@ -136,6 +136,11 @@ class PlanItemView(AdminOnlyModelView): column_filters = ['state', 'timestamp_created', 'user_id'] column_formatters = { 'id': lambda v, c, m, p: str(m.id)[:8] if m.id else '', + 'state': lambda v, c, m, p: ( + Markup('stopped') + if m.state and m.state.name == 'failed' and m.stop_requested + else (m.state.name if m.state else '') + ), 'prompt': lambda v, c, m, p: m.prompt[:100] + '...' if m.prompt and len(m.prompt) > 100 else m.prompt, 'view_plan': lambda v, c, m, p: Markup( f'View' From 73a57c398987bcd91ebc274618f7e98e9b5d3367 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 18:53:31 +0100 Subject: [PATCH 06/19] Add dedicated PlanState.stopped enum value (Proposal 114-I1, Option A) Replace computed stop_reason/display_state_name logic with proper state semantics: plan_stop now transitions to PlanState.stopped instead of PlanState.failed, and the stop_reason field is removed from API responses. plan_retry and plan_resume accept both failed and stopped states. Includes DB migration to add 'stopped' to PostgreSQL planstate enum across all three startup paths (mcp_cloud, worker, frontend). Co-Authored-By: Claude Opus 4.6 --- database_api/model_planitem.py | 1 + .../114-mcp-interface-feedback-stress-test.md | 12 ++--- frontend_multi_user/src/app.py | 29 ++++++------ frontend_multi_user/src/planexe_modelviews.py | 2 +- frontend_multi_user/templates/account.html | 1 + frontend_multi_user/templates/index.html | 4 +- .../templates/plan_iframe.html | 29 ++++++------ mcp_cloud/db_queries.py | 13 +++--- mcp_cloud/db_setup.py | 13 +++++- mcp_cloud/handlers.py | 12 +++-- mcp_cloud/schemas.py | 20 ++++----- mcp_cloud/sse.py | 2 +- mcp_cloud/tests/test_plan_status_tool.py | 37 +++++----------- mcp_cloud/tool_models.py | 44 ++++++------------- mcp_local/planexe_mcp_local.py | 25 +++++------ worker_plan_database/app.py | 9 ++++ 16 files changed, 116 insertions(+), 137 deletions(-) diff --git a/database_api/model_planitem.py b/database_api/model_planitem.py index 6bb39b2a..6e417511 100644 --- a/database_api/model_planitem.py +++ b/database_api/model_planitem.py @@ -37,6 +37,7 @@ class PlanState(enum.Enum): processing = 2 completed = 3 failed = 4 + stopped = 5 class PlanItem(db.Model): diff --git a/docs/proposals/114-mcp-interface-feedback-stress-test.md b/docs/proposals/114-mcp-interface-feedback-stress-test.md index 133cce30..1c36d00a 100644 --- a/docs/proposals/114-mcp-interface-feedback-stress-test.md +++ b/docs/proposals/114-mcp-interface-feedback-stress-test.md @@ -19,7 +19,7 @@ The testing surfaced concrete MCP interface friction points that affect agent op ### I1 — `failed` state conflates user-stop and actual failure -**Status:** Implemented (PR #244, Option B — `stop_reason` field on `plan_status`). +**Status:** Implemented (Option A — dedicated `PlanState.stopped` DB state). Supersedes PR #244's Option B (`stop_reason` field). **Problem:** When `plan_stop` is called, the plan transitions to `failed`. When a worker crashes, it also transitions to `failed`. The agent operator cannot distinguish between: - User-initiated stop (nothing went wrong) @@ -38,15 +38,9 @@ This matters because the correct response differs: user stop suggests `plan_resu Option B is less disruptive and can be added to `plan_status` without changing state enum values. -**Implemented approach (PR #244):** Option B with a minimal initial vocabulary: `stop_reason` is `"user_requested"` when `plan_stop` was called, `null` otherwise. Computed at response time from the existing `stop_requested` DB boolean — no migration, no new DB columns. The richer values proposed above (`"worker_crash"`, `"timeout"`, `"model_error"`) require the failure diagnostics infrastructure from I2; once I2 is implemented, `stop_reason` can be extended to report finer-grained failure causes. +**Implemented approach:** Option A — a dedicated `PlanState.stopped` enum value (value 5). `plan_stop` now transitions plans to the `stopped` state instead of `failed`. The `stop_reason` field introduced in PR #244 (Option B) has been removed — the state itself communicates intent. `plan_retry` and `plan_resume` accept both `failed` and `stopped` states. DB migration adds `'stopped'` to the PostgreSQL `planstate` enum type. -The implementation also went beyond the original MCP-only scope: -- `mcp_local` proxy schema and description updated to match -- `frontend_multi_user` shows "stopped" (orange) instead of "failed" (red) across plan list, plan detail status bar, and dashboard recent tasks - -**Affected files (originally proposed):** `mcp_cloud/db_queries.py`, `mcp_cloud/handlers.py` (plan_status response), DB model (new column or use existing `parameters` JSONB). - -**Affected files (actual):** `mcp_cloud/handlers.py`, `mcp_cloud/tool_models.py`, `mcp_cloud/schemas.py`, `mcp_local/planexe_mcp_local.py`, `frontend_multi_user/src/app.py`, `frontend_multi_user/templates/plan_iframe.html`, `frontend_multi_user/templates/plan_list.html`, `frontend_multi_user/templates/index.html`. No DB-layer changes were needed — `stop_reason` is computed from the existing `stop_requested` column at response time. +**Affected files:** `database_api/model_planitem.py`, `mcp_cloud/db_setup.py`, `worker_plan_database/app.py`, `mcp_cloud/db_queries.py`, `mcp_cloud/handlers.py`, `mcp_cloud/sse.py`, `mcp_cloud/tool_models.py`, `mcp_cloud/schemas.py`, `mcp_local/planexe_mcp_local.py`, `frontend_multi_user/src/app.py`, `frontend_multi_user/src/planexe_modelviews.py`, `frontend_multi_user/templates/plan_iframe.html`, `frontend_multi_user/templates/index.html`, `frontend_multi_user/templates/account.html`, `mcp_cloud/tests/test_plan_status_tool.py`. --- diff --git a/frontend_multi_user/src/app.py b/frontend_multi_user/src/app.py index 96b95eef..8a555bfb 100644 --- a/frontend_multi_user/src/app.py +++ b/frontend_multi_user/src/app.py @@ -496,6 +496,14 @@ def _ensure_step_count_columns() -> None: if "current_step" not in columns: conn.execute(text("ALTER TABLE task_item ADD COLUMN IF NOT EXISTS current_step VARCHAR(128)")) + def _ensure_stopped_state() -> None: + """Add 'stopped' value to the planstate enum type (idempotent).""" + with self.db.engine.begin() as conn: + try: + conn.execute(text("ALTER TYPE planstate ADD VALUE IF NOT EXISTS 'stopped'")) + except Exception as exc: + logger.warning("Could not add 'stopped' to planstate enum: %s", exc) + def _ensure_planitem_indexes() -> None: insp = inspect(self.db.engine) table_names = set(insp.get_table_names()) @@ -553,6 +561,7 @@ def _create_tables_with_retry(attempts: int = 5, delay_seconds: float = 2.0) -> _ensure_user_account_columns() _ensure_multi_api_key_columns() _ensure_step_count_columns() + _ensure_stopped_state() _ensure_planitem_indexes() _seed_initial_records() return @@ -2181,14 +2190,10 @@ def index(): else: prompt_text = (prompt_preview or "").strip() or "[Prompt unavailable]" state = task.state if isinstance(task.state, PlanState) else None - display_state_name = None - if state and state.name == "failed" and task.stop_requested: - display_state_name = "stopped" recent_tasks.append( SimpleNamespace( id=str(task.id), state=state, - display_state_name=display_state_name, prompt=prompt_text, ) ) @@ -3256,8 +3261,6 @@ def plan(): else: prompt_text = (prompt_preview or "").strip() or "[Prompt unavailable]" state_name = task.state.name if isinstance(task.state, PlanState) else "pending" - if state_name == "failed" and task.stop_requested: - state_name = "stopped" rows.append({ "id": str(task.id), "created_compact": created_compact, @@ -3354,7 +3357,7 @@ def plan_stop(): task.stop_requested = True task.stop_requested_timestamp = datetime.now(UTC) if task.state in (PlanState.pending, PlanState.processing): - task.state = PlanState.failed + task.state = PlanState.stopped task.progress_message = "Stop requested by user." self.db.session.commit() return redirect(url_for('plan', id=run_id)) @@ -3369,8 +3372,8 @@ def plan_retry(): if not current_user.is_admin and str(task.user_id) != str(current_user.id): return jsonify({"error": "Forbidden"}), 403 - if task.state == PlanState.processing and not bool(task.stop_requested): - return jsonify({"error": "Task is currently processing. Stop it first before retrying."}), 409 + if task.state not in (PlanState.failed, PlanState.stopped) and not bool(task.stop_requested): + return jsonify({"error": "Task is not in a retryable state. Stop it first before retrying."}), 409 raw_profile = request.form.get("model_profile") selected_model_profile = normalize_model_profile(raw_profile).value @@ -3413,7 +3416,7 @@ def plan_resume(): if not current_user.is_admin and str(task.user_id) != str(current_user.id): abort(403) - if task.state != PlanState.failed: + if task.state not in (PlanState.failed, PlanState.stopped): return redirect(url_for('plan', id=run_id)) # Reject resume if the snapshot was created by a different pipeline version. @@ -3482,11 +3485,6 @@ def plan_meta(): state_name = task.state.name if isinstance(task.state, PlanState) else "pending" telemetry = self._build_plan_telemetry(task, include_raw=False) failure_trace = self._build_plan_failure_trace(task) - stop_reason = ( - "user_requested" - if state_name == "failed" and task.stop_requested - else None - ) return jsonify({ "id": str(task.id), "state": state_name, @@ -3495,7 +3493,6 @@ def plan_meta(): "generated_report_html": bool(task.has_generated_report_html), "run_zip_snapshot": bool(task.has_run_zip_snapshot), "stop_requested": bool(task.stop_requested), - "stop_reason": stop_reason, "telemetry": telemetry, "failure_trace": failure_trace, }), 200 diff --git a/frontend_multi_user/src/planexe_modelviews.py b/frontend_multi_user/src/planexe_modelviews.py index ecfee4a8..73d0fc4a 100644 --- a/frontend_multi_user/src/planexe_modelviews.py +++ b/frontend_multi_user/src/planexe_modelviews.py @@ -138,7 +138,7 @@ class PlanItemView(AdminOnlyModelView): 'id': lambda v, c, m, p: str(m.id)[:8] if m.id else '', 'state': lambda v, c, m, p: ( Markup('stopped') - if m.state and m.state.name == 'failed' and m.stop_requested + if m.state and m.state.name == 'stopped' else (m.state.name if m.state else '') ), 'prompt': lambda v, c, m, p: m.prompt[:100] + '...' if m.prompt and len(m.prompt) > 100 else m.prompt, diff --git a/frontend_multi_user/templates/account.html b/frontend_multi_user/templates/account.html index c035fffd..2abdd8b9 100644 --- a/frontend_multi_user/templates/account.html +++ b/frontend_multi_user/templates/account.html @@ -425,6 +425,7 @@ .status-badge.completed, .status-badge.paid { background: #dcfce7; color: #166534; } .status-badge.pending { background: #fef3c7; color: #92400e; } .status-badge.failed, .status-badge.cancelled { background: #fee2e2; color: #991b1b; } + .status-badge.stopped { background: #ffedd5; color: #9a3412; } .empty-hint { text-align: center; padding: 24px 16px; diff --git a/frontend_multi_user/templates/index.html b/frontend_multi_user/templates/index.html index 9556ad6c..39c9ad40 100644 --- a/frontend_multi_user/templates/index.html +++ b/frontend_multi_user/templates/index.html @@ -620,10 +620,10 @@

Recent Plans

{% for task in recent_tasks %}
- + {{ task.prompt }} - {% if task.display_state_name %}{{ task.display_state_name | capitalize }}{% elif task.state %}{{ task.state.name | capitalize }}{% else %}Pending{% endif %} + {% if task.state %}{{ task.state.name | capitalize }}{% else %}Pending{% endif %}
diff --git a/frontend_multi_user/templates/plan_iframe.html b/frontend_multi_user/templates/plan_iframe.html index 7311198d..caa54b0e 100644 --- a/frontend_multi_user/templates/plan_iframe.html +++ b/frontend_multi_user/templates/plan_iframe.html @@ -541,9 +541,7 @@

id: {{ task.id }}{% if task.timestamp_created %} · Created: {{ task.timestamp_created.strftime("%Y-%m-%d %H:%M:%S") }}{% endif %}

Status: - {% if task.state and task.state.name == 'failed' and task.stop_requested %} - stopped - {% elif task.state %} + {% if task.state %} {{ task.state.name | lower }} {% else %} pending @@ -551,14 +549,14 @@ {% if task.progress_percentage is not none %} · {{ "%.0f"|format(task.progress_percentage) }}% {% endif %} - {% if task.progress_message and not (task.state and task.state.name == 'failed' and task.stop_requested) %} + {% if task.progress_message and not (task.state and task.state.name == 'stopped') %} · {{ task.progress_message }} {% endif %}

-
+
@@ -569,21 +567,21 @@ - + {% if resume_error == "version_mismatch" %} Resume unavailable — plan was created with a different PlanExe version. Use Retry. {% endif %}
- Retry - Resume + Retry + Resume -
+
- Stop + Stop Download Report Download Report @@ -1126,7 +1124,6 @@ function updatePanel(meta) { const state = (meta.state || "pending").toLowerCase(); currentPlanState = state; - const displayState = (state === "failed" && meta.stop_reason === "user_requested") ? "stopped" : state; const isCompleted = state === "completed"; const isActive = state === "pending" || state === "processing"; @@ -1134,25 +1131,25 @@ stateEl.style.display = "none"; } else { stateEl.style.display = ""; - const parts = [displayState]; + const parts = [state]; if (meta.progress_percentage !== undefined && meta.progress_percentage !== null) { parts.push(Math.round(meta.progress_percentage) + "%"); } - if (meta.progress_message && displayState !== "stopped") { + if (meta.progress_message && state !== "stopped") { parts.push(meta.progress_message); } stateTextEl.textContent = parts.join(" · "); } - const canRetry = state === "failed" || !!meta.stop_requested; - const canResume = state === "failed"; + const canRetry = state === "failed" || state === "stopped"; + const canResume = state === "failed" || state === "stopped"; setVisible("retry-resume-form", canRetry); setVisible("retry-disabled", !canRetry); setVisible("resume-disabled", !canRetry); var resumeBtn = document.getElementById("resume-btn"); if (resumeBtn) resumeBtn.disabled = !canResume; - const canStop = (state === "pending" || state === "processing") && !meta.stop_requested; + const canStop = (state === "pending" || state === "processing"); setVisible("stop-form", canStop); setVisible("stop-disabled", !canStop); diff --git a/mcp_cloud/db_queries.py b/mcp_cloud/db_queries.py index da51aca5..b0a45032 100644 --- a/mcp_cloud/db_queries.py +++ b/mcp_cloud/db_queries.py @@ -241,10 +241,10 @@ def _request_plan_stop_sync(plan_id: str) -> Optional[dict[str, Any]]: if plan.state in (PlanState.pending, PlanState.processing): plan.stop_requested = True plan.stop_requested_timestamp = datetime.now(UTC) - plan.state = PlanState.failed + plan.state = PlanState.stopped plan.progress_message = "Stop requested by user." db.session.commit() - logger.info("Stop requested for plan %s; state set to failed.", plan_id) + logger.info("Stop requested for plan %s; state set to stopped.", plan_id) stop_requested = True return { "state": get_plan_state_mapping(plan.state), @@ -257,11 +257,11 @@ def _retry_failed_plan_sync(plan_id: str, model_profile: str, caller_metadata: O plan = find_plan_by_id(plan_id) if plan is None: return None - if plan.state != PlanState.failed: + if plan.state not in (PlanState.failed, PlanState.stopped): return { "error": { "code": "PLAN_NOT_FAILED", - "message": f"Plan is not in failed state: {plan_id}", + "message": f"Plan is not in failed or stopped state: {plan_id}", } } @@ -333,11 +333,11 @@ def _resume_plan_sync(plan_id: str, model_profile: str) -> Optional[dict[str, An plan = find_plan_by_id(plan_id) if plan is None: return None - if plan.state != PlanState.failed: + if plan.state not in (PlanState.failed, PlanState.stopped): return { "error": { "code": "PLAN_NOT_RESUMABLE", - "message": f"Plan is not in failed state: {plan_id}", + "message": f"Plan is not in failed or stopped state: {plan_id}", } } @@ -475,6 +475,7 @@ def get_plan_state_mapping(plan_state: PlanState) -> str: PlanState.processing: "processing", PlanState.completed: "completed", PlanState.failed: "failed", + PlanState.stopped: "stopped", } return mapping.get(plan_state, "pending") diff --git a/mcp_cloud/db_setup.py b/mcp_cloud/db_setup.py index d2698c21..6bc3672a 100644 --- a/mcp_cloud/db_setup.py +++ b/mcp_cloud/db_setup.py @@ -103,10 +103,19 @@ def ensure_step_count_columns() -> None: except Exception as exc: logger.warning("Schema update failed for %s: %s", stmt, exc, exc_info=True) +def ensure_stopped_state() -> None: + """Add 'stopped' value to the planstate enum type (idempotent).""" + with db.engine.begin() as conn: + try: + conn.execute(text("ALTER TYPE planstate ADD VALUE IF NOT EXISTS 'stopped'")) + except Exception as exc: + logger.warning("Could not add 'stopped' to planstate enum: %s", exc) + with app.app_context(): ensure_planitem_stop_columns() ensure_multi_api_key_columns() ensure_step_count_columns() + ensure_stopped_state() # Shown in MCP initialize (e.g. Inspector) so clients know what PlanExe does. PLANEXE_SERVER_INSTRUCTIONS = ( @@ -136,12 +145,12 @@ def ensure_step_count_columns() -> None: "If plan generation fails before completing all steps, call plan_resume to continue from where it left off without discarding completed work. " "Use plan_retry instead for a full restart (plan must be in failed state). " "Both accept a failed plan_id and optional model_profile (defaults to baseline). " - "To stop, call plan_stop with the plan_id from plan_create; stopping is asynchronous and the plan will eventually transition to failed. " + "To stop, call plan_stop with the plan_id from plan_create; stopping is asynchronous and the plan will transition to the stopped state. " "If model_profiles returns MODEL_PROFILES_UNAVAILABLE, inform the user that no models are currently configured and the server administrator needs to set up model profiles. " "Tool errors use {error:{code,message}}. plan_file_info returns {ready:false,reason:...} while the artifact is not yet ready; check readiness by testing whether download_url is present in the response. " "plan_file_info download_url is the absolute URL where the requested artifact can be downloaded. " "To list recent plans for a user call plan_list; returns plan_id, state, progress_percentage, created_at, and prompt_excerpt for each plan. " - "plan_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "plan_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error; stopped => user called plan_stop (consider plan_resume). " "Troubleshooting: if plan_status stays in pending for longer than 5 minutes, the plan was likely queued but not picked up by a worker (server issue). " "If plan_status is in processing and output files do not change for longer than 20 minutes, the plan_create likely failed/stalled. " "In both cases, report the issue to PlanExe developers on GitHub: https://github.com/PlanExeOrg/PlanExe/issues . " diff --git a/mcp_cloud/handlers.py b/mcp_cloud/handlers.py index 369b153d..fd2e436c 100644 --- a/mcp_cloud/handlers.py +++ b/mcp_cloud/handlers.py @@ -338,16 +338,13 @@ async def handle_plan_status(arguments: dict[str, Any]) -> CallToolResult: }, "files_count": len(files), "files": files[:10], - "stop_reason": "user_requested" if ( - state == "failed" and plan_snapshot.get("stop_requested") - ) else None, } if state == "failed": message = plan_snapshot.get("progress_message") or "Plan generation failed." response["error"] = {"code": "generation_failed", "message": message} - if state not in ("completed", "failed"): + if state not in ("completed", "failed", "stopped"): base_url = _get_download_base_url() if base_url: response["sse_url"] = f"{base_url}/sse/plan/{plan_uuid}" @@ -578,6 +575,13 @@ async def handle_plan_file_info(arguments: dict[str, Any]) -> CallToolResult: structuredContent=response, isError=False, ) + if plan_state == PlanState.stopped: + response = {"ready": False, "reason": "stopped"} + return CallToolResult( + content=[TextContent(type="text", text=json.dumps(response))], + structuredContent=response, + isError=False, + ) if plan_state == PlanState.failed: message = plan_snapshot["progress_message"] or "Plan generation failed." response = {"ready": False, "reason": "failed", "error": {"code": "generation_failed", "message": message}} diff --git a/mcp_cloud/schemas.py b/mcp_cloud/schemas.py index 4c49a79d..319894da 100644 --- a/mcp_cloud/schemas.py +++ b/mcp_cloud/schemas.py @@ -176,8 +176,8 @@ class ToolDefinition: "This is the primary way to check progress — it returns structured JSON with all progress fields. " "Poll at reasonable intervals (e.g. every 5 minutes): plan generation typically takes 10-20 minutes " "(baseline profile) and may take longer on higher-quality profiles. " - "State contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " - "When state is 'failed', check stop_reason: 'user_requested' means plan_stop was called (consider plan_resume); null means an actual error. " + "State contract: pending/processing => keep polling; completed => download is ready; " + "failed => terminal error; stopped => user called plan_stop (consider plan_resume). " "progress_percentage is 0-100 (integer-like float); 100 when completed. " "Note: steps vary in duration — early steps complete quickly while later steps (review, report generation) " "take longer. Do not use progress_percentage to estimate time remaining. " @@ -203,7 +203,7 @@ class ToolDefinition: description=( "Request the plan generation to stop. Pass the plan_id (the UUID returned by plan_create). " "Stopping is asynchronous: the stop flag is set immediately but the plan may continue briefly before halting. " - "A stopped plan will eventually transition to the failed state. " + "A stopped plan will transition to the stopped state. " "If the plan is already completed or failed, stop_requested returns false (the plan already finished). " "Unknown plan_id returns error code PLAN_NOT_FOUND." ), @@ -219,10 +219,10 @@ class ToolDefinition: ToolDefinition( name="plan_retry", description=( - "Retry a plan that is currently in failed state. " - "Pass the failed plan_id and optionally model_profile (defaults to baseline). " + "Retry a plan that is currently in failed or stopped state. " + "Pass the plan_id and optionally model_profile (defaults to baseline). " "The plan is reset to pending, prior artifacts are cleared, and the same plan_id is requeued for processing. " - "Returns PLAN_NOT_FOUND when plan_id is unknown and PLAN_NOT_FAILED when the plan is not in failed state." + "Returns PLAN_NOT_FOUND when plan_id is unknown and PLAN_NOT_FAILED when the plan is not in failed or stopped state." ), input_schema=PLAN_RETRY_INPUT_SCHEMA, output_schema=PLAN_RETRY_OUTPUT_SCHEMA, @@ -236,13 +236,13 @@ class ToolDefinition: ToolDefinition( name="plan_resume", description=( - "Resume a failed plan without discarding completed intermediary files. " + "Resume a failed or stopped plan without discarding completed intermediary files. " "Plan generation restarts from the first incomplete step, skipping all steps that already produced output files. " - "Use plan_resume when plan_status shows 'failed' and plan generation was interrupted before completing all steps " + "Use plan_resume when plan_status shows 'failed' or 'stopped' and plan generation was interrupted before completing all steps " "(network drop, timeout, plan_stop, worker crash). " "For a full restart or to change model_profile, use plan_retry instead. " - "Only failed plans can be resumed. " - "Returns PLAN_NOT_FOUND when plan_id is unknown and PLAN_NOT_RESUMABLE when the plan is not in failed state. " + "Only failed or stopped plans can be resumed. " + "Returns PLAN_NOT_FOUND when plan_id is unknown and PLAN_NOT_RESUMABLE when the plan is not in failed or stopped state. " "Returns PIPELINE_VERSION_MISMATCH when the snapshot was created by a different pipeline version; use plan_retry instead." ), input_schema=PLAN_RESUME_INPUT_SCHEMA, diff --git a/mcp_cloud/sse.py b/mcp_cloud/sse.py index ff12e38f..39dbe51b 100644 --- a/mcp_cloud/sse.py +++ b/mcp_cloud/sse.py @@ -24,7 +24,7 @@ SSE_MAX_CONNECTIONS_PER_CLIENT: int = int(os.environ.get("PLANEXE_SSE_MAX_CONNECTIONS", "5")) SSE_MAX_TOTAL_CONNECTIONS: int = int(os.environ.get("PLANEXE_SSE_MAX_TOTAL_CONNECTIONS", "200")) -TERMINAL_STATES = {"completed", "failed"} +TERMINAL_STATES = {"completed", "failed", "stopped"} class SSEConnectionLimitError(Exception): diff --git a/mcp_cloud/tests/test_plan_status_tool.py b/mcp_cloud/tests/test_plan_status_tool.py index 02595223..4c8a4d66 100644 --- a/mcp_cloud/tests/test_plan_status_tool.py +++ b/mcp_cloud/tests/test_plan_status_tool.py @@ -217,12 +217,12 @@ def test_plan_status_includes_current_step(self): self.assertEqual(result.structuredContent["current_step"], "SWOT Analysis") - def test_plan_status_user_stopped_shows_stop_reason(self): - """Failed plan with stop_requested=True → stop_reason='user_requested'.""" + def test_plan_status_stopped_returns_stopped_state(self): + """User-stopped plan has state='stopped' and no stop_reason field.""" plan_id = str(uuid.uuid4()) plan_snapshot = { "id": plan_id, - "state": PlanState.failed, + "state": PlanState.stopped, "stop_requested": True, "progress_percentage": 42.0, "timestamp_created": datetime.now(UTC), @@ -235,10 +235,11 @@ def test_plan_status_user_stopped_shows_stop_reason(self): ): result = asyncio.run(handle_plan_status({"plan_id": plan_id})) - self.assertEqual(result.structuredContent["stop_reason"], "user_requested") + self.assertEqual(result.structuredContent["state"], "stopped") + self.assertNotIn("stop_reason", result.structuredContent) - def test_plan_status_actual_failure_shows_null_stop_reason(self): - """Failed plan with stop_requested=False → stop_reason=None.""" + def test_plan_status_actual_failure_has_no_stop_reason(self): + """Failed plan response does not contain stop_reason field.""" plan_id = str(uuid.uuid4()) plan_snapshot = { "id": plan_id, @@ -255,27 +256,9 @@ def test_plan_status_actual_failure_shows_null_stop_reason(self): ): result = asyncio.run(handle_plan_status({"plan_id": plan_id})) - self.assertIsNone(result.structuredContent["stop_reason"]) - - def test_plan_status_non_failed_shows_null_stop_reason(self): - """Processing plan → stop_reason=None regardless of stop_requested.""" - plan_id = str(uuid.uuid4()) - plan_snapshot = { - "id": plan_id, - "state": PlanState.processing, - "stop_requested": True, - "progress_percentage": 50.0, - "timestamp_created": datetime.now(UTC), - } - with patch( - "mcp_cloud.handlers._get_plan_status_snapshot_sync", - return_value=plan_snapshot, - ), patch( - "mcp_cloud.handlers.fetch_file_list_from_worker_plan", new=AsyncMock(return_value=[]) - ): - result = asyncio.run(handle_plan_status({"plan_id": plan_id})) - - self.assertIsNone(result.structuredContent["stop_reason"]) + self.assertEqual(result.structuredContent["state"], "failed") + self.assertNotIn("stop_reason", result.structuredContent) + self.assertIn("error", result.structuredContent) if __name__ == "__main__": diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index dc28f964..ead08213 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -107,7 +107,7 @@ class PlanStopInput(BaseModel): class PlanRetryInput(BaseModel): plan_id: str = Field( ..., - description="UUID of the failed plan to retry.", + description="UUID of the failed or stopped plan to retry.", ) model_profile: Literal["baseline", "premium", "frontier", "custom"] = Field( default="baseline", @@ -167,11 +167,12 @@ class PlanStatusSuccess(BaseModel): ..., description="Plan UUID returned by plan_create." ) - state: Literal["pending", "processing", "completed", "failed"] = Field( + state: Literal["pending", "processing", "completed", "failed", "stopped"] = Field( ..., description=( "Caller contract: pending/processing => keep polling; " - "completed => download is ready; failed => terminal error." + "completed => download is ready; failed => terminal error; " + "stopped => user called plan_stop (consider plan_resume)." ), ) progress_percentage: float = Field( @@ -214,15 +215,6 @@ class PlanStatusSuccess(BaseModel): "Run `curl -N ` in a background shell — auto-closes on completion/failure." ), ) - stop_reason: str | None = Field( - default=None, - description=( - "Why the plan entered failed state. " - "'user_requested' means plan_stop was called (not an error — consider plan_resume). " - "null means an actual error (worker crash, model error, timeout). " - "Only set when state is 'failed'; always null for other states." - ), - ) class PlanStatusOutput(BaseModel): @@ -230,11 +222,12 @@ class PlanStatusOutput(BaseModel): default=None, description="Plan UUID returned by plan_create." ) - state: Literal["pending", "processing", "completed", "failed"] | None = Field( + state: Literal["pending", "processing", "completed", "failed", "stopped"] | None = Field( default=None, description=( "Caller contract: pending/processing => keep polling; " - "completed => download is ready; failed => terminal error." + "completed => download is ready; failed => terminal error; " + "stopped => user called plan_stop (consider plan_resume)." ), ) progress_percentage: float | None = Field( @@ -277,20 +270,11 @@ class PlanStatusOutput(BaseModel): "Run `curl -N ` in a background shell — auto-closes on completion/failure." ), ) - stop_reason: str | None = Field( - default=None, - description=( - "Why the plan entered failed state. " - "'user_requested' means plan_stop was called (not an error — consider plan_resume). " - "null means an actual error (worker crash, model error, timeout). " - "Only set when state is 'failed'; always null for other states." - ), - ) error: ErrorDetail | None = None class PlanStopOutput(BaseModel): - state: Literal["pending", "processing", "completed", "failed"] | None = Field( + state: Literal["pending", "processing", "completed", "failed", "stopped"] | None = Field( default=None, description="Current plan state after stop request.", ) @@ -304,9 +288,9 @@ class PlanStopOutput(BaseModel): class PlanRetryOutput(BaseModel): plan_id: str | None = Field( default=None, - description="Plan UUID that was retried (same ID as the failed plan).", + description="Plan UUID that was retried (same ID as the failed or stopped plan).", ) - state: Literal["pending", "processing", "completed", "failed"] | None = Field( + state: Literal["pending", "processing", "completed", "failed", "stopped"] | None = Field( default=None, description="Current plan state after retry request.", ) @@ -332,7 +316,7 @@ class PlanRetryOutput(BaseModel): class PlanResumeInput(BaseModel): plan_id: str = Field( ..., - description="UUID of the failed plan to resume.", + description="UUID of the failed or stopped plan to resume.", ) model_profile: Literal["baseline", "premium", "frontier", "custom"] = Field( default="baseline", @@ -345,9 +329,9 @@ class PlanResumeInput(BaseModel): class PlanResumeOutput(BaseModel): plan_id: str | None = Field( default=None, - description="Plan UUID that was resumed (same ID as the failed plan).", + description="Plan UUID that was resumed (same ID as the failed or stopped plan).", ) - state: Literal["pending", "processing", "completed", "failed"] | None = Field( + state: Literal["pending", "processing", "completed", "failed", "stopped"] | None = Field( default=None, description="Current plan state after resume request.", ) @@ -423,7 +407,7 @@ class PlanListInput(BaseModel): class PlanListItem(BaseModel): plan_id: str = Field(..., description="Plan UUID.") - state: Literal["pending", "processing", "completed", "failed"] = Field( + state: Literal["pending", "processing", "completed", "failed", "stopped"] = Field( ..., description="Current plan state.", ) diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index 6f6cef22..facc9628 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -554,7 +554,6 @@ class ToolDefinition: }, }, }, - "stop_reason": {"type": ["string", "null"]}, }, } @@ -759,8 +758,8 @@ class ToolDefinition: "Returns status and progress of the plan currently being created. " "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation typically takes 10-20 minutes " "(baseline profile) and may take longer on higher-quality profiles. " - "State contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " - "When state is 'failed', check stop_reason: 'user_requested' means plan_stop was called (consider plan_resume); null means an actual error. " + "State contract: pending/processing => keep polling; completed => download is ready; " + "failed => terminal error; stopped => user called plan_stop (consider plan_resume). " "progress_percentage is 0-100 (integer-like float); 100 when completed. " "files lists intermediate outputs produced so far; use their updated_at timestamps to detect stalls. " "Unknown plan_id returns PLAN_NOT_FOUND (or REMOTE_ERROR when transport fails). " @@ -782,7 +781,7 @@ class ToolDefinition: description=( "Request the plan generation to stop. Pass the plan_id (the UUID returned by plan_create). " "Stopping is asynchronous: the stop flag is set immediately but the plan may continue briefly before halting. " - "A stopped plan will eventually transition to the failed state. " + "A stopped plan will transition to the stopped state. " "If the plan is already completed or failed, stop_requested returns false (the plan already finished). " "Unknown plan_id returns PLAN_NOT_FOUND (or REMOTE_ERROR when transport fails)." ), @@ -798,10 +797,10 @@ class ToolDefinition: ToolDefinition( name="plan_retry", description=( - "Retry a plan that is currently in failed state. " - "Pass the failed plan_id and optionally model_profile (defaults to baseline). " + "Retry a plan that is currently in failed or stopped state. " + "Pass the plan_id and optionally model_profile (defaults to baseline). " "The same plan_id is requeued and reset to pending on the cloud service. " - "Unknown plan_id returns PLAN_NOT_FOUND; non-failed plans return PLAN_NOT_FAILED." + "Unknown plan_id returns PLAN_NOT_FOUND; non-failed/stopped plans return PLAN_NOT_FAILED." ), input_schema=PLAN_RETRY_INPUT_SCHEMA, output_schema=PLAN_RETRY_OUTPUT_SCHEMA, @@ -815,13 +814,13 @@ class ToolDefinition: ToolDefinition( name="plan_resume", description=( - "Resume a failed plan without discarding completed intermediary files. " + "Resume a failed or stopped plan without discarding completed intermediary files. " "Plan generation restarts from the first incomplete step, skipping all steps that already produced output files. " - "Use plan_resume when plan_status shows 'failed' and plan generation was interrupted before completing all steps " + "Use plan_resume when plan_status shows 'failed' or 'stopped' and plan generation was interrupted before completing all steps " "(network drop, timeout, plan_stop, worker crash). " "For a full restart or to change model_profile, use plan_retry instead. " - "Only failed plans can be resumed. " - "Returns PLAN_NOT_FOUND when plan_id is unknown and PLAN_NOT_RESUMABLE when the plan is not in failed state. " + "Only failed or stopped plans can be resumed. " + "Returns PLAN_NOT_FOUND when plan_id is unknown and PLAN_NOT_RESUMABLE when the plan is not in failed or stopped state. " "Returns PIPELINE_VERSION_MISMATCH when the snapshot was created by a different pipeline version; use plan_retry instead." ), input_schema=PLAN_RESUME_INPUT_SCHEMA, @@ -896,12 +895,12 @@ class ToolDefinition: "If plan generation fails before completing all steps, call plan_resume to continue from where it left off without discarding completed work. " "Use plan_retry instead for a full restart. " "Both accept the failed plan_id and optional model_profile (defaults to baseline). " - "To stop, call plan_stop with the plan_id from plan_create; stopping is asynchronous and the plan will eventually transition to failed. " + "To stop, call plan_stop with the plan_id from plan_create; stopping is asynchronous and the plan will transition to the stopped state. " "If model_profiles returns MODEL_PROFILES_UNAVAILABLE, inform the user that no models are currently configured and the server administrator needs to set up model profiles. " "Tool errors use {error:{code,message}}. plan_download may return REMOTE_ERROR or DOWNLOAD_FAILED. " "plan_download saves to PLANEXE_PATH (default: current working directory) and returns saved_path. " "To list recent plans for a user call plan_list; returns plan_id, state, progress_percentage, created_at, and prompt_excerpt. " - "plan_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "plan_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error; stopped => user called plan_stop (consider plan_resume). " "Troubleshooting: if plan_status stays in pending for longer than 5 minutes, the plan was likely queued but not picked up by a worker (server issue). " "If plan_status is in processing and output files do not change for longer than 20 minutes, the run likely failed/stalled. " "In both cases, report the issue to PlanExe developers on GitHub: https://github.com/PlanExeOrg/PlanExe/issues . " diff --git a/worker_plan_database/app.py b/worker_plan_database/app.py index 71e35fc4..934705f9 100644 --- a/worker_plan_database/app.py +++ b/worker_plan_database/app.py @@ -313,6 +313,14 @@ def ensure_multi_api_key_columns() -> None: except Exception as exc: logger.warning("Schema update failed for %s: %s", stmt, exc, exc_info=True) +def ensure_stopped_state() -> None: + """Add 'stopped' value to the planstate enum type (idempotent).""" + with db.engine.begin() as conn: + try: + conn.execute(text("ALTER TYPE planstate ADD VALUE IF NOT EXISTS 'stopped'")) + except Exception as exc: + logger.warning("Could not add 'stopped' to planstate enum: %s", exc) + def worker_process_started() -> None: planexe_worker_id = os.environ.get("PLANEXE_WORKER_ID") event_context = { @@ -1314,6 +1322,7 @@ def startup_worker(): ensure_token_metrics_columns() ensure_fractional_credit_columns() ensure_multi_api_key_columns() + ensure_stopped_state() logger.debug(f"Ensured database tables exist.") WorkerItem.upsert_heartbeat(worker_id=WORKER_ID) except Exception as e: From 9865c5c048271d3411ef0ee95756e5807610f81c Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 19:31:29 +0100 Subject: [PATCH 07/19] Fix stale Option B references in proposal 114 Update cross-reference table, priority table, I2/I3/I8 descriptions to reflect Option A (PlanState.stopped) instead of the superseded Option B (stop_reason field). Co-Authored-By: Claude Opus 4.6 --- .../114-mcp-interface-feedback-stress-test.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/proposals/114-mcp-interface-feedback-stress-test.md b/docs/proposals/114-mcp-interface-feedback-stress-test.md index 1c36d00a..9baf4ae9 100644 --- a/docs/proposals/114-mcp-interface-feedback-stress-test.md +++ b/docs/proposals/114-mcp-interface-feedback-stress-test.md @@ -46,7 +46,7 @@ Option B is less disruptive and can be added to `plan_status` without changing s ### I2 — No failure diagnostics in `plan_status` -**Priority: High — identified as the single biggest observability gap. Also prerequisite for extending I1's `stop_reason` with finer-grained failure causes (`"worker_crash"`, `"timeout"`, `"model_error"`).** +**Priority: High — identified as the single biggest observability gap.** **Problem:** When a plan fails, `plan_status` returns `state: "failed"` with no `failure_reason`, `last_error`, or `failed_step` field. The agent can only tell the user "it failed" without explaining why. @@ -79,7 +79,7 @@ During the stress test, Plan 1 (20f1cfac) stalled at 5.5% with zero diagnostic i | Tool | Behavior | |------|----------| -| `plan_delete` | Permanently remove a plan from the user's list. Only allowed for terminal states (`failed`, `completed`). | +| `plan_delete` | Permanently remove a plan from the user's list. Only allowed for terminal states (`completed`, `failed`, `stopped`). | | `plan_archive` | Soft-delete: plan is hidden from `plan_list` but remains in DB for auditing. | `plan_archive` is safer for a billing system where plan records may need to be retained. @@ -160,7 +160,7 @@ This would eliminate the need for `plan_status` polling entirely when SSE is ava ``` plan_wait(plan_id, timeout_seconds=1200): Blocks until the plan reaches a terminal state -(completed, failed) or the timeout expires. Returns the final plan_status response. +(completed, failed, stopped) or the timeout expires. Returns the final plan_status response. ``` **Implementation:** Server-side long-poll using the existing SSE infrastructure. The handler subscribes to the plan's state changes and returns when terminal or timeout. @@ -190,7 +190,7 @@ This enables prompt iteration tracking without changing existing behavior for pl | Issue | Existing Proposal | Gap | |-------|-------------------|-----| -| I1 (stopped vs failed) | 87 §4 (deferred) | **Implemented** (Option B — `stop_reason` field) | +| I1 (stopped vs failed) | 87 §4 (deferred) | **Implemented** (Option A — dedicated `PlanState.stopped`) | | I2 (failure diagnostics) | 113 (logs only) | Not surfaced to MCP consumer | | I3 (plan_delete) | None | New | | I4 (idempotency) | None | **Implemented** (PR #242) | @@ -221,7 +221,7 @@ If accepted, I1–I4 and I7–I9 should be added to Proposal 70's quick-win chec | Priority | Issues | Rationale | |----------|--------|-----------| | P1 | I2 (failure diagnostics) | Biggest observability gap. Agent cannot help users debug failures without this. | -| ~~P1~~ | ~~I1 (stopped vs failed)~~ | **Implemented** (Option B). `stop_reason` field on `plan_status`: `"user_requested"` when `plan_stop` was called, `null` for actual errors. | +| ~~P1~~ | ~~I1 (stopped vs failed)~~ | **Implemented** (Option A). Dedicated `PlanState.stopped` enum value — `plan_stop` transitions to `stopped`, not `failed`. | | P2 | I7 (stall detection) | Prevents agents from waiting indefinitely on stuck plans. | | P2 | I6 (download TTL) | Low effort, reduces friction. | | P2 | I5 (rich SSE events) | Eliminates polling for SSE-capable clients. | From 393ea1f28f8d0e40b1bb15ebef742a5e3c471cc9 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 19:46:16 +0100 Subject: [PATCH 08/19] Update 114-I1 status in promising-directions.md Co-Authored-By: Claude Opus 4.6 --- docs/proposals/111-promising-directions.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/proposals/111-promising-directions.md b/docs/proposals/111-promising-directions.md index e3f4a7e0..9c03abed 100644 --- a/docs/proposals/111-promising-directions.md +++ b/docs/proposals/111-promising-directions.md @@ -24,7 +24,7 @@ Agents need PlanExe runs to complete reliably without human intervention. A fail | **103** | Pipeline Hardening for Local Models | Fix silent truncation and context-window overflows. Critical for agents running local models where failures are subtle | | **113** | LLM Error Traceability | ✅ **Implemented (PR #237)**. `LLMChatError` replaces generic `ValueError` across 38 call sites. Root cause preserved for error classification; `error_id` UUID enables log-to-metrics cross-referencing. Agents can programmatically diagnose failures | | **101** | Luigi Resume Enhancements | Webhook hooks on task completion/failure — agents can subscribe to events instead of polling | -| **114-I1** | Stopped vs Failed State | `plan_stop` and worker crashes both produce `failed` — agents can't distinguish user-initiated stops from actual errors. Add `stop_reason` field or a new `stopped` state | +| **114-I1** | Stopped vs Failed State | ✅ **Implemented**. Dedicated `PlanState.stopped` enum value — `plan_stop` transitions to `stopped`, not `failed`. Agents can now distinguish user-initiated stops from actual errors. `plan_retry` and `plan_resume` accept both `failed` and `stopped` | | **114-I2** | Failure Diagnostics in `plan_status` | When a plan fails, no `failure_reason`, `failed_step`, or `last_error` is returned. Biggest observability gap — agents can only say "it failed" without explaining why. Extends #113 to the MCP consumer surface | | **114-I7** | Stalled-Plan Detection | No `last_progress_at` or `last_llm_call_at` timestamps. Agents can't distinguish "slow step" from "stuck worker". Complements #87 §8 | @@ -107,7 +107,7 @@ Phase 1: Reliable foundation (nearly complete) ├─ #110 Usage metrics ✅ (PR #219, #236, #237) ├─ #113 Error traceability ✅ (PR #237) ├─ #58 Prompt boost ⚙️ (open PR #222) - ├─ #114-I1 Stopped vs failed state ← next priority + ├─ #114-I1 Stopped vs failed state ✅ ├─ #114-I2 Failure diagnostics in plan_status ← next priority (biggest gap) └─ #114-I7 Stalled-plan detection From fa21bca5b020696f11483c6e383ac9ccb6d0707c Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 19:57:32 +0100 Subject: [PATCH 09/19] Fix stopped-state migration to use correct PostgreSQL enum type name The PostgreSQL enum type is named `taskstate` (from the original `TaskState` Python class), not `planstate` (the renamed class). The migration silently failed, so `'stopped'` was never added to the enum, causing Internal Server Error on plan stop. Try both type names to support existing and fresh databases. Co-Authored-By: Claude Opus 4.6 --- frontend_multi_user/src/app.py | 17 ++++++++++++----- mcp_cloud/db_setup.py | 16 +++++++++++----- worker_plan_database/app.py | 16 +++++++++++----- 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/frontend_multi_user/src/app.py b/frontend_multi_user/src/app.py index 8a555bfb..5df97f83 100644 --- a/frontend_multi_user/src/app.py +++ b/frontend_multi_user/src/app.py @@ -497,12 +497,19 @@ def _ensure_step_count_columns() -> None: conn.execute(text("ALTER TABLE task_item ADD COLUMN IF NOT EXISTS current_step VARCHAR(128)")) def _ensure_stopped_state() -> None: - """Add 'stopped' value to the planstate enum type (idempotent).""" + """Add 'stopped' value to the planstate/taskstate enum type (idempotent). + + The PostgreSQL enum type is named ``taskstate`` in databases + created before the TaskState → PlanState Python rename + (proposal 74). Fresh databases will have ``planstate``. + We try both names. + """ with self.db.engine.begin() as conn: - try: - conn.execute(text("ALTER TYPE planstate ADD VALUE IF NOT EXISTS 'stopped'")) - except Exception as exc: - logger.warning("Could not add 'stopped' to planstate enum: %s", exc) + for type_name in ("taskstate", "planstate"): + try: + conn.execute(text(f"ALTER TYPE {type_name} ADD VALUE IF NOT EXISTS 'stopped'")) + except Exception as exc: + logger.debug("ALTER TYPE %s: %s", type_name, exc) def _ensure_planitem_indexes() -> None: insp = inspect(self.db.engine) diff --git a/mcp_cloud/db_setup.py b/mcp_cloud/db_setup.py index 6bc3672a..66cddc9e 100644 --- a/mcp_cloud/db_setup.py +++ b/mcp_cloud/db_setup.py @@ -104,12 +104,18 @@ def ensure_step_count_columns() -> None: logger.warning("Schema update failed for %s: %s", stmt, exc, exc_info=True) def ensure_stopped_state() -> None: - """Add 'stopped' value to the planstate enum type (idempotent).""" + """Add 'stopped' value to the planstate/taskstate enum type (idempotent). + + The PostgreSQL enum type is named ``taskstate`` in databases created before + the TaskState → PlanState Python rename (proposal 74). Fresh databases + created after that rename will have ``planstate``. We try both names. + """ with db.engine.begin() as conn: - try: - conn.execute(text("ALTER TYPE planstate ADD VALUE IF NOT EXISTS 'stopped'")) - except Exception as exc: - logger.warning("Could not add 'stopped' to planstate enum: %s", exc) + for type_name in ("taskstate", "planstate"): + try: + conn.execute(text(f"ALTER TYPE {type_name} ADD VALUE IF NOT EXISTS 'stopped'")) + except Exception as exc: + logger.debug("ALTER TYPE %s: %s", type_name, exc) with app.app_context(): ensure_planitem_stop_columns() diff --git a/worker_plan_database/app.py b/worker_plan_database/app.py index 934705f9..2dd4ba24 100644 --- a/worker_plan_database/app.py +++ b/worker_plan_database/app.py @@ -314,12 +314,18 @@ def ensure_multi_api_key_columns() -> None: logger.warning("Schema update failed for %s: %s", stmt, exc, exc_info=True) def ensure_stopped_state() -> None: - """Add 'stopped' value to the planstate enum type (idempotent).""" + """Add 'stopped' value to the planstate/taskstate enum type (idempotent). + + The PostgreSQL enum type is named ``taskstate`` in databases created before + the TaskState → PlanState Python rename (proposal 74). Fresh databases + created after that rename will have ``planstate``. We try both names. + """ with db.engine.begin() as conn: - try: - conn.execute(text("ALTER TYPE planstate ADD VALUE IF NOT EXISTS 'stopped'")) - except Exception as exc: - logger.warning("Could not add 'stopped' to planstate enum: %s", exc) + for type_name in ("taskstate", "planstate"): + try: + conn.execute(text(f"ALTER TYPE {type_name} ADD VALUE IF NOT EXISTS 'stopped'")) + except Exception as exc: + logger.debug("ALTER TYPE %s: %s", type_name, exc) def worker_process_started() -> None: planexe_worker_id = os.environ.get("PLANEXE_WORKER_ID") From 91c73661010a55254d21f5ca9658fbe82fa36562 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 20:04:54 +0100 Subject: [PATCH 10/19] Fix worker to transition user-stopped plans to PlanState.stopped The worker's post-pipeline logic always set PlanState.failed for non-completed plans, even when the user requested the stop. Now checks stop_requested and uses PlanState.stopped for user stops. Co-Authored-By: Claude Opus 4.6 --- worker_plan_database/app.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/worker_plan_database/app.py b/worker_plan_database/app.py index 2dd4ba24..79043204 100644 --- a/worker_plan_database/app.py +++ b/worker_plan_database/app.py @@ -1054,7 +1054,8 @@ def execute_pipeline_for_job( db.session.add(event) db.session.commit() else: - update_task_state_with_retry(task_id, PlanState.failed) + final_state = PlanState.stopped if stop_requested else PlanState.failed + update_task_state_with_retry(task_id, final_state) billing_result = _charge_usage_credits_once(task_id=task_id, run_id_dir=run_id_dir, success=False) event_context["machai_error_message"] = machai_error_message or "" event_context.update({ @@ -1067,7 +1068,7 @@ def execute_pipeline_for_job( event = _new_model( EventItem, event_type=EventType.TASK_FAILED, - message=f"Processing -> Failed", + message=f"Processing -> {'Stopped' if stop_requested else 'Failed'}", context=event_context ) db.session.add(event) From ddc99c2379355b0a8f157b97913344576ab07509 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 20:08:39 +0100 Subject: [PATCH 11/19] Handle stopped status in progress polling iframe The run_via_database.html progress poller only knew about pending, processing, completed, and failed. The new stopped state fell through to the unhandled-status error path. Co-Authored-By: Claude Opus 4.6 --- .../templates/run_via_database.html | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/frontend_multi_user/templates/run_via_database.html b/frontend_multi_user/templates/run_via_database.html index 0f900120..1b3aae9a 100644 --- a/frontend_multi_user/templates/run_via_database.html +++ b/frontend_multi_user/templates/run_via_database.html @@ -158,20 +158,18 @@ const is_pending = data.status == "pending"; const is_processing = data.status == "processing"; const is_failed = data.status == "failed"; + const is_stopped = data.status == "stopped"; progressMessageElement.textContent = data.progress_message || 'No progress message'; - // console.log("is_completed", is_completed); - // console.log("is_processing", is_processing); - // console.log("is_failed", is_failed); if (is_pending || is_processing || is_completed) { // There is a connection with the server, so ignore past errors consecutive_error_count = 0; + } else if (is_stopped) { + throw new PlanExeError("Plan stopped by user"); + } else if (is_failed) { + throw new PlanExeError("Failed to generate plan"); } else { - if (is_failed) { - throw new PlanExeError("Failed to generate plan"); - } else { - throw new PlanExeError(`Unhandled status: ${data.status}`); - } + throw new PlanExeError(`Unhandled status: ${data.status}`); } // Update progress bar From 3ee23e9afee2a163a3aeda157ceab0f83e660066 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 20:13:55 +0100 Subject: [PATCH 12/19] Fix whitespace in plan status text on plan_iframe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Jinja indentation caused extra whitespace between "Status:" and the state name. Use {%- whitespace control to produce clean single-space output like "Status: stopped · 23%". Co-Authored-By: Claude Opus 4.6 --- frontend_multi_user/templates/plan_iframe.html | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/frontend_multi_user/templates/plan_iframe.html b/frontend_multi_user/templates/plan_iframe.html index caa54b0e..ee11d1aa 100644 --- a/frontend_multi_user/templates/plan_iframe.html +++ b/frontend_multi_user/templates/plan_iframe.html @@ -541,17 +541,9 @@

id: {{ task.id }}{% if task.timestamp_created %} · Created: {{ task.timestamp_created.strftime("%Y-%m-%d %H:%M:%S") }}{% endif %}

Status: - {% if task.state %} - {{ task.state.name | lower }} - {% else %} - pending - {% endif %} - {% if task.progress_percentage is not none %} - · {{ "%.0f"|format(task.progress_percentage) }}% - {% endif %} - {% if task.progress_message and not (task.state and task.state.name == 'stopped') %} - · {{ task.progress_message }} - {% endif %} + {%- if task.state %} {{ task.state.name | lower }}{% else %} pending{% endif %} + {%- if task.progress_percentage is not none %} · {{ "%.0f"|format(task.progress_percentage) }}%{% endif %} + {%- if task.progress_message and not (task.state and task.state.name == 'stopped') %} · {{ task.progress_message }}{% endif %}

From 53b9b37ea1bcf462b267d9fb930bec682b900771 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 20:15:09 +0100 Subject: [PATCH 13/19] Remove outdated "Don't close this page" message from progress iframe Co-Authored-By: Claude Opus 4.6 --- frontend_multi_user/templates/run_via_database.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend_multi_user/templates/run_via_database.html b/frontend_multi_user/templates/run_via_database.html index 1b3aae9a..7604954f 100644 --- a/frontend_multi_user/templates/run_via_database.html +++ b/frontend_multi_user/templates/run_via_database.html @@ -60,7 +60,7 @@
-
Don't close this page. Your plan will be visible after it's generated.
+
Your plan will be visible after it's generated.
Elapsed Time: 0:00
From 627ec349cb4244d4b94f4a233f00754bef24da58 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 20:20:04 +0100 Subject: [PATCH 14/19] Fix missing space between Status: and state text in plan_iframe Inline all Jinja tags on one line to avoid whitespace-stripping eating the space after "Status:". Co-Authored-By: Claude Opus 4.6 --- frontend_multi_user/templates/plan_iframe.html | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/frontend_multi_user/templates/plan_iframe.html b/frontend_multi_user/templates/plan_iframe.html index ee11d1aa..474c69c3 100644 --- a/frontend_multi_user/templates/plan_iframe.html +++ b/frontend_multi_user/templates/plan_iframe.html @@ -540,11 +540,7 @@

Plan

id: {{ task.id }}{% if task.timestamp_created %} · Created: {{ task.timestamp_created.strftime("%Y-%m-%d %H:%M:%S") }}{% endif %}

- Status: - {%- if task.state %} {{ task.state.name | lower }}{% else %} pending{% endif %} - {%- if task.progress_percentage is not none %} · {{ "%.0f"|format(task.progress_percentage) }}%{% endif %} - {%- if task.progress_message and not (task.state and task.state.name == 'stopped') %} · {{ task.progress_message }}{% endif %} - + Status: {% if task.state %}{{ task.state.name | lower }}{% else %}pending{% endif %}{% if task.progress_percentage is not none %} · {{ "%.0f"|format(task.progress_percentage) }}%{% endif %}{% if task.progress_message and not (task.state and task.state.name == 'stopped') %} · {{ task.progress_message }}{% endif %}

From 7af4b71b247aefc3f08e944761df7cbdd40474c4 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 20:23:27 +0100 Subject: [PATCH 15/19] Fix spacing between Status: label and state text in plan_iframe inline-flex collapses whitespace between text nodes. Wrap "Status:" in its own span and add gap: 0.3em to the flex container. Co-Authored-By: Claude Opus 4.6 --- frontend_multi_user/templates/plan_iframe.html | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/frontend_multi_user/templates/plan_iframe.html b/frontend_multi_user/templates/plan_iframe.html index 474c69c3..c912f554 100644 --- a/frontend_multi_user/templates/plan_iframe.html +++ b/frontend_multi_user/templates/plan_iframe.html @@ -186,6 +186,7 @@ .plan-state { display: inline-flex; align-items: center; + gap: 0.3em; margin-top: 6px; font-size: 0.78rem; font-weight: 600; @@ -540,7 +541,7 @@

Plan

id: {{ task.id }}{% if task.timestamp_created %} · Created: {{ task.timestamp_created.strftime("%Y-%m-%d %H:%M:%S") }}{% endif %}

- Status: {% if task.state %}{{ task.state.name | lower }}{% else %}pending{% endif %}{% if task.progress_percentage is not none %} · {{ "%.0f"|format(task.progress_percentage) }}%{% endif %}{% if task.progress_message and not (task.state and task.state.name == 'stopped') %} · {{ task.progress_message }}{% endif %} + Status: {% if task.state %}{{ task.state.name | lower }}{% else %}pending{% endif %}{% if task.progress_percentage is not none %} · {{ "%.0f"|format(task.progress_percentage) }}%{% endif %}{% if task.progress_message and not (task.state and task.state.name == 'stopped') %} · {{ task.progress_message }}{% endif %}

From 53c5195fc5f69e4e2b5c8de0775d6934bb4a4159 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 20:50:53 +0100 Subject: [PATCH 16/19] Update docs to reflect PlanState.stopped across all references - planexe_mcp_interface.md: add stopped to state list, transitions, terminal states, caller contract, error codes, and recovery guidance - autonomous_agent_guide.md: add stopped to state monitoring section - proposal 87: update deferred stopped-state notes as implemented Co-Authored-By: Claude Opus 4.6 --- docs/mcp/autonomous_agent_guide.md | 1 + docs/mcp/planexe_mcp_interface.md | 26 +++++++++++++---------- docs/proposals/87-plan-resume-mcp-tool.md | 4 ++-- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/docs/mcp/autonomous_agent_guide.md b/docs/mcp/autonomous_agent_guide.md index 55294e95..6ac7d8bd 100644 --- a/docs/mcp/autonomous_agent_guide.md +++ b/docs/mcp/autonomous_agent_guide.md @@ -68,6 +68,7 @@ Poll every 5 minutes. State transitions: - `processing` → Pipeline is running - `completed` → Report is ready - `failed` → Terminal error (use `plan_resume` or `plan_retry`) +- `stopped` → User called `plan_stop` (use `plan_resume` to continue or `plan_retry` to restart) ### Step 6: Handle failures diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index bbc74b01..76245eb5 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -15,7 +15,7 @@ The plan is a **project plan**: a DAG of steps (Luigi pipeline stages) that prod Implementors should expose the following to agents so they understand what PlanExe does: - **What:** PlanExe turns a plain-English goal into a strategic project-plan draft (20+ sections) in ~10–20 min. Sections include executive summary, interactive Gantt charts, investor pitch, SWOT, governance, team profiles, work breakdown, scenario comparison, expert criticism, and adversarial sections (premortem, self-audit, premise attacks) that stress-test the plan. The output is a draft to refine, not an executable or final document — but it surfaces hard questions the prompter may not have considered. -- **Required interaction order:** Call `example_plans` (optional) and `example_prompts` first. Optional before `plan_create`: call `model_profiles` to inspect profile guidance and available models in each profile. Then complete a non-tool step: formulate a detailed prompt as flowing prose (not structured markdown), typically ~300-800 words, using the examples as a baseline; include objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria; get user approval. Only after approval, call `plan_create`. Then poll `plan_status` (about every 5 minutes); use `plan_download` (mcp_local helper) or `plan_file_info` (mcp_cloud tool) when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal). If a plan fails before completing all steps, call `plan_resume` to continue from where it left off without discarding completed work. Use `plan_retry` for a full restart (plan must be in failed state). Both accept the failed `plan_id` and optional `model_profile` (default `baseline`). To stop, call `plan_stop` with the `plan_id` from `plan_create`. +- **Required interaction order:** Call `example_plans` (optional) and `example_prompts` first. Optional before `plan_create`: call `model_profiles` to inspect profile guidance and available models in each profile. Then complete a non-tool step: formulate a detailed prompt as flowing prose (not structured markdown), typically ~300-800 words, using the examples as a baseline; include objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria; get user approval. Only after approval, call `plan_create`. Then poll `plan_status` (about every 5 minutes); use `plan_download` (mcp_local helper) or `plan_file_info` (mcp_cloud tool) when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal error, `stopped` = user called plan_stop). If a plan fails or is stopped before completing all steps, call `plan_resume` to continue from where it left off without discarding completed work. Use `plan_retry` for a full restart (plan must be in failed or stopped state). Both accept the `plan_id` and optional `model_profile` (default `baseline`). To stop, call `plan_stop` with the `plan_id` from `plan_create`. - **Output:** Self-contained interactive HTML report (~700KB) with collapsible sections and interactive Gantt charts — open in a browser. The zip contains the intermediary pipeline files (md, json, csv) that fed the report. ### 1.3 Scope of this document @@ -71,10 +71,10 @@ The interface is designed to support: The MCP specification defines two different mechanisms: -- **MCP tools** (e.g. plan_create, plan_status, plan_stop, plan_retry, plan_resume): the server exposes named tools; the client calls them and receives a response. PlanExe's interface is **tool-based**: the agent calls plan_create → receives plan_id → polls plan_status → optionally calls plan_resume or plan_retry on failed → uses plan_file_info (and optionally plan_download via mcp_local). This document specifies those tools. +- **MCP tools** (e.g. plan_create, plan_status, plan_stop, plan_retry, plan_resume): the server exposes named tools; the client calls them and receives a response. PlanExe's interface is **tool-based**: the agent calls plan_create → receives plan_id → polls plan_status → optionally calls plan_resume or plan_retry on failed/stopped → uses plan_file_info (and optionally plan_download via mcp_local). This document specifies those tools. - **MCP tasks protocol** ("Run as task" in some UIs): a separate mechanism where the client can run a tool "as a task" using RPC methods such as tasks/run, tasks/get, tasks/result, tasks/cancel, tasks/list, so the tool runs in the background and the client polls for results. -PlanExe **does not** use or advertise the MCP tasks protocol. Implementors and clients should use the **tools only**. Do not enable "Run as task" for PlanExe; many clients (e.g. Cursor) and the Python MCP SDK do not support the tasks protocol properly. Intended flow: optionally call `example_plans`; call `example_prompts`; optionally call `model_profiles`; perform the non-tool prompt drafting/approval step; call `plan_create`; poll `plan_status`; if failed call `plan_resume` to continue or `plan_retry` for a full restart (optional); then call `plan_file_info` (or `plan_download` via mcp_local) when completed. +PlanExe **does not** use or advertise the MCP tasks protocol. Implementors and clients should use the **tools only**. Do not enable "Run as task" for PlanExe; many clients (e.g. Cursor) and the Python MCP SDK do not support the tasks protocol properly. Intended flow: optionally call `example_plans`; call `example_prompts`; optionally call `model_profiles`; perform the non-tool prompt drafting/approval step; call `plan_create`; poll `plan_status`; if failed or stopped call `plan_resume` to continue or `plan_retry` for a full restart (optional); then call `plan_file_info` (or `plan_download` via mcp_local) when completed. --- @@ -99,7 +99,7 @@ A single execution attempt inside a plan. **Key properties** -- state: pending | processing | completed | failed +- state: pending | processing | completed | failed | stopped - progress_percentage: computed progress percentage (float) - started_at, ended_at @@ -137,13 +137,16 @@ The public MCP `state` field is aligned with `PlanItem.state`: - processing (picked up by a worker) - completed - failed +- stopped (user called `plan_stop`) ### 5.2 Allowed transitions - pending → processing when picked up by a worker - processing → completed via normal success - processing → failed via error +- processing → stopped via `plan_stop` - failed → pending when `plan_retry` or `plan_resume` is accepted +- stopped → pending when `plan_retry` or `plan_resume` is accepted ### 5.3 Invalid transitions @@ -332,10 +335,11 @@ Returns plan status and progress. Used for progress bars and UI states. **Pollin - `processing`: picked up by a worker and in progress. Keep polling. - `completed`: terminal success. Download artifacts now. - `failed`: terminal error. Do not keep polling for completion. +- `stopped`: user called `plan_stop`. Consider `plan_resume` to continue or `plan_retry` to restart. **Terminal states** -- `completed`, `failed` +- `completed`, `failed`, `stopped` **Response** @@ -441,7 +445,7 @@ Retries a plan that is currently in `failed` state. Resume a failed plan without discarding completed intermediary files. Plan generation restarts from the first incomplete step, skipping all steps that already produced output files. -Use `plan_resume` when `plan_status` shows `failed` and plan generation was interrupted before completing all steps (network drop, timeout, `plan_stop`, worker crash). For a full restart or to change `model_profile`, use `plan_retry` instead. +Use `plan_resume` when `plan_status` shows `failed` or `stopped` and plan generation was interrupted before completing all steps (network drop, timeout, `plan_stop`, worker crash). For a full restart or to change `model_profile`, use `plan_retry` instead. **Request** @@ -566,7 +570,7 @@ Recommended practice for MCP clients: Additional semantics: - Every `plan_create` call creates a new independent plan with a new `plan_id`. -- `plan_retry` and `plan_resume` reuse the existing failed `plan_id` (they do not create a new plan id). +- `plan_retry` and `plan_resume` reuse the existing failed or stopped `plan_id` (they do not create a new plan id). - The server does not deduplicate “same prompt” requests into a single shared plan. - Keep your own plan registry/client state if you run multiple plans concurrently. @@ -610,8 +614,8 @@ Cloud/core tool codes: - `INVALID_TOOL`: unknown MCP tool name. - `INTERNAL_ERROR`: uncaught server error. - `PLAN_NOT_FOUND`: plan_id not found. -- `PLAN_NOT_FAILED`: plan_retry called for a plan that is not in failed state. -- `PLAN_NOT_RESUMABLE`: plan_resume called for a plan that is not in failed state. +- `PLAN_NOT_FAILED`: plan_retry called for a plan that is not in failed or stopped state. +- `PLAN_NOT_RESUMABLE`: plan_resume called for a plan that is not in failed or stopped state. - `PIPELINE_VERSION_MISMATCH`: plan_resume snapshot was created by a different pipeline version; use plan_retry instead. - `INVALID_USER_API_KEY`: provided user_api_key is invalid. - `USER_API_KEY_REQUIRED`: deployment requires user_api_key for plan_create. @@ -636,8 +640,8 @@ Local proxy specific codes: - `USER_API_KEY_REQUIRED` - `INSUFFICIENT_CREDITS` - `INVALID_TOOL` -- For `PLAN_NOT_FAILED`: call `plan_retry` only after `plan_status.state == failed`. -- For `PLAN_NOT_RESUMABLE`: call `plan_resume` only after `plan_status.state == failed`. +- For `PLAN_NOT_FAILED`: call `plan_retry` only after `plan_status.state` is `failed` or `stopped`. +- For `PLAN_NOT_RESUMABLE`: call `plan_resume` only after `plan_status.state` is `failed` or `stopped`. - For `PIPELINE_VERSION_MISMATCH`: the snapshot is incompatible with the current pipeline; use `plan_retry` for a clean restart. - For `PLAN_NOT_FOUND`: verify plan_id source and stop polling that id. - For `generation_failed`: treat as terminal failure and surface plan progress_message to user. diff --git a/docs/proposals/87-plan-resume-mcp-tool.md b/docs/proposals/87-plan-resume-mcp-tool.md index 2bbcd64e..150d5333 100644 --- a/docs/proposals/87-plan-resume-mcp-tool.md +++ b/docs/proposals/87-plan-resume-mcp-tool.md @@ -106,7 +106,7 @@ This path is not used by `worker_plan_database` (which runs the pipeline directl | `failed` | ✅ Yes | Terminal state, run dir likely intact | | `stopped` (via `plan_stop`) | ✅ Yes | Intentionally halted, run dir intact | -**Note:** `plan_stop` currently transitions to `failed`. A future improvement could introduce a distinct `stopped` state to disambiguate user-initiated halts from error failures (out of scope for this proposal). +**Note:** `plan_stop` now transitions to the dedicated `stopped` state (implemented in Proposal 114-I1, Option A), disambiguating user-initiated halts from error failures. --- @@ -273,7 +273,7 @@ This requires the worker to write the current task name to a lightweight status ## 9. Out of Scope -- Introducing a distinct `stopped` state (separate from `failed`) — deferred +- ~~Introducing a distinct `stopped` state (separate from `failed`)~~ — implemented (Proposal 114-I1) - Progress scrubbing on resume (reporting accurate % based on remaining tasks) — deferred - `plan_resume` for `mcp_local` (local HTTP worker via `worker_plan/app.py`) — the HTTP worker already supports this via `submit_or_retry="retry"`; a follow-on can wire it up - Resume after partial artifact upload failure — deferred From 0233e8cc405c3a0eadfd62b6373a1a88c73cf5b9 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 20:52:31 +0100 Subject: [PATCH 17/19] Update proposal 114 I1 with implementation details - Note taskstate vs planstate enum type name issue - Note worker post-pipeline finalization fix - Add all affected files including docs and run_via_database.html - Update proposal 87 overlap reference Co-Authored-By: Claude Opus 4.6 --- docs/proposals/114-mcp-interface-feedback-stress-test.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/proposals/114-mcp-interface-feedback-stress-test.md b/docs/proposals/114-mcp-interface-feedback-stress-test.md index 9baf4ae9..1f655374 100644 --- a/docs/proposals/114-mcp-interface-feedback-stress-test.md +++ b/docs/proposals/114-mcp-interface-feedback-stress-test.md @@ -27,7 +27,7 @@ The testing surfaced concrete MCP interface friction points that affect agent op This matters because the correct response differs: user stop suggests `plan_resume`; actual failure may need `plan_retry` or investigation. -**Overlap:** Proposal 87 (plan_resume) acknowledges this in §4 ("A future improvement could introduce a distinct `stopped` state") but defers it as out of scope. +**Overlap:** Proposal 87 (plan_resume) acknowledged this in §4 — now updated to reflect the implementation. **Options:** @@ -38,9 +38,9 @@ This matters because the correct response differs: user stop suggests `plan_resu Option B is less disruptive and can be added to `plan_status` without changing state enum values. -**Implemented approach:** Option A — a dedicated `PlanState.stopped` enum value (value 5). `plan_stop` now transitions plans to the `stopped` state instead of `failed`. The `stop_reason` field introduced in PR #244 (Option B) has been removed — the state itself communicates intent. `plan_retry` and `plan_resume` accept both `failed` and `stopped` states. DB migration adds `'stopped'` to the PostgreSQL `planstate` enum type. +**Implemented approach:** Option A — a dedicated `PlanState.stopped` enum value (value 5). `plan_stop` now transitions plans to the `stopped` state instead of `failed`. The `stop_reason` field introduced in PR #244 (Option B) has been removed — the state itself communicates intent. `plan_retry` and `plan_resume` accept both `failed` and `stopped` states. DB migration adds `'stopped'` to the PostgreSQL enum type (both `taskstate` for pre-rename databases and `planstate` for fresh databases — the Python class was renamed from `TaskState` to `PlanState` in proposal 74 but the PostgreSQL type name was not changed). The worker's post-pipeline finalization also transitions to `stopped` (not `failed`) when `stop_requested` is true. -**Affected files:** `database_api/model_planitem.py`, `mcp_cloud/db_setup.py`, `worker_plan_database/app.py`, `mcp_cloud/db_queries.py`, `mcp_cloud/handlers.py`, `mcp_cloud/sse.py`, `mcp_cloud/tool_models.py`, `mcp_cloud/schemas.py`, `mcp_local/planexe_mcp_local.py`, `frontend_multi_user/src/app.py`, `frontend_multi_user/src/planexe_modelviews.py`, `frontend_multi_user/templates/plan_iframe.html`, `frontend_multi_user/templates/index.html`, `frontend_multi_user/templates/account.html`, `mcp_cloud/tests/test_plan_status_tool.py`. +**Affected files:** `database_api/model_planitem.py`, `mcp_cloud/db_setup.py`, `worker_plan_database/app.py`, `mcp_cloud/db_queries.py`, `mcp_cloud/handlers.py`, `mcp_cloud/sse.py`, `mcp_cloud/tool_models.py`, `mcp_cloud/schemas.py`, `mcp_local/planexe_mcp_local.py`, `frontend_multi_user/src/app.py`, `frontend_multi_user/src/planexe_modelviews.py`, `frontend_multi_user/templates/plan_iframe.html`, `frontend_multi_user/templates/run_via_database.html`, `frontend_multi_user/templates/index.html`, `frontend_multi_user/templates/account.html`, `mcp_cloud/tests/test_plan_status_tool.py`, `docs/mcp/planexe_mcp_interface.md`, `docs/mcp/autonomous_agent_guide.md`, `docs/proposals/87-plan-resume-mcp-tool.md`, `docs/proposals/111-promising-directions.md`. --- From 7c6ad53ce2ddae56ec0207d90e83a71c60397465 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 20:56:12 +0100 Subject: [PATCH 18/19] Incorporate v2 agent perception feedback into proposal 114 - Update context: two sessions (12 plans total), not just one - Add I10: silent partial failures in completed plans - Enhance I2 with recoverable boolean suggestion - Add agent perception section (8.5/10 rating, strengths, trust gaps) - Add evolution table across sessions - Update cross-reference and priority tables Co-Authored-By: Claude Opus 4.6 --- .../114-mcp-interface-feedback-stress-test.md | 78 +++++++++++++++++-- 1 file changed, 72 insertions(+), 6 deletions(-) diff --git a/docs/proposals/114-mcp-interface-feedback-stress-test.md b/docs/proposals/114-mcp-interface-feedback-stress-test.md index 1f655374..1ffc3a57 100644 --- a/docs/proposals/114-mcp-interface-feedback-stress-test.md +++ b/docs/proposals/114-mcp-interface-feedback-stress-test.md @@ -1,15 +1,18 @@ # Proposal 114: MCP Interface Issues from 10-Plan Agent Stress Test **Date:** 2026-03-11 -**Status:** Proposal -**Source:** Feedback from Claude (Opus 4.6) after a 10-plan stress-testing session operated through Claude Code CLI. +**Status:** Proposal (I1, I4 implemented; remainder open) +**Source:** Feedback from Claude (Opus 4.6) after two stress-testing sessions (12 plans total) operated through Claude Code CLI. **Scope:** MCP interface design, observability, and lifecycle management. Does not cover plan content quality or pipeline output improvements. --- ## 1. Context -A 10-plan stress test was conducted through Claude Code CLI using the PlanExe MCP interface. +Two stress-testing sessions were conducted through Claude Code CLI using the PlanExe MCP interface: + +- **Session 1** (10 plans): Surfaced core MCP interface friction points — state ambiguity, missing diagnostics, lifecycle gaps. +- **Session 2** (2 plans, with stop/resume cycling): Validated the `stopped` state implementation and identified remaining gaps. The testing surfaced concrete MCP interface friction points that affect agent operators — issues with state management, observability, lifecycle hygiene, and workflow ergonomics. This proposal catalogues the MCP-specific issues and maps them against existing proposals. @@ -61,11 +64,14 @@ During the stress test, Plan 1 (20f1cfac) stalled at 5.5% with zero diagnostic i "state": "failed", "failure_reason": "model_error", "failed_step": "016-expert_criticism", - "last_error": "openrouter-gemini-2.0-flash-001 returned invalid_json" + "last_error": "openrouter-gemini-2.0-flash-001 returned invalid_json", + "recoverable": true } ``` -**Implementation path:** The worker already logs errors internally. When transitioning to `failed`, write `failure_reason`, `failed_step`, and `last_error` to the plan's DB record (e.g. in the `parameters` JSONB column or new dedicated columns). `plan_status` handler surfaces these fields when `state == "failed"`. +The `recoverable` boolean would let the agent immediately suggest `plan_resume` (transient/recoverable) vs `plan_retry` (fundamental/non-recoverable) without guessing. + +**Implementation path:** The worker already logs errors internally. When transitioning to `failed`, write `failure_reason`, `failed_step`, `last_error`, and `recoverable` to the plan's DB record (e.g. in the `parameters` JSONB column or new dedicated columns). `plan_status` handler surfaces these fields when `state == "failed"`. **Affected files:** `worker_plan_database/app.py` (error capture on failure), `mcp_cloud/db_queries.py` (store failure info), `mcp_cloud/handlers.py` (include in plan_status response). @@ -186,6 +192,30 @@ This enables prompt iteration tracking without changing existing behavior for pl --- +### I10 — Silent partial failures in completed plans + +**Problem:** A plan can reach `completed` with sections that are empty or stub-quality (e.g. 6/8 experts returning no feedback in the expert criticism step). There is no signal in `plan_status` or `plan_file_info` that the output has quality gaps. The agent has to read individual files to discover this. `completed` means "all 110 steps ran" — not "all sections produced quality output." + +This is a trust gap: the agent cannot confidently tell the user "your plan is ready" without caveats, because it has no visibility into per-section quality. + +**Proposed addition:** A `quality_summary` in the completed plan status or file info: + +```json +{ + "sections_complete": 108, + "sections_partial": 2, + "partial_details": [ + {"step": "016-expert_criticism", "note": "2/8 experts provided feedback"} + ] +} +``` + +**Implementation path:** The worker already knows which steps produced output. A post-pipeline validation pass could check key sections for minimum output quality (e.g. non-empty, expected structure present) and write a summary to the DB. `plan_status` surfaces it when `state == "completed"`. + +**Affected files:** `worker_plan_database/app.py` (quality validation pass), DB model (quality summary column), `mcp_cloud/handlers.py` (include in completed plan_status response). + +--- + ## 3. Cross-Reference with Existing Proposals | Issue | Existing Proposal | Gap | @@ -199,6 +229,7 @@ This enables prompt iteration tracking without changing existing behavior for pl | I7 (stall detection) | 87 §8 (partial) | No explicit stall timestamps | | I8 (plan_wait) | None | New | | I9 (prompt iteration) | None | New | +| I10 (silent partial failures) | None | New | --- @@ -212,7 +243,7 @@ Several items here refine or extend issues already tracked in Proposal 70's chec | I5 (rich SSE) | 70 §5.1 (SSE implemented) | Extension — SSE works but events are thin | | I6 (download TTL) | 70 (signed tokens done) | Refinement — increase TTL | -If accepted, I1–I4 and I7–I9 should be added to Proposal 70's quick-win checklist as new line items. +If accepted, I1–I4 and I7–I10 should be added to Proposal 70's quick-win checklist as new line items. --- @@ -227,5 +258,40 @@ If accepted, I1–I4 and I7–I9 should be added to Proposal 70's quick-win chec | P2 | I5 (rich SSE events) | Eliminates polling for SSE-capable clients. | | P2 | I3 (plan_delete) | Hygiene for multi-plan sessions. | | ~~P3~~ | ~~I4 (idempotency)~~ | **Implemented** (PR #242). Server-side auto-dedup on `(user_id, prompt, model_profile)` within a time window. | +| P2 | I10 (silent partial failures) | Agent cannot trust `completed` means quality output. Undermines confidence in the entire workflow. | | P3 | I8 (plan_wait) | Nice-to-have for shell-less agents. | | P3 | I9 (prompt iteration) | Nice-to-have for iteration workflows. | + +--- + +## 6. Agent Perception (after 12 plans across two sessions) + +**Overall rating: 8.5/10** (up from 8/10 after session 1). The improvement comes from the `stopped` state — a small change with outsized impact on agent confidence. + +### What works well + +1. **Tool descriptions are best-in-class.** The agent operates PlanExe without external documentation, solely from tool descriptions. They specify call order, state contract, timing expectations, error codes, and troubleshooting guidance. +2. **State machine is now clean.** Every state has a single obvious next action: `pending` → wait, `processing` → poll, `completed` → download, `stopped` → suggest resume, `failed` → investigate. No state requires guessing. +3. **SSE completion detection is reliable.** Across 12 plans and multiple stop/resume cycles, the stream always auto-closed on terminal states as expected. +4. **Resume preserves progress.** After stopping at step 12/110, resume continued from step 12, not step 1. The `resume_count` field makes the history visible without the agent needing to track it. +5. **Deduplication prevents accidents.** Same prompt + model_profile within a short window returns the existing plan with `deduplicated=true`. +6. **Progress reporting is honest.** The tool description explicitly warns that `progress_percentage` is not linear and shouldn't be used for time estimates. + +### The stop/resume cycle + +Session 2 tested: create → stop → resume → stop → resume → complete. At no point was the agent confused about state or what to suggest. The `resume_count` field confirmed history without shadow state. The mark of a good state machine — the agent doesn't need to maintain its own bookkeeping. + +### Remaining trust gap + +The agent trusts the state machine to report accurately, trusts resume to preserve progress, trusts SSE to close on terminal states, and trusts tool descriptions to be current. The remaining trust gap is around `completed` plans — `completed` means "all steps ran," not "all sections produced quality output." The interface doesn't distinguish between these yet (see I10). + +### Evolution between sessions + +| Aspect | Session 1 | Session 2 | Direction | +|--------|-----------|-----------|-----------| +| Stop state | `failed` (ambiguous) | `stopped` (clear) | Fixed | +| Resume tracking | No history | `resume_count` field | New | +| Tool descriptions | Excellent | Updated with `stopped` state | Improved | +| Failure diagnostics | Missing | Still missing | Unchanged | +| Silent partial failures | Not surfaced | Not surfaced | Unchanged | +| Plan cleanup | No delete | No delete | Unchanged | From 6dd0ad0d58521540243cdca3161cd34a8dce1a74 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 11 Mar 2026 20:57:06 +0100 Subject: [PATCH 19/19] Add 114-I10 (silent partial failures) to promising-directions.md New issue from v2 agent perception feedback: completed plans may have empty or stub-quality sections with no quality_summary signal. Also updated I2 description to mention the recoverable field. Co-Authored-By: Claude Opus 4.6 --- docs/proposals/111-promising-directions.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/proposals/111-promising-directions.md b/docs/proposals/111-promising-directions.md index 9c03abed..fbfdcb8d 100644 --- a/docs/proposals/111-promising-directions.md +++ b/docs/proposals/111-promising-directions.md @@ -25,8 +25,9 @@ Agents need PlanExe runs to complete reliably without human intervention. A fail | **113** | LLM Error Traceability | ✅ **Implemented (PR #237)**. `LLMChatError` replaces generic `ValueError` across 38 call sites. Root cause preserved for error classification; `error_id` UUID enables log-to-metrics cross-referencing. Agents can programmatically diagnose failures | | **101** | Luigi Resume Enhancements | Webhook hooks on task completion/failure — agents can subscribe to events instead of polling | | **114-I1** | Stopped vs Failed State | ✅ **Implemented**. Dedicated `PlanState.stopped` enum value — `plan_stop` transitions to `stopped`, not `failed`. Agents can now distinguish user-initiated stops from actual errors. `plan_retry` and `plan_resume` accept both `failed` and `stopped` | -| **114-I2** | Failure Diagnostics in `plan_status` | When a plan fails, no `failure_reason`, `failed_step`, or `last_error` is returned. Biggest observability gap — agents can only say "it failed" without explaining why. Extends #113 to the MCP consumer surface | +| **114-I2** | Failure Diagnostics in `plan_status` | When a plan fails, no `failure_reason`, `failed_step`, `last_error`, or `recoverable` flag is returned. Biggest observability gap — agents can only say "it failed" without explaining why or recommending resume vs retry. Extends #113 to the MCP consumer surface | | **114-I7** | Stalled-Plan Detection | No `last_progress_at` or `last_llm_call_at` timestamps. Agents can't distinguish "slow step" from "stuck worker". Complements #87 §8 | +| **114-I10** | Silent Partial Failures in Completed Plans | A plan can reach `completed` with empty or stub-quality sections (e.g. 2/8 experts responding). No `quality_summary` in `plan_status` — agents can't tell if `completed` means "all sections produced quality output" or just "all steps ran." Trust gap for autonomous workflows | --- @@ -109,7 +110,8 @@ Phase 1: Reliable foundation (nearly complete) ├─ #58 Prompt boost ⚙️ (open PR #222) ├─ #114-I1 Stopped vs failed state ✅ ├─ #114-I2 Failure diagnostics in plan_status ← next priority (biggest gap) - └─ #114-I7 Stalled-plan detection + ├─ #114-I7 Stalled-plan detection + └─ #114-I10 Silent partial failures in completed plans Phase 2: Agent-native interface (next) ├─ #86 Remove agent friction points ✅ (PR #223)