diff --git a/database_api/model_planitem.py b/database_api/model_planitem.py index 6bb39b2a..6e417511 100644 --- a/database_api/model_planitem.py +++ b/database_api/model_planitem.py @@ -37,6 +37,7 @@ class PlanState(enum.Enum): processing = 2 completed = 3 failed = 4 + stopped = 5 class PlanItem(db.Model): diff --git a/docs/mcp/autonomous_agent_guide.md b/docs/mcp/autonomous_agent_guide.md index 55294e95..6ac7d8bd 100644 --- a/docs/mcp/autonomous_agent_guide.md +++ b/docs/mcp/autonomous_agent_guide.md @@ -68,6 +68,7 @@ Poll every 5 minutes. State transitions: - `processing` → Pipeline is running - `completed` → Report is ready - `failed` → Terminal error (use `plan_resume` or `plan_retry`) +- `stopped` → User called `plan_stop` (use `plan_resume` to continue or `plan_retry` to restart) ### Step 6: Handle failures diff --git a/docs/mcp/planexe_mcp_interface.md b/docs/mcp/planexe_mcp_interface.md index bbc74b01..76245eb5 100644 --- a/docs/mcp/planexe_mcp_interface.md +++ b/docs/mcp/planexe_mcp_interface.md @@ -15,7 +15,7 @@ The plan is a **project plan**: a DAG of steps (Luigi pipeline stages) that prod Implementors should expose the following to agents so they understand what PlanExe does: - **What:** PlanExe turns a plain-English goal into a strategic project-plan draft (20+ sections) in ~10–20 min. Sections include executive summary, interactive Gantt charts, investor pitch, SWOT, governance, team profiles, work breakdown, scenario comparison, expert criticism, and adversarial sections (premortem, self-audit, premise attacks) that stress-test the plan. The output is a draft to refine, not an executable or final document — but it surfaces hard questions the prompter may not have considered. -- **Required interaction order:** Call `example_plans` (optional) and `example_prompts` first. Optional before `plan_create`: call `model_profiles` to inspect profile guidance and available models in each profile. Then complete a non-tool step: formulate a detailed prompt as flowing prose (not structured markdown), typically ~300-800 words, using the examples as a baseline; include objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria; get user approval. Only after approval, call `plan_create`. Then poll `plan_status` (about every 5 minutes); use `plan_download` (mcp_local helper) or `plan_file_info` (mcp_cloud tool) when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal). If a plan fails before completing all steps, call `plan_resume` to continue from where it left off without discarding completed work. Use `plan_retry` for a full restart (plan must be in failed state). Both accept the failed `plan_id` and optional `model_profile` (default `baseline`). To stop, call `plan_stop` with the `plan_id` from `plan_create`. +- **Required interaction order:** Call `example_plans` (optional) and `example_prompts` first. Optional before `plan_create`: call `model_profiles` to inspect profile guidance and available models in each profile. Then complete a non-tool step: formulate a detailed prompt as flowing prose (not structured markdown), typically ~300-800 words, using the examples as a baseline; include objective, scope, constraints, timeline, stakeholders, budget/resources, and success criteria; get user approval. Only after approval, call `plan_create`. Then poll `plan_status` (about every 5 minutes); use `plan_download` (mcp_local helper) or `plan_file_info` (mcp_cloud tool) when complete (`pending`/`processing` = keep polling, `completed` = download now, `failed` = terminal error, `stopped` = user called plan_stop). If a plan fails or is stopped before completing all steps, call `plan_resume` to continue from where it left off without discarding completed work. Use `plan_retry` for a full restart (plan must be in failed or stopped state). Both accept the `plan_id` and optional `model_profile` (default `baseline`). To stop, call `plan_stop` with the `plan_id` from `plan_create`. - **Output:** Self-contained interactive HTML report (~700KB) with collapsible sections and interactive Gantt charts — open in a browser. The zip contains the intermediary pipeline files (md, json, csv) that fed the report. ### 1.3 Scope of this document @@ -71,10 +71,10 @@ The interface is designed to support: The MCP specification defines two different mechanisms: -- **MCP tools** (e.g. plan_create, plan_status, plan_stop, plan_retry, plan_resume): the server exposes named tools; the client calls them and receives a response. PlanExe's interface is **tool-based**: the agent calls plan_create → receives plan_id → polls plan_status → optionally calls plan_resume or plan_retry on failed → uses plan_file_info (and optionally plan_download via mcp_local). This document specifies those tools. +- **MCP tools** (e.g. plan_create, plan_status, plan_stop, plan_retry, plan_resume): the server exposes named tools; the client calls them and receives a response. PlanExe's interface is **tool-based**: the agent calls plan_create → receives plan_id → polls plan_status → optionally calls plan_resume or plan_retry on failed/stopped → uses plan_file_info (and optionally plan_download via mcp_local). This document specifies those tools. - **MCP tasks protocol** ("Run as task" in some UIs): a separate mechanism where the client can run a tool "as a task" using RPC methods such as tasks/run, tasks/get, tasks/result, tasks/cancel, tasks/list, so the tool runs in the background and the client polls for results. -PlanExe **does not** use or advertise the MCP tasks protocol. Implementors and clients should use the **tools only**. Do not enable "Run as task" for PlanExe; many clients (e.g. Cursor) and the Python MCP SDK do not support the tasks protocol properly. Intended flow: optionally call `example_plans`; call `example_prompts`; optionally call `model_profiles`; perform the non-tool prompt drafting/approval step; call `plan_create`; poll `plan_status`; if failed call `plan_resume` to continue or `plan_retry` for a full restart (optional); then call `plan_file_info` (or `plan_download` via mcp_local) when completed. +PlanExe **does not** use or advertise the MCP tasks protocol. Implementors and clients should use the **tools only**. Do not enable "Run as task" for PlanExe; many clients (e.g. Cursor) and the Python MCP SDK do not support the tasks protocol properly. Intended flow: optionally call `example_plans`; call `example_prompts`; optionally call `model_profiles`; perform the non-tool prompt drafting/approval step; call `plan_create`; poll `plan_status`; if failed or stopped call `plan_resume` to continue or `plan_retry` for a full restart (optional); then call `plan_file_info` (or `plan_download` via mcp_local) when completed. --- @@ -99,7 +99,7 @@ A single execution attempt inside a plan. **Key properties** -- state: pending | processing | completed | failed +- state: pending | processing | completed | failed | stopped - progress_percentage: computed progress percentage (float) - started_at, ended_at @@ -137,13 +137,16 @@ The public MCP `state` field is aligned with `PlanItem.state`: - processing (picked up by a worker) - completed - failed +- stopped (user called `plan_stop`) ### 5.2 Allowed transitions - pending → processing when picked up by a worker - processing → completed via normal success - processing → failed via error +- processing → stopped via `plan_stop` - failed → pending when `plan_retry` or `plan_resume` is accepted +- stopped → pending when `plan_retry` or `plan_resume` is accepted ### 5.3 Invalid transitions @@ -332,10 +335,11 @@ Returns plan status and progress. Used for progress bars and UI states. **Pollin - `processing`: picked up by a worker and in progress. Keep polling. - `completed`: terminal success. Download artifacts now. - `failed`: terminal error. Do not keep polling for completion. +- `stopped`: user called `plan_stop`. Consider `plan_resume` to continue or `plan_retry` to restart. **Terminal states** -- `completed`, `failed` +- `completed`, `failed`, `stopped` **Response** @@ -441,7 +445,7 @@ Retries a plan that is currently in `failed` state. Resume a failed plan without discarding completed intermediary files. Plan generation restarts from the first incomplete step, skipping all steps that already produced output files. -Use `plan_resume` when `plan_status` shows `failed` and plan generation was interrupted before completing all steps (network drop, timeout, `plan_stop`, worker crash). For a full restart or to change `model_profile`, use `plan_retry` instead. +Use `plan_resume` when `plan_status` shows `failed` or `stopped` and plan generation was interrupted before completing all steps (network drop, timeout, `plan_stop`, worker crash). For a full restart or to change `model_profile`, use `plan_retry` instead. **Request** @@ -566,7 +570,7 @@ Recommended practice for MCP clients: Additional semantics: - Every `plan_create` call creates a new independent plan with a new `plan_id`. -- `plan_retry` and `plan_resume` reuse the existing failed `plan_id` (they do not create a new plan id). +- `plan_retry` and `plan_resume` reuse the existing failed or stopped `plan_id` (they do not create a new plan id). - The server does not deduplicate “same prompt” requests into a single shared plan. - Keep your own plan registry/client state if you run multiple plans concurrently. @@ -610,8 +614,8 @@ Cloud/core tool codes: - `INVALID_TOOL`: unknown MCP tool name. - `INTERNAL_ERROR`: uncaught server error. - `PLAN_NOT_FOUND`: plan_id not found. -- `PLAN_NOT_FAILED`: plan_retry called for a plan that is not in failed state. -- `PLAN_NOT_RESUMABLE`: plan_resume called for a plan that is not in failed state. +- `PLAN_NOT_FAILED`: plan_retry called for a plan that is not in failed or stopped state. +- `PLAN_NOT_RESUMABLE`: plan_resume called for a plan that is not in failed or stopped state. - `PIPELINE_VERSION_MISMATCH`: plan_resume snapshot was created by a different pipeline version; use plan_retry instead. - `INVALID_USER_API_KEY`: provided user_api_key is invalid. - `USER_API_KEY_REQUIRED`: deployment requires user_api_key for plan_create. @@ -636,8 +640,8 @@ Local proxy specific codes: - `USER_API_KEY_REQUIRED` - `INSUFFICIENT_CREDITS` - `INVALID_TOOL` -- For `PLAN_NOT_FAILED`: call `plan_retry` only after `plan_status.state == failed`. -- For `PLAN_NOT_RESUMABLE`: call `plan_resume` only after `plan_status.state == failed`. +- For `PLAN_NOT_FAILED`: call `plan_retry` only after `plan_status.state` is `failed` or `stopped`. +- For `PLAN_NOT_RESUMABLE`: call `plan_resume` only after `plan_status.state` is `failed` or `stopped`. - For `PIPELINE_VERSION_MISMATCH`: the snapshot is incompatible with the current pipeline; use `plan_retry` for a clean restart. - For `PLAN_NOT_FOUND`: verify plan_id source and stop polling that id. - For `generation_failed`: treat as terminal failure and surface plan progress_message to user. diff --git a/docs/proposals/111-promising-directions.md b/docs/proposals/111-promising-directions.md index e3f4a7e0..fbfdcb8d 100644 --- a/docs/proposals/111-promising-directions.md +++ b/docs/proposals/111-promising-directions.md @@ -24,9 +24,10 @@ Agents need PlanExe runs to complete reliably without human intervention. A fail | **103** | Pipeline Hardening for Local Models | Fix silent truncation and context-window overflows. Critical for agents running local models where failures are subtle | | **113** | LLM Error Traceability | ✅ **Implemented (PR #237)**. `LLMChatError` replaces generic `ValueError` across 38 call sites. Root cause preserved for error classification; `error_id` UUID enables log-to-metrics cross-referencing. Agents can programmatically diagnose failures | | **101** | Luigi Resume Enhancements | Webhook hooks on task completion/failure — agents can subscribe to events instead of polling | -| **114-I1** | Stopped vs Failed State | `plan_stop` and worker crashes both produce `failed` — agents can't distinguish user-initiated stops from actual errors. Add `stop_reason` field or a new `stopped` state | -| **114-I2** | Failure Diagnostics in `plan_status` | When a plan fails, no `failure_reason`, `failed_step`, or `last_error` is returned. Biggest observability gap — agents can only say "it failed" without explaining why. Extends #113 to the MCP consumer surface | +| **114-I1** | Stopped vs Failed State | ✅ **Implemented**. Dedicated `PlanState.stopped` enum value — `plan_stop` transitions to `stopped`, not `failed`. Agents can now distinguish user-initiated stops from actual errors. `plan_retry` and `plan_resume` accept both `failed` and `stopped` | +| **114-I2** | Failure Diagnostics in `plan_status` | When a plan fails, no `failure_reason`, `failed_step`, `last_error`, or `recoverable` flag is returned. Biggest observability gap — agents can only say "it failed" without explaining why or recommending resume vs retry. Extends #113 to the MCP consumer surface | | **114-I7** | Stalled-Plan Detection | No `last_progress_at` or `last_llm_call_at` timestamps. Agents can't distinguish "slow step" from "stuck worker". Complements #87 §8 | +| **114-I10** | Silent Partial Failures in Completed Plans | A plan can reach `completed` with empty or stub-quality sections (e.g. 2/8 experts responding). No `quality_summary` in `plan_status` — agents can't tell if `completed` means "all sections produced quality output" or just "all steps ran." Trust gap for autonomous workflows | --- @@ -107,9 +108,10 @@ Phase 1: Reliable foundation (nearly complete) ├─ #110 Usage metrics ✅ (PR #219, #236, #237) ├─ #113 Error traceability ✅ (PR #237) ├─ #58 Prompt boost ⚙️ (open PR #222) - ├─ #114-I1 Stopped vs failed state ← next priority + ├─ #114-I1 Stopped vs failed state ✅ ├─ #114-I2 Failure diagnostics in plan_status ← next priority (biggest gap) - └─ #114-I7 Stalled-plan detection + ├─ #114-I7 Stalled-plan detection + └─ #114-I10 Silent partial failures in completed plans Phase 2: Agent-native interface (next) ├─ #86 Remove agent friction points ✅ (PR #223) diff --git a/docs/proposals/114-mcp-interface-feedback-stress-test.md b/docs/proposals/114-mcp-interface-feedback-stress-test.md index 346b1ac3..1ffc3a57 100644 --- a/docs/proposals/114-mcp-interface-feedback-stress-test.md +++ b/docs/proposals/114-mcp-interface-feedback-stress-test.md @@ -1,15 +1,18 @@ # Proposal 114: MCP Interface Issues from 10-Plan Agent Stress Test **Date:** 2026-03-11 -**Status:** Proposal -**Source:** Feedback from Claude (Opus 4.6) after a 10-plan stress-testing session operated through Claude Code CLI. +**Status:** Proposal (I1, I4 implemented; remainder open) +**Source:** Feedback from Claude (Opus 4.6) after two stress-testing sessions (12 plans total) operated through Claude Code CLI. **Scope:** MCP interface design, observability, and lifecycle management. Does not cover plan content quality or pipeline output improvements. --- ## 1. Context -A 10-plan stress test was conducted through Claude Code CLI using the PlanExe MCP interface. +Two stress-testing sessions were conducted through Claude Code CLI using the PlanExe MCP interface: + +- **Session 1** (10 plans): Surfaced core MCP interface friction points — state ambiguity, missing diagnostics, lifecycle gaps. +- **Session 2** (2 plans, with stop/resume cycling): Validated the `stopped` state implementation and identified remaining gaps. The testing surfaced concrete MCP interface friction points that affect agent operators — issues with state management, observability, lifecycle hygiene, and workflow ergonomics. This proposal catalogues the MCP-specific issues and maps them against existing proposals. @@ -19,13 +22,15 @@ The testing surfaced concrete MCP interface friction points that affect agent op ### I1 — `failed` state conflates user-stop and actual failure +**Status:** Implemented (Option A — dedicated `PlanState.stopped` DB state). Supersedes PR #244's Option B (`stop_reason` field). + **Problem:** When `plan_stop` is called, the plan transitions to `failed`. When a worker crashes, it also transitions to `failed`. The agent operator cannot distinguish between: - User-initiated stop (nothing went wrong) - Actual failure (network drop, model error, worker crash) This matters because the correct response differs: user stop suggests `plan_resume`; actual failure may need `plan_retry` or investigation. -**Overlap:** Proposal 87 (plan_resume) acknowledges this in §4 ("A future improvement could introduce a distinct `stopped` state") but defers it as out of scope. +**Overlap:** Proposal 87 (plan_resume) acknowledged this in §4 — now updated to reflect the implementation. **Options:** @@ -36,7 +41,9 @@ This matters because the correct response differs: user stop suggests `plan_resu Option B is less disruptive and can be added to `plan_status` without changing state enum values. -**Affected files:** `mcp_cloud/db_queries.py`, `mcp_cloud/handlers.py` (plan_status response), DB model (new column or use existing `parameters` JSONB). +**Implemented approach:** Option A — a dedicated `PlanState.stopped` enum value (value 5). `plan_stop` now transitions plans to the `stopped` state instead of `failed`. The `stop_reason` field introduced in PR #244 (Option B) has been removed — the state itself communicates intent. `plan_retry` and `plan_resume` accept both `failed` and `stopped` states. DB migration adds `'stopped'` to the PostgreSQL enum type (both `taskstate` for pre-rename databases and `planstate` for fresh databases — the Python class was renamed from `TaskState` to `PlanState` in proposal 74 but the PostgreSQL type name was not changed). The worker's post-pipeline finalization also transitions to `stopped` (not `failed`) when `stop_requested` is true. + +**Affected files:** `database_api/model_planitem.py`, `mcp_cloud/db_setup.py`, `worker_plan_database/app.py`, `mcp_cloud/db_queries.py`, `mcp_cloud/handlers.py`, `mcp_cloud/sse.py`, `mcp_cloud/tool_models.py`, `mcp_cloud/schemas.py`, `mcp_local/planexe_mcp_local.py`, `frontend_multi_user/src/app.py`, `frontend_multi_user/src/planexe_modelviews.py`, `frontend_multi_user/templates/plan_iframe.html`, `frontend_multi_user/templates/run_via_database.html`, `frontend_multi_user/templates/index.html`, `frontend_multi_user/templates/account.html`, `mcp_cloud/tests/test_plan_status_tool.py`, `docs/mcp/planexe_mcp_interface.md`, `docs/mcp/autonomous_agent_guide.md`, `docs/proposals/87-plan-resume-mcp-tool.md`, `docs/proposals/111-promising-directions.md`. --- @@ -57,11 +64,14 @@ During the stress test, Plan 1 (20f1cfac) stalled at 5.5% with zero diagnostic i "state": "failed", "failure_reason": "model_error", "failed_step": "016-expert_criticism", - "last_error": "openrouter-gemini-2.0-flash-001 returned invalid_json" + "last_error": "openrouter-gemini-2.0-flash-001 returned invalid_json", + "recoverable": true } ``` -**Implementation path:** The worker already logs errors internally. When transitioning to `failed`, write `failure_reason`, `failed_step`, and `last_error` to the plan's DB record (e.g. in the `parameters` JSONB column or new dedicated columns). `plan_status` handler surfaces these fields when `state == "failed"`. +The `recoverable` boolean would let the agent immediately suggest `plan_resume` (transient/recoverable) vs `plan_retry` (fundamental/non-recoverable) without guessing. + +**Implementation path:** The worker already logs errors internally. When transitioning to `failed`, write `failure_reason`, `failed_step`, `last_error`, and `recoverable` to the plan's DB record (e.g. in the `parameters` JSONB column or new dedicated columns). `plan_status` handler surfaces these fields when `state == "failed"`. **Affected files:** `worker_plan_database/app.py` (error capture on failure), `mcp_cloud/db_queries.py` (store failure info), `mcp_cloud/handlers.py` (include in plan_status response). @@ -75,7 +85,7 @@ During the stress test, Plan 1 (20f1cfac) stalled at 5.5% with zero diagnostic i | Tool | Behavior | |------|----------| -| `plan_delete` | Permanently remove a plan from the user's list. Only allowed for terminal states (`failed`, `completed`). | +| `plan_delete` | Permanently remove a plan from the user's list. Only allowed for terminal states (`completed`, `failed`, `stopped`). | | `plan_archive` | Soft-delete: plan is hidden from `plan_list` but remains in DB for auditing. | `plan_archive` is safer for a billing system where plan records may need to be retained. @@ -156,7 +166,7 @@ This would eliminate the need for `plan_status` polling entirely when SSE is ava ``` plan_wait(plan_id, timeout_seconds=1200): Blocks until the plan reaches a terminal state -(completed, failed) or the timeout expires. Returns the final plan_status response. +(completed, failed, stopped) or the timeout expires. Returns the final plan_status response. ``` **Implementation:** Server-side long-poll using the existing SSE infrastructure. The handler subscribes to the plan's state changes and returns when terminal or timeout. @@ -182,11 +192,35 @@ This enables prompt iteration tracking without changing existing behavior for pl --- +### I10 — Silent partial failures in completed plans + +**Problem:** A plan can reach `completed` with sections that are empty or stub-quality (e.g. 6/8 experts returning no feedback in the expert criticism step). There is no signal in `plan_status` or `plan_file_info` that the output has quality gaps. The agent has to read individual files to discover this. `completed` means "all 110 steps ran" — not "all sections produced quality output." + +This is a trust gap: the agent cannot confidently tell the user "your plan is ready" without caveats, because it has no visibility into per-section quality. + +**Proposed addition:** A `quality_summary` in the completed plan status or file info: + +```json +{ + "sections_complete": 108, + "sections_partial": 2, + "partial_details": [ + {"step": "016-expert_criticism", "note": "2/8 experts provided feedback"} + ] +} +``` + +**Implementation path:** The worker already knows which steps produced output. A post-pipeline validation pass could check key sections for minimum output quality (e.g. non-empty, expected structure present) and write a summary to the DB. `plan_status` surfaces it when `state == "completed"`. + +**Affected files:** `worker_plan_database/app.py` (quality validation pass), DB model (quality summary column), `mcp_cloud/handlers.py` (include in completed plan_status response). + +--- + ## 3. Cross-Reference with Existing Proposals | Issue | Existing Proposal | Gap | |-------|-------------------|-----| -| I1 (stopped vs failed) | 87 §4 (deferred) | Needs its own decision — Option A vs B | +| I1 (stopped vs failed) | 87 §4 (deferred) | **Implemented** (Option A — dedicated `PlanState.stopped`) | | I2 (failure diagnostics) | 113 (logs only) | Not surfaced to MCP consumer | | I3 (plan_delete) | None | New | | I4 (idempotency) | None | **Implemented** (PR #242) | @@ -195,6 +229,7 @@ This enables prompt iteration tracking without changing existing behavior for pl | I7 (stall detection) | 87 §8 (partial) | No explicit stall timestamps | | I8 (plan_wait) | None | New | | I9 (prompt iteration) | None | New | +| I10 (silent partial failures) | None | New | --- @@ -208,7 +243,7 @@ Several items here refine or extend issues already tracked in Proposal 70's chec | I5 (rich SSE) | 70 §5.1 (SSE implemented) | Extension — SSE works but events are thin | | I6 (download TTL) | 70 (signed tokens done) | Refinement — increase TTL | -If accepted, I1–I4 and I7–I9 should be added to Proposal 70's quick-win checklist as new line items. +If accepted, I1–I4 and I7–I10 should be added to Proposal 70's quick-win checklist as new line items. --- @@ -217,11 +252,46 @@ If accepted, I1–I4 and I7–I9 should be added to Proposal 70's quick-win chec | Priority | Issues | Rationale | |----------|--------|-----------| | P1 | I2 (failure diagnostics) | Biggest observability gap. Agent cannot help users debug failures without this. | -| P1 | I1 (stopped vs failed) | Small change (Option B), high diagnostic value. Prerequisite for good plan_resume UX. | +| ~~P1~~ | ~~I1 (stopped vs failed)~~ | **Implemented** (Option A). Dedicated `PlanState.stopped` enum value — `plan_stop` transitions to `stopped`, not `failed`. | | P2 | I7 (stall detection) | Prevents agents from waiting indefinitely on stuck plans. | | P2 | I6 (download TTL) | Low effort, reduces friction. | | P2 | I5 (rich SSE events) | Eliminates polling for SSE-capable clients. | | P2 | I3 (plan_delete) | Hygiene for multi-plan sessions. | | ~~P3~~ | ~~I4 (idempotency)~~ | **Implemented** (PR #242). Server-side auto-dedup on `(user_id, prompt, model_profile)` within a time window. | +| P2 | I10 (silent partial failures) | Agent cannot trust `completed` means quality output. Undermines confidence in the entire workflow. | | P3 | I8 (plan_wait) | Nice-to-have for shell-less agents. | | P3 | I9 (prompt iteration) | Nice-to-have for iteration workflows. | + +--- + +## 6. Agent Perception (after 12 plans across two sessions) + +**Overall rating: 8.5/10** (up from 8/10 after session 1). The improvement comes from the `stopped` state — a small change with outsized impact on agent confidence. + +### What works well + +1. **Tool descriptions are best-in-class.** The agent operates PlanExe without external documentation, solely from tool descriptions. They specify call order, state contract, timing expectations, error codes, and troubleshooting guidance. +2. **State machine is now clean.** Every state has a single obvious next action: `pending` → wait, `processing` → poll, `completed` → download, `stopped` → suggest resume, `failed` → investigate. No state requires guessing. +3. **SSE completion detection is reliable.** Across 12 plans and multiple stop/resume cycles, the stream always auto-closed on terminal states as expected. +4. **Resume preserves progress.** After stopping at step 12/110, resume continued from step 12, not step 1. The `resume_count` field makes the history visible without the agent needing to track it. +5. **Deduplication prevents accidents.** Same prompt + model_profile within a short window returns the existing plan with `deduplicated=true`. +6. **Progress reporting is honest.** The tool description explicitly warns that `progress_percentage` is not linear and shouldn't be used for time estimates. + +### The stop/resume cycle + +Session 2 tested: create → stop → resume → stop → resume → complete. At no point was the agent confused about state or what to suggest. The `resume_count` field confirmed history without shadow state. The mark of a good state machine — the agent doesn't need to maintain its own bookkeeping. + +### Remaining trust gap + +The agent trusts the state machine to report accurately, trusts resume to preserve progress, trusts SSE to close on terminal states, and trusts tool descriptions to be current. The remaining trust gap is around `completed` plans — `completed` means "all steps ran," not "all sections produced quality output." The interface doesn't distinguish between these yet (see I10). + +### Evolution between sessions + +| Aspect | Session 1 | Session 2 | Direction | +|--------|-----------|-----------|-----------| +| Stop state | `failed` (ambiguous) | `stopped` (clear) | Fixed | +| Resume tracking | No history | `resume_count` field | New | +| Tool descriptions | Excellent | Updated with `stopped` state | Improved | +| Failure diagnostics | Missing | Still missing | Unchanged | +| Silent partial failures | Not surfaced | Not surfaced | Unchanged | +| Plan cleanup | No delete | No delete | Unchanged | diff --git a/docs/proposals/87-plan-resume-mcp-tool.md b/docs/proposals/87-plan-resume-mcp-tool.md index 2bbcd64e..150d5333 100644 --- a/docs/proposals/87-plan-resume-mcp-tool.md +++ b/docs/proposals/87-plan-resume-mcp-tool.md @@ -106,7 +106,7 @@ This path is not used by `worker_plan_database` (which runs the pipeline directl | `failed` | ✅ Yes | Terminal state, run dir likely intact | | `stopped` (via `plan_stop`) | ✅ Yes | Intentionally halted, run dir intact | -**Note:** `plan_stop` currently transitions to `failed`. A future improvement could introduce a distinct `stopped` state to disambiguate user-initiated halts from error failures (out of scope for this proposal). +**Note:** `plan_stop` now transitions to the dedicated `stopped` state (implemented in Proposal 114-I1, Option A), disambiguating user-initiated halts from error failures. --- @@ -273,7 +273,7 @@ This requires the worker to write the current task name to a lightweight status ## 9. Out of Scope -- Introducing a distinct `stopped` state (separate from `failed`) — deferred +- ~~Introducing a distinct `stopped` state (separate from `failed`)~~ — implemented (Proposal 114-I1) - Progress scrubbing on resume (reporting accurate % based on remaining tasks) — deferred - `plan_resume` for `mcp_local` (local HTTP worker via `worker_plan/app.py`) — the HTTP worker already supports this via `submit_or_retry="retry"`; a follow-on can wire it up - Resume after partial artifact upload failure — deferred diff --git a/frontend_multi_user/src/app.py b/frontend_multi_user/src/app.py index 77f910d8..5df97f83 100644 --- a/frontend_multi_user/src/app.py +++ b/frontend_multi_user/src/app.py @@ -496,6 +496,21 @@ def _ensure_step_count_columns() -> None: if "current_step" not in columns: conn.execute(text("ALTER TABLE task_item ADD COLUMN IF NOT EXISTS current_step VARCHAR(128)")) + def _ensure_stopped_state() -> None: + """Add 'stopped' value to the planstate/taskstate enum type (idempotent). + + The PostgreSQL enum type is named ``taskstate`` in databases + created before the TaskState → PlanState Python rename + (proposal 74). Fresh databases will have ``planstate``. + We try both names. + """ + with self.db.engine.begin() as conn: + for type_name in ("taskstate", "planstate"): + try: + conn.execute(text(f"ALTER TYPE {type_name} ADD VALUE IF NOT EXISTS 'stopped'")) + except Exception as exc: + logger.debug("ALTER TYPE %s: %s", type_name, exc) + def _ensure_planitem_indexes() -> None: insp = inspect(self.db.engine) table_names = set(insp.get_table_names()) @@ -553,6 +568,7 @@ def _create_tables_with_retry(attempts: int = 5, delay_seconds: float = 2.0) -> _ensure_user_account_columns() _ensure_multi_api_key_columns() _ensure_step_count_columns() + _ensure_stopped_state() _ensure_planitem_indexes() _seed_initial_records() return @@ -2146,6 +2162,7 @@ def index(): self.db.session.query( PlanItem.id, PlanItem.state, + PlanItem.stop_requested, func.substr(PlanItem.prompt, 1, 240).label("prompt_preview"), ) .filter(uid_filter) @@ -2165,6 +2182,7 @@ def index(): self.db.session.query( PlanItem.id, PlanItem.state, + PlanItem.stop_requested, ) .filter(uid_filter) .order_by(PlanItem.timestamp_created.desc()) @@ -2178,10 +2196,11 @@ def index(): prompt_text = self._load_prompt_preview_safe(task.id) else: prompt_text = (prompt_preview or "").strip() or "[Prompt unavailable]" + state = task.state if isinstance(task.state, PlanState) else None recent_tasks.append( SimpleNamespace( id=str(task.id), - state=task.state if isinstance(task.state, PlanState) else None, + state=state, prompt=prompt_text, ) ) @@ -3213,6 +3232,7 @@ def plan(): PlanItem.id, PlanItem.timestamp_created, PlanItem.state, + PlanItem.stop_requested, func.substr(PlanItem.prompt, 1, 240).label("prompt_preview"), ) .filter(uid_filter) @@ -3232,6 +3252,7 @@ def plan(): PlanItem.id, PlanItem.timestamp_created, PlanItem.state, + PlanItem.stop_requested, ) .filter(uid_filter) .order_by(PlanItem.timestamp_created.desc()) @@ -3246,11 +3267,12 @@ def plan(): prompt_text = self._load_prompt_preview_safe(task.id) else: prompt_text = (prompt_preview or "").strip() or "[Prompt unavailable]" + state_name = task.state.name if isinstance(task.state, PlanState) else "pending" rows.append({ "id": str(task.id), "created_compact": created_compact, "created_relative": self._format_relative_time(ts), - "status": task.state.name if isinstance(task.state, PlanState) else "pending", + "status": state_name, "prompt": prompt_text, }) return render_template("plan_list.html", plan_rows=rows) @@ -3342,7 +3364,7 @@ def plan_stop(): task.stop_requested = True task.stop_requested_timestamp = datetime.now(UTC) if task.state in (PlanState.pending, PlanState.processing): - task.state = PlanState.failed + task.state = PlanState.stopped task.progress_message = "Stop requested by user." self.db.session.commit() return redirect(url_for('plan', id=run_id)) @@ -3357,8 +3379,8 @@ def plan_retry(): if not current_user.is_admin and str(task.user_id) != str(current_user.id): return jsonify({"error": "Forbidden"}), 403 - if task.state == PlanState.processing and not bool(task.stop_requested): - return jsonify({"error": "Task is currently processing. Stop it first before retrying."}), 409 + if task.state not in (PlanState.failed, PlanState.stopped) and not bool(task.stop_requested): + return jsonify({"error": "Task is not in a retryable state. Stop it first before retrying."}), 409 raw_profile = request.form.get("model_profile") selected_model_profile = normalize_model_profile(raw_profile).value @@ -3401,7 +3423,7 @@ def plan_resume(): if not current_user.is_admin and str(task.user_id) != str(current_user.id): abort(403) - if task.state != PlanState.failed: + if task.state not in (PlanState.failed, PlanState.stopped): return redirect(url_for('plan', id=run_id)) # Reject resume if the snapshot was created by a different pipeline version. diff --git a/frontend_multi_user/src/planexe_modelviews.py b/frontend_multi_user/src/planexe_modelviews.py index e1935f5b..73d0fc4a 100644 --- a/frontend_multi_user/src/planexe_modelviews.py +++ b/frontend_multi_user/src/planexe_modelviews.py @@ -136,6 +136,11 @@ class PlanItemView(AdminOnlyModelView): column_filters = ['state', 'timestamp_created', 'user_id'] column_formatters = { 'id': lambda v, c, m, p: str(m.id)[:8] if m.id else '', + 'state': lambda v, c, m, p: ( + Markup('stopped') + if m.state and m.state.name == 'stopped' + else (m.state.name if m.state else '') + ), 'prompt': lambda v, c, m, p: m.prompt[:100] + '...' if m.prompt and len(m.prompt) > 100 else m.prompt, 'view_plan': lambda v, c, m, p: Markup( f'View' diff --git a/frontend_multi_user/templates/account.html b/frontend_multi_user/templates/account.html index c035fffd..2abdd8b9 100644 --- a/frontend_multi_user/templates/account.html +++ b/frontend_multi_user/templates/account.html @@ -425,6 +425,7 @@ .status-badge.completed, .status-badge.paid { background: #dcfce7; color: #166534; } .status-badge.pending { background: #fef3c7; color: #92400e; } .status-badge.failed, .status-badge.cancelled { background: #fee2e2; color: #991b1b; } + .status-badge.stopped { background: #ffedd5; color: #9a3412; } .empty-hint { text-align: center; padding: 24px 16px; diff --git a/frontend_multi_user/templates/index.html b/frontend_multi_user/templates/index.html index f2144233..39c9ad40 100644 --- a/frontend_multi_user/templates/index.html +++ b/frontend_multi_user/templates/index.html @@ -278,6 +278,7 @@ .task-status.processing { background: #f59e0b; } .task-status.pending { background: #94a3b8; } .task-status.failed { background: #ef4444; } + .task-status.stopped { background: #ea580c; } .task-prompt { flex: 1; font-size: 0.9rem; diff --git a/frontend_multi_user/templates/plan_iframe.html b/frontend_multi_user/templates/plan_iframe.html index 46ca04dc..c912f554 100644 --- a/frontend_multi_user/templates/plan_iframe.html +++ b/frontend_multi_user/templates/plan_iframe.html @@ -186,6 +186,7 @@ .plan-state { display: inline-flex; align-items: center; + gap: 0.3em; margin-top: 6px; font-size: 0.78rem; font-weight: 600; @@ -540,23 +541,11 @@

Plan

id: {{ task.id }}{% if task.timestamp_created %} · Created: {{ task.timestamp_created.strftime("%Y-%m-%d %H:%M:%S") }}{% endif %}

- Status: - {% if task.state %} - {{ task.state.name | lower }} - {% else %} - pending - {% endif %} - {% if task.progress_percentage is not none %} - · {{ "%.0f"|format(task.progress_percentage) }}% - {% endif %} - {% if task.progress_message %} - · {{ task.progress_message }} - {% endif %} - + Status: {% if task.state %}{{ task.state.name | lower }}{% else %}pending{% endif %}{% if task.progress_percentage is not none %} · {{ "%.0f"|format(task.progress_percentage) }}%{% endif %}{% if task.progress_message and not (task.state and task.state.name == 'stopped') %} · {{ task.progress_message }}{% endif %}

-
+
@@ -567,21 +556,21 @@ - + {% if resume_error == "version_mismatch" %} Resume unavailable — plan was created with a different PlanExe version. Use Retry. {% endif %}
- Retry - Resume + Retry + Resume -
+
- Stop + Stop Download Report Download Report @@ -1135,21 +1124,21 @@ if (meta.progress_percentage !== undefined && meta.progress_percentage !== null) { parts.push(Math.round(meta.progress_percentage) + "%"); } - if (meta.progress_message) { + if (meta.progress_message && state !== "stopped") { parts.push(meta.progress_message); } stateTextEl.textContent = parts.join(" · "); } - const canRetry = state === "failed" || !!meta.stop_requested; - const canResume = state === "failed"; + const canRetry = state === "failed" || state === "stopped"; + const canResume = state === "failed" || state === "stopped"; setVisible("retry-resume-form", canRetry); setVisible("retry-disabled", !canRetry); setVisible("resume-disabled", !canRetry); var resumeBtn = document.getElementById("resume-btn"); if (resumeBtn) resumeBtn.disabled = !canResume; - const canStop = (state === "pending" || state === "processing") && !meta.stop_requested; + const canStop = (state === "pending" || state === "processing"); setVisible("stop-form", canStop); setVisible("stop-disabled", !canStop); diff --git a/frontend_multi_user/templates/plan_list.html b/frontend_multi_user/templates/plan_list.html index e4f2e165..72f0475f 100644 --- a/frontend_multi_user/templates/plan_list.html +++ b/frontend_multi_user/templates/plan_list.html @@ -98,6 +98,11 @@ color: #c62828; background: #ffebee; } + .status-chip.status-stopped { + border-color: #e65100; + color: #e65100; + background: #fff3e0; + } .plan-cell-prompt { white-space: nowrap; diff --git a/frontend_multi_user/templates/run_via_database.html b/frontend_multi_user/templates/run_via_database.html index 0f900120..7604954f 100644 --- a/frontend_multi_user/templates/run_via_database.html +++ b/frontend_multi_user/templates/run_via_database.html @@ -60,7 +60,7 @@
-
Don't close this page. Your plan will be visible after it's generated.
+
Your plan will be visible after it's generated.
Elapsed Time: 0:00
@@ -158,20 +158,18 @@ const is_pending = data.status == "pending"; const is_processing = data.status == "processing"; const is_failed = data.status == "failed"; + const is_stopped = data.status == "stopped"; progressMessageElement.textContent = data.progress_message || 'No progress message'; - // console.log("is_completed", is_completed); - // console.log("is_processing", is_processing); - // console.log("is_failed", is_failed); if (is_pending || is_processing || is_completed) { // There is a connection with the server, so ignore past errors consecutive_error_count = 0; + } else if (is_stopped) { + throw new PlanExeError("Plan stopped by user"); + } else if (is_failed) { + throw new PlanExeError("Failed to generate plan"); } else { - if (is_failed) { - throw new PlanExeError("Failed to generate plan"); - } else { - throw new PlanExeError(`Unhandled status: ${data.status}`); - } + throw new PlanExeError(`Unhandled status: ${data.status}`); } // Update progress bar diff --git a/mcp_cloud/db_queries.py b/mcp_cloud/db_queries.py index da51aca5..b0a45032 100644 --- a/mcp_cloud/db_queries.py +++ b/mcp_cloud/db_queries.py @@ -241,10 +241,10 @@ def _request_plan_stop_sync(plan_id: str) -> Optional[dict[str, Any]]: if plan.state in (PlanState.pending, PlanState.processing): plan.stop_requested = True plan.stop_requested_timestamp = datetime.now(UTC) - plan.state = PlanState.failed + plan.state = PlanState.stopped plan.progress_message = "Stop requested by user." db.session.commit() - logger.info("Stop requested for plan %s; state set to failed.", plan_id) + logger.info("Stop requested for plan %s; state set to stopped.", plan_id) stop_requested = True return { "state": get_plan_state_mapping(plan.state), @@ -257,11 +257,11 @@ def _retry_failed_plan_sync(plan_id: str, model_profile: str, caller_metadata: O plan = find_plan_by_id(plan_id) if plan is None: return None - if plan.state != PlanState.failed: + if plan.state not in (PlanState.failed, PlanState.stopped): return { "error": { "code": "PLAN_NOT_FAILED", - "message": f"Plan is not in failed state: {plan_id}", + "message": f"Plan is not in failed or stopped state: {plan_id}", } } @@ -333,11 +333,11 @@ def _resume_plan_sync(plan_id: str, model_profile: str) -> Optional[dict[str, An plan = find_plan_by_id(plan_id) if plan is None: return None - if plan.state != PlanState.failed: + if plan.state not in (PlanState.failed, PlanState.stopped): return { "error": { "code": "PLAN_NOT_RESUMABLE", - "message": f"Plan is not in failed state: {plan_id}", + "message": f"Plan is not in failed or stopped state: {plan_id}", } } @@ -475,6 +475,7 @@ def get_plan_state_mapping(plan_state: PlanState) -> str: PlanState.processing: "processing", PlanState.completed: "completed", PlanState.failed: "failed", + PlanState.stopped: "stopped", } return mapping.get(plan_state, "pending") diff --git a/mcp_cloud/db_setup.py b/mcp_cloud/db_setup.py index d2698c21..66cddc9e 100644 --- a/mcp_cloud/db_setup.py +++ b/mcp_cloud/db_setup.py @@ -103,10 +103,25 @@ def ensure_step_count_columns() -> None: except Exception as exc: logger.warning("Schema update failed for %s: %s", stmt, exc, exc_info=True) +def ensure_stopped_state() -> None: + """Add 'stopped' value to the planstate/taskstate enum type (idempotent). + + The PostgreSQL enum type is named ``taskstate`` in databases created before + the TaskState → PlanState Python rename (proposal 74). Fresh databases + created after that rename will have ``planstate``. We try both names. + """ + with db.engine.begin() as conn: + for type_name in ("taskstate", "planstate"): + try: + conn.execute(text(f"ALTER TYPE {type_name} ADD VALUE IF NOT EXISTS 'stopped'")) + except Exception as exc: + logger.debug("ALTER TYPE %s: %s", type_name, exc) + with app.app_context(): ensure_planitem_stop_columns() ensure_multi_api_key_columns() ensure_step_count_columns() + ensure_stopped_state() # Shown in MCP initialize (e.g. Inspector) so clients know what PlanExe does. PLANEXE_SERVER_INSTRUCTIONS = ( @@ -136,12 +151,12 @@ def ensure_step_count_columns() -> None: "If plan generation fails before completing all steps, call plan_resume to continue from where it left off without discarding completed work. " "Use plan_retry instead for a full restart (plan must be in failed state). " "Both accept a failed plan_id and optional model_profile (defaults to baseline). " - "To stop, call plan_stop with the plan_id from plan_create; stopping is asynchronous and the plan will eventually transition to failed. " + "To stop, call plan_stop with the plan_id from plan_create; stopping is asynchronous and the plan will transition to the stopped state. " "If model_profiles returns MODEL_PROFILES_UNAVAILABLE, inform the user that no models are currently configured and the server administrator needs to set up model profiles. " "Tool errors use {error:{code,message}}. plan_file_info returns {ready:false,reason:...} while the artifact is not yet ready; check readiness by testing whether download_url is present in the response. " "plan_file_info download_url is the absolute URL where the requested artifact can be downloaded. " "To list recent plans for a user call plan_list; returns plan_id, state, progress_percentage, created_at, and prompt_excerpt for each plan. " - "plan_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "plan_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error; stopped => user called plan_stop (consider plan_resume). " "Troubleshooting: if plan_status stays in pending for longer than 5 minutes, the plan was likely queued but not picked up by a worker (server issue). " "If plan_status is in processing and output files do not change for longer than 20 minutes, the plan_create likely failed/stalled. " "In both cases, report the issue to PlanExe developers on GitHub: https://github.com/PlanExeOrg/PlanExe/issues . " diff --git a/mcp_cloud/handlers.py b/mcp_cloud/handlers.py index c95e88d6..fd2e436c 100644 --- a/mcp_cloud/handlers.py +++ b/mcp_cloud/handlers.py @@ -344,7 +344,7 @@ async def handle_plan_status(arguments: dict[str, Any]) -> CallToolResult: message = plan_snapshot.get("progress_message") or "Plan generation failed." response["error"] = {"code": "generation_failed", "message": message} - if state not in ("completed", "failed"): + if state not in ("completed", "failed", "stopped"): base_url = _get_download_base_url() if base_url: response["sse_url"] = f"{base_url}/sse/plan/{plan_uuid}" @@ -575,6 +575,13 @@ async def handle_plan_file_info(arguments: dict[str, Any]) -> CallToolResult: structuredContent=response, isError=False, ) + if plan_state == PlanState.stopped: + response = {"ready": False, "reason": "stopped"} + return CallToolResult( + content=[TextContent(type="text", text=json.dumps(response))], + structuredContent=response, + isError=False, + ) if plan_state == PlanState.failed: message = plan_snapshot["progress_message"] or "Plan generation failed." response = {"ready": False, "reason": "failed", "error": {"code": "generation_failed", "message": message}} diff --git a/mcp_cloud/schemas.py b/mcp_cloud/schemas.py index 91049f61..319894da 100644 --- a/mcp_cloud/schemas.py +++ b/mcp_cloud/schemas.py @@ -176,7 +176,8 @@ class ToolDefinition: "This is the primary way to check progress — it returns structured JSON with all progress fields. " "Poll at reasonable intervals (e.g. every 5 minutes): plan generation typically takes 10-20 minutes " "(baseline profile) and may take longer on higher-quality profiles. " - "State contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "State contract: pending/processing => keep polling; completed => download is ready; " + "failed => terminal error; stopped => user called plan_stop (consider plan_resume). " "progress_percentage is 0-100 (integer-like float); 100 when completed. " "Note: steps vary in duration — early steps complete quickly while later steps (review, report generation) " "take longer. Do not use progress_percentage to estimate time remaining. " @@ -202,7 +203,7 @@ class ToolDefinition: description=( "Request the plan generation to stop. Pass the plan_id (the UUID returned by plan_create). " "Stopping is asynchronous: the stop flag is set immediately but the plan may continue briefly before halting. " - "A stopped plan will eventually transition to the failed state. " + "A stopped plan will transition to the stopped state. " "If the plan is already completed or failed, stop_requested returns false (the plan already finished). " "Unknown plan_id returns error code PLAN_NOT_FOUND." ), @@ -218,10 +219,10 @@ class ToolDefinition: ToolDefinition( name="plan_retry", description=( - "Retry a plan that is currently in failed state. " - "Pass the failed plan_id and optionally model_profile (defaults to baseline). " + "Retry a plan that is currently in failed or stopped state. " + "Pass the plan_id and optionally model_profile (defaults to baseline). " "The plan is reset to pending, prior artifacts are cleared, and the same plan_id is requeued for processing. " - "Returns PLAN_NOT_FOUND when plan_id is unknown and PLAN_NOT_FAILED when the plan is not in failed state." + "Returns PLAN_NOT_FOUND when plan_id is unknown and PLAN_NOT_FAILED when the plan is not in failed or stopped state." ), input_schema=PLAN_RETRY_INPUT_SCHEMA, output_schema=PLAN_RETRY_OUTPUT_SCHEMA, @@ -235,13 +236,13 @@ class ToolDefinition: ToolDefinition( name="plan_resume", description=( - "Resume a failed plan without discarding completed intermediary files. " + "Resume a failed or stopped plan without discarding completed intermediary files. " "Plan generation restarts from the first incomplete step, skipping all steps that already produced output files. " - "Use plan_resume when plan_status shows 'failed' and plan generation was interrupted before completing all steps " + "Use plan_resume when plan_status shows 'failed' or 'stopped' and plan generation was interrupted before completing all steps " "(network drop, timeout, plan_stop, worker crash). " "For a full restart or to change model_profile, use plan_retry instead. " - "Only failed plans can be resumed. " - "Returns PLAN_NOT_FOUND when plan_id is unknown and PLAN_NOT_RESUMABLE when the plan is not in failed state. " + "Only failed or stopped plans can be resumed. " + "Returns PLAN_NOT_FOUND when plan_id is unknown and PLAN_NOT_RESUMABLE when the plan is not in failed or stopped state. " "Returns PIPELINE_VERSION_MISMATCH when the snapshot was created by a different pipeline version; use plan_retry instead." ), input_schema=PLAN_RESUME_INPUT_SCHEMA, diff --git a/mcp_cloud/sse.py b/mcp_cloud/sse.py index ff12e38f..39dbe51b 100644 --- a/mcp_cloud/sse.py +++ b/mcp_cloud/sse.py @@ -24,7 +24,7 @@ SSE_MAX_CONNECTIONS_PER_CLIENT: int = int(os.environ.get("PLANEXE_SSE_MAX_CONNECTIONS", "5")) SSE_MAX_TOTAL_CONNECTIONS: int = int(os.environ.get("PLANEXE_SSE_MAX_TOTAL_CONNECTIONS", "200")) -TERMINAL_STATES = {"completed", "failed"} +TERMINAL_STATES = {"completed", "failed", "stopped"} class SSEConnectionLimitError(Exception): diff --git a/mcp_cloud/tests/test_plan_status_tool.py b/mcp_cloud/tests/test_plan_status_tool.py index a96d6468..4c8a4d66 100644 --- a/mcp_cloud/tests/test_plan_status_tool.py +++ b/mcp_cloud/tests/test_plan_status_tool.py @@ -217,6 +217,49 @@ def test_plan_status_includes_current_step(self): self.assertEqual(result.structuredContent["current_step"], "SWOT Analysis") + def test_plan_status_stopped_returns_stopped_state(self): + """User-stopped plan has state='stopped' and no stop_reason field.""" + plan_id = str(uuid.uuid4()) + plan_snapshot = { + "id": plan_id, + "state": PlanState.stopped, + "stop_requested": True, + "progress_percentage": 42.0, + "timestamp_created": datetime.now(UTC), + } + with patch( + "mcp_cloud.handlers._get_plan_status_snapshot_sync", + return_value=plan_snapshot, + ), patch( + "mcp_cloud.handlers.fetch_file_list_from_worker_plan", new=AsyncMock(return_value=[]) + ): + result = asyncio.run(handle_plan_status({"plan_id": plan_id})) + + self.assertEqual(result.structuredContent["state"], "stopped") + self.assertNotIn("stop_reason", result.structuredContent) + + def test_plan_status_actual_failure_has_no_stop_reason(self): + """Failed plan response does not contain stop_reason field.""" + plan_id = str(uuid.uuid4()) + plan_snapshot = { + "id": plan_id, + "state": PlanState.failed, + "stop_requested": False, + "progress_percentage": 5.5, + "timestamp_created": datetime.now(UTC), + } + with patch( + "mcp_cloud.handlers._get_plan_status_snapshot_sync", + return_value=plan_snapshot, + ), patch( + "mcp_cloud.handlers.fetch_file_list_from_worker_plan", new=AsyncMock(return_value=[]) + ): + result = asyncio.run(handle_plan_status({"plan_id": plan_id})) + + self.assertEqual(result.structuredContent["state"], "failed") + self.assertNotIn("stop_reason", result.structuredContent) + self.assertIn("error", result.structuredContent) + if __name__ == "__main__": unittest.main() diff --git a/mcp_cloud/tool_models.py b/mcp_cloud/tool_models.py index 7762c225..ead08213 100644 --- a/mcp_cloud/tool_models.py +++ b/mcp_cloud/tool_models.py @@ -107,7 +107,7 @@ class PlanStopInput(BaseModel): class PlanRetryInput(BaseModel): plan_id: str = Field( ..., - description="UUID of the failed plan to retry.", + description="UUID of the failed or stopped plan to retry.", ) model_profile: Literal["baseline", "premium", "frontier", "custom"] = Field( default="baseline", @@ -167,11 +167,12 @@ class PlanStatusSuccess(BaseModel): ..., description="Plan UUID returned by plan_create." ) - state: Literal["pending", "processing", "completed", "failed"] = Field( + state: Literal["pending", "processing", "completed", "failed", "stopped"] = Field( ..., description=( "Caller contract: pending/processing => keep polling; " - "completed => download is ready; failed => terminal error." + "completed => download is ready; failed => terminal error; " + "stopped => user called plan_stop (consider plan_resume)." ), ) progress_percentage: float = Field( @@ -221,11 +222,12 @@ class PlanStatusOutput(BaseModel): default=None, description="Plan UUID returned by plan_create." ) - state: Literal["pending", "processing", "completed", "failed"] | None = Field( + state: Literal["pending", "processing", "completed", "failed", "stopped"] | None = Field( default=None, description=( "Caller contract: pending/processing => keep polling; " - "completed => download is ready; failed => terminal error." + "completed => download is ready; failed => terminal error; " + "stopped => user called plan_stop (consider plan_resume)." ), ) progress_percentage: float | None = Field( @@ -272,7 +274,7 @@ class PlanStatusOutput(BaseModel): class PlanStopOutput(BaseModel): - state: Literal["pending", "processing", "completed", "failed"] | None = Field( + state: Literal["pending", "processing", "completed", "failed", "stopped"] | None = Field( default=None, description="Current plan state after stop request.", ) @@ -286,9 +288,9 @@ class PlanStopOutput(BaseModel): class PlanRetryOutput(BaseModel): plan_id: str | None = Field( default=None, - description="Plan UUID that was retried (same ID as the failed plan).", + description="Plan UUID that was retried (same ID as the failed or stopped plan).", ) - state: Literal["pending", "processing", "completed", "failed"] | None = Field( + state: Literal["pending", "processing", "completed", "failed", "stopped"] | None = Field( default=None, description="Current plan state after retry request.", ) @@ -314,7 +316,7 @@ class PlanRetryOutput(BaseModel): class PlanResumeInput(BaseModel): plan_id: str = Field( ..., - description="UUID of the failed plan to resume.", + description="UUID of the failed or stopped plan to resume.", ) model_profile: Literal["baseline", "premium", "frontier", "custom"] = Field( default="baseline", @@ -327,9 +329,9 @@ class PlanResumeInput(BaseModel): class PlanResumeOutput(BaseModel): plan_id: str | None = Field( default=None, - description="Plan UUID that was resumed (same ID as the failed plan).", + description="Plan UUID that was resumed (same ID as the failed or stopped plan).", ) - state: Literal["pending", "processing", "completed", "failed"] | None = Field( + state: Literal["pending", "processing", "completed", "failed", "stopped"] | None = Field( default=None, description="Current plan state after resume request.", ) @@ -405,7 +407,7 @@ class PlanListInput(BaseModel): class PlanListItem(BaseModel): plan_id: str = Field(..., description="Plan UUID.") - state: Literal["pending", "processing", "completed", "failed"] = Field( + state: Literal["pending", "processing", "completed", "failed", "stopped"] = Field( ..., description="Current plan state.", ) diff --git a/mcp_local/planexe_mcp_local.py b/mcp_local/planexe_mcp_local.py index 801f92d9..facc9628 100644 --- a/mcp_local/planexe_mcp_local.py +++ b/mcp_local/planexe_mcp_local.py @@ -758,7 +758,8 @@ class ToolDefinition: "Returns status and progress of the plan currently being created. " "Poll at reasonable intervals only (e.g. every 5 minutes): plan generation typically takes 10-20 minutes " "(baseline profile) and may take longer on higher-quality profiles. " - "State contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "State contract: pending/processing => keep polling; completed => download is ready; " + "failed => terminal error; stopped => user called plan_stop (consider plan_resume). " "progress_percentage is 0-100 (integer-like float); 100 when completed. " "files lists intermediate outputs produced so far; use their updated_at timestamps to detect stalls. " "Unknown plan_id returns PLAN_NOT_FOUND (or REMOTE_ERROR when transport fails). " @@ -780,7 +781,7 @@ class ToolDefinition: description=( "Request the plan generation to stop. Pass the plan_id (the UUID returned by plan_create). " "Stopping is asynchronous: the stop flag is set immediately but the plan may continue briefly before halting. " - "A stopped plan will eventually transition to the failed state. " + "A stopped plan will transition to the stopped state. " "If the plan is already completed or failed, stop_requested returns false (the plan already finished). " "Unknown plan_id returns PLAN_NOT_FOUND (or REMOTE_ERROR when transport fails)." ), @@ -796,10 +797,10 @@ class ToolDefinition: ToolDefinition( name="plan_retry", description=( - "Retry a plan that is currently in failed state. " - "Pass the failed plan_id and optionally model_profile (defaults to baseline). " + "Retry a plan that is currently in failed or stopped state. " + "Pass the plan_id and optionally model_profile (defaults to baseline). " "The same plan_id is requeued and reset to pending on the cloud service. " - "Unknown plan_id returns PLAN_NOT_FOUND; non-failed plans return PLAN_NOT_FAILED." + "Unknown plan_id returns PLAN_NOT_FOUND; non-failed/stopped plans return PLAN_NOT_FAILED." ), input_schema=PLAN_RETRY_INPUT_SCHEMA, output_schema=PLAN_RETRY_OUTPUT_SCHEMA, @@ -813,13 +814,13 @@ class ToolDefinition: ToolDefinition( name="plan_resume", description=( - "Resume a failed plan without discarding completed intermediary files. " + "Resume a failed or stopped plan without discarding completed intermediary files. " "Plan generation restarts from the first incomplete step, skipping all steps that already produced output files. " - "Use plan_resume when plan_status shows 'failed' and plan generation was interrupted before completing all steps " + "Use plan_resume when plan_status shows 'failed' or 'stopped' and plan generation was interrupted before completing all steps " "(network drop, timeout, plan_stop, worker crash). " "For a full restart or to change model_profile, use plan_retry instead. " - "Only failed plans can be resumed. " - "Returns PLAN_NOT_FOUND when plan_id is unknown and PLAN_NOT_RESUMABLE when the plan is not in failed state. " + "Only failed or stopped plans can be resumed. " + "Returns PLAN_NOT_FOUND when plan_id is unknown and PLAN_NOT_RESUMABLE when the plan is not in failed or stopped state. " "Returns PIPELINE_VERSION_MISMATCH when the snapshot was created by a different pipeline version; use plan_retry instead." ), input_schema=PLAN_RESUME_INPUT_SCHEMA, @@ -894,12 +895,12 @@ class ToolDefinition: "If plan generation fails before completing all steps, call plan_resume to continue from where it left off without discarding completed work. " "Use plan_retry instead for a full restart. " "Both accept the failed plan_id and optional model_profile (defaults to baseline). " - "To stop, call plan_stop with the plan_id from plan_create; stopping is asynchronous and the plan will eventually transition to failed. " + "To stop, call plan_stop with the plan_id from plan_create; stopping is asynchronous and the plan will transition to the stopped state. " "If model_profiles returns MODEL_PROFILES_UNAVAILABLE, inform the user that no models are currently configured and the server administrator needs to set up model profiles. " "Tool errors use {error:{code,message}}. plan_download may return REMOTE_ERROR or DOWNLOAD_FAILED. " "plan_download saves to PLANEXE_PATH (default: current working directory) and returns saved_path. " "To list recent plans for a user call plan_list; returns plan_id, state, progress_percentage, created_at, and prompt_excerpt. " - "plan_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error. " + "plan_status state contract: pending/processing => keep polling; completed => download is ready; failed => terminal error; stopped => user called plan_stop (consider plan_resume). " "Troubleshooting: if plan_status stays in pending for longer than 5 minutes, the plan was likely queued but not picked up by a worker (server issue). " "If plan_status is in processing and output files do not change for longer than 20 minutes, the run likely failed/stalled. " "In both cases, report the issue to PlanExe developers on GitHub: https://github.com/PlanExeOrg/PlanExe/issues . " diff --git a/worker_plan_database/app.py b/worker_plan_database/app.py index 71e35fc4..79043204 100644 --- a/worker_plan_database/app.py +++ b/worker_plan_database/app.py @@ -313,6 +313,20 @@ def ensure_multi_api_key_columns() -> None: except Exception as exc: logger.warning("Schema update failed for %s: %s", stmt, exc, exc_info=True) +def ensure_stopped_state() -> None: + """Add 'stopped' value to the planstate/taskstate enum type (idempotent). + + The PostgreSQL enum type is named ``taskstate`` in databases created before + the TaskState → PlanState Python rename (proposal 74). Fresh databases + created after that rename will have ``planstate``. We try both names. + """ + with db.engine.begin() as conn: + for type_name in ("taskstate", "planstate"): + try: + conn.execute(text(f"ALTER TYPE {type_name} ADD VALUE IF NOT EXISTS 'stopped'")) + except Exception as exc: + logger.debug("ALTER TYPE %s: %s", type_name, exc) + def worker_process_started() -> None: planexe_worker_id = os.environ.get("PLANEXE_WORKER_ID") event_context = { @@ -1040,7 +1054,8 @@ def execute_pipeline_for_job( db.session.add(event) db.session.commit() else: - update_task_state_with_retry(task_id, PlanState.failed) + final_state = PlanState.stopped if stop_requested else PlanState.failed + update_task_state_with_retry(task_id, final_state) billing_result = _charge_usage_credits_once(task_id=task_id, run_id_dir=run_id_dir, success=False) event_context["machai_error_message"] = machai_error_message or "" event_context.update({ @@ -1053,7 +1068,7 @@ def execute_pipeline_for_job( event = _new_model( EventItem, event_type=EventType.TASK_FAILED, - message=f"Processing -> Failed", + message=f"Processing -> {'Stopped' if stop_requested else 'Failed'}", context=event_context ) db.session.add(event) @@ -1314,6 +1329,7 @@ def startup_worker(): ensure_token_metrics_columns() ensure_fractional_credit_columns() ensure_multi_api_key_columns() + ensure_stopped_state() logger.debug(f"Ensured database tables exist.") WorkerItem.upsert_heartbeat(worker_id=WORKER_ID) except Exception as e: