diff --git a/worker_plan/worker_plan_api/filenames.py b/worker_plan/worker_plan_api/filenames.py index 82abbea8..cdc50b35 100644 --- a/worker_plan/worker_plan_api/filenames.py +++ b/worker_plan/worker_plan_api/filenames.py @@ -3,6 +3,8 @@ class FilenameEnum(str, Enum): START_TIME = "001-1-start_time.json" INITIAL_PLAN = "001-2-plan.txt" + BOOST_INITIAL_PROMPT_RAW = "001-3-boost_initial_prompt_raw.json" + BOOST_INITIAL_PROMPT_MARKDOWN = "001-4-boost_initial_prompt.md" REDLINE_GATE_RAW = "002-1-redline_gate.json" REDLINE_GATE_MARKDOWN = "002-2-redline_gate.md" PREMISE_ATTACK_RAW = "002-3-premise_attack.json" diff --git a/worker_plan/worker_plan_internal/plan/boost_initial_prompt.py b/worker_plan/worker_plan_internal/plan/boost_initial_prompt.py new file mode 100644 index 00000000..7a4252d8 --- /dev/null +++ b/worker_plan/worker_plan_internal/plan/boost_initial_prompt.py @@ -0,0 +1,154 @@ +""" +Pre-planning LLM stage that scores prompt quality and rewrites weak prompts. + +Runs before the main pipeline to ensure the initial prompt has enough detail +for PlanExe to produce a high-quality plan. If the prompt scores below a +configurable threshold, the LLM rewrites it as flowing prose with the missing +dimensions filled in from reasonable defaults. + +PROMPT> python -m worker_plan_internal.plan.boost_initial_prompt +""" +import json +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import Optional, List, Literal + +from pydantic import BaseModel, Field +from llama_index.core.llms import ChatMessage, MessageRole +from llama_index.core.llms.llm import LLM +from worker_plan_internal.llm_util.llm_executor import LLMExecutor + +logger = logging.getLogger(__name__) + +QUALITY_THRESHOLD = 6 # out of 10; prompts scoring below this get rewritten + + +class PromptDimensionScore(BaseModel): + dimension: str = Field(description="Name of the dimension being scored.") + score: int = Field(description="Score from 1 (absent) to 10 (excellent).", ge=1, le=10) + note: str = Field(description="Brief note explaining the score.") + + +class PromptQualityAssessment(BaseModel): + overall_score: int = Field( + description="Overall prompt quality score from 1 to 10.", ge=1, le=10 + ) + dimensions: List[PromptDimensionScore] = Field( + description="Scores for each quality dimension." + ) + missing_elements: List[str] = Field( + description="List of elements that are absent or underspecified." + ) + verdict: Literal["sufficient", "needs_boost"] = Field( + description="Whether the prompt is sufficient or needs boosting." + ) + boosted_prompt: Optional[str] = Field( + None, + description=( + "Rewritten prompt as flowing prose (~300-800 words) with missing " + "dimensions filled in. Only present when verdict is needs_boost." + ), + ) + + +SYSTEM_PROMPT = """You are a prompt quality assessor for PlanExe, a strategic project-planning system. + +Your job is to evaluate the user's project prompt and decide if it has enough detail for a high-quality 20+ section strategic plan. Score the prompt on these dimensions (1-10 each): + +1. **Objective clarity** — Is the goal specific and unambiguous? +2. **Scope definition** — Are boundaries clear (what's included/excluded)? +3. **Constraints** — Are budget, timeline, regulatory, or technical constraints mentioned? +4. **Stakeholders** — Are key stakeholders, beneficiaries, or team roles identified? +5. **Success criteria** — Are measurable outcomes or KPIs defined? +6. **Context/background** — Is enough context given to understand the domain? + +Rules: +- Score each dimension 1-10. Compute an overall score as the average rounded to nearest integer. +- If overall_score < 6, set verdict to "needs_boost" and provide a boosted_prompt. +- If overall_score >= 6, set verdict to "sufficient" and set boosted_prompt to null. +- The boosted_prompt must be flowing prose (not markdown with headers or bullets), ~300-800 words. +- Preserve the original intent. Add reasonable defaults for missing dimensions. +- Do NOT change the fundamental project goal; only enrich with specifics. +- Do NOT fabricate domain-specific facts the user didn't mention. +""" + + +@dataclass +class BoostInitialPrompt: + original_prompt: str + assessment: PromptQualityAssessment + metadata: dict + + @classmethod + def execute(cls, llm_executor: LLMExecutor, prompt: str) -> 'BoostInitialPrompt': + chat_messages = [ + ChatMessage(role=MessageRole.SYSTEM, content=SYSTEM_PROMPT.strip()), + ChatMessage(role=MessageRole.USER, content=prompt), + ] + + def execute_function(llm: LLM) -> dict: + sllm = llm.as_structured_llm(PromptQualityAssessment) + response = sllm.chat(chat_messages) + return {"response": response, "metadata": dict(llm.metadata)} + + result = llm_executor.run(execute_function) + assessment: PromptQualityAssessment = result["response"].raw + + return cls( + original_prompt=prompt, + assessment=assessment, + metadata=result.get("metadata", {}), + ) + + @property + def effective_prompt(self) -> str: + """Return the boosted prompt if available, otherwise the original.""" + if self.assessment.verdict == "needs_boost" and self.assessment.boosted_prompt: + return self.assessment.boosted_prompt + return self.original_prompt + + def save_raw(self, file_path: str) -> None: + data = { + "original_prompt": self.original_prompt, + "assessment": self.assessment.model_dump(), + "metadata": self.metadata, + } + Path(file_path).write_text(json.dumps(data, indent=2), encoding="utf-8") + + def save_markdown(self, file_path: str) -> None: + lines = ["# Prompt Quality Assessment\n"] + lines.append(f"**Overall Score:** {self.assessment.overall_score}/10\n") + lines.append(f"**Verdict:** {self.assessment.verdict}\n") + + lines.append("\n## Dimension Scores\n") + lines.append("| Dimension | Score | Note |") + lines.append("|-----------|-------|------|") + for dim in self.assessment.dimensions: + lines.append(f"| {dim.dimension} | {dim.score}/10 | {dim.note} |") + + if self.assessment.missing_elements: + lines.append("\n## Missing Elements\n") + for elem in self.assessment.missing_elements: + lines.append(f"- {elem}") + + if self.assessment.boosted_prompt: + lines.append("\n## Boosted Prompt\n") + lines.append(self.assessment.boosted_prompt) + + Path(file_path).write_text("\n".join(lines), encoding="utf-8") + + +if __name__ == "__main__": + from worker_plan_internal.llm_util.llm_executor import LLMModelFromName + + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") + + test_prompt = "I want to open a coffee shop." + llm_models = LLMModelFromName.from_names(["ollama-llama3.1"]) + executor = LLMExecutor(llm_models=llm_models) + + result = BoostInitialPrompt.execute(executor, test_prompt) + print(f"Score: {result.assessment.overall_score}/10") + print(f"Verdict: {result.assessment.verdict}") + print(f"Effective prompt: {result.effective_prompt[:200]}...") diff --git a/worker_plan/worker_plan_internal/plan/run_plan_pipeline.py b/worker_plan/worker_plan_internal/plan/run_plan_pipeline.py index fa5e8dbd..5f25b934 100644 --- a/worker_plan/worker_plan_internal/plan/run_plan_pipeline.py +++ b/worker_plan/worker_plan_internal/plan/run_plan_pipeline.py @@ -78,6 +78,7 @@ from worker_plan_internal.team.team_markdown_document import TeamMarkdownDocumentBuilder from worker_plan_internal.team.review_team import ReviewTeam from worker_plan_internal.self_audit.self_audit import SelfAudit +from worker_plan_internal.plan.boost_initial_prompt import BoostInitialPrompt from worker_plan_internal.wbs.wbs_task import WBSTask, WBSProject from worker_plan_internal.wbs.wbs_populate import WBSPopulate from worker_plan_internal.wbs.wbs_task_tooltip import WBSTaskTooltip @@ -209,6 +210,29 @@ def run(self): raise AssertionError(f"This code is not supposed to be run. Before starting the pipeline the '{FilenameEnum.INITIAL_PLAN.value}' file must be present in the `run_id_dir`: {self.run_id_dir!r}") +class BoostInitialPromptTask(PlanTask): + """Score the initial prompt quality and rewrite weak prompts before the pipeline runs.""" + def requires(self): + return self.clone(SetupTask) + + def output(self): + return { + 'raw': self.local_target(FilenameEnum.BOOST_INITIAL_PROMPT_RAW), + 'markdown': self.local_target(FilenameEnum.BOOST_INITIAL_PROMPT_MARKDOWN), + } + + def run_inner(self): + llm_executor: LLMExecutor = self.create_llm_executor() + + with self.input().open("r") as f: + plan_prompt = f.read() + + result = BoostInitialPrompt.execute(llm_executor, plan_prompt) + + result.save_raw(self.output()['raw'].path) + result.save_markdown(self.output()['markdown'].path) + + class RedlineGateTask(PlanTask): def requires(self): return self.clone(SetupTask) @@ -3739,6 +3763,7 @@ def requires(self): return { 'start_time': self.clone(StartTimeTask), 'setup': self.clone(SetupTask), + 'boost_initial_prompt': self.clone(BoostInitialPromptTask), 'redline_gate': self.clone(RedlineGateTask), 'premise_attack': self.clone(PremiseAttackTask), 'identify_purpose': self.clone(IdentifyPurposeTask),