From c96d16990c2590ad4f1c45858e53167635e7279a Mon Sep 17 00:00:00 2001 From: Br1an67 <932039080@qq.com> Date: Mon, 2 Mar 2026 01:13:53 +0800 Subject: [PATCH] fix: strip markdown code fences from LLM JSON responses Some LLM providers (notably Claude) wrap JSON responses in markdown code fences even when JSON mode is requested. This causes json.loads to fail with JSONDecodeError. Add a _strip_markdown_json helper that removes ```json....``` fences before parsing. Applied to generate_schema (sync+async) and the main extraction methods. Fixes #1663 --- crawl4ai/extraction_strategy.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 6be1c7c7b..0dcc092ff 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -45,6 +45,16 @@ from bs4 import BeautifulSoup from lxml import html, etree +_MARKDOWN_JSON_FENCE_RE = re.compile( + r"^\s*```(?:json)?\s*\n(.*?)\n\s*```\s*$", re.DOTALL +) + + +def _strip_markdown_json(text: str) -> str: + """Strip markdown code fences from LLM JSON responses.""" + m = _MARKDOWN_JSON_FENCE_RE.match(text.strip()) + return m.group(1) if m else text + class ExtractionStrategy(ABC): """ @@ -677,7 +687,7 @@ def extract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]: blocks = None if self.force_json_response: - blocks = json.loads(content) + blocks = json.loads(_strip_markdown_json(content)) if isinstance(blocks, dict): # If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]} if len(blocks) == 1 and isinstance(list(blocks.values())[0], list): @@ -877,7 +887,7 @@ async def aextract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]: blocks = None if self.force_json_response: - blocks = json.loads(content) + blocks = json.loads(_strip_markdown_json(content)) if isinstance(blocks, dict): if len(blocks) == 1 and isinstance(list(blocks.values())[0], list): blocks = list(blocks.values())[0] @@ -1382,7 +1392,7 @@ def generate_schema( base_url=llm_config.base_url, extra_args=kwargs ) - return json.loads(response.choices[0].message.content) + return json.loads(_strip_markdown_json(response.choices[0].message.content)) except Exception as e: raise Exception(f"Failed to generate schema: {str(e)}") @@ -1429,7 +1439,7 @@ async def agenerate_schema( base_url=llm_config.base_url, extra_args=kwargs ) - return json.loads(response.choices[0].message.content) + return json.loads(_strip_markdown_json(response.choices[0].message.content)) except Exception as e: raise Exception(f"Failed to generate schema: {str(e)}")