From c96d16990c2590ad4f1c45858e53167635e7279a Mon Sep 17 00:00:00 2001
From: Br1an67 <932039080@qq.com>
Date: Mon, 2 Mar 2026 01:13:53 +0800
Subject: [PATCH] fix: strip markdown code fences from LLM JSON responses

Some LLM providers (notably Claude) wrap JSON responses in markdown
code fences even when JSON mode is requested. This causes json.loads
to fail with JSONDecodeError.

Add a _strip_markdown_json helper that removes ```json....``` fences
before parsing. Applied to generate_schema (sync+async) and the main
extraction methods.

Fixes #1663
---
 crawl4ai/extraction_strategy.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index 6be1c7c7b..0dcc092ff 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -45,6 +45,16 @@
 from bs4 import BeautifulSoup
 from lxml import html, etree
 
+_MARKDOWN_JSON_FENCE_RE = re.compile(
+    r"^\s*```(?:json)?\s*\n(.*?)\n\s*```\s*$", re.DOTALL
+)
+
+
+def _strip_markdown_json(text: str) -> str:
+    """Strip markdown code fences from LLM JSON responses."""
+    m = _MARKDOWN_JSON_FENCE_RE.match(text.strip())
+    return m.group(1) if m else text
+
 
 class ExtractionStrategy(ABC):
     """
@@ -677,7 +687,7 @@ def extract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]:
                 blocks = None
 
                 if self.force_json_response:
-                    blocks = json.loads(content)
+                    blocks = json.loads(_strip_markdown_json(content))
                     if isinstance(blocks, dict):
                         # If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]}
                         if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
@@ -877,7 +887,7 @@ async def aextract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]:
                 blocks = None
 
                 if self.force_json_response:
-                    blocks = json.loads(content)
+                    blocks = json.loads(_strip_markdown_json(content))
                     if isinstance(blocks, dict):
                         if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
                             blocks = list(blocks.values())[0]
@@ -1382,7 +1392,7 @@ def generate_schema(
                 base_url=llm_config.base_url,
                 extra_args=kwargs
             )
-            return json.loads(response.choices[0].message.content)
+            return json.loads(_strip_markdown_json(response.choices[0].message.content))
         except Exception as e:
             raise Exception(f"Failed to generate schema: {str(e)}")
 
@@ -1429,7 +1439,7 @@ async def agenerate_schema(
                 base_url=llm_config.base_url,
                 extra_args=kwargs
             )
-            return json.loads(response.choices[0].message.content)
+            return json.loads(_strip_markdown_json(response.choices[0].message.content))
         except Exception as e:
             raise Exception(f"Failed to generate schema: {str(e)}")