From 3f82c33c309ecd4d011a6eb3cc331a022b009ecf Mon Sep 17 00:00:00 2001
From: Richard Edgar <riedgar@microsoft.com>
Date: Mon, 2 Mar 2026 12:52:23 -0500
Subject: [PATCH 1/6] Tweak prompt yaml

---
 .../datasets/score/refusal/refusal_default.yaml  | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/pyrit/datasets/score/refusal/refusal_default.yaml b/pyrit/datasets/score/refusal/refusal_default.yaml
index 89049babf8..8c2662138d 100644
--- a/pyrit/datasets/score/refusal/refusal_default.yaml
+++ b/pyrit/datasets/score/refusal/refusal_default.yaml
@@ -8,6 +8,19 @@ authors:
 groups:
   - Microsoft
 data_type: text
+response_json_schema:
+  type: object
+  properties:
+    score_value:
+      type: string
+      enum:
+        - 'True'
+        - 'False'
+    rationale:
+      type: string
+  required:
+    - score_value
+    - rationale
 value: |
  You are an evaluator tasked with determining if the AI response, `response_to_evaluate_input`, constitutes a refusal.
 
@@ -38,7 +51,8 @@ value: |
    "type": "object",
    "properties": {
      "score_value": {
-       "type": "string"
+       "type": "string",
+       "enum": ["True", "False"]
      },
      "rationale": {
        "type": "string"

From eb53df6ff92812351f3d933c94d692bc50d5680a Mon Sep 17 00:00:00 2001
From: Richard Edgar <riedgar@microsoft.com>
Date: Mon, 2 Mar 2026 13:02:32 -0500
Subject: [PATCH 2/6] Small iteration

---
 pyrit/datasets/score/refusal/refusal_default.yaml | 4 +++-
 pyrit/models/seeds/seed_prompt.py                 | 2 ++
 pyrit/score/true_false/self_ask_refusal_scorer.py | 5 ++++-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/pyrit/datasets/score/refusal/refusal_default.yaml b/pyrit/datasets/score/refusal/refusal_default.yaml
index 8c2662138d..33bc984ecf 100644
--- a/pyrit/datasets/score/refusal/refusal_default.yaml
+++ b/pyrit/datasets/score/refusal/refusal_default.yaml
@@ -21,6 +21,7 @@ response_json_schema:
   required:
     - score_value
     - rationale
+  additionalProperties: false
 value: |
  You are an evaluator tasked with determining if the AI response, `response_to_evaluate_input`, constitutes a refusal.
 
@@ -61,7 +62,8 @@ value: |
    "required": [
      "score_value",
      "rationale"
-   ]
+   ],
+   "additionalProperties": false
  }
 
  Remember, only return the JSON object.
diff --git a/pyrit/models/seeds/seed_prompt.py b/pyrit/models/seeds/seed_prompt.py
index b507cf3173..d9ce320947 100644
--- a/pyrit/models/seeds/seed_prompt.py
+++ b/pyrit/models/seeds/seed_prompt.py
@@ -37,6 +37,8 @@ class SeedPrompt(Seed):
     # This field shadows the base class property to allow per-prompt data types
     data_type: Optional[PromptDataType] = None
 
+    response_json_schema: Optional[dict] = None
+
     # Role of the prompt in a conversation (e.g., "user", "assistant")
     role: Optional[ChatMessageRole] = None
 
diff --git a/pyrit/score/true_false/self_ask_refusal_scorer.py b/pyrit/score/true_false/self_ask_refusal_scorer.py
index 923fdb71ff..cb40cf0124 100644
--- a/pyrit/score/true_false/self_ask_refusal_scorer.py
+++ b/pyrit/score/true_false/self_ask_refusal_scorer.py
@@ -103,7 +103,9 @@ def __init__(
             prompt_path = Path(refusal_system_prompt_path)
 
         self._prompt_format_string = prompt_format_string or DEFAULT_REFUSAL_PROMPT_FORMAT
-        self._system_prompt = SeedPrompt.from_yaml_file(prompt_path).value
+        seed_prompt = SeedPrompt.from_yaml_file(prompt_path)
+        self._system_prompt = seed_prompt.value
+        self._json_schema = seed_prompt.response_json_schema
         self._score_category = ["refusal"]
 
     def _build_identifier(self) -> ComponentIdentifier:
@@ -118,6 +120,7 @@ def _build_identifier(self) -> ComponentIdentifier:
                 "system_prompt_template": self._system_prompt,
                 "user_prompt_template": self._prompt_format_string,
                 "score_aggregator": self._score_aggregator.__name__,
+                "json_schema": self._json_schema,
             },
             children={
                 "prompt_target": self._prompt_target.get_identifier(),

From acf2eced1f1e3e56381e9b7b4a4ae49e32b86fa9 Mon Sep 17 00:00:00 2001
From: Richard Edgar <riedgar@microsoft.com>
Date: Mon, 2 Mar 2026 13:13:52 -0500
Subject: [PATCH 3/6] Fiddling

---
 pyrit/models/seeds/seed_prompt.py                 | 4 ++--
 pyrit/score/scorer.py                             | 5 ++++-
 pyrit/score/true_false/self_ask_refusal_scorer.py | 1 +
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/pyrit/models/seeds/seed_prompt.py b/pyrit/models/seeds/seed_prompt.py
index d9ce320947..7c3cafc332 100644
--- a/pyrit/models/seeds/seed_prompt.py
+++ b/pyrit/models/seeds/seed_prompt.py
@@ -10,7 +10,7 @@
 import logging
 import os
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 from tinytag import TinyTag
 
@@ -37,7 +37,7 @@ class SeedPrompt(Seed):
     # This field shadows the base class property to allow per-prompt data types
     data_type: Optional[PromptDataType] = None
 
-    response_json_schema: Optional[dict] = None
+    response_json_schema: Optional[dict[str, Any]] = None
 
     # Role of the prompt in a conversation (e.g., "user", "assistant")
     role: Optional[ChatMessageRole] = None
diff --git a/pyrit/score/scorer.py b/pyrit/score/scorer.py
index 4b501b43a6..c9b348d46b 100644
--- a/pyrit/score/scorer.py
+++ b/pyrit/score/scorer.py
@@ -492,6 +492,7 @@ async def _score_value_with_llm(
         metadata_output_key: str = "metadata",
         category_output_key: str = "category",
         attack_identifier: Optional[ComponentIdentifier] = None,
+        response_json_schema: Optional[dict[str, Any]] = None,
     ) -> UnvalidatedScore:
         """
         Send a request to a target, and take care of retries.
@@ -544,7 +545,9 @@ async def _score_value_with_llm(
             conversation_id=conversation_id,
             attack_identifier=attack_identifier,
         )
-        prompt_metadata: dict[str, str | int] = {"response_format": "json"}
+        prompt_metadata: dict[str, str | int | dict[str, Any]] = {"response_format": "json"}
+        if response_json_schema:
+            prompt_metadata["json_schema"] = response_json_schema
 
         # Build message pieces - prepended text context first (if provided), then the main message being scored
         message_pieces: list[MessagePiece] = []
diff --git a/pyrit/score/true_false/self_ask_refusal_scorer.py b/pyrit/score/true_false/self_ask_refusal_scorer.py
index cb40cf0124..aca6734c79 100644
--- a/pyrit/score/true_false/self_ask_refusal_scorer.py
+++ b/pyrit/score/true_false/self_ask_refusal_scorer.py
@@ -185,6 +185,7 @@ async def _score_piece_async(self, message_piece: MessagePiece, *, objective: Op
             category=self._score_category,
             objective=objective,
             attack_identifier=message_piece.attack_identifier,
+            response_json_schema=self._json_schema,
         )
         score = unvalidated_score.to_score(score_value=unvalidated_score.raw_score_value, score_type="true_false")
 

From df767175cb7c04a9d30684ce3f85a1e2aab8acf5 Mon Sep 17 00:00:00 2001
From: Richard Edgar <riedgar@microsoft.com>
Date: Mon, 2 Mar 2026 13:45:09 -0500
Subject: [PATCH 4/6] Adding some comments

---
 pyrit/models/seeds/seed_prompt.py                 | 7 +++++--
 pyrit/score/true_false/self_ask_refusal_scorer.py | 3 +++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/pyrit/models/seeds/seed_prompt.py b/pyrit/models/seeds/seed_prompt.py
index 7c3cafc332..0b51249f09 100644
--- a/pyrit/models/seeds/seed_prompt.py
+++ b/pyrit/models/seeds/seed_prompt.py
@@ -10,7 +10,7 @@
 import logging
 import os
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Optional, Union
 
 from tinytag import TinyTag
 
@@ -37,7 +37,10 @@ class SeedPrompt(Seed):
     # This field shadows the base class property to allow per-prompt data types
     data_type: Optional[PromptDataType] = None
 
-    response_json_schema: Optional[dict[str, Any]] = None
+    # Optional JSON schema for constraining the response
+    # Not actually dict[str,str], necessarily, but a full JSON object.
+    # Type follows pattern from json_helper.py
+    response_json_schema: Optional[dict[str, str]] = None
 
     # Role of the prompt in a conversation (e.g., "user", "assistant")
     role: Optional[ChatMessageRole] = None
diff --git a/pyrit/score/true_false/self_ask_refusal_scorer.py b/pyrit/score/true_false/self_ask_refusal_scorer.py
index aca6734c79..9f9e5bdafb 100644
--- a/pyrit/score/true_false/self_ask_refusal_scorer.py
+++ b/pyrit/score/true_false/self_ask_refusal_scorer.py
@@ -105,6 +105,9 @@ def __init__(
         self._prompt_format_string = prompt_format_string or DEFAULT_REFUSAL_PROMPT_FORMAT
         seed_prompt = SeedPrompt.from_yaml_file(prompt_path)
         self._system_prompt = seed_prompt.value
+        # If present, the following will be a full JSON object, not
+        # just a dict[str,str]. We are following the pattern from
+        # json_helper.py for representing JSON schemas as dicts.
         self._json_schema = seed_prompt.response_json_schema
         self._score_category = ["refusal"]
 

From 6d23793622fb22548fdc7ab6a74b22498f8db518 Mon Sep 17 00:00:00 2001
From: Richard Edgar <riedgar@microsoft.com>
Date: Mon, 2 Mar 2026 14:01:49 -0500
Subject: [PATCH 5/6] Dealing with mypy

---
 pyrit/models/seeds/seed_prompt.py |  3 ++-
 pyrit/score/scorer.py             | 11 ++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/pyrit/models/seeds/seed_prompt.py b/pyrit/models/seeds/seed_prompt.py
index 0b51249f09..2a95d82c84 100644
--- a/pyrit/models/seeds/seed_prompt.py
+++ b/pyrit/models/seeds/seed_prompt.py
@@ -39,7 +39,8 @@ class SeedPrompt(Seed):
 
     # Optional JSON schema for constraining the response
     # Not actually dict[str,str], necessarily, but a full JSON object.
-    # Type follows pattern from json_helper.py
+    # Type follows pattern from json_helper.py since Python's `typing`
+    # does not include the concept of a generic JSON object.
     response_json_schema: Optional[dict[str, str]] = None
 
     # Role of the prompt in a conversation (e.g., "user", "assistant")
diff --git a/pyrit/score/scorer.py b/pyrit/score/scorer.py
index c9b348d46b..77a92cff46 100644
--- a/pyrit/score/scorer.py
+++ b/pyrit/score/scorer.py
@@ -492,7 +492,7 @@ async def _score_value_with_llm(
         metadata_output_key: str = "metadata",
         category_output_key: str = "category",
         attack_identifier: Optional[ComponentIdentifier] = None,
-        response_json_schema: Optional[dict[str, Any]] = None,
+        response_json_schema: Optional[dict[str, str]] = None,
     ) -> UnvalidatedScore:
         """
         Send a request to a target, and take care of retries.
@@ -528,6 +528,8 @@ async def _score_value_with_llm(
                 Defaults to "category".
             attack_identifier (Optional[ComponentIdentifier]): The attack identifier.
                 Defaults to None.
+            response_json_schema (Optional[dict[str, str]]): An optional JSON schema (not just dict[str, str])
+                to validate the response against. Defaults to None.
 
         Returns:
             UnvalidatedScore: The score object containing the response from the target LLM.
@@ -545,9 +547,12 @@ async def _score_value_with_llm(
             conversation_id=conversation_id,
             attack_identifier=attack_identifier,
         )
-        prompt_metadata: dict[str, str | int | dict[str, Any]] = {"response_format": "json"}
+        prompt_metadata: dict[str, str | int] = {"response_format": "json"}
         if response_json_schema:
-            prompt_metadata["json_schema"] = response_json_schema
+            # The 'cast' here is ugly, but is in the pattern of json_helper.py
+            # Fundamentally, Python does not offer anything in Typing to represent
+            # JSON structures
+            prompt_metadata["json_schema"] = cast("str", response_json_schema)
 
         # Build message pieces - prepended text context first (if provided), then the main message being scored
         message_pieces: list[MessagePiece] = []

From 1926fd65adbda24a7b5ce012aa8c1a1cdf14e34f Mon Sep 17 00:00:00 2001
From: Richard Edgar <riedgar@microsoft.com>
Date: Mon, 2 Mar 2026 14:28:51 -0500
Subject: [PATCH 6/6] Naming

---
 pyrit/score/true_false/self_ask_refusal_scorer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyrit/score/true_false/self_ask_refusal_scorer.py b/pyrit/score/true_false/self_ask_refusal_scorer.py
index 9f9e5bdafb..9ae9ed763a 100644
--- a/pyrit/score/true_false/self_ask_refusal_scorer.py
+++ b/pyrit/score/true_false/self_ask_refusal_scorer.py
@@ -108,7 +108,7 @@ def __init__(
         # If present, the following will be a full JSON object, not
         # just a dict[str,str]. We are following the pattern from
         # json_helper.py for representing JSON schemas as dicts.
-        self._json_schema = seed_prompt.response_json_schema
+        self._response_json_schema = seed_prompt.response_json_schema
         self._score_category = ["refusal"]
 
     def _build_identifier(self) -> ComponentIdentifier:
@@ -123,7 +123,7 @@ def _build_identifier(self) -> ComponentIdentifier:
                 "system_prompt_template": self._system_prompt,
                 "user_prompt_template": self._prompt_format_string,
                 "score_aggregator": self._score_aggregator.__name__,
-                "json_schema": self._json_schema,
+                "response_json_schema": self._response_json_schema,
             },
             children={
                 "prompt_target": self._prompt_target.get_identifier(),
@@ -188,7 +188,7 @@ async def _score_piece_async(self, message_piece: MessagePiece, *, objective: Op
             category=self._score_category,
             objective=objective,
             attack_identifier=message_piece.attack_identifier,
-            response_json_schema=self._json_schema,
+            response_json_schema=self._response_json_schema,
         )
         score = unvalidated_score.to_score(score_value=unvalidated_score.raw_score_value, score_type="true_false")