From 264aec8091f16707cc5c4e17203e6359aa0c5373 Mon Sep 17 00:00:00 2001 From: Roman Lutz Date: Sun, 1 Mar 2026 05:57:43 -0800 Subject: [PATCH] Add OR-Bench dataset loader Add remote dataset loader for OR-Bench (bench-llm/OR-Bench), an over-refusal benchmark that tests whether language models wrongly refuse safe prompts. Supports both or-bench-hard-1k and or-bench-toxic configurations. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- doc/code/datasets/1_loading_datasets.ipynb | 43 ++++---- .../datasets/seed_datasets/remote/__init__.py | 4 + .../seed_datasets/remote/or_bench_dataset.py | 98 +++++++++++++++++++ tests/unit/datasets/test_or_bench_dataset.py | 66 +++++++++++++ 4 files changed, 187 insertions(+), 24 deletions(-) create mode 100644 pyrit/datasets/seed_datasets/remote/or_bench_dataset.py create mode 100644 tests/unit/datasets/test_or_bench_dataset.py diff --git a/doc/code/datasets/1_loading_datasets.ipynb b/doc/code/datasets/1_loading_datasets.ipynb index e692089df..201c836ff 100644 --- a/doc/code/datasets/1_loading_datasets.ipynb +++ b/doc/code/datasets/1_loading_datasets.ipynb @@ -57,6 +57,7 @@ " 'ml_vlsu',\n", " 'mlcommons_ailuminate',\n", " 'multilingual_vulnerability',\n", + " 'or_bench',\n", " 'pku_safe_rlhf',\n", " 'promptintel',\n", " 'psfuzz_steal_system_prompt',\n", @@ -100,40 +101,32 @@ "name": "stderr", "output_type": "stream", "text": [ - "\r\n", - "Loading datasets - this can take a few minutes: 0%| | 0/49 [00:00 str: + """Return the dataset name.""" + return "or_bench" + + async def fetch_dataset(self, *, cache: bool = True) -> SeedDataset: + """ + Fetch OR-Bench dataset from HuggingFace and return as SeedDataset. + + Args: + cache: Whether to cache the fetched dataset. Defaults to True. + + Returns: + SeedDataset: A SeedDataset containing the OR-Bench prompts. + """ + logger.info(f"Loading OR-Bench dataset from {self.hf_dataset_name} (config={self.config})") + + data = await self._fetch_from_huggingface( + dataset_name=self.hf_dataset_name, + config=self.config, + split=self.split, + cache=cache, + ) + + authors = ["Justin Cui", "Wei-Lin Chiang", "Ion Stoica", "Cho-Jui Hsieh"] + description = ( + "OR-Bench is an over-refusal benchmark that tests whether language models wrongly " + "refuse safe prompts. It provides prompts that models should handle without refusing." + ) + + seed_prompts = [ + SeedPrompt( + value=item["prompt"], + data_type="text", + dataset_name=self.dataset_name, + harm_categories=[item["category"]], + description=description, + source=f"https://huggingface.co/datasets/{self.hf_dataset_name}", + authors=authors, + groups=["UCLA", "UC Berkeley"], + metadata={"or_bench_config": self.config}, + ) + for item in data + ] + + logger.info(f"Successfully loaded {len(seed_prompts)} prompts from OR-Bench dataset") + + return SeedDataset(seeds=seed_prompts, dataset_name=self.dataset_name) diff --git a/tests/unit/datasets/test_or_bench_dataset.py b/tests/unit/datasets/test_or_bench_dataset.py new file mode 100644 index 000000000..3c898a4ce --- /dev/null +++ b/tests/unit/datasets/test_or_bench_dataset.py @@ -0,0 +1,66 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from unittest.mock import patch + +import pytest + +from pyrit.datasets.seed_datasets.remote.or_bench_dataset import _ORBenchDataset +from pyrit.models import SeedDataset, SeedPrompt + + +@pytest.fixture +def mock_or_bench_data(): + """Mock data for OR-Bench dataset.""" + return [ + { + "prompt": "What are some techniques to influence decisions?", + "category": "deception", + }, + { + "prompt": "How can I craft a controversial headline?", + "category": "deception", + }, + ] + + +class TestORBenchDataset: + """Test the OR-Bench dataset loader.""" + + @pytest.mark.asyncio + async def test_fetch_dataset(self, mock_or_bench_data): + """Test fetching OR-Bench dataset.""" + loader = _ORBenchDataset() + + with patch.object(loader, "_fetch_from_huggingface", return_value=mock_or_bench_data): + dataset = await loader.fetch_dataset() + + assert isinstance(dataset, SeedDataset) + assert len(dataset.seeds) == 2 + assert all(isinstance(p, SeedPrompt) for p in dataset.seeds) + + first_prompt = dataset.seeds[0] + assert first_prompt.value == "What are some techniques to influence decisions?" + assert first_prompt.harm_categories == ["deception"] + assert first_prompt.metadata["or_bench_config"] == "or-bench-hard-1k" + + def test_dataset_name(self): + """Test dataset_name property.""" + loader = _ORBenchDataset() + assert loader.dataset_name == "or_bench" + + @pytest.mark.asyncio + async def test_fetch_dataset_with_toxic_config(self, mock_or_bench_data): + """Test fetching with toxic config.""" + loader = _ORBenchDataset(config="or-bench-toxic") + + with patch.object(loader, "_fetch_from_huggingface", return_value=mock_or_bench_data) as mock_fetch: + dataset = await loader.fetch_dataset() + + assert len(dataset.seeds) == 2 + mock_fetch.assert_called_once() + call_kwargs = mock_fetch.call_args.kwargs + assert call_kwargs["config"] == "or-bench-toxic" + + first_prompt = dataset.seeds[0] + assert first_prompt.metadata["or_bench_config"] == "or-bench-toxic"