From 65e8f7be5ef743b0c3b5e4d82fbeddad57fe3f84 Mon Sep 17 00:00:00 2001
From: Rahuldrabit <rahuldrabit@gmail.com>
Date: Sat, 21 Feb 2026 08:57:02 +0600
Subject: [PATCH 1/4] MNT: replace live server calls with mocks in
 tests/test_utils/test_utils.py

---
 tests/test_utils/test_utils.py | 106 +++++++++++++++++++++++++--------
 1 file changed, 81 insertions(+), 25 deletions(-)

diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 38e004bfb..208b94ca1 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -2,8 +2,13 @@
 
 import os
 import unittest.mock
+
+import pandas as pd
 import pytest
+
 import openml
+from openml.evaluations.evaluation import OpenMLEvaluation
+from openml.setups.setup import OpenMLSetup
 from openml.testing import _check_dataset
 
 
@@ -43,24 +48,49 @@ def min_number_evaluations_on_test_server() -> int:
     return 8
 
 
-def _mocked_perform_api_call(call, request_method):
-    url = openml.config.server + "/" + call
-    return openml._api_calls._download_text_file(url)
 
+def _create_mock_listing_call(total_items, item_factory, return_type="dataframe"):
+    def mock_listing_call(limit, offset, **kwargs):
+        if offset >= total_items:
+            return pd.DataFrame() if return_type == "dataframe" else []
+        size = min(limit, total_items - offset)
+        items = [item_factory(i) for i in range(offset, offset + size)]
+        return pd.DataFrame(items) if return_type == "dataframe" else items
+    return mock_listing_call
 
-@pytest.mark.test_server()
-def test_list_all():
+def _mocked_perform_api_call(call, request_method):
+    if call == "data/list/limit/1000/offset/0/data_name/iris/data_version/1":
+        return """<oml:data xmlns:oml="http://openml.org/openml">
+            <oml:dataset>
+                <oml:did>61</oml:did>
+                <oml:name>iris</oml:name>
+                <oml:version>1</oml:version>
+                <oml:status>active</oml:status>
+            </oml:dataset>
+        </oml:data>"""
+    raise ValueError(f"Unexpected call: {call}")
+
+
+@unittest.mock.patch("openml.tasks.functions._list_tasks")
+def test_list_all(mock_list_tasks):
+    mock_list_tasks.side_effect = _create_mock_listing_call(10, lambda i: {"tid": i})
     openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks)
 
 
-@pytest.mark.test_server()
-def test_list_all_for_tasks(min_number_tasks_on_test_server):
+@unittest.mock.patch("openml.tasks.functions._list_tasks")
+def test_list_all_for_tasks(mock_list_tasks, min_number_tasks_on_test_server):
+    mock_list_tasks.side_effect = _create_mock_listing_call(
+        min_number_tasks_on_test_server, lambda i: {"tid": i}
+    )
     tasks = openml.tasks.list_tasks(size=min_number_tasks_on_test_server)
     assert min_number_tasks_on_test_server == len(tasks)
 
 
-@pytest.mark.test_server()
-def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
+@unittest.mock.patch("openml.tasks.functions._list_tasks")
+def test_list_all_with_multiple_batches(mock_list_tasks, min_number_tasks_on_test_server):
+    mock_list_tasks.side_effect = _create_mock_listing_call(
+        min_number_tasks_on_test_server, lambda i: {"tid": i}
+    )
     # By setting the batch size one lower than the minimum we guarantee at least two
     # batches and at the same time do as few batches (roundtrips) as possible.
     batch_size = min_number_tasks_on_test_server - 1
@@ -72,8 +102,11 @@ def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
     assert min_number_tasks_on_test_server <= sum(len(batch) for batch in batches)
 
 
-@pytest.mark.test_server()
-def test_list_all_for_datasets(min_number_datasets_on_test_server):
+@unittest.mock.patch("openml.datasets.functions._list_datasets")
+def test_list_all_for_datasets(mock_list_datasets, min_number_datasets_on_test_server):
+    mock_list_datasets.side_effect = _create_mock_listing_call(
+        min_number_datasets_on_test_server, lambda i: {"did": i, "status": "active"}
+    )
     datasets = openml.datasets.list_datasets(
         size=min_number_datasets_on_test_server,
     )
@@ -83,30 +116,47 @@ def test_list_all_for_datasets(min_number_datasets_on_test_server):
         _check_dataset(dataset)
 
 
-@pytest.mark.test_server()
-def test_list_all_for_flows(min_number_flows_on_test_server):
+@unittest.mock.patch("openml.flows.functions._list_flows")
+def test_list_all_for_flows(mock_list_flows, min_number_flows_on_test_server):
+    mock_list_flows.side_effect = _create_mock_listing_call(
+        min_number_flows_on_test_server, lambda i: {"id": i}
+    )
     flows = openml.flows.list_flows(size=min_number_flows_on_test_server)
     assert min_number_flows_on_test_server == len(flows)
 
 
-@pytest.mark.flaky()  # Other tests might need to upload runs first
-@pytest.mark.test_server()
-def test_list_all_for_setups(min_number_setups_on_test_server):
+@unittest.mock.patch("openml.setups.functions._list_setups")
+def test_list_all_for_setups(mock_list_setups, min_number_setups_on_test_server):
+    mock_list_setups.side_effect = _create_mock_listing_call(
+        min_number_setups_on_test_server,
+        lambda i: OpenMLSetup(setup_id=i, flow_id=1, parameters={}),
+        return_type="list"
+    )
     # TODO apparently list_setups function does not support kwargs
     setups = openml.setups.list_setups(size=min_number_setups_on_test_server)
     assert min_number_setups_on_test_server == len(setups)
 
 
-@pytest.mark.flaky()  # Other tests might need to upload runs first
-@pytest.mark.test_server()
-def test_list_all_for_runs(min_number_runs_on_test_server):
+@unittest.mock.patch("openml.runs.functions._list_runs")
+def test_list_all_for_runs(mock_list_runs, min_number_runs_on_test_server):
+    mock_list_runs.side_effect = _create_mock_listing_call(
+        min_number_runs_on_test_server, lambda i: {"run_id": i}
+    )
     runs = openml.runs.list_runs(size=min_number_runs_on_test_server)
     assert min_number_runs_on_test_server == len(runs)
 
 
-@pytest.mark.flaky()  # Other tests might need to upload runs first
-@pytest.mark.test_server()
-def test_list_all_for_evaluations(min_number_evaluations_on_test_server):
+@unittest.mock.patch("openml.evaluations.functions._list_evaluations")
+def test_list_all_for_evaluations(mock_list_evaluations, min_number_evaluations_on_test_server):
+    mock_list_evaluations.side_effect = _create_mock_listing_call(
+        min_number_evaluations_on_test_server,
+        lambda i: OpenMLEvaluation(
+            run_id=i, task_id=1, setup_id=1, flow_id=1, flow_name="flow", data_id=1, data_name="data",
+            function="predictive_accuracy", upload_time="2020-01-01", uploader=1, uploader_name="user",
+            value=0.5, values=None
+        ),
+        return_type="list"
+    )
     # TODO apparently list_evaluations function does not support kwargs
     evaluations = openml.evaluations.list_evaluations(
         function="predictive_accuracy",
@@ -116,7 +166,6 @@ def test_list_all_for_evaluations(min_number_evaluations_on_test_server):
 
 
 @unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=_mocked_perform_api_call)
-@pytest.mark.test_server()
 def test_list_all_few_results_available(_perform_api_call):
     datasets = openml.datasets.list_datasets(size=1000, data_name="iris", data_version=1)
     assert len(datasets) == 1, "only one iris dataset version 1 should be present"
@@ -141,14 +190,21 @@ def test__create_cache_directory(config_mock, tmp_path):
         openml.utils._create_cache_directory("ghi")
 
 
-@pytest.mark.test_server()
-def test_correct_test_server_download_state():
+@unittest.mock.patch("openml.tasks.get_task")
+def test_correct_test_server_download_state(mock_get_task):
     """This test verifies that the test server downloads the data from the correct source.
 
     If this tests fails, it is highly likely that the test server is not configured correctly.
     Usually, this means that the test server is serving data from the task with the same ID from the production server.
     That is, it serves parquet files wrongly associated with the test server's task.
     """
+    mock_task = unittest.mock.Mock()
+    mock_dataset = unittest.mock.Mock()
+    mock_dataset.features = {0: "feature1", 1: "feature2"}
+    mock_dataset.get_data.return_value = (pd.DataFrame({"feature1": [1], "feature2": [2]}), None, None, None)
+    mock_task.get_dataset.return_value = mock_dataset
+    mock_get_task.return_value = mock_task
+
     task = openml.tasks.get_task(119)
     dataset = task.get_dataset()
     assert len(dataset.features) == dataset.get_data()[0].shape[1]

From 73d12dc8ec6305ce04ae2edafdf6b86e4cdd3203 Mon Sep 17 00:00:00 2001
From: Rahul Drabit Chowdhury <104688569+Rahuldrabit@users.noreply.github.com>
Date: Mon, 30 Mar 2026 09:34:07 +0600
Subject: [PATCH 2/4] Update tests/test_utils/test_utils.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 tests/test_utils/test_utils.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 208b94ca1..0fd8039e2 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -151,9 +151,19 @@ def test_list_all_for_evaluations(mock_list_evaluations, min_number_evaluations_
     mock_list_evaluations.side_effect = _create_mock_listing_call(
         min_number_evaluations_on_test_server,
         lambda i: OpenMLEvaluation(
-            run_id=i, task_id=1, setup_id=1, flow_id=1, flow_name="flow", data_id=1, data_name="data",
-            function="predictive_accuracy", upload_time="2020-01-01", uploader=1, uploader_name="user",
-            value=0.5, values=None
+            run_id=i,
+            task_id=1,
+            setup_id=1,
+            flow_id=1,
+            flow_name="flow",
+            data_id=1,
+            data_name="data",
+            function="predictive_accuracy",
+            upload_time="2020-01-01",
+            uploader=1,
+            uploader_name="user",
+            value=0.5,
+            values=None,
         ),
         return_type="list"
     )

From c145567a61bc8698bcad285e0af41f2de248d70f Mon Sep 17 00:00:00 2001
From: Rahul Drabit Chowdhury <104688569+Rahuldrabit@users.noreply.github.com>
Date: Mon, 30 Mar 2026 09:34:25 +0600
Subject: [PATCH 3/4] Update tests/test_utils/test_utils.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 tests/test_utils/test_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 0fd8039e2..7ad073a22 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -48,7 +48,6 @@ def min_number_evaluations_on_test_server() -> int:
     return 8
 
 
-
 def _create_mock_listing_call(total_items, item_factory, return_type="dataframe"):
     def mock_listing_call(limit, offset, **kwargs):
         if offset >= total_items:

From caec46647f48a1fd4ae661ce02c08c8c973c632c Mon Sep 17 00:00:00 2001
From: Rahul Drabit Chowdhury <104688569+Rahuldrabit@users.noreply.github.com>
Date: Mon, 30 Mar 2026 09:34:40 +0600
Subject: [PATCH 4/4] Update tests/test_utils/test_utils.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 tests/test_utils/test_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 7ad073a22..27142324b 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -56,7 +56,6 @@ def mock_listing_call(limit, offset, **kwargs):
         items = [item_factory(i) for i in range(offset, offset + size)]
         return pd.DataFrame(items) if return_type == "dataframe" else items
     return mock_listing_call
-
 def _mocked_perform_api_call(call, request_method):
     if call == "data/list/limit/1000/offset/0/data_name/iris/data_version/1":
         return """<oml:data xmlns:oml="http://openml.org/openml">