From a4d309056e3c4b22aa58d2cf0ec94e2d90b13b70 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Mon, 9 Mar 2026 23:52:19 +0100 Subject: [PATCH] Support multi-cluster operation in Slurm backends --- docs/config_reference.rst | 18 +++++++++++++++++- reframe/core/schedulers/slurm.py | 18 +++++++++++++++--- reframe/schemas/config.json | 5 +++++ 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/docs/config_reference.rst b/docs/config_reference.rst index 4299ca224..eb7fda441 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -431,6 +431,22 @@ System Partition Configuration List of hosts in a partition that uses the ``ssh`` scheduler. +.. py:attribute:: systems.partitions.sched_options.slurm_multi_cluster_mode + + :required: No + :default: ``[]`` + + List of Slurm clusters to poll for submitted jobs. + + If empty, only the local cluster is considered. + If the single value ``all`` is passed, then all clusters will be considered. + This is translated directly to Slurm's ``-M`` option passed to the ``sacct`` or ``squeue`` commands. + + This option is relevant only for the Slurm backends. + + .. versionadded:: 4.10 + + .. py:attribute:: systems.partitions.sched_options.ignore_reqnodenotavail :required: No @@ -1647,7 +1663,7 @@ The additional properties for the ``httpjson`` handler are the following: These may depend on the server configuration. .. note:: - If you specify an authorization header here, it will be evaluated at the start of the test session and potentially expire. + If you specify an authorization header here, it will be evaluated at the start of the test session and potentially expire. Consider using the :attr:`~config.logging.handlers_perflog..httpjson..authorization_header` parameter instead for dynamic authorization headers. .. versionadded:: 4.2 diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index ac18c2eda..9fc834693 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -147,6 +147,7 @@ def __init__(self): self._sched_access_in_submit = self.get_option( 'sched_access_in_submit' ) + self._multi_clusters = self.get_option('slurm_multi_cluster_mode') self._available_states = { 'ALLOCATED', 'COMPLETING', @@ -155,6 +156,15 @@ def __init__(self): 'RESERVED' } + # Define the base sacct and squeue commands to account for Slurm's + # multiple cluster mode if enabled + self._sacct = 'sacct' + self._squeue = 'squeue' + if self._multi_clusters: + clusters = ",".join(self._multi_clusters) + self._sacct += f' -M {clusters}' + self._squeue += f' -M {clusters}' + def make_job(self, *args, **kwargs): return _SlurmJob(*args, **kwargs) @@ -491,7 +501,7 @@ def poll(self, *jobs): ) try: completed = _run_strict( - f'sacct -S {t_start} -P ' + f'{self._sacct} -S {t_start} -P ' f'-j {",".join(job.jobid for job in jobs)} ' f'-o jobid,state,exitcode,end,nodelist' ) @@ -570,7 +580,9 @@ def _cancel_if_blocked(self, job, reasons=None): return if not reasons: - completed = osext.run_command('squeue -h -j %s -o %%r' % job.jobid) + completed = osext.run_command( + f'{self._squeue} -h -j {job.jobid} -o %r' + ) reasons = completed.stdout.splitlines() if not reasons: # Can't retrieve job's state. Perhaps it has finished already @@ -677,7 +689,7 @@ def poll(self, *jobs): # finished already, squeue might return an error about an invalid # job id. completed = osext.run_command( - f'squeue -h -j {",".join(job.jobid for job in jobs)} ' + f'{self._squeue} -h -j {",".join(job.jobid for job in jobs)} ' f'-o "%%i|%%T|%%N|%%r"' ) diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index fc774f8b6..51a8907d8 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -119,6 +119,10 @@ "type": "array", "items": {"type": "string"} }, + "slurm_multi_cluster_mode": { + "type": "array", + "items": {"type": "string"} + }, "sched_access_in_submit": {"type": "boolean"}, "unqualified_hostnames": {"type": "boolean"}, "use_nodes_option": {"type": "boolean"} @@ -708,6 +712,7 @@ "systems*/sched_options/job_submit_timeout": 60, "systems*/sched_options/max_sacct_failures": 3, "systems*/sched_options/sched_access_in_submit": false, + "systems*/sched_options/slurm_multi_cluster_mode": [], "systems*/sched_options/ssh_hosts": [], "systems*/sched_options/resubmit_on_errors": [], "systems*/sched_options/unqualified_hostnames": false,