diff --git a/src/VirtualClient/VirtualClient.Actions.FunctionalTests/SuperBenchmarkProfileTests.cs b/src/VirtualClient/VirtualClient.Actions.FunctionalTests/SuperBenchmarkProfileTests.cs index 92cabfc783..96ea2a4a99 100644 --- a/src/VirtualClient/VirtualClient.Actions.FunctionalTests/SuperBenchmarkProfileTests.cs +++ b/src/VirtualClient/VirtualClient.Actions.FunctionalTests/SuperBenchmarkProfileTests.cs @@ -39,7 +39,7 @@ public void SuperBenchmarkWorkloadProfileParametersAreInlinedCorrectly(string pr } [Test] - [TestCase("PERF-GPU-SUPERBENCH.json")] + [TestCase("SETUP-NVIDIA-A100.json")] public async Task SuperBenchmarkWorkloadProfileExecutesTheExpectedDependenciesAndReboot(string profile) { List expectedCommands = new List @@ -74,7 +74,7 @@ public async Task SuperBenchmarkWorkloadProfileExecutesTheExpectedDependenciesAn } [Test] - [TestCase("PERF-GPU-SUPERBENCH.json")] + [TestCase("SETUP-NVIDIA-A100.json")] public async Task SuperBenchmarkWorkloadProfileExecutesTheExpectedDependenciesAndWorkloadsAfterReboot(string profile) { IEnumerable expectedCommands = this.GetProfileExpectedCommands(PlatformID.Unix); @@ -123,12 +123,7 @@ private IEnumerable GetProfileExpectedCommands(PlatformID platform) $"sudo bash -c \"{setupCommand}\"", $"sudo apt-get update", $"sudo apt-get install -y nvidia-container-toolkit", - $"sudo systemctl restart docker", - $"sudo chmod -R 2777 \"/home/user/tools/VirtualClient\"", - $"sudo git clone -b v0.9.0 https://github.com/microsoft/superbenchmark", - $"sudo bash initialize.sh", - $"sb deploy --host-list localhost -i superbench/superbench:v0.9.0-cuda12.1", - $"sb run --host-list localhost -c default.yaml" + $"sudo systemctl restart docker" }; } } diff --git a/src/VirtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs b/src/VirtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs index ed727f84f8..92523d692c 100644 --- a/src/VirtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs +++ b/src/VirtualClient/VirtualClient.Actions.UnitTests/SuperBenchmark/SuperBenchmarkExecutorTests.cs @@ -260,6 +260,60 @@ public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallati Assert.IsTrue(processCount == 5); } + [Test] + public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallationAndDockerContainerPath() + { + this.mockFixture.Parameters = new Dictionary() + { + { nameof(SuperBenchmarkExecutor.Version), "0.0.1" }, + { nameof(SuperBenchmarkExecutor.ContainerVersion), "testContainer" }, + { nameof(SuperBenchmarkExecutor.ConfigurationFile), "Test.yaml" }, + { nameof(SuperBenchmarkExecutor.Username), "testuser" }, + { nameof(SuperBenchmarkExecutor.DockerContainerPath), "/docker/path" } + }; + + ProcessStartInfo expectedInfo = new ProcessStartInfo(); + List expectedCommands = new List + { + $"sudo chmod -R 2777 \"{this.mockFixture.PlatformSpecifics.CurrentDirectory}\"", + $"sudo git clone -b v0.0.1 https://github.com/microsoft/superbenchmark", + $"sudo bash initialize.sh testuser /docker/path", + $"sb deploy --host-list localhost -i testContainer", + $"sb run --host-list localhost -c Test.yaml" + }; + + int processCount = 0; + this.mockFixture.ProcessManager.OnCreateProcess = (exe, arguments, workingDir) => + { + Assert.AreEqual(expectedCommands.ElementAt(processCount), $"{exe} {arguments}"); + processCount++; + + return new InMemoryProcess + { + StartInfo = new ProcessStartInfo + { + FileName = exe, + Arguments = arguments + }, + ExitCode = 0, + OnStart = () => true, + OnHasExited = () => true + }; + }; + + this.mockFixture.StateManager.OnGetState().ReturnsAsync(JObject.FromObject(new SuperBenchmarkExecutor.SuperBenchmarkState() + { + SuperBenchmarkInitialized = false + })); + + using (TestSuperBenchmarkExecutor superBenchmarkExecutor = new TestSuperBenchmarkExecutor(this.mockFixture.Dependencies, this.mockFixture.Parameters)) + { + await superBenchmarkExecutor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false); + } + + Assert.IsTrue(processCount == expectedCommands.Count); + } + [Test] public async Task SuperBenchmarkExecutorSkipsInitializationOfTheWorkloadForExecutionAfterTheFirstRun() { diff --git a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs index afd8c2ea60..1010966216 100644 --- a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs +++ b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/SuperBenchmarkExecutor.cs @@ -7,14 +7,12 @@ namespace VirtualClient.Actions using System.Collections.Generic; using System.IO; using System.IO.Abstractions; - using System.Runtime.InteropServices; using System.Threading; using System.Threading.Tasks; using Microsoft.CodeAnalysis; using Microsoft.Extensions.DependencyInjection; using VirtualClient.Common; using VirtualClient.Common.Extensions; - using VirtualClient.Common.Platform; using VirtualClient.Common.Telemetry; using VirtualClient.Contracts; using VirtualClient.Contracts.Metadata; @@ -121,6 +119,18 @@ public string OutputDirectory } } + /// + /// Path to hold all docker container data. + /// + public string DockerContainerPath + { + get + { + this.Parameters.TryGetValue(nameof(SuperBenchmarkExecutor.DockerContainerPath), out IConvertible dockerContainerPath); + return dockerContainerPath?.ToString(); + } + } + /// /// Executes the SuperBenchmark workload. /// @@ -173,7 +183,14 @@ protected override async Task InitializeAsync(EventContext telemetryContext, Can true); } - await this.ExecuteSbCommandAsync("bash", $"initialize.sh {this.Username}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, true); + string initializeArgs = $"initialize.sh {this.Username}"; + + if (!string.IsNullOrEmpty(this.DockerContainerPath)) + { + initializeArgs = $"initialize.sh {this.Username} {this.DockerContainerPath}"; + } + + await this.ExecuteSbCommandAsync("bash", initializeArgs, this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, true); await this.ExecuteSbCommandAsync("sb", $"deploy --host-list localhost -i {this.ContainerVersion}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, false); state.SuperBenchmarkInitialized = true; diff --git a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/initialize.sh b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/initialize.sh index cfd7e68560..4e2d9262b1 100644 --- a/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/initialize.sh +++ b/src/VirtualClient/VirtualClient.Actions/SuperBenchmark/initialize.sh @@ -1,10 +1,49 @@ # Ansible will use sudo which needs explicit password input. This command removes that step. echo '$1 ALL=(ALL) NOPASSWD:ALL' | (sudo EDITOR='tee -a' visudo) -# sb binary might be in this path. This command adds this path to the PATH variable. -export PATH=$PATH:/home/$1/.local/bin + +# Remove any existing system-installed Ansible to avoid version conflicts +sudo apt remove -y ansible || true +sudo pip3 uninstall -y ansible ansible-base ansible-core || true + +# Install ansible-core compatible with Python 3.8 (Ubuntu 20.04) +python3 -m pip install --user "ansible-core>=2.12,<2.14" + +# Ensure the pip user-installed ansible is in PATH and takes precedence +export PATH=/home/$1/.local/bin:$PATH + +# Configure Docker to use the data disk at path, unless not provided +if [[ -n "${2:-}" ]]; then + DOCKER_DATA_ROOT="$2" + echo "Configuring Docker data-root at ${DOCKER_DATA_ROOT} ..." + + # Create target path and stop Docker cleanly + sudo mkdir -p "${DOCKER_DATA_ROOT}" + sudo systemctl stop docker || true + + # Write/merge daemon.json to set data-root + # If jq is present and an existing file exists, merge to preserve other keys; otherwise overwrite minimal file. + if command -v jq >/dev/null 2>&1 && [[ -f /etc/docker/daemon.json ]]; then + TMP_JSON=$(mktemp) + sudo jq --arg dr "${DOCKER_DATA_ROOT}" '. + { "data-root": $dr }' /etc/docker/daemon.json | sudo tee "${TMP_JSON}" >/dev/null + sudo mv "${TMP_JSON}" /etc/docker/daemon.json + else + echo "{\"data-root\": \"${DOCKER_DATA_ROOT}\"}" | sudo tee /etc/docker/daemon.json >/dev/null + fi + + # Start Docker back up + sudo systemctl start docker + + # (Optional) Warm-up/check NVIDIA devices as you had in the commented section + # sudo docker run --rm --gpus all nvidia/cuda:11.0.3-base nvidia-smi +else + echo "No second argument provided; skipping Docker data-root configuration." +fi + # Command to install sb dependencies. python3 -m pip install . + # Command to build sb. make postinstall + # This command initiates /dev/nvidiactl and /dev/nvidia-uvm directories, which sb checks before running. sudo docker run --rm --gpus all nvidia/cuda:11.0.3-base nvidia-smi \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Main/profiles/PERF-GPU-SUPERBENCH.json b/src/VirtualClient/VirtualClient.Main/profiles/PERF-GPU-SUPERBENCH.json index 42ce2f08dd..349cef0664 100644 --- a/src/VirtualClient/VirtualClient.Main/profiles/PERF-GPU-SUPERBENCH.json +++ b/src/VirtualClient/VirtualClient.Main/profiles/PERF-GPU-SUPERBENCH.json @@ -5,14 +5,12 @@ "RecommendedMinimumExecutionTime": "08:00:00", "SupportedPlatforms": "linux-x64", "SupportedOperatingSystems": "Ubuntu", - "SpecialRequirements": "This is an Nvidia GPU-specialized workload. It depends upon the system having an Nvidia GPU card/chip." + "SpecialRequirements": "This is an Nvidia GPU-specialized workload. It depends upon the system having an Nvidia GPU card/chip. Use a SETUP profile for GPU setup." }, "Parameters": { "ConfigurationFile": "default.yaml", "Username": "", - "LinuxCudaVersion": "12.0", - "LinuxDriverVersion": "525", - "LinuxLocalRunFile": "https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run" + "DockerContainerPath": null }, "Actions": [ { @@ -21,44 +19,10 @@ "Scenario": "Models", "Username": "$.Parameters.Username", "Version": "0.9.0", + "DockerContainerPath": "$.Parameters.DockerContainerPath", "ConfigurationFile": "$.Parameters.ConfigurationFile", "ContainerVersion": "superbench/superbench:v0.9.0-cuda12.1" } } - ], - "Dependencies": [ - { - "Type": "NvidiaCudaInstallation", - "Parameters": { - "Scenario": "InstallNvidiaCuda", - "LinuxCudaVersion": "$.Parameters.LinuxCudaVersion", - "LinuxDriverVersion": "$.Parameters.LinuxDriverVersion", - "Username": "$.Parameters.Username", - "LinuxLocalRunFile": "$.Parameters.LinuxLocalRunFile" - } - }, - { - "Type": "DockerInstallation", - "Parameters": { - "Scenario": "InstallDocker" - } - }, - { - "Type": "NvidiaContainerToolkitInstallation", - "Parameters": { - "Scenario": "InstallNvidiaContainerToolkit" - } - }, - { - "Type": "LinuxPackageInstallation", - "Parameters": { - "Scenario": "InstallLinuxPackages", - "Packages": "sshpass,python3-pip", - "Packages-Apt": "nvidia-common", - "Packages-Dnf": "nvidia-driver", - "Packages-Yum": "nvidia-driver", - "Packages-Zypper": "" - } - } ] } diff --git a/src/VirtualClient/VirtualClient.Main/profiles/SETUP-NVIDIA-A100.json b/src/VirtualClient/VirtualClient.Main/profiles/SETUP-NVIDIA-A100.json new file mode 100644 index 0000000000..2f42a941e1 --- /dev/null +++ b/src/VirtualClient/VirtualClient.Main/profiles/SETUP-NVIDIA-A100.json @@ -0,0 +1,53 @@ +{ + "Description": "NVIDIA A100 GPU Driver Installation Dependency", + "Metadata": { + "RecommendedMinimumExecutionTime": "00:10:00", + "SupportedPlatforms": "linux-x64", + "SupportedOperatingSystems": "Linux", + "SupportedLinuxGpuModel": "NVIDIA A100", + "SupportedLinuxDistros": "Ubuntu20", + "SpecialRequirements": "This is an NVIDIA GPU Driver dependency. It can only be installed on the system having an NVIDIA A100 GPU card/chip." + }, + "Parameters": { + "ConfigurationFile": "default.yaml", + "Username": "", + "LinuxCudaVersion": "12.0", + "LinuxDriverVersion": "525", + "LinuxLocalRunFile": "https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run" + }, + "Dependencies": [ + { + "Type": "DockerInstallation", + "Parameters": { + "Scenario": "InstallDocker" + } + }, + { + "Type": "NvidiaCudaInstallation", + "Parameters": { + "Scenario": "InstallNvidiaCuda", + "LinuxCudaVersion": "$.Parameters.LinuxCudaVersion", + "LinuxDriverVersion": "$.Parameters.LinuxDriverVersion", + "Username": "$.Parameters.Username", + "LinuxLocalRunFile": "$.Parameters.LinuxLocalRunFile" + } + }, + { + "Type": "NvidiaContainerToolkitInstallation", + "Parameters": { + "Scenario": "InstallNvidiaContainerToolkit" + } + }, + { + "Type": "LinuxPackageInstallation", + "Parameters": { + "Scenario": "InstallLinuxPackages", + "Packages": "sshpass,python3-pip", + "Packages-Apt": "nvidia-common", + "Packages-Dnf": "nvidia-driver", + "Packages-Yum": "nvidia-driver", + "Packages-Zypper": "" + } + } + ] +} \ No newline at end of file