Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public void SuperBenchmarkWorkloadProfileParametersAreInlinedCorrectly(string pr
}

[Test]
[TestCase("PERF-GPU-SUPERBENCH.json")]
[TestCase("SETUP-NVIDIA-A100.json")]
public async Task SuperBenchmarkWorkloadProfileExecutesTheExpectedDependenciesAndReboot(string profile)
{
List<string> expectedCommands = new List<string>
Expand Down Expand Up @@ -74,7 +74,7 @@ public async Task SuperBenchmarkWorkloadProfileExecutesTheExpectedDependenciesAn
}

[Test]
[TestCase("PERF-GPU-SUPERBENCH.json")]
[TestCase("SETUP-NVIDIA-A100.json")]
public async Task SuperBenchmarkWorkloadProfileExecutesTheExpectedDependenciesAndWorkloadsAfterReboot(string profile)
{
IEnumerable<string> expectedCommands = this.GetProfileExpectedCommands(PlatformID.Unix);
Expand Down Expand Up @@ -123,12 +123,7 @@ private IEnumerable<string> GetProfileExpectedCommands(PlatformID platform)
$"sudo bash -c \"{setupCommand}\"",
$"sudo apt-get update",
$"sudo apt-get install -y nvidia-container-toolkit",
$"sudo systemctl restart docker",
$"sudo chmod -R 2777 \"/home/user/tools/VirtualClient\"",
$"sudo git clone -b v0.9.0 https://github.com/microsoft/superbenchmark",
$"sudo bash initialize.sh",
$"sb deploy --host-list localhost -i superbench/superbench:v0.9.0-cuda12.1",
$"sb run --host-list localhost -c default.yaml"
$"sudo systemctl restart docker"
};
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,60 @@ public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallati
Assert.IsTrue(processCount == 5);
}

[Test]
public async Task SuperBenchmarkExecutorExecutesTheCorrectCommandsWithInstallationAndDockerContainerPath()
{
this.mockFixture.Parameters = new Dictionary<string, IConvertible>()
{
{ nameof(SuperBenchmarkExecutor.Version), "0.0.1" },
{ nameof(SuperBenchmarkExecutor.ContainerVersion), "testContainer" },
{ nameof(SuperBenchmarkExecutor.ConfigurationFile), "Test.yaml" },
{ nameof(SuperBenchmarkExecutor.Username), "testuser" },
{ nameof(SuperBenchmarkExecutor.DockerContainerPath), "/docker/path" }
};

ProcessStartInfo expectedInfo = new ProcessStartInfo();
List<string> expectedCommands = new List<string>
{
$"sudo chmod -R 2777 \"{this.mockFixture.PlatformSpecifics.CurrentDirectory}\"",
$"sudo git clone -b v0.0.1 https://github.com/microsoft/superbenchmark",
$"sudo bash initialize.sh testuser /docker/path",
$"sb deploy --host-list localhost -i testContainer",
$"sb run --host-list localhost -c Test.yaml"
};

int processCount = 0;
this.mockFixture.ProcessManager.OnCreateProcess = (exe, arguments, workingDir) =>
{
Assert.AreEqual(expectedCommands.ElementAt(processCount), $"{exe} {arguments}");
processCount++;

return new InMemoryProcess
{
StartInfo = new ProcessStartInfo
{
FileName = exe,
Arguments = arguments
},
ExitCode = 0,
OnStart = () => true,
OnHasExited = () => true
};
};

this.mockFixture.StateManager.OnGetState().ReturnsAsync(JObject.FromObject(new SuperBenchmarkExecutor.SuperBenchmarkState()
{
SuperBenchmarkInitialized = false
}));

using (TestSuperBenchmarkExecutor superBenchmarkExecutor = new TestSuperBenchmarkExecutor(this.mockFixture.Dependencies, this.mockFixture.Parameters))
{
await superBenchmarkExecutor.ExecuteAsync(CancellationToken.None).ConfigureAwait(false);
}

Assert.IsTrue(processCount == expectedCommands.Count);
}

[Test]
public async Task SuperBenchmarkExecutorSkipsInitializationOfTheWorkloadForExecutionAfterTheFirstRun()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,12 @@ namespace VirtualClient.Actions
using System.Collections.Generic;
using System.IO;
using System.IO.Abstractions;
using System.Runtime.InteropServices;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.CodeAnalysis;
using Microsoft.Extensions.DependencyInjection;
using VirtualClient.Common;
using VirtualClient.Common.Extensions;
using VirtualClient.Common.Platform;
using VirtualClient.Common.Telemetry;
using VirtualClient.Contracts;
using VirtualClient.Contracts.Metadata;
Expand Down Expand Up @@ -121,6 +119,18 @@ public string OutputDirectory
}
}

/// <summary>
/// Path to hold all docker container data.
/// </summary>
public string DockerContainerPath
{
get
{
this.Parameters.TryGetValue(nameof(SuperBenchmarkExecutor.DockerContainerPath), out IConvertible dockerContainerPath);
return dockerContainerPath?.ToString();
}
}

/// <summary>
/// Executes the SuperBenchmark workload.
/// </summary>
Expand Down Expand Up @@ -173,7 +183,14 @@ protected override async Task InitializeAsync(EventContext telemetryContext, Can
true);
}

await this.ExecuteSbCommandAsync("bash", $"initialize.sh {this.Username}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, true);
string initializeArgs = $"initialize.sh {this.Username}";

if (!string.IsNullOrEmpty(this.DockerContainerPath))
{
initializeArgs = $"initialize.sh {this.Username} {this.DockerContainerPath}";
}

await this.ExecuteSbCommandAsync("bash", initializeArgs, this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, true);
await this.ExecuteSbCommandAsync("sb", $"deploy --host-list localhost -i {this.ContainerVersion}", this.SuperBenchmarkDirectory, telemetryContext, cancellationToken, false);

state.SuperBenchmarkInitialized = true;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,49 @@
# Ansible will use sudo which needs explicit password input. This command removes that step.
echo '$1 ALL=(ALL) NOPASSWD:ALL' | (sudo EDITOR='tee -a' visudo)
# sb binary might be in this path. This command adds this path to the PATH variable.
export PATH=$PATH:/home/$1/.local/bin

# Remove any existing system-installed Ansible to avoid version conflicts
sudo apt remove -y ansible || true
sudo pip3 uninstall -y ansible ansible-base ansible-core || true

# Install ansible-core compatible with Python 3.8 (Ubuntu 20.04)
python3 -m pip install --user "ansible-core>=2.12,<2.14"

# Ensure the pip user-installed ansible is in PATH and takes precedence
export PATH=/home/$1/.local/bin:$PATH

# Configure Docker to use the data disk at path, unless not provided
if [[ -n "${2:-}" ]]; then
DOCKER_DATA_ROOT="$2"
echo "Configuring Docker data-root at ${DOCKER_DATA_ROOT} ..."

# Create target path and stop Docker cleanly
sudo mkdir -p "${DOCKER_DATA_ROOT}"
sudo systemctl stop docker || true

# Write/merge daemon.json to set data-root
# If jq is present and an existing file exists, merge to preserve other keys; otherwise overwrite minimal file.
if command -v jq >/dev/null 2>&1 && [[ -f /etc/docker/daemon.json ]]; then
TMP_JSON=$(mktemp)
sudo jq --arg dr "${DOCKER_DATA_ROOT}" '. + { "data-root": $dr }' /etc/docker/daemon.json | sudo tee "${TMP_JSON}" >/dev/null
sudo mv "${TMP_JSON}" /etc/docker/daemon.json
else
echo "{\"data-root\": \"${DOCKER_DATA_ROOT}\"}" | sudo tee /etc/docker/daemon.json >/dev/null
fi

# Start Docker back up
sudo systemctl start docker

# (Optional) Warm-up/check NVIDIA devices as you had in the commented section
# sudo docker run --rm --gpus all nvidia/cuda:11.0.3-base nvidia-smi
else
echo "No second argument provided; skipping Docker data-root configuration."
fi

# Command to install sb dependencies.
python3 -m pip install .

# Command to build sb.
make postinstall

# This command initiates /dev/nvidiactl and /dev/nvidia-uvm directories, which sb checks before running.
sudo docker run --rm --gpus all nvidia/cuda:11.0.3-base nvidia-smi
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@
"RecommendedMinimumExecutionTime": "08:00:00",
"SupportedPlatforms": "linux-x64",
"SupportedOperatingSystems": "Ubuntu",
"SpecialRequirements": "This is an Nvidia GPU-specialized workload. It depends upon the system having an Nvidia GPU card/chip."
"SpecialRequirements": "This is an Nvidia GPU-specialized workload. It depends upon the system having an Nvidia GPU card/chip. Use a SETUP profile for GPU setup."
},
"Parameters": {
"ConfigurationFile": "default.yaml",
"Username": "",
"LinuxCudaVersion": "12.0",
"LinuxDriverVersion": "525",
"LinuxLocalRunFile": "https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run"
"DockerContainerPath": null
},
"Actions": [
{
Expand All @@ -21,44 +19,10 @@
"Scenario": "Models",
"Username": "$.Parameters.Username",
"Version": "0.9.0",
"DockerContainerPath": "$.Parameters.DockerContainerPath",
"ConfigurationFile": "$.Parameters.ConfigurationFile",
"ContainerVersion": "superbench/superbench:v0.9.0-cuda12.1"
}
}
],
"Dependencies": [
{
"Type": "NvidiaCudaInstallation",
"Parameters": {
"Scenario": "InstallNvidiaCuda",
"LinuxCudaVersion": "$.Parameters.LinuxCudaVersion",
"LinuxDriverVersion": "$.Parameters.LinuxDriverVersion",
"Username": "$.Parameters.Username",
"LinuxLocalRunFile": "$.Parameters.LinuxLocalRunFile"
}
},
{
"Type": "DockerInstallation",
"Parameters": {
"Scenario": "InstallDocker"
}
},
{
"Type": "NvidiaContainerToolkitInstallation",
"Parameters": {
"Scenario": "InstallNvidiaContainerToolkit"
}
},
{
"Type": "LinuxPackageInstallation",
"Parameters": {
"Scenario": "InstallLinuxPackages",
"Packages": "sshpass,python3-pip",
"Packages-Apt": "nvidia-common",
"Packages-Dnf": "nvidia-driver",
"Packages-Yum": "nvidia-driver",
"Packages-Zypper": ""
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"Description": "NVIDIA A100 GPU Driver Installation Dependency",
"Metadata": {
"RecommendedMinimumExecutionTime": "00:10:00",
"SupportedPlatforms": "linux-x64",
"SupportedOperatingSystems": "Linux",
"SupportedLinuxGpuModel": "NVIDIA A100",
"SupportedLinuxDistros": "Ubuntu20",
"SpecialRequirements": "This is an NVIDIA GPU Driver dependency. It can only be installed on the system having an NVIDIA A100 GPU card/chip."
},
"Parameters": {
"ConfigurationFile": "default.yaml",
"Username": "",
"LinuxCudaVersion": "12.0",
"LinuxDriverVersion": "525",
"LinuxLocalRunFile": "https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run"
},
"Dependencies": [
{
"Type": "DockerInstallation",
"Parameters": {
"Scenario": "InstallDocker"
}
},
{
"Type": "NvidiaCudaInstallation",
"Parameters": {
"Scenario": "InstallNvidiaCuda",
"LinuxCudaVersion": "$.Parameters.LinuxCudaVersion",
"LinuxDriverVersion": "$.Parameters.LinuxDriverVersion",
"Username": "$.Parameters.Username",
"LinuxLocalRunFile": "$.Parameters.LinuxLocalRunFile"
}
},
{
"Type": "NvidiaContainerToolkitInstallation",
"Parameters": {
"Scenario": "InstallNvidiaContainerToolkit"
}
},
{
"Type": "LinuxPackageInstallation",
"Parameters": {
"Scenario": "InstallLinuxPackages",
"Packages": "sshpass,python3-pip",
"Packages-Apt": "nvidia-common",
"Packages-Dnf": "nvidia-driver",
"Packages-Yum": "nvidia-driver",
"Packages-Zypper": ""
}
}
]
}