Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 26 additions & 3 deletions cmd/amd-ctk/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ package main

import (
"fmt"
"log/slog"
"os"

"github.com/ROCm/container-toolkit/cmd/amd-ctk/cdi"
"github.com/ROCm/container-toolkit/cmd/amd-ctk/gpu-tracker"
gpuTracker "github.com/ROCm/container-toolkit/cmd/amd-ctk/gpu-tracker"
"github.com/ROCm/container-toolkit/cmd/amd-ctk/runtime"
"github.com/ROCm/container-toolkit/internal/logger"
"github.com/urfave/cli/v2"
)

Expand All @@ -33,6 +33,10 @@ var (
GitCommit = "none"
)

type options struct {
debug bool
}

func showVersion() *cli.Command {
showVersionCmd := cli.Command{
Name: "version",
Expand All @@ -48,14 +52,33 @@ func showVersion() *cli.Command {
}

func main() {
logger.Init(false)
opts := options{}

// Create the top-level CLI tree
amdCtkCli := &cli.App{
Name: "AMD Container Toolkit CLI",
EnableBashCompletion: true,
Usage: "Tool to configure AMD Container Toolkit",
UsageText: "amd-ctk [command] [options]",
Flags: []cli.Flag{
&cli.BoolFlag{
Name: "debug",
Aliases: []string{"d"},
Usage: "Enable debug output",
Destination: &opts.debug,
},
},
Before: func(c *cli.Context) error {
level := slog.LevelInfo
if opts.debug {
level = slog.LevelDebug
}
handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: level,
})
slog.SetDefault(slog.New(handler))
return nil
},
}

// Add subcommands
Expand Down
15 changes: 8 additions & 7 deletions cmd/container-runtime/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,36 +17,37 @@
package main

import (
"log/slog"
"os"

"github.com/ROCm/container-toolkit/internal/gpu-tracker"
gpuTracker "github.com/ROCm/container-toolkit/internal/gpu-tracker"
"github.com/ROCm/container-toolkit/internal/logger"
"github.com/ROCm/container-toolkit/internal/runtime"
)

func main() {
logger.Init(false)
logger.Log.Printf("Creating ROCm container runtime with args %v", os.Args)
slog.Info("Creating ROCm container runtime", "args", os.Args)

rt, err := runtime.New(os.Args)
if err != nil {
logger.Log.Printf("Failed to create container runtime, err = %v", err)
slog.Error("Failed to create container runtime", "error", err)
gpuTracker, err := gpuTracker.New()
if err != nil {
logger.Log.Printf("Failed to create GPU tracker, err = %v", err)
slog.Error("Failed to create GPU tracker", "error", err)
os.Exit(1)
}
gpuTracker.ReleaseGPUs(os.Args[len(os.Args)-1])
os.Exit(1)
}

logger.Log.Printf("Running ROCm container runtime")
slog.Info("Running ROCm container runtime")
err = rt.Run()
if err != nil {
logger.Log.Printf("Failed to run container runtime, err = %v", err)
slog.Error("Failed to run container runtime", "error", err)
gpuTracker, err := gpuTracker.New()
if err != nil {
logger.Log.Printf("Failed to create GPU tracker, err = %v", err)
slog.Error("Failed to create GPU tracker", "error", err)
os.Exit(1)
}
gpuTracker.ReleaseGPUs(os.Args[len(os.Args)-1])
Expand Down
5 changes: 5 additions & 0 deletions docs/container-runtime/cdi-guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,8 @@ If ``amd-ctk cdi validate`` reports errors:
* Check that GPU devices are properly detected by the system (verify with ``rocm-smi``, ``amd-smi`` or similar tools)
* Ensure GPU drivers are correctly installed
* Regenerate the specification to reflect the current system state
* Use the ``--debug`` flag for verbose output to help diagnose the issue:

.. code-block:: bash

amd-ctk --debug cdi validate
28 changes: 23 additions & 5 deletions docs/container-runtime/gpu-tracker.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,10 +166,13 @@ Device Node IDs Temp Power Partitions SCLK MCLK

> docker run --runtime=amd -itd -e AMD_VISIBLE_DEVICES=0-2 rocm/rocm-terminal bash
d23ff3dce1839cbf8ce7ad362641ab85e80b315c319edf73b269c460e348053a
docker: Error response from daemon: failed to create task for container: failed to create shim task: OCI runtime create failed: unable to retrieve OCI runtime error (open /run/containerd/io.containerd.runtime.v2.task/moby/d23ff3dce1839cbf8ce7ad362641ab85e80b315c319edf73b269c460e348053a/log.json: no such file or directory): amd-container-runtime did not terminate successfully: exit status 1: GPUs [0 2] allocated
GPUs [1] are exclusive and already in use
Released GPUs [2 0] used by container d23ff3dce1839cbf8ce7ad362641ab85e80b315c319edf73b269c460e348053a
: unknown.
docker: Error response from daemon: failed to create task for container: failed to create shim task: OCI runtime create failed: unable to retrieve OCI runtime error: amd-container-runtime did not terminate successfully: exit status 1

The runtime log at /var/log/amd-container-runtime.log will contain details about the failure:

> grep -E "allocated|exclusive" /var/log/amd-container-runtime.log
time=... level=INFO msg="amd-container-runtime GPUs allocated" gpus=[0 2]
time=... level=ERROR msg="amd-container-runtime Failed to run container runtime" error="update OCI spec (add GPU devices): GPUs [1] are exclusive and already in use"

> amd-ctk gpu-tracker status
------------------------------------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -285,7 +288,7 @@ Device Node IDs Temp Power Partitions SCLK MCLK

```text
> amd-ctk gpu-tracker status
GPUs info is invalid. Please reset GPU Tracker.
showing GPU status: GPU info mismatch: please reset GPU Tracker

> amd-ctk gpu-tracker reset
GPU Tracker has been reset
Expand All @@ -307,3 +310,18 @@ Device Node IDs Temp Power Partitions SCLK MCLK
3 0x12FE4F7FDAF06B9 Shared -
```

## Debugging

For verbose debug output when troubleshooting GPU Tracker issues, use the `--debug` (or `-d`) flag:

```text
> amd-ctk --debug gpu-tracker status
```

This prints debug-level log messages to stderr, which can help diagnose GPU enumeration or tracker state issues.

For container runtime errors (e.g. exclusive GPU enforcement failures), check the runtime log:

```text
> sudo tail -f /var/log/amd-container-runtime.log
```
19 changes: 15 additions & 4 deletions docs/container-runtime/troubleshooting.rst
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ This applies to any run that relies on host GPU devices (e.g. ``docker run --dev
Log File Reference
------------------

The AMD Container Toolkit logs runtime events and errors to the following location:
The AMD container runtime (``amd-container-runtime``) logs events and errors to the following location:

**/var/log/amd-container-runtime.log**

Expand All @@ -157,11 +157,22 @@ You can view logs in real-time using:
This log captures detailed interactions between Docker and the AMD container runtime, including:

- Runtime initialization
- GPU device injection
- GPU device injection and allocation
- OCI specification modifications
- CDI specification usage
- Exclusive GPU enforcement errors

If you experience issues that are not easily diagnosed, refer to this log file for real-time insights and deeper debugging.
If a container fails to start with the AMD runtime, this log will contain the specific error (e.g. ``GPUs [0] are exclusive and already in use``), even when Docker only shows a generic runtime failure message.

.. note::

The ``amd-ctk`` CLI tool prints errors directly to the terminal (not to a log file). For verbose debug output from ``amd-ctk``, use the ``--debug`` (or ``-d``) flag:

.. code-block:: bash

amd-ctk --debug gpu-tracker status
amd-ctk --debug cdi validate

This prints debug-level messages to stderr, which can help diagnose GPU enumeration, tracker state, or CDI specification issues.

Diagnostic Commands
-------------------
Expand Down
16 changes: 8 additions & 8 deletions internal/amdgpu/amdgpu.go
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
/**
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the \"License\");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an \"AS IS\" BASIS,
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Expand All @@ -20,6 +20,7 @@ import (
"bufio"
"fmt"
"io/ioutil"
"log/slog"
"math"
"os"
"os/exec"
Expand All @@ -28,8 +29,6 @@ import (
"sort"
"strconv"
"strings"

"github.com/ROCm/container-toolkit/internal/logger"
)

type DeviceInfo struct {
Expand Down Expand Up @@ -215,6 +214,7 @@ func GetAMDGPUWithFS(fs FileSystem, dev string) (AMDGPU, error) {
ret, err = strconv.ParseInt(out, base, width)
}
if err != nil {
slog.Debug("Failed to parse device stat", "device", dev, "value", out, "base", base, "error", err)
return 0
}

Expand Down Expand Up @@ -253,15 +253,15 @@ func GetDevIdsFromTopology(fs FileSystem, topoRootParam ...string) map[int]strin
renderDevIds := make(map[int]string)
nodeFiles, err := fs.Glob(topoRoot + "/topology/nodes/*/properties")
if err != nil {
logger.Log.Printf("glob error: %s", err)
slog.Warn("Failed to glob topology nodes", "error", err)
return renderDevIds
}

for _, nodeFile := range nodeFiles {
logger.Log.Printf("Parsing %s", nodeFile)
slog.Debug("Parsing topology node file", "file", nodeFile)
renderMinor, err := ParseTopologyProperties(fs, nodeFile, renderMinorRe)
if err != nil {
logger.Log.Printf("Error parsing render minor: %v", err)
slog.Debug("Error parsing render minor", "file", nodeFile, "error", err)
continue
}

Expand All @@ -271,7 +271,7 @@ func GetDevIdsFromTopology(fs FileSystem, topoRootParam ...string) map[int]strin

devID, err := ParseTopologyPropertiesString(fs, nodeFile, topoUniqueIdRe)
if err != nil {
logger.Log.Printf("Error parsing unique_id: %v", err)
slog.Debug("Error parsing unique_id", "file", nodeFile, "error", err)
continue
}

Expand Down
7 changes: 0 additions & 7 deletions internal/amdgpu/amdgpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,16 @@ package amdgpu

import (
"fmt"
"log"
"os"
"path/filepath"
"regexp"
"testing"
"time"

"github.com/ROCm/container-toolkit/internal/logger"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
)

func init() {
// Initialize logger for tests
logger.Log = log.New(os.Stderr, "", log.LstdFlags)
}

// Mock filesystem operations
type mockFS struct {
mock.Mock
Expand Down
9 changes: 0 additions & 9 deletions internal/cdi/cdi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (
"testing"

"github.com/ROCm/container-toolkit/internal/amdgpu"
"github.com/ROCm/container-toolkit/internal/logger"
"github.com/stretchr/testify/assert"
"tags.cncf.io/container-device-interface/specs-go"
)
Expand Down Expand Up @@ -49,13 +48,7 @@ func mockGetAMDGPU(dev string) (amdgpu.AMDGPU, error) {
return gpu, nil
}

func setup(t *testing.T) {
logger.Init(true)
}

func TestInterface(t *testing.T) {
setup(t)

spec := specs.Spec{
Version: "0.6.0",
Kind: "amd.com/gpu",
Expand Down Expand Up @@ -89,7 +82,6 @@ var dummySpec = specs.Spec{
}

func TestWriteSpec(t *testing.T) {
setup(t)

tests := []struct {
name string
Expand Down Expand Up @@ -159,7 +151,6 @@ func TestWriteSpec(t *testing.T) {
}

func TestWriteSpec_OverwritesExisting(t *testing.T) {
setup(t)
dir := t.TempDir()
specPath := filepath.Join(dir, "amd.json")

Expand Down
13 changes: 6 additions & 7 deletions internal/gpu-tracker/gpu-tracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"encoding/json"
"errors"
"fmt"
"log/slog"
"os"
"reflect"
"sort"
Expand All @@ -29,7 +30,6 @@ import (
"time"

"github.com/ROCm/container-toolkit/internal/amdgpu"
"github.com/ROCm/container-toolkit/internal/logger"
"github.com/gofrs/flock"
)

Expand Down Expand Up @@ -439,7 +439,6 @@ func (gpuTracker *gpu_tracker_t) Init() error {
return err
}

logger.Log.Printf("GPU Tracker has been initialized")
return nil
}

Expand Down Expand Up @@ -756,14 +755,14 @@ func (gpuTracker *gpu_tracker_t) ReserveGPUs(gpus string, containerId string) ([
return []int{}, err
}
if len(invalidGPUsRange) > 0 {
logger.Log.Printf("Ignoring %v GPUs Ranges as they are invalid", invalidGPUsRange)
slog.Warn("Ignoring GPUs Ranges as they are invalid", "ranges", invalidGPUsRange)
}
if len(invalidGPUs) > 0 {
logger.Log.Printf("Ignoring %v GPUs as they are invalid", invalidGPUs)
slog.Warn("Ignoring GPUs as they are invalid", "gpus", invalidGPUs)
}

if !gpusTrackerData.Enabled {
logger.Log.Printf("GPU Tracker is disabled")
slog.Debug("GPU Tracker is disabled")
return validGPUs, nil
}

Expand Down Expand Up @@ -798,7 +797,7 @@ func (gpuTracker *gpu_tracker_t) ReserveGPUs(gpus string, containerId string) ([
}

if len(allocatedGPUs) > 0 {
logger.Log.Printf("GPUs %v allocated", allocatedGPUs)
slog.Info("GPUs allocated", "gpus", allocatedGPUs)
}
if len(unavailableGPUs) > 0 {
return []int{}, fmt.Errorf("GPUs %v are exclusive and already in use", unavailableGPUs)
Expand Down Expand Up @@ -852,7 +851,7 @@ func (gpuTracker *gpu_tracker_t) ReleaseGPUs(containerId string) error {
return err
}

logger.Log.Printf("Released GPUs %v used by container %v", releasedGPUs, containerId)
slog.Info("Released GPUs used by container", "gpus", releasedGPUs, "container", containerId)
}

return nil
Expand Down
Loading
Loading