From 26aef509e7f9264d890053139a0282098173ca0d Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Sun, 8 Mar 2026 15:29:50 -0700
Subject: [PATCH] feat(env_config): fix ROCm GPU detection for ROCm 7.2+

- Add amd-smi static --json as primary detection strategy (ROCm 6.3+/7.x)
- Keep rocm-smi as fallback for legacy ROCm <6.3
- Add multi-GPU selection: pick GPU with most VRAM, set HIP_VISIBLE_DEVICES
- Add _check_rocm_runtime() to verify ROCmExecutionProvider in onnxruntime
  (prevents CPU-only onnxruntime from shadowing onnxruntime-rocm)
- Update deploy.sh heuristic to also check for amd-smi
- Document GPU package conflict in requirements.txt
- Add 14 unit tests for all ROCm detection paths
---
 .../detection/yolo-detection-2026/deploy.sh   |   2 +-
 .../yolo-detection-2026/requirements.txt      |   4 +
 .../yolo-detection-2026/scripts/env_config.py |  63 +++-
 skills/lib/env_config.py                      |  63 +++-
 skills/lib/test_env_config_rocm.py            | 283 ++++++++++++++++++
 5 files changed, 404 insertions(+), 11 deletions(-)
 create mode 100644 skills/lib/test_env_config_rocm.py

diff --git a/skills/detection/yolo-detection-2026/deploy.sh b/skills/detection/yolo-detection-2026/deploy.sh
index 894a30d..a1e4771 100755
--- a/skills/detection/yolo-detection-2026/deploy.sh
+++ b/skills/detection/yolo-detection-2026/deploy.sh
@@ -136,7 +136,7 @@ else
             BACKEND="cuda"
             log "Detected NVIDIA GPU (driver: $cuda_ver)"
         fi
-    elif command -v rocm-smi &>/dev/null || [ -d "/opt/rocm" ]; then
+    elif command -v amd-smi &>/dev/null || command -v rocm-smi &>/dev/null || [ -d "/opt/rocm" ]; then
         BACKEND="rocm"
         log "Detected AMD ROCm"
     elif [ "$(uname)" = "Darwin" ] && [ "$(uname -m)" = "arm64" ]; then
diff --git a/skills/detection/yolo-detection-2026/requirements.txt b/skills/detection/yolo-detection-2026/requirements.txt
index a8fa34a..77ef373 100644
--- a/skills/detection/yolo-detection-2026/requirements.txt
+++ b/skills/detection/yolo-detection-2026/requirements.txt
@@ -5,3 +5,7 @@ ultralytics>=8.3.0        # YOLOv11/v10/v8 inference
 numpy>=1.24.0,<2.0.0
 opencv-python-headless>=4.8.0
 Pillow>=10.0.0
+# GPU inference — install ONE of these (not both!):
+# AMD ROCm:  pip install onnxruntime-rocm  (do NOT install onnxruntime alongside)
+# NVIDIA:    handled by ultralytics (tensorrt)
+# CPU only:  pip install onnxruntime
diff --git a/skills/detection/yolo-detection-2026/scripts/env_config.py b/skills/detection/yolo-detection-2026/scripts/env_config.py
index 4386c53..49935b1 100644
--- a/skills/detection/yolo-detection-2026/scripts/env_config.py
+++ b/skills/detection/yolo-detection-2026/scripts/env_config.py
@@ -156,17 +156,60 @@ def _try_cuda(self) -> bool:
         return False
 
     def _try_rocm(self) -> bool:
-        """Detect AMD GPU via rocm-smi or /opt/rocm."""
+        """Detect AMD GPU via amd-smi (preferred) or rocm-smi."""
+        has_amd_smi = shutil.which("amd-smi") is not None
         has_rocm_smi = shutil.which("rocm-smi") is not None
         has_rocm_dir = Path("/opt/rocm").is_dir()
 
-        if not (has_rocm_smi or has_rocm_dir):
+        if not (has_amd_smi or has_rocm_smi or has_rocm_dir):
             return False
 
         self.backend = "rocm"
         self.device = "cuda"  # ROCm exposes as CUDA in PyTorch
 
-        if has_rocm_smi:
+        # Strategy 1: amd-smi static --json (ROCm 6.3+/7.x, richest output)
+        if has_amd_smi:
+            try:
+                result = subprocess.run(
+                    ["amd-smi", "static", "--json"],
+                    capture_output=True, text=True, timeout=10,
+                )
+                if result.returncode == 0:
+                    import json as _json
+                    data = _json.loads(result.stdout)
+                    # amd-smi may return {"gpu_data": [...]} or a bare list
+                    gpu_list = data.get("gpu_data", data) if isinstance(data, dict) else data
+                    if isinstance(gpu_list, list) and len(gpu_list) > 0:
+                        # Pick GPU with most VRAM (discrete > iGPU)
+                        def _vram_mb(g):
+                            vram = g.get("vram", {}).get("size", {})
+                            if isinstance(vram, dict):
+                                return int(vram.get("value", 0))
+                            return 0
+
+                        best_gpu = max(gpu_list, key=_vram_mb)
+                        best_idx = gpu_list.index(best_gpu)
+                        asic = best_gpu.get("asic", {})
+                        vram = best_gpu.get("vram", {}).get("size", {})
+
+                        self.gpu_name = asic.get("market_name", "AMD GPU")
+                        self.gpu_memory_mb = int(vram.get("value", 0)) if isinstance(vram, dict) else 0
+                        self.detection_details["amd_smi"] = {
+                            "gpu_index": best_idx,
+                            "gfx_version": asic.get("target_graphics_version", ""),
+                            "total_gpus": len(gpu_list),
+                        }
+
+                        # Pin to discrete GPU if multiple GPUs present
+                        if len(gpu_list) > 1:
+                            os.environ["HIP_VISIBLE_DEVICES"] = str(best_idx)
+                            os.environ["ROCR_VISIBLE_DEVICES"] = str(best_idx)
+                            _log(f"Multi-GPU: pinned to GPU {best_idx} ({self.gpu_name})")
+            except (subprocess.TimeoutExpired, FileNotFoundError, ValueError, Exception) as e:
+                _log(f"amd-smi probe failed: {e}")
+
+        # Strategy 2: rocm-smi fallback (legacy ROCm <6.3)
+        if not self.gpu_name and has_rocm_smi:
             try:
                 result = subprocess.run(
                     ["rocm-smi", "--showproductname", "--csv"],
@@ -186,7 +229,6 @@ def _try_rocm(self) -> bool:
                     capture_output=True, text=True, timeout=10,
                 )
                 if result.returncode == 0:
-                    # Parse total VRAM
                     for line in result.stdout.strip().split("\n")[1:]:
                         parts = line.split(",")
                         if len(parts) >= 2:
@@ -296,11 +338,22 @@ def _fallback_cpu(self):
 
         _log("No GPU detected, using CPU backend")
 
+    def _check_rocm_runtime(self):
+        """Verify onnxruntime has ROCm provider, not just CPU."""
+        import onnxruntime
+        providers = onnxruntime.get_available_providers()
+        if "ROCmExecutionProvider" in providers or "MIGraphXExecutionProvider" in providers:
+            _log(f"onnxruntime ROCm providers: {providers}")
+            return True
+        _log(f"onnxruntime providers: {providers} — ROCmExecutionProvider not found")
+        _log("Fix: pip uninstall onnxruntime && pip install onnxruntime-rocm")
+        raise ImportError("ROCmExecutionProvider not available")
+
     def _check_framework(self) -> bool:
         """Check if the optimized inference runtime is importable."""
         checks = {
             "cuda": lambda: __import__("tensorrt"),
-            "rocm": lambda: __import__("onnxruntime"),
+            "rocm": lambda: self._check_rocm_runtime(),
             "mps": lambda: __import__("coremltools"),
             "intel": lambda: __import__("openvino"),
             "cpu": lambda: __import__("onnxruntime"),
diff --git a/skills/lib/env_config.py b/skills/lib/env_config.py
index 4386c53..49935b1 100644
--- a/skills/lib/env_config.py
+++ b/skills/lib/env_config.py
@@ -156,17 +156,60 @@ def _try_cuda(self) -> bool:
         return False
 
     def _try_rocm(self) -> bool:
-        """Detect AMD GPU via rocm-smi or /opt/rocm."""
+        """Detect AMD GPU via amd-smi (preferred) or rocm-smi."""
+        has_amd_smi = shutil.which("amd-smi") is not None
         has_rocm_smi = shutil.which("rocm-smi") is not None
         has_rocm_dir = Path("/opt/rocm").is_dir()
 
-        if not (has_rocm_smi or has_rocm_dir):
+        if not (has_amd_smi or has_rocm_smi or has_rocm_dir):
             return False
 
         self.backend = "rocm"
         self.device = "cuda"  # ROCm exposes as CUDA in PyTorch
 
-        if has_rocm_smi:
+        # Strategy 1: amd-smi static --json (ROCm 6.3+/7.x, richest output)
+        if has_amd_smi:
+            try:
+                result = subprocess.run(
+                    ["amd-smi", "static", "--json"],
+                    capture_output=True, text=True, timeout=10,
+                )
+                if result.returncode == 0:
+                    import json as _json
+                    data = _json.loads(result.stdout)
+                    # amd-smi may return {"gpu_data": [...]} or a bare list
+                    gpu_list = data.get("gpu_data", data) if isinstance(data, dict) else data
+                    if isinstance(gpu_list, list) and len(gpu_list) > 0:
+                        # Pick GPU with most VRAM (discrete > iGPU)
+                        def _vram_mb(g):
+                            vram = g.get("vram", {}).get("size", {})
+                            if isinstance(vram, dict):
+                                return int(vram.get("value", 0))
+                            return 0
+
+                        best_gpu = max(gpu_list, key=_vram_mb)
+                        best_idx = gpu_list.index(best_gpu)
+                        asic = best_gpu.get("asic", {})
+                        vram = best_gpu.get("vram", {}).get("size", {})
+
+                        self.gpu_name = asic.get("market_name", "AMD GPU")
+                        self.gpu_memory_mb = int(vram.get("value", 0)) if isinstance(vram, dict) else 0
+                        self.detection_details["amd_smi"] = {
+                            "gpu_index": best_idx,
+                            "gfx_version": asic.get("target_graphics_version", ""),
+                            "total_gpus": len(gpu_list),
+                        }
+
+                        # Pin to discrete GPU if multiple GPUs present
+                        if len(gpu_list) > 1:
+                            os.environ["HIP_VISIBLE_DEVICES"] = str(best_idx)
+                            os.environ["ROCR_VISIBLE_DEVICES"] = str(best_idx)
+                            _log(f"Multi-GPU: pinned to GPU {best_idx} ({self.gpu_name})")
+            except (subprocess.TimeoutExpired, FileNotFoundError, ValueError, Exception) as e:
+                _log(f"amd-smi probe failed: {e}")
+
+        # Strategy 2: rocm-smi fallback (legacy ROCm <6.3)
+        if not self.gpu_name and has_rocm_smi:
             try:
                 result = subprocess.run(
                     ["rocm-smi", "--showproductname", "--csv"],
@@ -186,7 +229,6 @@ def _try_rocm(self) -> bool:
                     capture_output=True, text=True, timeout=10,
                 )
                 if result.returncode == 0:
-                    # Parse total VRAM
                     for line in result.stdout.strip().split("\n")[1:]:
                         parts = line.split(",")
                         if len(parts) >= 2:
@@ -296,11 +338,22 @@ def _fallback_cpu(self):
 
         _log("No GPU detected, using CPU backend")
 
+    def _check_rocm_runtime(self):
+        """Verify onnxruntime has ROCm provider, not just CPU."""
+        import onnxruntime
+        providers = onnxruntime.get_available_providers()
+        if "ROCmExecutionProvider" in providers or "MIGraphXExecutionProvider" in providers:
+            _log(f"onnxruntime ROCm providers: {providers}")
+            return True
+        _log(f"onnxruntime providers: {providers} — ROCmExecutionProvider not found")
+        _log("Fix: pip uninstall onnxruntime && pip install onnxruntime-rocm")
+        raise ImportError("ROCmExecutionProvider not available")
+
     def _check_framework(self) -> bool:
         """Check if the optimized inference runtime is importable."""
         checks = {
             "cuda": lambda: __import__("tensorrt"),
-            "rocm": lambda: __import__("onnxruntime"),
+            "rocm": lambda: self._check_rocm_runtime(),
             "mps": lambda: __import__("coremltools"),
             "intel": lambda: __import__("openvino"),
             "cpu": lambda: __import__("onnxruntime"),
diff --git a/skills/lib/test_env_config_rocm.py b/skills/lib/test_env_config_rocm.py
new file mode 100644
index 0000000..76021c3
--- /dev/null
+++ b/skills/lib/test_env_config_rocm.py
@@ -0,0 +1,283 @@
+#!/usr/bin/env python3
+"""
+Unit tests for ROCm GPU detection in env_config.py.
+
+Tests amd-smi parsing, rocm-smi fallback, provider verification,
+and multi-GPU selection — all mocked, no ROCm hardware required.
+
+Run:  python -m pytest skills/lib/test_env_config_rocm.py -v
+"""
+
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+from unittest import mock
+
+import pytest
+
+# Ensure env_config is importable from skills/lib/
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from env_config import HardwareEnv, _log  # noqa: E402
+
+
+# ── Sample amd-smi JSON (dual-GPU: discrete R9700 + iGPU) ─────────────────
+
+AMD_SMI_DUAL_GPU = json.dumps([
+    {
+        "asic": {
+            "market_name": "AMD Radeon AI PRO R9700",
+            "vendor_id": "0x1002",
+            "target_graphics_version": "gfx1201",
+        },
+        "vram": {
+            "size": {"value": 32624, "unit": "MB"},
+        },
+    },
+    {
+        "asic": {
+            "market_name": "AMD Radeon Graphics",
+            "vendor_id": "0x1002",
+            "target_graphics_version": "gfx1036",
+        },
+        "vram": {
+            "size": {"value": 2048, "unit": "MB"},
+        },
+    },
+])
+
+AMD_SMI_SINGLE_GPU = json.dumps([
+    {
+        "asic": {
+            "market_name": "AMD Radeon RX 7900 XTX",
+            "target_graphics_version": "gfx1100",
+        },
+        "vram": {
+            "size": {"value": 24576, "unit": "MB"},
+        },
+    },
+])
+
+# Wrapped in gpu_data key (some amd-smi versions do this)
+AMD_SMI_WRAPPED = json.dumps({
+    "gpu_data": json.loads(AMD_SMI_SINGLE_GPU),
+})
+
+ROCM_SMI_PRODUCTNAME = "device,Card Series\ncard0,AMD Radeon RX 7900 XTX\n"
+ROCM_SMI_MEMINFO = "GPU,vram Total Memory (B)\n25769803776,25769803776\n"
+
+
+# ── Helpers ────────────────────────────────────────────────────────────────
+
+def _make_run_result(stdout="", returncode=0):
+    return subprocess.CompletedProcess(args=[], returncode=returncode, stdout=stdout, stderr="")
+
+
+def _mock_which(available_tools):
+    """Return a shutil.which mock that only finds tools in available_tools."""
+    def _which(name):
+        return f"/usr/bin/{name}" if name in available_tools else None
+    return _which
+
+
+# ── Tests: _try_rocm ──────────────────────────────────────────────────────
+
+class TestTryRocmAmdSmi:
+    """amd-smi primary strategy."""
+
+    @mock.patch("env_config.shutil.which", _mock_which({"amd-smi"}))
+    @mock.patch("env_config.Path.is_dir", return_value=False)
+    @mock.patch("env_config.subprocess.run")
+    def test_dual_gpu_picks_discrete(self, mock_run, _mock_dir):
+        """With 2 GPUs, picks the R9700 (32 GB) over iGPU (2 GB)."""
+        mock_run.return_value = _make_run_result(AMD_SMI_DUAL_GPU)
+
+        env = HardwareEnv()
+        result = env._try_rocm()
+
+        assert result is True
+        assert env.backend == "rocm"
+        assert env.device == "cuda"
+        assert env.gpu_name == "AMD Radeon AI PRO R9700"
+        assert env.gpu_memory_mb == 32624
+        assert env.detection_details["amd_smi"]["gpu_index"] == 0
+        assert env.detection_details["amd_smi"]["gfx_version"] == "gfx1201"
+        assert env.detection_details["amd_smi"]["total_gpus"] == 2
+
+    @mock.patch("env_config.shutil.which", _mock_which({"amd-smi"}))
+    @mock.patch("env_config.Path.is_dir", return_value=False)
+    @mock.patch("env_config.subprocess.run")
+    def test_dual_gpu_sets_env_vars(self, mock_run, _mock_dir):
+        """Multi-GPU: HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES are set."""
+        mock_run.return_value = _make_run_result(AMD_SMI_DUAL_GPU)
+
+        # Clean env
+        for var in ("HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES"):
+            os.environ.pop(var, None)
+
+        env = HardwareEnv()
+        env._try_rocm()
+
+        assert os.environ.get("HIP_VISIBLE_DEVICES") == "0"
+        assert os.environ.get("ROCR_VISIBLE_DEVICES") == "0"
+
+        # Cleanup
+        os.environ.pop("HIP_VISIBLE_DEVICES", None)
+        os.environ.pop("ROCR_VISIBLE_DEVICES", None)
+
+    @mock.patch("env_config.shutil.which", _mock_which({"amd-smi"}))
+    @mock.patch("env_config.Path.is_dir", return_value=False)
+    @mock.patch("env_config.subprocess.run")
+    def test_single_gpu_no_env_vars(self, mock_run, _mock_dir):
+        """Single GPU: HIP_VISIBLE_DEVICES NOT set."""
+        mock_run.return_value = _make_run_result(AMD_SMI_SINGLE_GPU)
+
+        for var in ("HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES"):
+            os.environ.pop(var, None)
+
+        env = HardwareEnv()
+        env._try_rocm()
+
+        assert env.gpu_name == "AMD Radeon RX 7900 XTX"
+        assert env.gpu_memory_mb == 24576
+        assert "HIP_VISIBLE_DEVICES" not in os.environ
+
+    @mock.patch("env_config.shutil.which", _mock_which({"amd-smi"}))
+    @mock.patch("env_config.Path.is_dir", return_value=False)
+    @mock.patch("env_config.subprocess.run")
+    def test_wrapped_gpu_data_format(self, mock_run, _mock_dir):
+        """amd-smi returning {\"gpu_data\": [...]} wrapper."""
+        mock_run.return_value = _make_run_result(AMD_SMI_WRAPPED)
+
+        env = HardwareEnv()
+        env._try_rocm()
+
+        assert env.gpu_name == "AMD Radeon RX 7900 XTX"
+        assert env.gpu_memory_mb == 24576
+
+    @mock.patch("env_config.shutil.which", _mock_which({"amd-smi"}))
+    @mock.patch("env_config.Path.is_dir", return_value=False)
+    @mock.patch("env_config.subprocess.run")
+    def test_amd_smi_failure_returns_true_with_defaults(self, mock_run, _mock_dir):
+        """amd-smi fails → still returns True (ROCm detected), empty gpu_name."""
+        mock_run.return_value = _make_run_result("", returncode=1)
+
+        env = HardwareEnv()
+        result = env._try_rocm()
+
+        assert result is True
+        assert env.backend == "rocm"
+        assert env.gpu_name == ""  # No name parsed, but backend detected
+
+
+class TestTryRocmFallback:
+    """rocm-smi fallback (amd-smi not available)."""
+
+    @mock.patch("env_config.shutil.which", _mock_which({"rocm-smi"}))
+    @mock.patch("env_config.Path.is_dir", return_value=False)
+    @mock.patch("env_config.subprocess.run")
+    def test_rocm_smi_parses_name_and_vram(self, mock_run, _mock_dir):
+        """Legacy rocm-smi fallback parses product name and VRAM."""
+        def side_effect(cmd, **kwargs):
+            if "--showproductname" in cmd:
+                return _make_run_result(ROCM_SMI_PRODUCTNAME)
+            elif "--showmeminfo" in cmd:
+                return _make_run_result(ROCM_SMI_MEMINFO)
+            return _make_run_result("", returncode=1)
+
+        mock_run.side_effect = side_effect
+
+        env = HardwareEnv()
+        result = env._try_rocm()
+
+        assert result is True
+        # NOTE: rocm-smi --showproductname CSV puts device ID in col 0 ("card0"),
+        # which is why amd-smi is the preferred strategy.  This is the known
+        # limitation documented in the original bug report.
+        assert env.gpu_name == "card0"
+        # 25769803776 / (1024*1024) = 24576
+        assert env.gpu_memory_mb == 24576
+
+    @mock.patch("env_config.shutil.which", _mock_which(set()))
+    @mock.patch("env_config.Path.is_dir", return_value=True)
+    def test_only_opt_rocm_dir(self, _mock_dir):
+        """Only /opt/rocm exists — detects ROCm with no GPU info."""
+        env = HardwareEnv()
+        result = env._try_rocm()
+
+        assert result is True
+        assert env.backend == "rocm"
+        assert env.gpu_name == ""
+
+    @mock.patch("env_config.shutil.which", _mock_which(set()))
+    @mock.patch("env_config.Path.is_dir", return_value=False)
+    def test_no_rocm_at_all(self, _mock_dir):
+        """No amd-smi, no rocm-smi, no /opt/rocm → returns False."""
+        env = HardwareEnv()
+        result = env._try_rocm()
+
+        assert result is False
+        assert env.backend == "cpu"  # unchanged default
+
+
+# ── Tests: _check_rocm_runtime ────────────────────────────────────────────
+
+class TestCheckRocmRuntime:
+    """Verify ONNX Runtime provider check."""
+
+    def test_rocm_provider_present(self):
+        """ROCmExecutionProvider in list → returns True."""
+        env = HardwareEnv()
+        with mock.patch.dict("sys.modules", {"onnxruntime": mock.MagicMock()}):
+            ort = sys.modules["onnxruntime"]
+            ort.get_available_providers.return_value = [
+                "ROCmExecutionProvider", "CPUExecutionProvider",
+            ]
+            assert env._check_rocm_runtime() is True
+
+    def test_migraphx_provider_present(self):
+        """MIGraphXExecutionProvider also accepted."""
+        env = HardwareEnv()
+        with mock.patch.dict("sys.modules", {"onnxruntime": mock.MagicMock()}):
+            ort = sys.modules["onnxruntime"]
+            ort.get_available_providers.return_value = [
+                "MIGraphXExecutionProvider", "CPUExecutionProvider",
+            ]
+            assert env._check_rocm_runtime() is True
+
+    def test_cpu_only_raises(self):
+        """CPU-only onnxruntime → raises ImportError."""
+        env = HardwareEnv()
+        with mock.patch.dict("sys.modules", {"onnxruntime": mock.MagicMock()}):
+            ort = sys.modules["onnxruntime"]
+            ort.get_available_providers.return_value = [
+                "AzureExecutionProvider", "CPUExecutionProvider",
+            ]
+            with pytest.raises(ImportError, match="ROCmExecutionProvider not available"):
+                env._check_rocm_runtime()
+
+    def test_onnxruntime_missing_raises(self):
+        """onnxruntime not installed → ImportError from import."""
+        env = HardwareEnv()
+        with mock.patch.dict("sys.modules", {"onnxruntime": None}):
+            with pytest.raises((ImportError, ModuleNotFoundError)):
+                env._check_rocm_runtime()
+
+
+# ── Tests: _check_framework integration ───────────────────────────────────
+
+class TestCheckFrameworkRocm:
+    """_check_framework uses _check_rocm_runtime for ROCm backend."""
+
+    def test_rocm_framework_ok_when_provider_present(self):
+        env = HardwareEnv()
+        env.backend = "rocm"
+        with mock.patch.object(env, "_check_rocm_runtime", return_value=True):
+            assert env._check_framework() is True
+
+    def test_rocm_framework_not_ok_when_provider_missing(self):
+        env = HardwareEnv()
+        env.backend = "rocm"
+        with mock.patch.object(env, "_check_rocm_runtime", side_effect=ImportError("no ROCm")):
+            assert env._check_framework() is False