From fe1f66a08492782d07992a40cb171349f115628b Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Sat, 7 Mar 2026 17:55:21 -0800 Subject: [PATCH 1/9] feat(yolo-2026): CoreML auto-conversion on Apple Silicon for ~2x faster inference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - detect.py: auto-export .pt → .mlpackage when device=mps, cached for reuse - deploy.sh: pre-convert default nano model to CoreML during installation - requirements_mps.txt: added coremltools>=8.0 - config.yaml: added use_coreml toggle (default: true) - Falls back to PyTorch MPS if CoreML export or loading fails --- .../detection/yolo-detection-2026/config.yaml | 7 ++ .../detection/yolo-detection-2026/deploy.sh | 24 +++++- .../yolo-detection-2026/requirements_mps.txt | 2 + .../yolo-detection-2026/scripts/detect.py | 74 +++++++++++++++++-- 4 files changed, 101 insertions(+), 6 deletions(-) diff --git a/skills/detection/yolo-detection-2026/config.yaml b/skills/detection/yolo-detection-2026/config.yaml index d37254b..3c0c9e6 100644 --- a/skills/detection/yolo-detection-2026/config.yaml +++ b/skills/detection/yolo-detection-2026/config.yaml @@ -56,3 +56,10 @@ params: - { value: cuda, label: "NVIDIA CUDA" } - { value: mps, label: "Apple Silicon (MPS)" } - { value: rocm, label: "AMD ROCm" } + + - key: use_coreml + label: CoreML Acceleration + type: boolean + default: true + description: "Convert model to CoreML for ~2x faster inference on Apple Silicon (ANE)" + diff --git a/skills/detection/yolo-detection-2026/deploy.sh b/skills/detection/yolo-detection-2026/deploy.sh index 9ba2bc6..35e62d7 100755 --- a/skills/detection/yolo-detection-2026/deploy.sh +++ b/skills/detection/yolo-detection-2026/deploy.sh @@ -142,7 +142,28 @@ emit "{\"event\": \"progress\", \"stage\": \"install\", \"message\": \"Installin "$PIP" install -r "$REQ_FILE" -q 2>&1 | tail -5 >&2 -# ─── Step 5: Verify installation ──────────────────────────────────────────── +# ─── Step 5: CoreML pre-conversion (MPS only) ─────────────────────────────── + +if [ "$BACKEND" = "mps" ]; then + log "Pre-converting default model to CoreML for ANE acceleration..." + emit '{"event": "progress", "stage": "coreml", "message": "Converting model to CoreML (~30s)..."}' + + "$VENV_DIR/bin/python" -c " +from ultralytics import YOLO +model = YOLO('yolo26n.pt') +exported = model.export(format='coreml', half=True, nms=False) +print(f'CoreML model exported: {exported}') +" 2>&1 | while read -r line; do log "$line"; done + + if [ $? -eq 0 ]; then + emit '{"event": "progress", "stage": "coreml", "message": "CoreML conversion complete"}' + else + log "WARNING: CoreML conversion failed, will use PyTorch MPS at runtime" + emit '{"event": "progress", "stage": "coreml", "message": "CoreML conversion failed — PyTorch MPS fallback"}' + fi +fi + +# ─── Step 6: Verify installation ──────────────────────────────────────────── log "Verifying installation..." "$VENV_DIR/bin/python" -c " @@ -156,3 +177,4 @@ print(f'OK: ultralytics loaded, torch device={device}') emit "{\"event\": \"complete\", \"backend\": \"$BACKEND\", \"message\": \"YOLO 2026 skill installed ($BACKEND backend)\"}" log "Done! Backend: $BACKEND" + diff --git a/skills/detection/yolo-detection-2026/requirements_mps.txt b/skills/detection/yolo-detection-2026/requirements_mps.txt index 5498200..eb018ea 100644 --- a/skills/detection/yolo-detection-2026/requirements_mps.txt +++ b/skills/detection/yolo-detection-2026/requirements_mps.txt @@ -3,6 +3,8 @@ torch>=2.4.0 torchvision>=0.19.0 ultralytics>=8.3.0 +coremltools>=8.0 numpy>=1.24.0 opencv-python-headless>=4.8.0 Pillow>=10.0.0 + diff --git a/skills/detection/yolo-detection-2026/scripts/detect.py b/skills/detection/yolo-detection-2026/scripts/detect.py index 903a434..7191285 100644 --- a/skills/detection/yolo-detection-2026/scripts/detect.py +++ b/skills/detection/yolo-detection-2026/scripts/detect.py @@ -6,6 +6,8 @@ stdin: {"event": "frame", "frame_id": N, "camera_id": "...", "frame_path": "...", ...} stdout: {"event": "detections", "frame_id": N, "camera_id": "...", "objects": [...]} +On Apple Silicon (MPS), auto-converts to CoreML for ~2x faster inference via ANE. + Usage: python detect.py --config config.json python detect.py --model-size nano --confidence 0.5 --device auto @@ -90,6 +92,64 @@ def emit(event: dict): print(json.dumps(event), flush=True) +def log(msg: str): + """Write a log message to stderr (visible in Aegis deploy console).""" + print(f"[YOLO-2026] {msg}", file=sys.stderr, flush=True) + + +def try_coreml_export(model, model_name: str) -> "Path | None": + """Export PyTorch model to CoreML. Returns path to .mlpackage or None on failure.""" + coreml_path = Path(f"{model_name}.mlpackage") + + # Already exported + if coreml_path.exists(): + log(f"CoreML model found: {coreml_path}") + return coreml_path + + try: + log(f"Exporting {model_name}.pt → CoreML (one-time, ~30s)...") + exported = model.export(format="coreml", half=True, nms=False) + exported_path = Path(exported) + if exported_path.exists(): + log(f"CoreML export complete: {exported_path}") + return exported_path + log(f"CoreML export returned path {exported} but file not found") + except Exception as e: + log(f"CoreML export failed: {e}") + + return None + + +def load_model(model_name: str, device: str, use_coreml: bool): + """Load YOLO model — CoreML on MPS if available, PyTorch otherwise.""" + from ultralytics import YOLO + + model_format = "pytorch" + + # Try CoreML on Apple Silicon + if device == "mps" and use_coreml: + pt_model = YOLO(f"{model_name}.pt") + coreml_path = try_coreml_export(pt_model, model_name) + + if coreml_path: + try: + model = YOLO(str(coreml_path)) + model_format = "coreml" + log(f"Loaded CoreML model ({coreml_path})") + return model, model_format + except Exception as e: + log(f"CoreML load failed, falling back to PyTorch MPS: {e}") + + # Fallback: use the already-loaded PyTorch model on MPS + pt_model.to(device) + return pt_model, model_format + + # Non-CoreML path: standard PyTorch + model = YOLO(f"{model_name}.pt") + model.to(device) + return model, model_format + + def main(): args = parse_args() config = load_config(args) @@ -99,24 +159,28 @@ def main(): device = select_device(config.get("device", "auto")) confidence = config.get("confidence", 0.5) fps = config.get("fps", 5) + use_coreml = config.get("use_coreml", True) + + # Coerce use_coreml from string "true"/"false" if passed via env + if isinstance(use_coreml, str): + use_coreml = use_coreml.lower() in ("true", "1", "yes") # Map size to ultralytics model name - model_name = MODEL_SIZE_MAP.get(model_size, "yolo11n") + model_name = MODEL_SIZE_MAP.get(model_size, "yolo26n") target_classes = config.get("classes", ["person", "car", "dog", "cat"]) if isinstance(target_classes, str): target_classes = [c.strip() for c in target_classes.split(",")] - # Load YOLO model + # Load YOLO model (with CoreML auto-conversion on MPS) try: - from ultralytics import YOLO - model = YOLO(f"{model_name}.pt") - model.to(device) + model, model_format = load_model(model_name, device, use_coreml) emit({ "event": "ready", "model": f"yolo2026{model_size[0]}", "model_size": model_size, "device": device, + "format": model_format, "classes": len(model.names), "fps": fps, "available_sizes": list(MODEL_SIZE_MAP.keys()), From b8e73046a20a14a3849bb9aa50072b599a64acfa Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Sat, 7 Mar 2026 17:58:14 -0800 Subject: [PATCH 2/9] feat(yolo-2026): add runtime performance instrumentation with aggregate stats PerfTracker instruments every pipeline stage: - file_read: frame path check - inference: model prediction - postprocess: bbox extraction + class filtering - emit: JSON serialization Emits perf_stats event every 50 frames with avg/min/max/p50/p95/p99. Also tracks one-time model_load_ms and coreml_export_ms. --- .../yolo-detection-2026/scripts/detect.py | 144 ++++++++++++++++-- 1 file changed, 135 insertions(+), 9 deletions(-) diff --git a/skills/detection/yolo-detection-2026/scripts/detect.py b/skills/detection/yolo-detection-2026/scripts/detect.py index 7191285..1aba9e5 100644 --- a/skills/detection/yolo-detection-2026/scripts/detect.py +++ b/skills/detection/yolo-detection-2026/scripts/detect.py @@ -7,6 +7,7 @@ stdout: {"event": "detections", "frame_id": N, "camera_id": "...", "objects": [...]} On Apple Silicon (MPS), auto-converts to CoreML for ~2x faster inference via ANE. +Emits periodic performance statistics via "perf_stats" events. Usage: python detect.py --config config.json @@ -17,6 +18,7 @@ import json import argparse import signal +import time from pathlib import Path @@ -28,6 +30,92 @@ "large": "yolo26l", } +# How often to emit aggregate perf stats (every N frames) +PERF_STATS_INTERVAL = 50 + + +# ─────────────────────────────────────────────────────────────────────────────── +# Performance tracker — collects per-frame timings, emits aggregate stats +# ─────────────────────────────────────────────────────────────────────────────── + +class PerfTracker: + """Tracks timing for each pipeline stage and emits periodic statistics.""" + + def __init__(self, interval: int = PERF_STATS_INTERVAL): + self.interval = interval + self.frame_count = 0 + self.total_frames = 0 + self.error_count = 0 + + # One-time timings (ms) + self.model_load_ms = 0.0 + self.coreml_export_ms = 0.0 + + # Per-frame accumulators (ms) + self._timings: dict[str, list[float]] = { + "file_read": [], # frame_path existence check + file I/O + "inference": [], # model(frame_path, ...) + "postprocess": [], # bbox extraction + filtering + "emit": [], # JSON serialization + print + "total": [], # end-to-end per frame + } + + def record(self, stage: str, duration_ms: float): + """Record a timing for a pipeline stage.""" + if stage in self._timings: + self._timings[stage].append(duration_ms) + + def record_frame(self): + """Increment frame counter and emit stats if interval reached.""" + self.frame_count += 1 + self.total_frames += 1 + if self.frame_count >= self.interval: + self.emit_stats() + self.frame_count = 0 + + def emit_stats(self): + """Emit aggregate statistics as a JSONL event.""" + stats = { + "event": "perf_stats", + "total_frames": self.total_frames, + "window_size": len(self._timings["total"]) or 1, + "errors": self.error_count, + "model_load_ms": round(self.model_load_ms, 1), + "timings_ms": {}, + } + + if self.coreml_export_ms > 0: + stats["coreml_export_ms"] = round(self.coreml_export_ms, 1) + + for stage, values in self._timings.items(): + if not values: + continue + sorted_v = sorted(values) + n = len(sorted_v) + stats["timings_ms"][stage] = { + "avg": round(sum(sorted_v) / n, 2), + "min": round(sorted_v[0], 2), + "max": round(sorted_v[-1], 2), + "p50": round(sorted_v[n // 2], 2), + "p95": round(sorted_v[int(n * 0.95)], 2), + "p99": round(sorted_v[int(n * 0.99)], 2), + } + + emit(stats) + + # Reset per-frame accumulators for next window + for key in self._timings: + self._timings[key].clear() + + def emit_final(self): + """Emit remaining stats on shutdown.""" + if self._timings["total"]: + self.emit_stats() + + +# ─────────────────────────────────────────────────────────────────────────────── +# Helpers +# ─────────────────────────────────────────────────────────────────────────────── def parse_args(): parser = argparse.ArgumentParser(description="YOLO 2026 Detection Skill") @@ -81,7 +169,6 @@ def select_device(preference: str) -> str: return "cuda" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): return "mps" - # ROCm exposes as CUDA in PyTorch with ROCm builds except ImportError: pass return "cpu" @@ -97,8 +184,8 @@ def log(msg: str): print(f"[YOLO-2026] {msg}", file=sys.stderr, flush=True) -def try_coreml_export(model, model_name: str) -> "Path | None": - """Export PyTorch model to CoreML. Returns path to .mlpackage or None on failure.""" +def try_coreml_export(model, model_name: str, perf: PerfTracker) -> "Path | None": + """Export PyTorch model to CoreML. Returns path to .mlpackage or None.""" coreml_path = Path(f"{model_name}.mlpackage") # Already exported @@ -108,10 +195,12 @@ def try_coreml_export(model, model_name: str) -> "Path | None": try: log(f"Exporting {model_name}.pt → CoreML (one-time, ~30s)...") + t0 = time.perf_counter() exported = model.export(format="coreml", half=True, nms=False) + perf.coreml_export_ms = (time.perf_counter() - t0) * 1000 exported_path = Path(exported) if exported_path.exists(): - log(f"CoreML export complete: {exported_path}") + log(f"CoreML export complete: {exported_path} ({perf.coreml_export_ms:.0f}ms)") return exported_path log(f"CoreML export returned path {exported} but file not found") except Exception as e: @@ -120,36 +209,44 @@ def try_coreml_export(model, model_name: str) -> "Path | None": return None -def load_model(model_name: str, device: str, use_coreml: bool): +def load_model(model_name: str, device: str, use_coreml: bool, perf: PerfTracker): """Load YOLO model — CoreML on MPS if available, PyTorch otherwise.""" from ultralytics import YOLO model_format = "pytorch" + t0 = time.perf_counter() # Try CoreML on Apple Silicon if device == "mps" and use_coreml: pt_model = YOLO(f"{model_name}.pt") - coreml_path = try_coreml_export(pt_model, model_name) + coreml_path = try_coreml_export(pt_model, model_name, perf) if coreml_path: try: model = YOLO(str(coreml_path)) model_format = "coreml" - log(f"Loaded CoreML model ({coreml_path})") + perf.model_load_ms = (time.perf_counter() - t0) * 1000 + log(f"Loaded CoreML model ({coreml_path}) in {perf.model_load_ms:.0f}ms") return model, model_format except Exception as e: log(f"CoreML load failed, falling back to PyTorch MPS: {e}") # Fallback: use the already-loaded PyTorch model on MPS pt_model.to(device) + perf.model_load_ms = (time.perf_counter() - t0) * 1000 return pt_model, model_format # Non-CoreML path: standard PyTorch model = YOLO(f"{model_name}.pt") model.to(device) + perf.model_load_ms = (time.perf_counter() - t0) * 1000 return model, model_format +# ─────────────────────────────────────────────────────────────────────────────── +# Main loop +# ─────────────────────────────────────────────────────────────────────────────── + def main(): args = parse_args() config = load_config(args) @@ -172,9 +269,12 @@ def main(): if isinstance(target_classes, str): target_classes = [c.strip() for c in target_classes.split(",")] + # Performance tracker + perf = PerfTracker(interval=PERF_STATS_INTERVAL) + # Load YOLO model (with CoreML auto-conversion on MPS) try: - model, model_format = load_model(model_name, device, use_coreml) + model, model_format = load_model(model_name, device, use_coreml, perf) emit({ "event": "ready", "model": f"yolo2026{model_size[0]}", @@ -183,6 +283,7 @@ def main(): "format": model_format, "classes": len(model.names), "fps": fps, + "model_load_ms": round(perf.model_load_ms, 1), "available_sizes": list(MODEL_SIZE_MAP.keys()), }) except Exception as e: @@ -215,11 +316,15 @@ def handle_signal(signum, frame): break if msg.get("event") == "frame": + t_frame_start = time.perf_counter() + frame_path = msg.get("frame_path") frame_id = msg.get("frame_id") camera_id = msg.get("camera_id", "unknown") timestamp = msg.get("timestamp", "") + # ── File check ── + t0 = time.perf_counter() if not frame_path or not Path(frame_path).exists(): emit({ "event": "error", @@ -227,11 +332,18 @@ def handle_signal(signum, frame): "message": f"Frame not found: {frame_path}", "retriable": True, }) + perf.error_count += 1 continue + perf.record("file_read", (time.perf_counter() - t0) * 1000) - # Run inference + # ── Inference ── try: + t0 = time.perf_counter() results = model(frame_path, conf=confidence, verbose=False) + perf.record("inference", (time.perf_counter() - t0) * 1000) + + # ── Postprocess ── + t0 = time.perf_counter() objects = [] for r in results: for box in r.boxes: @@ -244,7 +356,10 @@ def handle_signal(signum, frame): "confidence": round(float(box.conf[0]), 3), "bbox": [int(x1), int(y1), int(x2), int(y2)], }) + perf.record("postprocess", (time.perf_counter() - t0) * 1000) + # ── Emit ── + t0 = time.perf_counter() emit({ "event": "detections", "frame_id": frame_id, @@ -252,6 +367,8 @@ def handle_signal(signum, frame): "timestamp": timestamp, "objects": objects, }) + perf.record("emit", (time.perf_counter() - t0) * 1000) + except Exception as e: emit({ "event": "error", @@ -259,6 +376,15 @@ def handle_signal(signum, frame): "message": f"Inference error: {e}", "retriable": True, }) + perf.error_count += 1 + continue + + # ── Total frame time ── + perf.record("total", (time.perf_counter() - t_frame_start) * 1000) + perf.record_frame() + + # Emit final stats on shutdown + perf.emit_final() if __name__ == "__main__": From a37e2b72eb91b115689b491182350d1b107235b7 Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Sat, 7 Mar 2026 19:52:55 -0800 Subject: [PATCH 3/9] feat: shared hardware environment layer (env_config.py) for multi-backend inference New shared library at skills/lib/env_config.py: - HardwareEnv.detect(): auto-detect CUDA, ROCm, MPS, Intel, CPU - export_model(): auto-export to TensorRT/ONNX/CoreML/OpenVINO - load_optimized(): load cached model with PyTorch fallback detect.py: replaced inline CoreML logic with HardwareEnv (2 lines) deploy.sh: uses env_config for hardware detection + generalized pre-conversion New: requirements_intel.txt (OpenVINO) Updated: requirements_cuda (tensorrt), rocm (onnxruntime-rocm), cpu (onnxruntime) --- .../detection/yolo-detection-2026/deploy.sh | 114 +++-- .../yolo-detection-2026/requirements_cpu.txt | 2 + .../yolo-detection-2026/requirements_cuda.txt | 2 + .../requirements_intel.txt | 9 + .../yolo-detection-2026/requirements_rocm.txt | 2 + .../yolo-detection-2026/scripts/detect.py | 143 ++---- skills/lib/__init__.py | 1 + skills/lib/env_config.py | 432 ++++++++++++++++++ 8 files changed, 540 insertions(+), 165 deletions(-) create mode 100644 skills/detection/yolo-detection-2026/requirements_intel.txt create mode 100644 skills/lib/__init__.py create mode 100644 skills/lib/env_config.py diff --git a/skills/detection/yolo-detection-2026/deploy.sh b/skills/detection/yolo-detection-2026/deploy.sh index 35e62d7..a56dfdf 100755 --- a/skills/detection/yolo-detection-2026/deploy.sh +++ b/skills/detection/yolo-detection-2026/deploy.sh @@ -4,6 +4,8 @@ # Probes the system for Python, GPU backends, and installs the minimum # viable stack. Called by Aegis skill-runtime-manager during installation. # +# Uses skills/lib/env_config.py for hardware detection and model optimization. +# # Exit codes: # 0 = success # 1 = fatal error (no Python found and cannot install) @@ -13,6 +15,7 @@ set -euo pipefail SKILL_DIR="$(cd "$(dirname "$0")" && pwd)" VENV_DIR="$SKILL_DIR/.venv" +LIB_DIR="$(cd "$SKILL_DIR/../../lib" 2>/dev/null && pwd || echo "")" LOG_PREFIX="[YOLO-2026-deploy]" log() { echo "$LOG_PREFIX $*" >&2; } @@ -21,7 +24,6 @@ emit() { echo "$1"; } # JSON to stdout for Aegis to parse # ─── Step 1: Find or install Python ───────────────────────────────────────── find_python() { - # Check common Python 3 locations for cmd in python3.12 python3.11 python3.10 python3.9 python3; do if command -v "$cmd" &>/dev/null; then local ver @@ -36,7 +38,6 @@ find_python() { fi done - # Check conda if command -v conda &>/dev/null; then log "No system Python >=3.9 found, but conda is available" log "Creating conda environment..." @@ -48,7 +49,6 @@ find_python() { return 0 fi - # Check pyenv if command -v pyenv &>/dev/null; then log "No system Python >=3.9 found, using pyenv..." pyenv install -s 3.11.9 @@ -76,55 +76,60 @@ if [ ! -d "$VENV_DIR" ]; then "$PYTHON_CMD" -m venv "$VENV_DIR" fi -# Activate venv # shellcheck disable=SC1091 source "$VENV_DIR/bin/activate" PIP="$VENV_DIR/bin/pip" -# Upgrade pip "$PIP" install --upgrade pip -q 2>/dev/null || true emit '{"event": "progress", "stage": "venv", "message": "Virtual environment ready"}' -# ─── Step 3: Detect compute backend ───────────────────────────────────────── +# ─── Step 3: Detect hardware via env_config ───────────────────────────────── BACKEND="cpu" -detect_gpu() { - # NVIDIA CUDA +if [ -n "$LIB_DIR" ] && [ -f "$LIB_DIR/env_config.py" ]; then + log "Detecting hardware via env_config.py..." + DETECT_OUTPUT=$("$VENV_DIR/bin/python" -c " +import sys +sys.path.insert(0, '$LIB_DIR') +from env_config import HardwareEnv +env = HardwareEnv.detect() +print(env.backend) +" 2>&1) || true + + # The last line of output is the backend name + BACKEND=$(echo "$DETECT_OUTPUT" | tail -1) + + # Validate backend value + case "$BACKEND" in + cuda|rocm|mps|intel|cpu) ;; + *) + log "env_config returned unexpected backend '$BACKEND', falling back to heuristic" + BACKEND="cpu" + ;; + esac + + log "env_config detected backend: $BACKEND" +else + log "env_config.py not found, using heuristic detection..." + + # Fallback: inline GPU detection (same as before) if command -v nvidia-smi &>/dev/null; then - local cuda_ver cuda_ver=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1) if [ -n "$cuda_ver" ]; then BACKEND="cuda" log "Detected NVIDIA GPU (driver: $cuda_ver)" - return 0 fi - fi - - # AMD ROCm - if command -v rocm-smi &>/dev/null || [ -d "/opt/rocm" ]; then + elif command -v rocm-smi &>/dev/null || [ -d "/opt/rocm" ]; then BACKEND="rocm" log "Detected AMD ROCm" - return 0 - fi - - # Apple Silicon MPS - if [ "$(uname)" = "Darwin" ]; then - local arch - arch=$(uname -m) - if [ "$arch" = "arm64" ]; then - BACKEND="mps" - log "Detected Apple Silicon (MPS)" - return 0 - fi + elif [ "$(uname)" = "Darwin" ] && [ "$(uname -m)" = "arm64" ]; then + BACKEND="mps" + log "Detected Apple Silicon (MPS)" fi +fi - log "No GPU detected, using CPU backend" - return 0 -} - -detect_gpu emit "{\"event\": \"progress\", \"stage\": \"gpu\", \"backend\": \"$BACKEND\", \"message\": \"Compute backend: $BACKEND\"}" # ─── Step 4: Install requirements ──────────────────────────────────────────── @@ -142,24 +147,35 @@ emit "{\"event\": \"progress\", \"stage\": \"install\", \"message\": \"Installin "$PIP" install -r "$REQ_FILE" -q 2>&1 | tail -5 >&2 -# ─── Step 5: CoreML pre-conversion (MPS only) ─────────────────────────────── +# ─── Step 5: Pre-convert model to optimized format ─────────────────────────── -if [ "$BACKEND" = "mps" ]; then - log "Pre-converting default model to CoreML for ANE acceleration..." - emit '{"event": "progress", "stage": "coreml", "message": "Converting model to CoreML (~30s)..."}' +if [ "$BACKEND" != "cpu" ] || [ -f "$SKILL_DIR/requirements_cpu.txt" ]; then + log "Pre-converting model to optimized format for $BACKEND..." + emit "{\"event\": \"progress\", \"stage\": \"optimize\", \"message\": \"Converting model for $BACKEND (~30-120s)...\"}" "$VENV_DIR/bin/python" -c " -from ultralytics import YOLO -model = YOLO('yolo26n.pt') -exported = model.export(format='coreml', half=True, nms=False) -print(f'CoreML model exported: {exported}') +import sys +sys.path.insert(0, '$LIB_DIR') +from env_config import HardwareEnv +env = HardwareEnv.detect() + +if env.framework_ok: + from ultralytics import YOLO + model = YOLO('yolo26n.pt') + result = env.export_model(model, 'yolo26n') + if result: + print(f'Optimized model exported: {result}') + else: + print('Export skipped or failed — will use PyTorch at runtime') +else: + print(f'Optimized runtime not available for {env.backend} — will use PyTorch') " 2>&1 | while read -r line; do log "$line"; done if [ $? -eq 0 ]; then - emit '{"event": "progress", "stage": "coreml", "message": "CoreML conversion complete"}' + emit "{\"event\": \"progress\", \"stage\": \"optimize\", \"message\": \"Model optimization complete\"}" else - log "WARNING: CoreML conversion failed, will use PyTorch MPS at runtime" - emit '{"event": "progress", "stage": "coreml", "message": "CoreML conversion failed — PyTorch MPS fallback"}' + log "WARNING: Model optimization failed, will use PyTorch at runtime" + emit "{\"event\": \"progress\", \"stage\": \"optimize\", \"message\": \"Optimization failed — PyTorch fallback\"}" fi fi @@ -167,14 +183,14 @@ fi log "Verifying installation..." "$VENV_DIR/bin/python" -c " -from ultralytics import YOLO -import torch -device = 'cpu' -if torch.cuda.is_available(): device = 'cuda' -elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): device = 'mps' -print(f'OK: ultralytics loaded, torch device={device}') +import sys +sys.path.insert(0, '$LIB_DIR') +from env_config import HardwareEnv +import json + +env = HardwareEnv.detect() +print(json.dumps(env.to_dict(), indent=2)) " 2>&1 | while read -r line; do log "$line"; done emit "{\"event\": \"complete\", \"backend\": \"$BACKEND\", \"message\": \"YOLO 2026 skill installed ($BACKEND backend)\"}" log "Done! Backend: $BACKEND" - diff --git a/skills/detection/yolo-detection-2026/requirements_cpu.txt b/skills/detection/yolo-detection-2026/requirements_cpu.txt index cdb172f..98c60bb 100644 --- a/skills/detection/yolo-detection-2026/requirements_cpu.txt +++ b/skills/detection/yolo-detection-2026/requirements_cpu.txt @@ -4,6 +4,8 @@ torch>=2.4.0 torchvision>=0.19.0 ultralytics>=8.3.0 +onnxruntime>=1.18 numpy>=1.24.0 opencv-python-headless>=4.8.0 Pillow>=10.0.0 + diff --git a/skills/detection/yolo-detection-2026/requirements_cuda.txt b/skills/detection/yolo-detection-2026/requirements_cuda.txt index 0240bd7..d08623e 100644 --- a/skills/detection/yolo-detection-2026/requirements_cuda.txt +++ b/skills/detection/yolo-detection-2026/requirements_cuda.txt @@ -4,6 +4,8 @@ torch>=2.4.0 torchvision>=0.19.0 ultralytics>=8.3.0 +tensorrt>=10.0 numpy>=1.24.0 opencv-python-headless>=4.8.0 Pillow>=10.0.0 + diff --git a/skills/detection/yolo-detection-2026/requirements_intel.txt b/skills/detection/yolo-detection-2026/requirements_intel.txt new file mode 100644 index 0000000..06ae3d2 --- /dev/null +++ b/skills/detection/yolo-detection-2026/requirements_intel.txt @@ -0,0 +1,9 @@ +# YOLO 2026 — Intel (OpenVINO) requirements +# Supports Intel CPUs, iGPUs, and NPUs (Meteor Lake+) +torch>=2.4.0 +torchvision>=0.19.0 +ultralytics>=8.3.0 +openvino>=2024.0 +numpy>=1.24.0 +opencv-python-headless>=4.8.0 +Pillow>=10.0.0 diff --git a/skills/detection/yolo-detection-2026/requirements_rocm.txt b/skills/detection/yolo-detection-2026/requirements_rocm.txt index e665dff..0d0ca7f 100644 --- a/skills/detection/yolo-detection-2026/requirements_rocm.txt +++ b/skills/detection/yolo-detection-2026/requirements_rocm.txt @@ -4,6 +4,8 @@ torch>=2.4.0 torchvision>=0.19.0 ultralytics>=8.3.0 +onnxruntime-rocm>=1.18 numpy>=1.24.0 opencv-python-headless>=4.8.0 Pillow>=10.0.0 + diff --git a/skills/detection/yolo-detection-2026/scripts/detect.py b/skills/detection/yolo-detection-2026/scripts/detect.py index 1aba9e5..fa31b31 100644 --- a/skills/detection/yolo-detection-2026/scripts/detect.py +++ b/skills/detection/yolo-detection-2026/scripts/detect.py @@ -6,8 +6,8 @@ stdin: {"event": "frame", "frame_id": N, "camera_id": "...", "frame_path": "...", ...} stdout: {"event": "detections", "frame_id": N, "camera_id": "...", "objects": [...]} -On Apple Silicon (MPS), auto-converts to CoreML for ~2x faster inference via ANE. -Emits periodic performance statistics via "perf_stats" events. +Uses env_config.py for automatic hardware detection and model optimization +(TensorRT, ONNX, CoreML, OpenVINO) with PyTorch fallback. Usage: python detect.py --config config.json @@ -21,6 +21,10 @@ import time from pathlib import Path +# Add skills/lib to path for shared modules +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent.parent / "lib")) +from env_config import HardwareEnv # noqa: E402 + # Model size → ultralytics model name mapping (YOLO26, released Jan 2026) MODEL_SIZE_MAP = { @@ -49,24 +53,22 @@ def __init__(self, interval: int = PERF_STATS_INTERVAL): # One-time timings (ms) self.model_load_ms = 0.0 - self.coreml_export_ms = 0.0 + self.export_ms = 0.0 # Per-frame accumulators (ms) self._timings: dict[str, list[float]] = { - "file_read": [], # frame_path existence check + file I/O - "inference": [], # model(frame_path, ...) - "postprocess": [], # bbox extraction + filtering - "emit": [], # JSON serialization + print - "total": [], # end-to-end per frame + "file_read": [], + "inference": [], + "postprocess": [], + "emit": [], + "total": [], } def record(self, stage: str, duration_ms: float): - """Record a timing for a pipeline stage.""" if stage in self._timings: self._timings[stage].append(duration_ms) def record_frame(self): - """Increment frame counter and emit stats if interval reached.""" self.frame_count += 1 self.total_frames += 1 if self.frame_count >= self.interval: @@ -74,7 +76,6 @@ def record_frame(self): self.frame_count = 0 def emit_stats(self): - """Emit aggregate statistics as a JSONL event.""" stats = { "event": "perf_stats", "total_frames": self.total_frames, @@ -83,9 +84,8 @@ def emit_stats(self): "model_load_ms": round(self.model_load_ms, 1), "timings_ms": {}, } - - if self.coreml_export_ms > 0: - stats["coreml_export_ms"] = round(self.coreml_export_ms, 1) + if self.export_ms > 0: + stats["export_ms"] = round(self.export_ms, 1) for stage, values in self._timings.items(): if not values: @@ -100,15 +100,11 @@ def emit_stats(self): "p95": round(sorted_v[int(n * 0.95)], 2), "p99": round(sorted_v[int(n * 0.99)], 2), } - emit(stats) - - # Reset per-frame accumulators for next window for key in self._timings: self._timings[key].clear() def emit_final(self): - """Emit remaining stats on shutdown.""" if self._timings["total"]: self.emit_stats() @@ -134,7 +130,6 @@ def load_config(args): """Load config from JSON file, CLI args, or AEGIS_SKILL_PARAMS env var.""" import os - # Priority 1: AEGIS_SKILL_PARAMS env var (set by Aegis skill-runtime-manager) env_params = os.environ.get("AEGIS_SKILL_PARAMS") if env_params: try: @@ -142,14 +137,12 @@ def load_config(args): except json.JSONDecodeError: pass - # Priority 2: Config file if args.config: config_path = Path(args.config) if config_path.exists(): with open(config_path) as f: return json.load(f) - # Priority 3: CLI args return { "model_size": args.model_size, "confidence": args.confidence, @@ -159,90 +152,14 @@ def load_config(args): } -def select_device(preference: str) -> str: - """Select the best available inference device.""" - if preference not in ("auto", ""): - return preference - try: - import torch - if torch.cuda.is_available(): - return "cuda" - if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): - return "mps" - except ImportError: - pass - return "cpu" - - def emit(event: dict): - """Write a JSON line to stdout.""" print(json.dumps(event), flush=True) def log(msg: str): - """Write a log message to stderr (visible in Aegis deploy console).""" print(f"[YOLO-2026] {msg}", file=sys.stderr, flush=True) -def try_coreml_export(model, model_name: str, perf: PerfTracker) -> "Path | None": - """Export PyTorch model to CoreML. Returns path to .mlpackage or None.""" - coreml_path = Path(f"{model_name}.mlpackage") - - # Already exported - if coreml_path.exists(): - log(f"CoreML model found: {coreml_path}") - return coreml_path - - try: - log(f"Exporting {model_name}.pt → CoreML (one-time, ~30s)...") - t0 = time.perf_counter() - exported = model.export(format="coreml", half=True, nms=False) - perf.coreml_export_ms = (time.perf_counter() - t0) * 1000 - exported_path = Path(exported) - if exported_path.exists(): - log(f"CoreML export complete: {exported_path} ({perf.coreml_export_ms:.0f}ms)") - return exported_path - log(f"CoreML export returned path {exported} but file not found") - except Exception as e: - log(f"CoreML export failed: {e}") - - return None - - -def load_model(model_name: str, device: str, use_coreml: bool, perf: PerfTracker): - """Load YOLO model — CoreML on MPS if available, PyTorch otherwise.""" - from ultralytics import YOLO - - model_format = "pytorch" - t0 = time.perf_counter() - - # Try CoreML on Apple Silicon - if device == "mps" and use_coreml: - pt_model = YOLO(f"{model_name}.pt") - coreml_path = try_coreml_export(pt_model, model_name, perf) - - if coreml_path: - try: - model = YOLO(str(coreml_path)) - model_format = "coreml" - perf.model_load_ms = (time.perf_counter() - t0) * 1000 - log(f"Loaded CoreML model ({coreml_path}) in {perf.model_load_ms:.0f}ms") - return model, model_format - except Exception as e: - log(f"CoreML load failed, falling back to PyTorch MPS: {e}") - - # Fallback: use the already-loaded PyTorch model on MPS - pt_model.to(device) - perf.model_load_ms = (time.perf_counter() - t0) * 1000 - return pt_model, model_format - - # Non-CoreML path: standard PyTorch - model = YOLO(f"{model_name}.pt") - model.to(device) - perf.model_load_ms = (time.perf_counter() - t0) * 1000 - return model, model_format - - # ─────────────────────────────────────────────────────────────────────────────── # Main loop # ─────────────────────────────────────────────────────────────────────────────── @@ -251,39 +168,39 @@ def main(): args = parse_args() config = load_config(args) - # Resolve config values model_size = config.get("model_size", "nano") - device = select_device(config.get("device", "auto")) confidence = config.get("confidence", 0.5) fps = config.get("fps", 5) - use_coreml = config.get("use_coreml", True) - - # Coerce use_coreml from string "true"/"false" if passed via env - if isinstance(use_coreml, str): - use_coreml = use_coreml.lower() in ("true", "1", "yes") + use_optimized = config.get("use_coreml", True) # legacy key, now covers all backends + if isinstance(use_optimized, str): + use_optimized = use_optimized.lower() in ("true", "1", "yes") - # Map size to ultralytics model name model_name = MODEL_SIZE_MAP.get(model_size, "yolo26n") target_classes = config.get("classes", ["person", "car", "dog", "cat"]) if isinstance(target_classes, str): target_classes = [c.strip() for c in target_classes.split(",")] - # Performance tracker + # ── Hardware detection & optimized model loading ── + env = HardwareEnv.detect() perf = PerfTracker(interval=PERF_STATS_INTERVAL) - # Load YOLO model (with CoreML auto-conversion on MPS) try: - model, model_format = load_model(model_name, device, use_coreml, perf) + model, model_format = env.load_optimized(model_name, use_optimized=use_optimized) + perf.model_load_ms = env.load_ms + perf.export_ms = env.export_ms + emit({ "event": "ready", "model": f"yolo2026{model_size[0]}", "model_size": model_size, - "device": device, + "device": env.device, + "backend": env.backend, "format": model_format, + "gpu": env.gpu_name, "classes": len(model.names), "fps": fps, - "model_load_ms": round(perf.model_load_ms, 1), + "model_load_ms": round(env.load_ms, 1), "available_sizes": list(MODEL_SIZE_MAP.keys()), }) except Exception as e: @@ -323,7 +240,6 @@ def handle_signal(signum, frame): camera_id = msg.get("camera_id", "unknown") timestamp = msg.get("timestamp", "") - # ── File check ── t0 = time.perf_counter() if not frame_path or not Path(frame_path).exists(): emit({ @@ -336,13 +252,11 @@ def handle_signal(signum, frame): continue perf.record("file_read", (time.perf_counter() - t0) * 1000) - # ── Inference ── try: t0 = time.perf_counter() results = model(frame_path, conf=confidence, verbose=False) perf.record("inference", (time.perf_counter() - t0) * 1000) - # ── Postprocess ── t0 = time.perf_counter() objects = [] for r in results: @@ -358,7 +272,6 @@ def handle_signal(signum, frame): }) perf.record("postprocess", (time.perf_counter() - t0) * 1000) - # ── Emit ── t0 = time.perf_counter() emit({ "event": "detections", @@ -379,11 +292,9 @@ def handle_signal(signum, frame): perf.error_count += 1 continue - # ── Total frame time ── perf.record("total", (time.perf_counter() - t_frame_start) * 1000) perf.record_frame() - # Emit final stats on shutdown perf.emit_final() diff --git a/skills/lib/__init__.py b/skills/lib/__init__.py new file mode 100644 index 0000000..33a0dde --- /dev/null +++ b/skills/lib/__init__.py @@ -0,0 +1 @@ +# DeepCamera Skills — Shared Library diff --git a/skills/lib/env_config.py b/skills/lib/env_config.py new file mode 100644 index 0000000..1676e21 --- /dev/null +++ b/skills/lib/env_config.py @@ -0,0 +1,432 @@ +""" +env_config.py — Shared hardware environment detection and model optimization. + +Provides a single entry point for any DeepCamera skill to: + 1. Detect available compute hardware (NVIDIA, AMD, Apple, Intel, CPU) + 2. Auto-export models to the optimal inference format + 3. Load cached optimized models with PyTorch fallback + +Usage: + from lib.env_config import HardwareEnv + + env = HardwareEnv.detect() + model, fmt = env.load_optimized("yolo26n") +""" + +import json +import os +import platform +import shutil +import subprocess +import sys +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + + +def _log(msg: str): + """Log to stderr.""" + print(f"[env_config] {msg}", file=sys.stderr, flush=True) + + +# ─── Backend definitions ──────────────────────────────────────────────────── + +@dataclass +class BackendSpec: + """Specification for a compute backend's optimized export.""" + name: str # "cuda", "rocm", "mps", "intel", "cpu" + export_format: str # ultralytics export format string + model_suffix: str # file extension/dir to look for cached model + half: bool = True # use FP16 + extra_export_args: dict = field(default_factory=dict) + + +BACKEND_SPECS = { + "cuda": BackendSpec( + name="cuda", + export_format="engine", + model_suffix=".engine", + half=True, + ), + "rocm": BackendSpec( + name="rocm", + export_format="onnx", + model_suffix=".onnx", + half=False, # ONNX Runtime ROCm handles precision internally + ), + "mps": BackendSpec( + name="mps", + export_format="coreml", + model_suffix=".mlpackage", + half=True, + extra_export_args={"nms": False}, + ), + "intel": BackendSpec( + name="intel", + export_format="openvino", + model_suffix="_openvino_model", + half=True, + ), + "cpu": BackendSpec( + name="cpu", + export_format="onnx", + model_suffix=".onnx", + half=False, + ), +} + + +# ─── Hardware detection ────────────────────────────────────────────────────── + +@dataclass +class HardwareEnv: + """Detected hardware environment with model optimization capabilities.""" + + backend: str = "cpu" # "cuda" | "rocm" | "mps" | "intel" | "cpu" + device: str = "cpu" # torch device string + export_format: str = "onnx" # optimal export format + gpu_name: str = "" # human-readable GPU name + gpu_memory_mb: int = 0 # GPU memory in MB + driver_version: str = "" # GPU driver version + framework_ok: bool = False # True if optimized runtime is importable + detection_details: dict = field(default_factory=dict) # raw detection info + + # Timing (populated by export/load) + export_ms: float = 0.0 + load_ms: float = 0.0 + + @staticmethod + def detect() -> "HardwareEnv": + """Probe the system and return a populated HardwareEnv.""" + env = HardwareEnv() + + # Try each backend in priority order + if env._try_cuda(): + pass + elif env._try_rocm(): + pass + elif env._try_mps(): + pass + elif env._try_intel(): + pass + else: + env._fallback_cpu() + + # Set export format from backend spec + spec = BACKEND_SPECS.get(env.backend, BACKEND_SPECS["cpu"]) + env.export_format = spec.export_format + + # Check if optimized runtime is available + env.framework_ok = env._check_framework() + + _log(f"Detected: backend={env.backend}, device={env.device}, " + f"gpu={env.gpu_name or 'none'}, " + f"format={env.export_format}, " + f"framework_ok={env.framework_ok}") + + return env + + def _try_cuda(self) -> bool: + """Detect NVIDIA GPU via nvidia-smi and torch.""" + if not shutil.which("nvidia-smi"): + return False + try: + result = subprocess.run( + ["nvidia-smi", "--query-gpu=name,memory.total,driver_version", + "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=10, + ) + if result.returncode != 0: + return False + + line = result.stdout.strip().split("\n")[0] + parts = [p.strip() for p in line.split(",")] + if len(parts) >= 3: + self.backend = "cuda" + self.device = "cuda" + self.gpu_name = parts[0] + self.gpu_memory_mb = int(float(parts[1])) + self.driver_version = parts[2] + self.detection_details["nvidia_smi"] = line + _log(f"NVIDIA GPU: {self.gpu_name} ({self.gpu_memory_mb}MB, driver {self.driver_version})") + return True + except (subprocess.TimeoutExpired, FileNotFoundError, ValueError) as e: + _log(f"nvidia-smi probe failed: {e}") + return False + + def _try_rocm(self) -> bool: + """Detect AMD GPU via rocm-smi or /opt/rocm.""" + has_rocm_smi = shutil.which("rocm-smi") is not None + has_rocm_dir = Path("/opt/rocm").is_dir() + + if not (has_rocm_smi or has_rocm_dir): + return False + + self.backend = "rocm" + self.device = "cuda" # ROCm exposes as CUDA in PyTorch + + if has_rocm_smi: + try: + result = subprocess.run( + ["rocm-smi", "--showproductname", "--csv"], + capture_output=True, text=True, timeout=10, + ) + if result.returncode == 0: + lines = result.stdout.strip().split("\n") + if len(lines) > 1: + self.gpu_name = lines[1].split(",")[0].strip() + self.detection_details["rocm_smi"] = result.stdout.strip() + except (subprocess.TimeoutExpired, FileNotFoundError): + pass + + try: + result = subprocess.run( + ["rocm-smi", "--showmeminfo", "vram", "--csv"], + capture_output=True, text=True, timeout=10, + ) + if result.returncode == 0: + # Parse total VRAM + for line in result.stdout.strip().split("\n")[1:]: + parts = line.split(",") + if len(parts) >= 2: + try: + self.gpu_memory_mb = int(float(parts[0].strip()) / (1024 * 1024)) + except ValueError: + pass + break + except (subprocess.TimeoutExpired, FileNotFoundError): + pass + + _log(f"AMD ROCm GPU: {self.gpu_name or 'detected'} ({self.gpu_memory_mb}MB)") + return True + + def _try_mps(self) -> bool: + """Detect Apple Silicon via uname + sysctl.""" + if platform.system() != "Darwin" or platform.machine() != "arm64": + return False + + self.backend = "mps" + self.device = "mps" + + # Get chip name + try: + result = subprocess.run( + ["sysctl", "-n", "machdep.cpu.brand_string"], + capture_output=True, text=True, timeout=5, + ) + if result.returncode == 0: + self.gpu_name = result.stdout.strip() + except (subprocess.TimeoutExpired, FileNotFoundError): + self.gpu_name = "Apple Silicon" + + # Get total memory (shared with GPU on Apple Silicon) + try: + result = subprocess.run( + ["sysctl", "-n", "hw.memsize"], + capture_output=True, text=True, timeout=5, + ) + if result.returncode == 0: + self.gpu_memory_mb = int(int(result.stdout.strip()) / (1024 * 1024)) + except (subprocess.TimeoutExpired, FileNotFoundError, ValueError): + pass + + _log(f"Apple Silicon: {self.gpu_name} ({self.gpu_memory_mb}MB unified)") + return True + + def _try_intel(self) -> bool: + """Detect Intel OpenVINO-capable hardware.""" + # Check for OpenVINO installation + has_openvino = False + try: + import openvino # noqa: F401 + has_openvino = True + except ImportError: + # Check for system install + has_openvino = Path("/opt/intel/openvino").is_dir() + + if not has_openvino: + # Check CPU flags for Intel-specific features (AVX-512, AMX) + try: + if platform.system() == "Linux": + with open("/proc/cpuinfo") as f: + cpuinfo = f.read() + if "GenuineIntel" in cpuinfo: + self.backend = "intel" + self.device = "cpu" + self.gpu_name = "Intel CPU" + _log("Intel CPU detected (no OpenVINO installed)") + return True + except FileNotFoundError: + pass + return False + + self.backend = "intel" + self.device = "cpu" # OpenVINO handles device selection internally + self.gpu_name = "Intel (OpenVINO)" + + # Check for Intel GPU / NPU + try: + from openvino.runtime import Core + core = Core() + devices = core.available_devices + self.detection_details["openvino_devices"] = devices + if "GPU" in devices: + self.gpu_name = "Intel GPU (OpenVINO)" + if "NPU" in devices: + self.gpu_name = "Intel NPU (OpenVINO)" + _log(f"OpenVINO devices: {devices}") + except Exception: + pass + + _log(f"Intel: {self.gpu_name}") + return True + + def _fallback_cpu(self): + """CPU-only fallback.""" + self.backend = "cpu" + self.device = "cpu" + self.gpu_name = "" + + # Report CPU info + try: + self.detection_details["cpu"] = platform.processor() or "unknown" + except Exception: + pass + + _log("No GPU detected, using CPU backend") + + def _check_framework(self) -> bool: + """Check if the optimized inference runtime is importable.""" + checks = { + "cuda": lambda: __import__("tensorrt"), + "rocm": lambda: __import__("onnxruntime"), + "mps": lambda: __import__("coremltools"), + "intel": lambda: __import__("openvino"), + "cpu": lambda: __import__("onnxruntime"), + } + + check = checks.get(self.backend) + if not check: + return False + try: + check() + return True + except ImportError: + _log(f"Optimized runtime not installed for {self.backend}, " + f"will use PyTorch fallback") + return False + + # ─── Model export & loading ────────────────────────────────────────── + + def get_optimized_path(self, model_name: str) -> Path: + """Get the expected path for the optimized model.""" + spec = BACKEND_SPECS.get(self.backend, BACKEND_SPECS["cpu"]) + return Path(f"{model_name}{spec.model_suffix}") + + def export_model(self, model, model_name: str) -> Optional[Path]: + """Export PyTorch model to optimal format. Returns path or None.""" + if not self.framework_ok: + _log(f"Skipping export — {self.backend} runtime not available") + return None + + spec = BACKEND_SPECS.get(self.backend, BACKEND_SPECS["cpu"]) + optimized_path = self.get_optimized_path(model_name) + + # Already exported + if optimized_path.exists(): + _log(f"Cached model found: {optimized_path}") + return optimized_path + + try: + _log(f"Exporting {model_name}.pt → {spec.export_format} " + f"(one-time, may take 30-120s)...") + t0 = time.perf_counter() + + export_kwargs = { + "format": spec.export_format, + "half": spec.half, + } + export_kwargs.update(spec.extra_export_args) + + exported = model.export(**export_kwargs) + self.export_ms = (time.perf_counter() - t0) * 1000 + + exported_path = Path(exported) + if exported_path.exists(): + _log(f"Export complete: {exported_path} ({self.export_ms:.0f}ms)") + return exported_path + + _log(f"Export returned {exported} but path not found") + except Exception as e: + _log(f"Export failed ({spec.export_format}): {e}") + + return None + + def load_optimized(self, model_name: str, use_optimized: bool = True): + """ + Load the best available model for this hardware. + + Returns: + (model, format_str) — the YOLO model and its format name + """ + from ultralytics import YOLO + + t0 = time.perf_counter() + + if use_optimized and self.framework_ok: + # Try loading from cache first (no export needed) + optimized_path = self.get_optimized_path(model_name) + if optimized_path.exists(): + try: + model = YOLO(str(optimized_path)) + self.load_ms = (time.perf_counter() - t0) * 1000 + _log(f"Loaded {self.export_format} model ({self.load_ms:.0f}ms)") + return model, self.export_format + except Exception as e: + _log(f"Failed to load cached model: {e}") + + # Try exporting then loading + pt_model = YOLO(f"{model_name}.pt") + exported = self.export_model(pt_model, model_name) + if exported: + try: + model = YOLO(str(exported)) + self.load_ms = (time.perf_counter() - t0) * 1000 + _log(f"Loaded freshly exported {self.export_format} model ({self.load_ms:.0f}ms)") + return model, self.export_format + except Exception as e: + _log(f"Failed to load exported model: {e}") + + # Fallback: use the PT model we already loaded + _log("Falling back to PyTorch model") + pt_model.to(self.device) + self.load_ms = (time.perf_counter() - t0) * 1000 + return pt_model, "pytorch" + + # No optimization requested or framework missing + model = YOLO(f"{model_name}.pt") + model.to(self.device) + self.load_ms = (time.perf_counter() - t0) * 1000 + return model, "pytorch" + + def to_dict(self) -> dict: + """Serialize environment info for JSON output.""" + return { + "backend": self.backend, + "device": self.device, + "export_format": self.export_format, + "gpu_name": self.gpu_name, + "gpu_memory_mb": self.gpu_memory_mb, + "driver_version": self.driver_version, + "framework_ok": self.framework_ok, + "export_ms": round(self.export_ms, 1), + "load_ms": round(self.load_ms, 1), + } + + +# ─── CLI: run standalone for diagnostics ───────────────────────────────────── + +if __name__ == "__main__": + env = HardwareEnv.detect() + print(json.dumps(env.to_dict(), indent=2)) From 753257e6a2c6c5c427a52e3e4a479fbf5b67296e Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Sat, 7 Mar 2026 21:20:16 -0800 Subject: [PATCH 4/9] docs: comprehensive SKILL.md v2.0 + README update for multi-backend hardware acceleration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SKILL.md v2.0: - Hardware acceleration section (TensorRT/CoreML/OpenVINO/ONNX) - env_config.py integration documentation - Auto start parameter documentation - Performance monitoring (perf_stats) documentation - Requirements files reference table - Updated protocol examples with backend/format/gpu fields README.md: - Updated yolo-detection-2026 description with acceleration formats - Added hardware environment layer to roadmap (completed) config.yaml: renamed use_coreml → use_optimized (all backends) detect.py: backward-compatible config key migration --- README.md | 3 +- skills/detection/yolo-detection-2026/SKILL.md | 86 ++++++++++++++++--- .../detection/yolo-detection-2026/config.yaml | 7 +- .../yolo-detection-2026/scripts/detect.py | 2 +- 4 files changed, 80 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 0648ee1..e37c29e 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Each skill is a self-contained module with its own model, parameters, and [commu | Category | Skill | What It Does | Status | |----------|-------|--------------|:------:| -| **Detection** | [`yolo-detection-2026`](skills/detection/yolo-detection-2026/) | Real-time 80+ class object detection | ✅| +| **Detection** | [`yolo-detection-2026`](skills/detection/yolo-detection-2026/) | Real-time 80+ class detection — auto-accelerated via TensorRT / CoreML / OpenVINO / ONNX | ✅| | | [`dinov3-grounding`](skills/detection/dinov3-grounding/) | Open-vocabulary detection — describe what to find | 📐 | | | [`person-recognition`](skills/detection/person-recognition/) | Re-identify individuals across cameras | 📐 | | **Analysis** | [`home-security-benchmark`](skills/analysis/home-security-benchmark/) | [131-test evaluation suite](#-homesec-bench--how-secure-is-your-local-ai) for LLM & VLM security performance | ✅ | @@ -52,6 +52,7 @@ Each skill is a self-contained module with its own model, parameters, and [commu - [x] **Skill architecture** — pluggable `SKILL.md` interface for all capabilities - [x] **Full skill catalog** — 18 skills across 9 categories with working scripts +- [x] **Hardware environment layer** — shared [`env_config.py`](skills/lib/env_config.py) for auto-detection + model optimization across NVIDIA, AMD, Apple Silicon, Intel, and CPU - [ ] **Skill Store UI** — browse, install, and configure skills from Aegis - [ ] **Custom skill packaging** — community-contributed skills via GitHub - [ ] **GPU-optimized containers** — one-click Docker deployment per skill diff --git a/skills/detection/yolo-detection-2026/SKILL.md b/skills/detection/yolo-detection-2026/SKILL.md index 60677f9..95ff307 100644 --- a/skills/detection/yolo-detection-2026/SKILL.md +++ b/skills/detection/yolo-detection-2026/SKILL.md @@ -1,11 +1,19 @@ --- name: yolo-detection-2026 description: "YOLO 2026 — state-of-the-art real-time object detection" -version: 1.0.0 +version: 2.0.0 icon: assets/icon.png entry: scripts/detect.py +deploy: deploy.sh parameters: + - name: auto_start + label: "Auto Start" + type: boolean + default: false + description: "Start this skill automatically when Aegis launches" + group: Lifecycle + - name: model_size label: "Model Size" type: select @@ -45,6 +53,13 @@ parameters: description: "auto = best available GPU, else CPU" group: Performance + - name: use_optimized + label: "Hardware Acceleration" + type: boolean + default: true + description: "Auto-convert model to optimized format for faster inference" + group: Performance + capabilities: live_detection: script: scripts/detect.py @@ -64,6 +79,50 @@ Real-time object detection using the latest YOLO 2026 models. Detects 80+ COCO o | medium | Moderate | High | Accuracy-focused deployments | | large | Slower | Highest | Maximum detection quality | +## Hardware Acceleration + +The skill uses [`env_config.py`](../../lib/env_config.py) to **automatically detect hardware** and convert the model to the fastest format for your platform. Conversion happens once during deployment and is cached. + +| Platform | Backend | Optimized Format | Expected Speedup | +|----------|---------|------------------|:----------------:| +| NVIDIA GPU | CUDA | TensorRT `.engine` | ~3-5x | +| Apple Silicon (M1+) | MPS | CoreML `.mlpackage` | ~2x | +| Intel CPU/GPU/NPU | OpenVINO | OpenVINO IR `.xml` | ~2-3x | +| AMD GPU | ROCm | ONNX Runtime | ~1.5-2x | +| CPU (any) | CPU | ONNX Runtime | ~1.5x | + +### How It Works + +1. `deploy.sh` detects your hardware via `env_config.HardwareEnv.detect()` +2. Installs the matching `requirements_{backend}.txt` (e.g. CUDA → includes `tensorrt`) +3. Pre-converts the default model to the optimal format +4. At runtime, `detect.py` loads the cached optimized model automatically +5. Falls back to PyTorch if optimization fails + +Set `use_optimized: false` to disable auto-conversion and use raw PyTorch. + +## Auto Start + +Set `auto_start: true` in the skill config to start detection automatically when Aegis launches. The skill will begin processing frames from the selected camera immediately. + +```yaml +auto_start: true +model_size: nano +fps: 5 +``` + +## Performance Monitoring + +The skill emits `perf_stats` events every 50 frames with aggregate timing: + +```jsonl +{"event": "perf_stats", "total_frames": 50, "timings_ms": { + "inference": {"avg": 3.4, "p50": 3.2, "p95": 5.1}, + "postprocess": {"avg": 0.15, "p50": 0.12, "p95": 0.31}, + "total": {"avg": 3.6, "p50": 3.4, "p95": 5.5} +}} +``` + ## Protocol Communicates via **JSON lines** over stdin/stdout. @@ -75,10 +134,11 @@ Communicates via **JSON lines** over stdin/stdout. ### Skill → Aegis (stdout) ```jsonl -{"event": "ready", "model": "yolo2026n", "device": "mps", "classes": 80, "fps": 5} +{"event": "ready", "model": "yolo2026n", "device": "mps", "backend": "mps", "format": "coreml", "gpu": "Apple M3", "classes": 80, "fps": 5} {"event": "detections", "frame_id": 42, "camera_id": "front_door", "timestamp": "...", "objects": [ {"class": "person", "confidence": 0.92, "bbox": [100, 50, 300, 400]} ]} +{"event": "perf_stats", "total_frames": 50, "timings_ms": {"inference": {"avg": 3.4}}} {"event": "error", "message": "...", "retriable": true} ``` @@ -90,20 +150,20 @@ Communicates via **JSON lines** over stdin/stdout. {"command": "stop"} ``` -## Hardware Support - -| Platform | Backend | Performance | -|----------|---------|-------------| -| Apple Silicon (M1+) | MPS | 20-30 FPS | -| NVIDIA GPU | CUDA | 25-60 FPS | -| AMD GPU | ROCm | 15-40 FPS | -| CPU (modern x86) | CPU | 5-15 FPS | -| Raspberry Pi 5 | CPU | 2-5 FPS | - ## Installation -The `deploy.sh` bootstrapper handles everything — Python environment, GPU backend detection, and dependency installation. No manual setup required. +The `deploy.sh` bootstrapper handles everything — Python environment, GPU backend detection, dependency installation, and model optimization. No manual setup required. ```bash ./deploy.sh ``` + +### Requirements Files + +| File | Backend | Key Deps | +|------|---------|----------| +| `requirements_cuda.txt` | NVIDIA | `torch` (cu124), `tensorrt` | +| `requirements_mps.txt` | Apple | `torch`, `coremltools` | +| `requirements_intel.txt` | Intel | `torch`, `openvino` | +| `requirements_rocm.txt` | AMD | `torch` (rocm6.2), `onnxruntime-rocm` | +| `requirements_cpu.txt` | CPU | `torch` (cpu), `onnxruntime` | diff --git a/skills/detection/yolo-detection-2026/config.yaml b/skills/detection/yolo-detection-2026/config.yaml index 3c0c9e6..742146f 100644 --- a/skills/detection/yolo-detection-2026/config.yaml +++ b/skills/detection/yolo-detection-2026/config.yaml @@ -57,9 +57,10 @@ params: - { value: mps, label: "Apple Silicon (MPS)" } - { value: rocm, label: "AMD ROCm" } - - key: use_coreml - label: CoreML Acceleration + - key: use_optimized + label: Hardware Acceleration type: boolean default: true - description: "Convert model to CoreML for ~2x faster inference on Apple Silicon (ANE)" + description: "Auto-convert model to optimized format (TensorRT/CoreML/OpenVINO/ONNX) for faster inference" + diff --git a/skills/detection/yolo-detection-2026/scripts/detect.py b/skills/detection/yolo-detection-2026/scripts/detect.py index fa31b31..5608b64 100644 --- a/skills/detection/yolo-detection-2026/scripts/detect.py +++ b/skills/detection/yolo-detection-2026/scripts/detect.py @@ -171,7 +171,7 @@ def main(): model_size = config.get("model_size", "nano") confidence = config.get("confidence", 0.5) fps = config.get("fps", 5) - use_optimized = config.get("use_coreml", True) # legacy key, now covers all backends + use_optimized = config.get("use_optimized", config.get("use_coreml", True)) if isinstance(use_optimized, str): use_optimized = use_optimized.lower() in ("true", "1", "yes") From cc637243abfa03385029987a9adda6da11179fd2 Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Sat, 7 Mar 2026 21:26:01 -0800 Subject: [PATCH 5/9] =?UTF-8?q?docs:=20highlight=20YOLO=202026=20detection?= =?UTF-8?q?=20in=20README=20=E2=80=94=20YOLO26=20models,=20hardware=20acce?= =?UTF-8?q?l,=20AI-driven=20skill=20integration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/README.md b/README.md index e37c29e..fe14c34 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,53 @@ The easiest way to run DeepCamera's AI skills. Aegis connects everything — cam +## 🎯 YOLO 2026 — Real-Time Object Detection + +State-of-the-art detection running locally on **any hardware**, fully integrated as a [DeepCamera skill](skills/detection/yolo-detection-2026/). + +### YOLO26 Models + +YOLO26 (Jan 2026) eliminates NMS and DFL for cleaner exports and lower latency. Pick the size that fits your hardware: + +| Model | Params | Latency (optimized) | Use Case | +|-------|--------|:-------------------:|----------| +| **yolo26n** (nano) | 2.6M | ~2ms | Edge devices, real-time on CPU | +| **yolo26s** (small) | 11.2M | ~5ms | Balanced speed & accuracy | +| **yolo26m** (medium) | 25.4M | ~12ms | Accuracy-focused | +| **yolo26l** (large) | 52.3M | ~25ms | Maximum detection quality | + +All models detect **80+ COCO classes**: people, vehicles, animals, everyday objects. + +### Hardware Acceleration + +The shared [`env_config.py`](skills/lib/env_config.py) **auto-detects your GPU** and converts the model to the fastest native format — zero manual setup: + +| Your Hardware | Optimized Format | Runtime | Speedup vs PyTorch | +|---------------|-----------------|---------|:------------------:| +| **NVIDIA GPU** (RTX, Jetson) | TensorRT `.engine` | CUDA | **3-5x** | +| **Apple Silicon** (M1–M4) | CoreML `.mlpackage` | ANE + GPU | **~2x** | +| **Intel** (CPU, iGPU, NPU) | OpenVINO IR `.xml` | OpenVINO | **2-3x** | +| **AMD GPU** (RX, MI) | ONNX Runtime | ROCm | **1.5-2x** | +| **Any CPU** | ONNX Runtime | CPU | **~1.5x** | + +### Aegis Skill Integration + +Detection runs as a **parallel pipeline** alongside VLM analysis — never blocks your AI agent: + +``` +Camera → Frame Governor → detect.py (JSONL) → Aegis IPC → Live Overlay + 5 FPS ↓ + perf_stats (p50/p95/p99 latency) +``` + +- 🖱️ **Click to setup** — one button in Aegis installs everything, no terminal needed +- 🤖 **AI-driven environment config** — autonomous agent detects your GPU, installs the right framework (CUDA/ROCm/CoreML/OpenVINO), converts models, and verifies the setup +- 📺 **Live bounding boxes** — detection results rendered as overlays on RTSP camera streams +- 📊 **Built-in performance profiling** — aggregate latency stats (p50/p95/p99) emitted every 50 frames +- ⚡ **Auto start** — set `auto_start: true` to begin detecting when Aegis launches + +📖 [Full Skill Documentation →](skills/detection/yolo-detection-2026/SKILL.md) + ## 📊 HomeSec-Bench — How Secure Is Your Local AI? **HomeSec-Bench** is a 131-test security benchmark that measures how well your local AI performs as a security guard. It tests what matters: Can it detect a person in fog? Classify a break-in vs. a delivery? Resist prompt injection? Route alerts correctly at 3 AM? From cc1fd412230f8c061a3f60eafc55a25efc4294ac Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Sat, 7 Mar 2026 21:28:11 -0800 Subject: [PATCH 6/9] =?UTF-8?q?docs:=20update=20roadmap=20=E2=80=94=20Skil?= =?UTF-8?q?l=20Store=20done,=20hardware-aware=20install=20done,=20add=20AI?= =?UTF-8?q?-assisted=20install=20+=20skill=20dev=20guide?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index fe14c34..e45040f 100644 --- a/README.md +++ b/README.md @@ -53,9 +53,10 @@ Each skill is a self-contained module with its own model, parameters, and [commu - [x] **Skill architecture** — pluggable `SKILL.md` interface for all capabilities - [x] **Full skill catalog** — 18 skills across 9 categories with working scripts - [x] **Hardware environment layer** — shared [`env_config.py`](skills/lib/env_config.py) for auto-detection + model optimization across NVIDIA, AMD, Apple Silicon, Intel, and CPU -- [ ] **Skill Store UI** — browse, install, and configure skills from Aegis -- [ ] **Custom skill packaging** — community-contributed skills via GitHub -- [ ] **GPU-optimized containers** — one-click Docker deployment per skill +- [x] **Skill Store UI** — browse, install, and configure skills from Aegis +- [x] **GPU / NPU / CPU (AIPC) aware installation** — auto-detect hardware, install matching frameworks, convert models to optimal format +- [ ] **AI/LLM-assisted skill installation** — community-contributed skills installed and configured via AI agent +- [ ] **Skill development guide** — documentation for building custom skills ## 🚀 Getting Started with [SharpAI Aegis](https://www.sharpai.org) From f476ee40470cd21eb7ec273b66a9a218de0e6cec Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Sat, 7 Mar 2026 21:29:54 -0800 Subject: [PATCH 7/9] docs: mark AI/LLM-assisted skill installation as done --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e45040f..4820480 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ Each skill is a self-contained module with its own model, parameters, and [commu - [x] **Hardware environment layer** — shared [`env_config.py`](skills/lib/env_config.py) for auto-detection + model optimization across NVIDIA, AMD, Apple Silicon, Intel, and CPU - [x] **Skill Store UI** — browse, install, and configure skills from Aegis - [x] **GPU / NPU / CPU (AIPC) aware installation** — auto-detect hardware, install matching frameworks, convert models to optimal format -- [ ] **AI/LLM-assisted skill installation** — community-contributed skills installed and configured via AI agent +- [x] **AI/LLM-assisted skill installation** — community-contributed skills installed and configured via AI agent - [ ] **Skill development guide** — documentation for building custom skills ## 🚀 Getting Started with [SharpAI Aegis](https://www.sharpai.org) From cd6b7c5c9109ebe61671f50a2ec4e0242cd7a34c Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Sat, 7 Mar 2026 21:30:58 -0800 Subject: [PATCH 8/9] =?UTF-8?q?docs:=20roadmap=20=E2=80=94=20change=20skil?= =?UTF-8?q?l=20catalog=20to=20ongoing=20skill=20development?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 4820480..04a7a8d 100644 --- a/README.md +++ b/README.md @@ -51,12 +51,11 @@ Each skill is a self-contained module with its own model, parameters, and [commu ### 🗺️ Roadmap - [x] **Skill architecture** — pluggable `SKILL.md` interface for all capabilities -- [x] **Full skill catalog** — 18 skills across 9 categories with working scripts -- [x] **Hardware environment layer** — shared [`env_config.py`](skills/lib/env_config.py) for auto-detection + model optimization across NVIDIA, AMD, Apple Silicon, Intel, and CPU - [x] **Skill Store UI** — browse, install, and configure skills from Aegis -- [x] **GPU / NPU / CPU (AIPC) aware installation** — auto-detect hardware, install matching frameworks, convert models to optimal format - [x] **AI/LLM-assisted skill installation** — community-contributed skills installed and configured via AI agent -- [ ] **Skill development guide** — documentation for building custom skills +- [x] **GPU / NPU / CPU (AIPC) aware installation** — auto-detect hardware, install matching frameworks, convert models to optimal format +- [x] **Hardware environment layer** — shared [`env_config.py`](skills/lib/env_config.py) for auto-detection + model optimization across NVIDIA, AMD, Apple Silicon, Intel, and CPU +- [ ] **Skill development** — 18 skills across 9 categories, actively expanding with community contributions ## 🚀 Getting Started with [SharpAI Aegis](https://www.sharpai.org) From 8b19435b5a31e569afb7a59957697ce06e80eb61 Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Sat, 7 Mar 2026 21:31:55 -0800 Subject: [PATCH 9/9] docs: move roadmap above skill catalog for better hierarchy --- README.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 04a7a8d..b821942 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,15 @@ --- +## 🗺️ Roadmap + +- [x] **Skill architecture** — pluggable `SKILL.md` interface for all capabilities +- [x] **Skill Store UI** — browse, install, and configure skills from Aegis +- [x] **AI/LLM-assisted skill installation** — community-contributed skills installed and configured via AI agent +- [x] **GPU / NPU / CPU (AIPC) aware installation** — auto-detect hardware, install matching frameworks, convert models to optimal format +- [x] **Hardware environment layer** — shared [`env_config.py`](skills/lib/env_config.py) for auto-detection + model optimization across NVIDIA, AMD, Apple Silicon, Intel, and CPU +- [ ] **Skill development** — 18 skills across 9 categories, actively expanding with community contributions + ## 🧩 Skill Catalog Each skill is a self-contained module with its own model, parameters, and [communication protocol](docs/skill-development.md). See the [Skill Development Guide](docs/skill-development.md) and [Platform Parameters](docs/skill-params.md) to build your own. @@ -48,14 +57,6 @@ Each skill is a self-contained module with its own model, parameters, and [commu > **Registry:** All skills are indexed in [`skills.json`](skills.json) for programmatic discovery. -### 🗺️ Roadmap - -- [x] **Skill architecture** — pluggable `SKILL.md` interface for all capabilities -- [x] **Skill Store UI** — browse, install, and configure skills from Aegis -- [x] **AI/LLM-assisted skill installation** — community-contributed skills installed and configured via AI agent -- [x] **GPU / NPU / CPU (AIPC) aware installation** — auto-detect hardware, install matching frameworks, convert models to optimal format -- [x] **Hardware environment layer** — shared [`env_config.py`](skills/lib/env_config.py) for auto-detection + model optimization across NVIDIA, AMD, Apple Silicon, Intel, and CPU -- [ ] **Skill development** — 18 skills across 9 categories, actively expanding with community contributions ## 🚀 Getting Started with [SharpAI Aegis](https://www.sharpai.org)