diff --git a/README.md b/README.md index 0648ee1..b821942 100644 --- a/README.md +++ b/README.md @@ -24,13 +24,22 @@ --- +## πŸ—ΊοΈ Roadmap + +- [x] **Skill architecture** β€” pluggable `SKILL.md` interface for all capabilities +- [x] **Skill Store UI** β€” browse, install, and configure skills from Aegis +- [x] **AI/LLM-assisted skill installation** β€” community-contributed skills installed and configured via AI agent +- [x] **GPU / NPU / CPU (AIPC) aware installation** β€” auto-detect hardware, install matching frameworks, convert models to optimal format +- [x] **Hardware environment layer** β€” shared [`env_config.py`](skills/lib/env_config.py) for auto-detection + model optimization across NVIDIA, AMD, Apple Silicon, Intel, and CPU +- [ ] **Skill development** β€” 18 skills across 9 categories, actively expanding with community contributions + ## 🧩 Skill Catalog Each skill is a self-contained module with its own model, parameters, and [communication protocol](docs/skill-development.md). See the [Skill Development Guide](docs/skill-development.md) and [Platform Parameters](docs/skill-params.md) to build your own. | Category | Skill | What It Does | Status | |----------|-------|--------------|:------:| -| **Detection** | [`yolo-detection-2026`](skills/detection/yolo-detection-2026/) | Real-time 80+ class object detection | βœ…| +| **Detection** | [`yolo-detection-2026`](skills/detection/yolo-detection-2026/) | Real-time 80+ class detection β€” auto-accelerated via TensorRT / CoreML / OpenVINO / ONNX | βœ…| | | [`dinov3-grounding`](skills/detection/dinov3-grounding/) | Open-vocabulary detection β€” describe what to find | πŸ“ | | | [`person-recognition`](skills/detection/person-recognition/) | Re-identify individuals across cameras | πŸ“ | | **Analysis** | [`home-security-benchmark`](skills/analysis/home-security-benchmark/) | [131-test evaluation suite](#-homesec-bench--how-secure-is-your-local-ai) for LLM & VLM security performance | βœ… | @@ -48,13 +57,6 @@ Each skill is a self-contained module with its own model, parameters, and [commu > **Registry:** All skills are indexed in [`skills.json`](skills.json) for programmatic discovery. -### πŸ—ΊοΈ Roadmap - -- [x] **Skill architecture** β€” pluggable `SKILL.md` interface for all capabilities -- [x] **Full skill catalog** β€” 18 skills across 9 categories with working scripts -- [ ] **Skill Store UI** β€” browse, install, and configure skills from Aegis -- [ ] **Custom skill packaging** β€” community-contributed skills via GitHub -- [ ] **GPU-optimized containers** β€” one-click Docker deployment per skill ## πŸš€ Getting Started with [SharpAI Aegis](https://www.sharpai.org) @@ -89,6 +91,53 @@ The easiest way to run DeepCamera's AI skills. Aegis connects everything β€” cam +## 🎯 YOLO 2026 β€” Real-Time Object Detection + +State-of-the-art detection running locally on **any hardware**, fully integrated as a [DeepCamera skill](skills/detection/yolo-detection-2026/). + +### YOLO26 Models + +YOLO26 (Jan 2026) eliminates NMS and DFL for cleaner exports and lower latency. Pick the size that fits your hardware: + +| Model | Params | Latency (optimized) | Use Case | +|-------|--------|:-------------------:|----------| +| **yolo26n** (nano) | 2.6M | ~2ms | Edge devices, real-time on CPU | +| **yolo26s** (small) | 11.2M | ~5ms | Balanced speed & accuracy | +| **yolo26m** (medium) | 25.4M | ~12ms | Accuracy-focused | +| **yolo26l** (large) | 52.3M | ~25ms | Maximum detection quality | + +All models detect **80+ COCO classes**: people, vehicles, animals, everyday objects. + +### Hardware Acceleration + +The shared [`env_config.py`](skills/lib/env_config.py) **auto-detects your GPU** and converts the model to the fastest native format β€” zero manual setup: + +| Your Hardware | Optimized Format | Runtime | Speedup vs PyTorch | +|---------------|-----------------|---------|:------------------:| +| **NVIDIA GPU** (RTX, Jetson) | TensorRT `.engine` | CUDA | **3-5x** | +| **Apple Silicon** (M1–M4) | CoreML `.mlpackage` | ANE + GPU | **~2x** | +| **Intel** (CPU, iGPU, NPU) | OpenVINO IR `.xml` | OpenVINO | **2-3x** | +| **AMD GPU** (RX, MI) | ONNX Runtime | ROCm | **1.5-2x** | +| **Any CPU** | ONNX Runtime | CPU | **~1.5x** | + +### Aegis Skill Integration + +Detection runs as a **parallel pipeline** alongside VLM analysis β€” never blocks your AI agent: + +``` +Camera β†’ Frame Governor β†’ detect.py (JSONL) β†’ Aegis IPC β†’ Live Overlay + 5 FPS ↓ + perf_stats (p50/p95/p99 latency) +``` + +- πŸ–±οΈ **Click to setup** β€” one button in Aegis installs everything, no terminal needed +- πŸ€– **AI-driven environment config** β€” autonomous agent detects your GPU, installs the right framework (CUDA/ROCm/CoreML/OpenVINO), converts models, and verifies the setup +- πŸ“Ί **Live bounding boxes** β€” detection results rendered as overlays on RTSP camera streams +- πŸ“Š **Built-in performance profiling** β€” aggregate latency stats (p50/p95/p99) emitted every 50 frames +- ⚑ **Auto start** β€” set `auto_start: true` to begin detecting when Aegis launches + +πŸ“– [Full Skill Documentation β†’](skills/detection/yolo-detection-2026/SKILL.md) + ## πŸ“Š HomeSec-Bench β€” How Secure Is Your Local AI? **HomeSec-Bench** is a 131-test security benchmark that measures how well your local AI performs as a security guard. It tests what matters: Can it detect a person in fog? Classify a break-in vs. a delivery? Resist prompt injection? Route alerts correctly at 3 AM? diff --git a/skills/detection/yolo-detection-2026/SKILL.md b/skills/detection/yolo-detection-2026/SKILL.md index 60677f9..95ff307 100644 --- a/skills/detection/yolo-detection-2026/SKILL.md +++ b/skills/detection/yolo-detection-2026/SKILL.md @@ -1,11 +1,19 @@ --- name: yolo-detection-2026 description: "YOLO 2026 β€” state-of-the-art real-time object detection" -version: 1.0.0 +version: 2.0.0 icon: assets/icon.png entry: scripts/detect.py +deploy: deploy.sh parameters: + - name: auto_start + label: "Auto Start" + type: boolean + default: false + description: "Start this skill automatically when Aegis launches" + group: Lifecycle + - name: model_size label: "Model Size" type: select @@ -45,6 +53,13 @@ parameters: description: "auto = best available GPU, else CPU" group: Performance + - name: use_optimized + label: "Hardware Acceleration" + type: boolean + default: true + description: "Auto-convert model to optimized format for faster inference" + group: Performance + capabilities: live_detection: script: scripts/detect.py @@ -64,6 +79,50 @@ Real-time object detection using the latest YOLO 2026 models. Detects 80+ COCO o | medium | Moderate | High | Accuracy-focused deployments | | large | Slower | Highest | Maximum detection quality | +## Hardware Acceleration + +The skill uses [`env_config.py`](../../lib/env_config.py) to **automatically detect hardware** and convert the model to the fastest format for your platform. Conversion happens once during deployment and is cached. + +| Platform | Backend | Optimized Format | Expected Speedup | +|----------|---------|------------------|:----------------:| +| NVIDIA GPU | CUDA | TensorRT `.engine` | ~3-5x | +| Apple Silicon (M1+) | MPS | CoreML `.mlpackage` | ~2x | +| Intel CPU/GPU/NPU | OpenVINO | OpenVINO IR `.xml` | ~2-3x | +| AMD GPU | ROCm | ONNX Runtime | ~1.5-2x | +| CPU (any) | CPU | ONNX Runtime | ~1.5x | + +### How It Works + +1. `deploy.sh` detects your hardware via `env_config.HardwareEnv.detect()` +2. Installs the matching `requirements_{backend}.txt` (e.g. CUDA β†’ includes `tensorrt`) +3. Pre-converts the default model to the optimal format +4. At runtime, `detect.py` loads the cached optimized model automatically +5. Falls back to PyTorch if optimization fails + +Set `use_optimized: false` to disable auto-conversion and use raw PyTorch. + +## Auto Start + +Set `auto_start: true` in the skill config to start detection automatically when Aegis launches. The skill will begin processing frames from the selected camera immediately. + +```yaml +auto_start: true +model_size: nano +fps: 5 +``` + +## Performance Monitoring + +The skill emits `perf_stats` events every 50 frames with aggregate timing: + +```jsonl +{"event": "perf_stats", "total_frames": 50, "timings_ms": { + "inference": {"avg": 3.4, "p50": 3.2, "p95": 5.1}, + "postprocess": {"avg": 0.15, "p50": 0.12, "p95": 0.31}, + "total": {"avg": 3.6, "p50": 3.4, "p95": 5.5} +}} +``` + ## Protocol Communicates via **JSON lines** over stdin/stdout. @@ -75,10 +134,11 @@ Communicates via **JSON lines** over stdin/stdout. ### Skill β†’ Aegis (stdout) ```jsonl -{"event": "ready", "model": "yolo2026n", "device": "mps", "classes": 80, "fps": 5} +{"event": "ready", "model": "yolo2026n", "device": "mps", "backend": "mps", "format": "coreml", "gpu": "Apple M3", "classes": 80, "fps": 5} {"event": "detections", "frame_id": 42, "camera_id": "front_door", "timestamp": "...", "objects": [ {"class": "person", "confidence": 0.92, "bbox": [100, 50, 300, 400]} ]} +{"event": "perf_stats", "total_frames": 50, "timings_ms": {"inference": {"avg": 3.4}}} {"event": "error", "message": "...", "retriable": true} ``` @@ -90,20 +150,20 @@ Communicates via **JSON lines** over stdin/stdout. {"command": "stop"} ``` -## Hardware Support - -| Platform | Backend | Performance | -|----------|---------|-------------| -| Apple Silicon (M1+) | MPS | 20-30 FPS | -| NVIDIA GPU | CUDA | 25-60 FPS | -| AMD GPU | ROCm | 15-40 FPS | -| CPU (modern x86) | CPU | 5-15 FPS | -| Raspberry Pi 5 | CPU | 2-5 FPS | - ## Installation -The `deploy.sh` bootstrapper handles everything β€” Python environment, GPU backend detection, and dependency installation. No manual setup required. +The `deploy.sh` bootstrapper handles everything β€” Python environment, GPU backend detection, dependency installation, and model optimization. No manual setup required. ```bash ./deploy.sh ``` + +### Requirements Files + +| File | Backend | Key Deps | +|------|---------|----------| +| `requirements_cuda.txt` | NVIDIA | `torch` (cu124), `tensorrt` | +| `requirements_mps.txt` | Apple | `torch`, `coremltools` | +| `requirements_intel.txt` | Intel | `torch`, `openvino` | +| `requirements_rocm.txt` | AMD | `torch` (rocm6.2), `onnxruntime-rocm` | +| `requirements_cpu.txt` | CPU | `torch` (cpu), `onnxruntime` | diff --git a/skills/detection/yolo-detection-2026/config.yaml b/skills/detection/yolo-detection-2026/config.yaml index d37254b..742146f 100644 --- a/skills/detection/yolo-detection-2026/config.yaml +++ b/skills/detection/yolo-detection-2026/config.yaml @@ -56,3 +56,11 @@ params: - { value: cuda, label: "NVIDIA CUDA" } - { value: mps, label: "Apple Silicon (MPS)" } - { value: rocm, label: "AMD ROCm" } + + - key: use_optimized + label: Hardware Acceleration + type: boolean + default: true + description: "Auto-convert model to optimized format (TensorRT/CoreML/OpenVINO/ONNX) for faster inference" + + diff --git a/skills/detection/yolo-detection-2026/deploy.sh b/skills/detection/yolo-detection-2026/deploy.sh index 9ba2bc6..a56dfdf 100755 --- a/skills/detection/yolo-detection-2026/deploy.sh +++ b/skills/detection/yolo-detection-2026/deploy.sh @@ -4,6 +4,8 @@ # Probes the system for Python, GPU backends, and installs the minimum # viable stack. Called by Aegis skill-runtime-manager during installation. # +# Uses skills/lib/env_config.py for hardware detection and model optimization. +# # Exit codes: # 0 = success # 1 = fatal error (no Python found and cannot install) @@ -13,6 +15,7 @@ set -euo pipefail SKILL_DIR="$(cd "$(dirname "$0")" && pwd)" VENV_DIR="$SKILL_DIR/.venv" +LIB_DIR="$(cd "$SKILL_DIR/../../lib" 2>/dev/null && pwd || echo "")" LOG_PREFIX="[YOLO-2026-deploy]" log() { echo "$LOG_PREFIX $*" >&2; } @@ -21,7 +24,6 @@ emit() { echo "$1"; } # JSON to stdout for Aegis to parse # ─── Step 1: Find or install Python ───────────────────────────────────────── find_python() { - # Check common Python 3 locations for cmd in python3.12 python3.11 python3.10 python3.9 python3; do if command -v "$cmd" &>/dev/null; then local ver @@ -36,7 +38,6 @@ find_python() { fi done - # Check conda if command -v conda &>/dev/null; then log "No system Python >=3.9 found, but conda is available" log "Creating conda environment..." @@ -48,7 +49,6 @@ find_python() { return 0 fi - # Check pyenv if command -v pyenv &>/dev/null; then log "No system Python >=3.9 found, using pyenv..." pyenv install -s 3.11.9 @@ -76,55 +76,60 @@ if [ ! -d "$VENV_DIR" ]; then "$PYTHON_CMD" -m venv "$VENV_DIR" fi -# Activate venv # shellcheck disable=SC1091 source "$VENV_DIR/bin/activate" PIP="$VENV_DIR/bin/pip" -# Upgrade pip "$PIP" install --upgrade pip -q 2>/dev/null || true emit '{"event": "progress", "stage": "venv", "message": "Virtual environment ready"}' -# ─── Step 3: Detect compute backend ───────────────────────────────────────── +# ─── Step 3: Detect hardware via env_config ───────────────────────────────── BACKEND="cpu" -detect_gpu() { - # NVIDIA CUDA +if [ -n "$LIB_DIR" ] && [ -f "$LIB_DIR/env_config.py" ]; then + log "Detecting hardware via env_config.py..." + DETECT_OUTPUT=$("$VENV_DIR/bin/python" -c " +import sys +sys.path.insert(0, '$LIB_DIR') +from env_config import HardwareEnv +env = HardwareEnv.detect() +print(env.backend) +" 2>&1) || true + + # The last line of output is the backend name + BACKEND=$(echo "$DETECT_OUTPUT" | tail -1) + + # Validate backend value + case "$BACKEND" in + cuda|rocm|mps|intel|cpu) ;; + *) + log "env_config returned unexpected backend '$BACKEND', falling back to heuristic" + BACKEND="cpu" + ;; + esac + + log "env_config detected backend: $BACKEND" +else + log "env_config.py not found, using heuristic detection..." + + # Fallback: inline GPU detection (same as before) if command -v nvidia-smi &>/dev/null; then - local cuda_ver cuda_ver=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1) if [ -n "$cuda_ver" ]; then BACKEND="cuda" log "Detected NVIDIA GPU (driver: $cuda_ver)" - return 0 fi - fi - - # AMD ROCm - if command -v rocm-smi &>/dev/null || [ -d "/opt/rocm" ]; then + elif command -v rocm-smi &>/dev/null || [ -d "/opt/rocm" ]; then BACKEND="rocm" log "Detected AMD ROCm" - return 0 + elif [ "$(uname)" = "Darwin" ] && [ "$(uname -m)" = "arm64" ]; then + BACKEND="mps" + log "Detected Apple Silicon (MPS)" fi +fi - # Apple Silicon MPS - if [ "$(uname)" = "Darwin" ]; then - local arch - arch=$(uname -m) - if [ "$arch" = "arm64" ]; then - BACKEND="mps" - log "Detected Apple Silicon (MPS)" - return 0 - fi - fi - - log "No GPU detected, using CPU backend" - return 0 -} - -detect_gpu emit "{\"event\": \"progress\", \"stage\": \"gpu\", \"backend\": \"$BACKEND\", \"message\": \"Compute backend: $BACKEND\"}" # ─── Step 4: Install requirements ──────────────────────────────────────────── @@ -142,16 +147,49 @@ emit "{\"event\": \"progress\", \"stage\": \"install\", \"message\": \"Installin "$PIP" install -r "$REQ_FILE" -q 2>&1 | tail -5 >&2 -# ─── Step 5: Verify installation ──────────────────────────────────────────── +# ─── Step 5: Pre-convert model to optimized format ─────────────────────────── + +if [ "$BACKEND" != "cpu" ] || [ -f "$SKILL_DIR/requirements_cpu.txt" ]; then + log "Pre-converting model to optimized format for $BACKEND..." + emit "{\"event\": \"progress\", \"stage\": \"optimize\", \"message\": \"Converting model for $BACKEND (~30-120s)...\"}" + + "$VENV_DIR/bin/python" -c " +import sys +sys.path.insert(0, '$LIB_DIR') +from env_config import HardwareEnv +env = HardwareEnv.detect() + +if env.framework_ok: + from ultralytics import YOLO + model = YOLO('yolo26n.pt') + result = env.export_model(model, 'yolo26n') + if result: + print(f'Optimized model exported: {result}') + else: + print('Export skipped or failed β€” will use PyTorch at runtime') +else: + print(f'Optimized runtime not available for {env.backend} β€” will use PyTorch') +" 2>&1 | while read -r line; do log "$line"; done + + if [ $? -eq 0 ]; then + emit "{\"event\": \"progress\", \"stage\": \"optimize\", \"message\": \"Model optimization complete\"}" + else + log "WARNING: Model optimization failed, will use PyTorch at runtime" + emit "{\"event\": \"progress\", \"stage\": \"optimize\", \"message\": \"Optimization failed β€” PyTorch fallback\"}" + fi +fi + +# ─── Step 6: Verify installation ──────────────────────────────────────────── log "Verifying installation..." "$VENV_DIR/bin/python" -c " -from ultralytics import YOLO -import torch -device = 'cpu' -if torch.cuda.is_available(): device = 'cuda' -elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): device = 'mps' -print(f'OK: ultralytics loaded, torch device={device}') +import sys +sys.path.insert(0, '$LIB_DIR') +from env_config import HardwareEnv +import json + +env = HardwareEnv.detect() +print(json.dumps(env.to_dict(), indent=2)) " 2>&1 | while read -r line; do log "$line"; done emit "{\"event\": \"complete\", \"backend\": \"$BACKEND\", \"message\": \"YOLO 2026 skill installed ($BACKEND backend)\"}" diff --git a/skills/detection/yolo-detection-2026/requirements_cpu.txt b/skills/detection/yolo-detection-2026/requirements_cpu.txt index cdb172f..98c60bb 100644 --- a/skills/detection/yolo-detection-2026/requirements_cpu.txt +++ b/skills/detection/yolo-detection-2026/requirements_cpu.txt @@ -4,6 +4,8 @@ torch>=2.4.0 torchvision>=0.19.0 ultralytics>=8.3.0 +onnxruntime>=1.18 numpy>=1.24.0 opencv-python-headless>=4.8.0 Pillow>=10.0.0 + diff --git a/skills/detection/yolo-detection-2026/requirements_cuda.txt b/skills/detection/yolo-detection-2026/requirements_cuda.txt index 0240bd7..d08623e 100644 --- a/skills/detection/yolo-detection-2026/requirements_cuda.txt +++ b/skills/detection/yolo-detection-2026/requirements_cuda.txt @@ -4,6 +4,8 @@ torch>=2.4.0 torchvision>=0.19.0 ultralytics>=8.3.0 +tensorrt>=10.0 numpy>=1.24.0 opencv-python-headless>=4.8.0 Pillow>=10.0.0 + diff --git a/skills/detection/yolo-detection-2026/requirements_intel.txt b/skills/detection/yolo-detection-2026/requirements_intel.txt new file mode 100644 index 0000000..06ae3d2 --- /dev/null +++ b/skills/detection/yolo-detection-2026/requirements_intel.txt @@ -0,0 +1,9 @@ +# YOLO 2026 β€” Intel (OpenVINO) requirements +# Supports Intel CPUs, iGPUs, and NPUs (Meteor Lake+) +torch>=2.4.0 +torchvision>=0.19.0 +ultralytics>=8.3.0 +openvino>=2024.0 +numpy>=1.24.0 +opencv-python-headless>=4.8.0 +Pillow>=10.0.0 diff --git a/skills/detection/yolo-detection-2026/requirements_mps.txt b/skills/detection/yolo-detection-2026/requirements_mps.txt index 5498200..eb018ea 100644 --- a/skills/detection/yolo-detection-2026/requirements_mps.txt +++ b/skills/detection/yolo-detection-2026/requirements_mps.txt @@ -3,6 +3,8 @@ torch>=2.4.0 torchvision>=0.19.0 ultralytics>=8.3.0 +coremltools>=8.0 numpy>=1.24.0 opencv-python-headless>=4.8.0 Pillow>=10.0.0 + diff --git a/skills/detection/yolo-detection-2026/requirements_rocm.txt b/skills/detection/yolo-detection-2026/requirements_rocm.txt index e665dff..0d0ca7f 100644 --- a/skills/detection/yolo-detection-2026/requirements_rocm.txt +++ b/skills/detection/yolo-detection-2026/requirements_rocm.txt @@ -4,6 +4,8 @@ torch>=2.4.0 torchvision>=0.19.0 ultralytics>=8.3.0 +onnxruntime-rocm>=1.18 numpy>=1.24.0 opencv-python-headless>=4.8.0 Pillow>=10.0.0 + diff --git a/skills/detection/yolo-detection-2026/scripts/detect.py b/skills/detection/yolo-detection-2026/scripts/detect.py index 903a434..5608b64 100644 --- a/skills/detection/yolo-detection-2026/scripts/detect.py +++ b/skills/detection/yolo-detection-2026/scripts/detect.py @@ -6,6 +6,9 @@ stdin: {"event": "frame", "frame_id": N, "camera_id": "...", "frame_path": "...", ...} stdout: {"event": "detections", "frame_id": N, "camera_id": "...", "objects": [...]} +Uses env_config.py for automatic hardware detection and model optimization +(TensorRT, ONNX, CoreML, OpenVINO) with PyTorch fallback. + Usage: python detect.py --config config.json python detect.py --model-size nano --confidence 0.5 --device auto @@ -15,8 +18,13 @@ import json import argparse import signal +import time from pathlib import Path +# Add skills/lib to path for shared modules +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent.parent / "lib")) +from env_config import HardwareEnv # noqa: E402 + # Model size β†’ ultralytics model name mapping (YOLO26, released Jan 2026) MODEL_SIZE_MAP = { @@ -26,6 +34,84 @@ "large": "yolo26l", } +# How often to emit aggregate perf stats (every N frames) +PERF_STATS_INTERVAL = 50 + + +# ─────────────────────────────────────────────────────────────────────────────── +# Performance tracker β€” collects per-frame timings, emits aggregate stats +# ─────────────────────────────────────────────────────────────────────────────── + +class PerfTracker: + """Tracks timing for each pipeline stage and emits periodic statistics.""" + + def __init__(self, interval: int = PERF_STATS_INTERVAL): + self.interval = interval + self.frame_count = 0 + self.total_frames = 0 + self.error_count = 0 + + # One-time timings (ms) + self.model_load_ms = 0.0 + self.export_ms = 0.0 + + # Per-frame accumulators (ms) + self._timings: dict[str, list[float]] = { + "file_read": [], + "inference": [], + "postprocess": [], + "emit": [], + "total": [], + } + + def record(self, stage: str, duration_ms: float): + if stage in self._timings: + self._timings[stage].append(duration_ms) + + def record_frame(self): + self.frame_count += 1 + self.total_frames += 1 + if self.frame_count >= self.interval: + self.emit_stats() + self.frame_count = 0 + + def emit_stats(self): + stats = { + "event": "perf_stats", + "total_frames": self.total_frames, + "window_size": len(self._timings["total"]) or 1, + "errors": self.error_count, + "model_load_ms": round(self.model_load_ms, 1), + "timings_ms": {}, + } + if self.export_ms > 0: + stats["export_ms"] = round(self.export_ms, 1) + + for stage, values in self._timings.items(): + if not values: + continue + sorted_v = sorted(values) + n = len(sorted_v) + stats["timings_ms"][stage] = { + "avg": round(sum(sorted_v) / n, 2), + "min": round(sorted_v[0], 2), + "max": round(sorted_v[-1], 2), + "p50": round(sorted_v[n // 2], 2), + "p95": round(sorted_v[int(n * 0.95)], 2), + "p99": round(sorted_v[int(n * 0.99)], 2), + } + emit(stats) + for key in self._timings: + self._timings[key].clear() + + def emit_final(self): + if self._timings["total"]: + self.emit_stats() + + +# ─────────────────────────────────────────────────────────────────────────────── +# Helpers +# ─────────────────────────────────────────────────────────────────────────────── def parse_args(): parser = argparse.ArgumentParser(description="YOLO 2026 Detection Skill") @@ -44,7 +130,6 @@ def load_config(args): """Load config from JSON file, CLI args, or AEGIS_SKILL_PARAMS env var.""" import os - # Priority 1: AEGIS_SKILL_PARAMS env var (set by Aegis skill-runtime-manager) env_params = os.environ.get("AEGIS_SKILL_PARAMS") if env_params: try: @@ -52,14 +137,12 @@ def load_config(args): except json.JSONDecodeError: pass - # Priority 2: Config file if args.config: config_path = Path(args.config) if config_path.exists(): with open(config_path) as f: return json.load(f) - # Priority 3: CLI args return { "model_size": args.model_size, "confidence": args.confidence, @@ -69,56 +152,55 @@ def load_config(args): } -def select_device(preference: str) -> str: - """Select the best available inference device.""" - if preference not in ("auto", ""): - return preference - try: - import torch - if torch.cuda.is_available(): - return "cuda" - if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): - return "mps" - # ROCm exposes as CUDA in PyTorch with ROCm builds - except ImportError: - pass - return "cpu" - - def emit(event: dict): - """Write a JSON line to stdout.""" print(json.dumps(event), flush=True) +def log(msg: str): + print(f"[YOLO-2026] {msg}", file=sys.stderr, flush=True) + + +# ─────────────────────────────────────────────────────────────────────────────── +# Main loop +# ─────────────────────────────────────────────────────────────────────────────── + def main(): args = parse_args() config = load_config(args) - # Resolve config values model_size = config.get("model_size", "nano") - device = select_device(config.get("device", "auto")) confidence = config.get("confidence", 0.5) fps = config.get("fps", 5) + use_optimized = config.get("use_optimized", config.get("use_coreml", True)) + if isinstance(use_optimized, str): + use_optimized = use_optimized.lower() in ("true", "1", "yes") - # Map size to ultralytics model name - model_name = MODEL_SIZE_MAP.get(model_size, "yolo11n") + model_name = MODEL_SIZE_MAP.get(model_size, "yolo26n") target_classes = config.get("classes", ["person", "car", "dog", "cat"]) if isinstance(target_classes, str): target_classes = [c.strip() for c in target_classes.split(",")] - # Load YOLO model + # ── Hardware detection & optimized model loading ── + env = HardwareEnv.detect() + perf = PerfTracker(interval=PERF_STATS_INTERVAL) + try: - from ultralytics import YOLO - model = YOLO(f"{model_name}.pt") - model.to(device) + model, model_format = env.load_optimized(model_name, use_optimized=use_optimized) + perf.model_load_ms = env.load_ms + perf.export_ms = env.export_ms + emit({ "event": "ready", "model": f"yolo2026{model_size[0]}", "model_size": model_size, - "device": device, + "device": env.device, + "backend": env.backend, + "format": model_format, + "gpu": env.gpu_name, "classes": len(model.names), "fps": fps, + "model_load_ms": round(env.load_ms, 1), "available_sizes": list(MODEL_SIZE_MAP.keys()), }) except Exception as e: @@ -151,11 +233,14 @@ def handle_signal(signum, frame): break if msg.get("event") == "frame": + t_frame_start = time.perf_counter() + frame_path = msg.get("frame_path") frame_id = msg.get("frame_id") camera_id = msg.get("camera_id", "unknown") timestamp = msg.get("timestamp", "") + t0 = time.perf_counter() if not frame_path or not Path(frame_path).exists(): emit({ "event": "error", @@ -163,11 +248,16 @@ def handle_signal(signum, frame): "message": f"Frame not found: {frame_path}", "retriable": True, }) + perf.error_count += 1 continue + perf.record("file_read", (time.perf_counter() - t0) * 1000) - # Run inference try: + t0 = time.perf_counter() results = model(frame_path, conf=confidence, verbose=False) + perf.record("inference", (time.perf_counter() - t0) * 1000) + + t0 = time.perf_counter() objects = [] for r in results: for box in r.boxes: @@ -180,7 +270,9 @@ def handle_signal(signum, frame): "confidence": round(float(box.conf[0]), 3), "bbox": [int(x1), int(y1), int(x2), int(y2)], }) + perf.record("postprocess", (time.perf_counter() - t0) * 1000) + t0 = time.perf_counter() emit({ "event": "detections", "frame_id": frame_id, @@ -188,6 +280,8 @@ def handle_signal(signum, frame): "timestamp": timestamp, "objects": objects, }) + perf.record("emit", (time.perf_counter() - t0) * 1000) + except Exception as e: emit({ "event": "error", @@ -195,6 +289,13 @@ def handle_signal(signum, frame): "message": f"Inference error: {e}", "retriable": True, }) + perf.error_count += 1 + continue + + perf.record("total", (time.perf_counter() - t_frame_start) * 1000) + perf.record_frame() + + perf.emit_final() if __name__ == "__main__": diff --git a/skills/lib/__init__.py b/skills/lib/__init__.py new file mode 100644 index 0000000..33a0dde --- /dev/null +++ b/skills/lib/__init__.py @@ -0,0 +1 @@ +# DeepCamera Skills β€” Shared Library diff --git a/skills/lib/env_config.py b/skills/lib/env_config.py new file mode 100644 index 0000000..1676e21 --- /dev/null +++ b/skills/lib/env_config.py @@ -0,0 +1,432 @@ +""" +env_config.py β€” Shared hardware environment detection and model optimization. + +Provides a single entry point for any DeepCamera skill to: + 1. Detect available compute hardware (NVIDIA, AMD, Apple, Intel, CPU) + 2. Auto-export models to the optimal inference format + 3. Load cached optimized models with PyTorch fallback + +Usage: + from lib.env_config import HardwareEnv + + env = HardwareEnv.detect() + model, fmt = env.load_optimized("yolo26n") +""" + +import json +import os +import platform +import shutil +import subprocess +import sys +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + + +def _log(msg: str): + """Log to stderr.""" + print(f"[env_config] {msg}", file=sys.stderr, flush=True) + + +# ─── Backend definitions ──────────────────────────────────────────────────── + +@dataclass +class BackendSpec: + """Specification for a compute backend's optimized export.""" + name: str # "cuda", "rocm", "mps", "intel", "cpu" + export_format: str # ultralytics export format string + model_suffix: str # file extension/dir to look for cached model + half: bool = True # use FP16 + extra_export_args: dict = field(default_factory=dict) + + +BACKEND_SPECS = { + "cuda": BackendSpec( + name="cuda", + export_format="engine", + model_suffix=".engine", + half=True, + ), + "rocm": BackendSpec( + name="rocm", + export_format="onnx", + model_suffix=".onnx", + half=False, # ONNX Runtime ROCm handles precision internally + ), + "mps": BackendSpec( + name="mps", + export_format="coreml", + model_suffix=".mlpackage", + half=True, + extra_export_args={"nms": False}, + ), + "intel": BackendSpec( + name="intel", + export_format="openvino", + model_suffix="_openvino_model", + half=True, + ), + "cpu": BackendSpec( + name="cpu", + export_format="onnx", + model_suffix=".onnx", + half=False, + ), +} + + +# ─── Hardware detection ────────────────────────────────────────────────────── + +@dataclass +class HardwareEnv: + """Detected hardware environment with model optimization capabilities.""" + + backend: str = "cpu" # "cuda" | "rocm" | "mps" | "intel" | "cpu" + device: str = "cpu" # torch device string + export_format: str = "onnx" # optimal export format + gpu_name: str = "" # human-readable GPU name + gpu_memory_mb: int = 0 # GPU memory in MB + driver_version: str = "" # GPU driver version + framework_ok: bool = False # True if optimized runtime is importable + detection_details: dict = field(default_factory=dict) # raw detection info + + # Timing (populated by export/load) + export_ms: float = 0.0 + load_ms: float = 0.0 + + @staticmethod + def detect() -> "HardwareEnv": + """Probe the system and return a populated HardwareEnv.""" + env = HardwareEnv() + + # Try each backend in priority order + if env._try_cuda(): + pass + elif env._try_rocm(): + pass + elif env._try_mps(): + pass + elif env._try_intel(): + pass + else: + env._fallback_cpu() + + # Set export format from backend spec + spec = BACKEND_SPECS.get(env.backend, BACKEND_SPECS["cpu"]) + env.export_format = spec.export_format + + # Check if optimized runtime is available + env.framework_ok = env._check_framework() + + _log(f"Detected: backend={env.backend}, device={env.device}, " + f"gpu={env.gpu_name or 'none'}, " + f"format={env.export_format}, " + f"framework_ok={env.framework_ok}") + + return env + + def _try_cuda(self) -> bool: + """Detect NVIDIA GPU via nvidia-smi and torch.""" + if not shutil.which("nvidia-smi"): + return False + try: + result = subprocess.run( + ["nvidia-smi", "--query-gpu=name,memory.total,driver_version", + "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=10, + ) + if result.returncode != 0: + return False + + line = result.stdout.strip().split("\n")[0] + parts = [p.strip() for p in line.split(",")] + if len(parts) >= 3: + self.backend = "cuda" + self.device = "cuda" + self.gpu_name = parts[0] + self.gpu_memory_mb = int(float(parts[1])) + self.driver_version = parts[2] + self.detection_details["nvidia_smi"] = line + _log(f"NVIDIA GPU: {self.gpu_name} ({self.gpu_memory_mb}MB, driver {self.driver_version})") + return True + except (subprocess.TimeoutExpired, FileNotFoundError, ValueError) as e: + _log(f"nvidia-smi probe failed: {e}") + return False + + def _try_rocm(self) -> bool: + """Detect AMD GPU via rocm-smi or /opt/rocm.""" + has_rocm_smi = shutil.which("rocm-smi") is not None + has_rocm_dir = Path("/opt/rocm").is_dir() + + if not (has_rocm_smi or has_rocm_dir): + return False + + self.backend = "rocm" + self.device = "cuda" # ROCm exposes as CUDA in PyTorch + + if has_rocm_smi: + try: + result = subprocess.run( + ["rocm-smi", "--showproductname", "--csv"], + capture_output=True, text=True, timeout=10, + ) + if result.returncode == 0: + lines = result.stdout.strip().split("\n") + if len(lines) > 1: + self.gpu_name = lines[1].split(",")[0].strip() + self.detection_details["rocm_smi"] = result.stdout.strip() + except (subprocess.TimeoutExpired, FileNotFoundError): + pass + + try: + result = subprocess.run( + ["rocm-smi", "--showmeminfo", "vram", "--csv"], + capture_output=True, text=True, timeout=10, + ) + if result.returncode == 0: + # Parse total VRAM + for line in result.stdout.strip().split("\n")[1:]: + parts = line.split(",") + if len(parts) >= 2: + try: + self.gpu_memory_mb = int(float(parts[0].strip()) / (1024 * 1024)) + except ValueError: + pass + break + except (subprocess.TimeoutExpired, FileNotFoundError): + pass + + _log(f"AMD ROCm GPU: {self.gpu_name or 'detected'} ({self.gpu_memory_mb}MB)") + return True + + def _try_mps(self) -> bool: + """Detect Apple Silicon via uname + sysctl.""" + if platform.system() != "Darwin" or platform.machine() != "arm64": + return False + + self.backend = "mps" + self.device = "mps" + + # Get chip name + try: + result = subprocess.run( + ["sysctl", "-n", "machdep.cpu.brand_string"], + capture_output=True, text=True, timeout=5, + ) + if result.returncode == 0: + self.gpu_name = result.stdout.strip() + except (subprocess.TimeoutExpired, FileNotFoundError): + self.gpu_name = "Apple Silicon" + + # Get total memory (shared with GPU on Apple Silicon) + try: + result = subprocess.run( + ["sysctl", "-n", "hw.memsize"], + capture_output=True, text=True, timeout=5, + ) + if result.returncode == 0: + self.gpu_memory_mb = int(int(result.stdout.strip()) / (1024 * 1024)) + except (subprocess.TimeoutExpired, FileNotFoundError, ValueError): + pass + + _log(f"Apple Silicon: {self.gpu_name} ({self.gpu_memory_mb}MB unified)") + return True + + def _try_intel(self) -> bool: + """Detect Intel OpenVINO-capable hardware.""" + # Check for OpenVINO installation + has_openvino = False + try: + import openvino # noqa: F401 + has_openvino = True + except ImportError: + # Check for system install + has_openvino = Path("/opt/intel/openvino").is_dir() + + if not has_openvino: + # Check CPU flags for Intel-specific features (AVX-512, AMX) + try: + if platform.system() == "Linux": + with open("/proc/cpuinfo") as f: + cpuinfo = f.read() + if "GenuineIntel" in cpuinfo: + self.backend = "intel" + self.device = "cpu" + self.gpu_name = "Intel CPU" + _log("Intel CPU detected (no OpenVINO installed)") + return True + except FileNotFoundError: + pass + return False + + self.backend = "intel" + self.device = "cpu" # OpenVINO handles device selection internally + self.gpu_name = "Intel (OpenVINO)" + + # Check for Intel GPU / NPU + try: + from openvino.runtime import Core + core = Core() + devices = core.available_devices + self.detection_details["openvino_devices"] = devices + if "GPU" in devices: + self.gpu_name = "Intel GPU (OpenVINO)" + if "NPU" in devices: + self.gpu_name = "Intel NPU (OpenVINO)" + _log(f"OpenVINO devices: {devices}") + except Exception: + pass + + _log(f"Intel: {self.gpu_name}") + return True + + def _fallback_cpu(self): + """CPU-only fallback.""" + self.backend = "cpu" + self.device = "cpu" + self.gpu_name = "" + + # Report CPU info + try: + self.detection_details["cpu"] = platform.processor() or "unknown" + except Exception: + pass + + _log("No GPU detected, using CPU backend") + + def _check_framework(self) -> bool: + """Check if the optimized inference runtime is importable.""" + checks = { + "cuda": lambda: __import__("tensorrt"), + "rocm": lambda: __import__("onnxruntime"), + "mps": lambda: __import__("coremltools"), + "intel": lambda: __import__("openvino"), + "cpu": lambda: __import__("onnxruntime"), + } + + check = checks.get(self.backend) + if not check: + return False + try: + check() + return True + except ImportError: + _log(f"Optimized runtime not installed for {self.backend}, " + f"will use PyTorch fallback") + return False + + # ─── Model export & loading ────────────────────────────────────────── + + def get_optimized_path(self, model_name: str) -> Path: + """Get the expected path for the optimized model.""" + spec = BACKEND_SPECS.get(self.backend, BACKEND_SPECS["cpu"]) + return Path(f"{model_name}{spec.model_suffix}") + + def export_model(self, model, model_name: str) -> Optional[Path]: + """Export PyTorch model to optimal format. Returns path or None.""" + if not self.framework_ok: + _log(f"Skipping export β€” {self.backend} runtime not available") + return None + + spec = BACKEND_SPECS.get(self.backend, BACKEND_SPECS["cpu"]) + optimized_path = self.get_optimized_path(model_name) + + # Already exported + if optimized_path.exists(): + _log(f"Cached model found: {optimized_path}") + return optimized_path + + try: + _log(f"Exporting {model_name}.pt β†’ {spec.export_format} " + f"(one-time, may take 30-120s)...") + t0 = time.perf_counter() + + export_kwargs = { + "format": spec.export_format, + "half": spec.half, + } + export_kwargs.update(spec.extra_export_args) + + exported = model.export(**export_kwargs) + self.export_ms = (time.perf_counter() - t0) * 1000 + + exported_path = Path(exported) + if exported_path.exists(): + _log(f"Export complete: {exported_path} ({self.export_ms:.0f}ms)") + return exported_path + + _log(f"Export returned {exported} but path not found") + except Exception as e: + _log(f"Export failed ({spec.export_format}): {e}") + + return None + + def load_optimized(self, model_name: str, use_optimized: bool = True): + """ + Load the best available model for this hardware. + + Returns: + (model, format_str) β€” the YOLO model and its format name + """ + from ultralytics import YOLO + + t0 = time.perf_counter() + + if use_optimized and self.framework_ok: + # Try loading from cache first (no export needed) + optimized_path = self.get_optimized_path(model_name) + if optimized_path.exists(): + try: + model = YOLO(str(optimized_path)) + self.load_ms = (time.perf_counter() - t0) * 1000 + _log(f"Loaded {self.export_format} model ({self.load_ms:.0f}ms)") + return model, self.export_format + except Exception as e: + _log(f"Failed to load cached model: {e}") + + # Try exporting then loading + pt_model = YOLO(f"{model_name}.pt") + exported = self.export_model(pt_model, model_name) + if exported: + try: + model = YOLO(str(exported)) + self.load_ms = (time.perf_counter() - t0) * 1000 + _log(f"Loaded freshly exported {self.export_format} model ({self.load_ms:.0f}ms)") + return model, self.export_format + except Exception as e: + _log(f"Failed to load exported model: {e}") + + # Fallback: use the PT model we already loaded + _log("Falling back to PyTorch model") + pt_model.to(self.device) + self.load_ms = (time.perf_counter() - t0) * 1000 + return pt_model, "pytorch" + + # No optimization requested or framework missing + model = YOLO(f"{model_name}.pt") + model.to(self.device) + self.load_ms = (time.perf_counter() - t0) * 1000 + return model, "pytorch" + + def to_dict(self) -> dict: + """Serialize environment info for JSON output.""" + return { + "backend": self.backend, + "device": self.device, + "export_format": self.export_format, + "gpu_name": self.gpu_name, + "gpu_memory_mb": self.gpu_memory_mb, + "driver_version": self.driver_version, + "framework_ok": self.framework_ok, + "export_ms": round(self.export_ms, 1), + "load_ms": round(self.load_ms, 1), + } + + +# ─── CLI: run standalone for diagnostics ───────────────────────────────────── + +if __name__ == "__main__": + env = HardwareEnv.detect() + print(json.dumps(env.to_dict(), indent=2))