From 2d1ba04a2675ae40f86d0a4acef07bcb5c6c8335 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Mon, 12 Jan 2026 12:26:49 +0400 Subject: [PATCH 01/15] chore: upgrade pinned dependency versions in obolup.sh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update dependency versions to latest stable releases: - kubectl: 1.31.0 → 1.35.0 - helm: 3.19.1 → 3.19.4 - helmfile: 1.2.2 → 1.2.3 - k9s: 0.32.5 → 0.50.18 - helm-diff: 3.9.11 → 3.14.1 k3d remains at 5.8.3 (already current). --- obolup.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/obolup.sh b/obolup.sh index 2741a53..f6430ab 100755 --- a/obolup.sh +++ b/obolup.sh @@ -49,12 +49,12 @@ fi # Pinned dependency versions # Update these versions to upgrade dependencies across all installations -readonly KUBECTL_VERSION="1.31.0" -readonly HELM_VERSION="3.19.1" +readonly KUBECTL_VERSION="1.35.0" +readonly HELM_VERSION="3.19.4" readonly K3D_VERSION="5.8.3" -readonly HELMFILE_VERSION="1.2.2" -readonly K9S_VERSION="0.32.5" -readonly HELM_DIFF_VERSION="3.9.11" +readonly HELMFILE_VERSION="1.2.3" +readonly K9S_VERSION="0.50.18" +readonly HELM_DIFF_VERSION="3.14.1" # Repository URL for building from source readonly OBOL_REPO_URL="git@github.com:ObolNetwork/obol-stack.git" From d1101042a2baf5a589bcc8a31c0cf0ba228f1d1c Mon Sep 17 00:00:00 2001 From: bussyjd Date: Wed, 14 Jan 2026 00:21:46 +0400 Subject: [PATCH 02/15] feat: replace nginx-ingress with Traefik and Gateway API Replace nginx-ingress controller with Traefik 38.0.2 using Kubernetes Gateway API for routing. This addresses the nginx-ingress deprecation (end of maintenance March 2026). Changes: - Remove --disable=traefik from k3d config to use k3s built-in Traefik - Replace nginx-ingress helm release with Traefik 38.0.2 in infrastructure - Configure Gateway API provider with cross-namespace routing support - Add GatewayClass and Gateway resources via Traefik helm chart - Convert all Ingress resources to HTTPRoute format: - eRPC: /rpc path routing - obol-frontend: / path routing - ethereum: /execution and /beacon path routing with URL rewrite - aztec: namespace-based path routing with URL rewrite - helios: namespace-based path routing with URL rewrite - Disable legacy Ingress in service helm values Closes #125 --- internal/embed/infrastructure/helmfile.yaml | 155 +++++++++++++++--- .../infrastructure/values/erpc.yaml.gotmpl | 9 +- .../values/obol-frontend.yaml.gotmpl | 9 +- internal/embed/k3d-config.yaml | 4 - .../networks/aztec/templates/ingress.yaml | 38 +++-- .../networks/ethereum/templates/ingress.yaml | 75 ++++++--- .../networks/helios/helmfile.yaml.gotmpl | 45 +++-- 7 files changed, 243 insertions(+), 92 deletions(-) diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml index 9f49d09..c2751a1 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml @@ -1,17 +1,20 @@ # Helmfile for Obol Stack default infrastructure # Orchestrates core infrastructure components deployed with every stack +# Uses Traefik with Gateway API for routing (replaces nginx-ingress) repositories: - - name: ingress-nginx - url: https://kubernetes.github.io/ingress-nginx + - name: traefik + url: https://traefik.github.io/charts - name: obol url: https://obolnetwork.github.io/helm-charts/ - name: ethereum url: https://ethpandaops.github.io/ethereum-helm-charts + - name: bedag + url: https://bedag.github.io/helm-charts/ # Single source of truth: change this to switch networks values: - - network: mainnet + - network: mainnet releases: # Local storage provisioner (raw manifests wrapped as chart) @@ -22,45 +25,149 @@ releases: - dataDir: /data - network: "{{ .Values.network }}" - # Nginx ingress controller (upstream chart) - - name: ingress-nginx - namespace: ingress-nginx - chart: ingress-nginx/ingress-nginx - version: 4.13.3 + # Traefik ingress controller with Gateway API support + - name: traefik + namespace: traefik + createNamespace: true + chart: traefik/traefik + version: 38.0.2 values: - - controller: - replicaCount: 1 - service: - type: LoadBalancer - externalTrafficPolicy: Local - resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 100m - memory: 128Mi - tolerations: [] - admissionWebhooks: + # Gateway API provider configuration + - providers: + kubernetesGateway: + enabled: true + namespaces: [] # Watch all namespaces + kubernetesCRD: + enabled: true + kubernetesIngress: + enabled: false # Disable legacy Ingress support + # GatewayClass configuration + - gatewayClass: + enabled: true + name: traefik + # Gateway configuration (main entry point) + - gateway: + enabled: true + name: traefik-gateway + namespace: traefik + listeners: + web: + port: 8000 + protocol: HTTP + namespacePolicy: + from: All + # Ports configuration + - ports: + web: + port: 8000 + expose: + default: true + exposedPort: 80 + protocol: TCP + websecure: + port: 8443 + expose: + default: true + exposedPort: 443 + protocol: TCP + tls: + enabled: false # TLS termination disabled for local dev + # Service configuration + - service: + type: LoadBalancer + externalTrafficPolicy: Local + # Resource limits + - resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 100m + memory: 128Mi + # Disable dashboard by default + - ingressRoute: + dashboard: enabled: false # eRPC - name: erpc namespace: erpc + createNamespace: true chart: ethereum/erpc needs: - kube-system/base - - ingress-nginx/ingress-nginx + - traefik/traefik values: - ./values/erpc.yaml.gotmpl + # eRPC HTTPRoute + - name: erpc-httproute + namespace: erpc + chart: bedag/raw + needs: + - traefik/traefik + - erpc/erpc + values: + - resources: + - apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + name: erpc + namespace: erpc + spec: + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: web + hostnames: + - obol.stack + rules: + - matches: + - path: + type: PathPrefix + value: /rpc + backendRefs: + - name: erpc + port: 4000 + # Obol Stack frontend - name: obol-frontend namespace: obol-frontend + createNamespace: true chart: obol/obol-app version: 0.1.0 needs: - - ingress-nginx/ingress-nginx + - traefik/traefik - erpc/erpc values: - ./values/obol-frontend.yaml.gotmpl + + # Obol Frontend HTTPRoute + - name: obol-frontend-httproute + namespace: obol-frontend + chart: bedag/raw + needs: + - traefik/traefik + - obol-frontend/obol-frontend + values: + - resources: + - apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + name: obol-frontend + namespace: obol-frontend + spec: + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: web + hostnames: + - obol.stack + rules: + - matches: + - path: + type: PathPrefix + value: / + backendRefs: + - name: obol-frontend + port: 3000 diff --git a/internal/embed/infrastructure/values/erpc.yaml.gotmpl b/internal/embed/infrastructure/values/erpc.yaml.gotmpl index fdedc69..6799332 100644 --- a/internal/embed/infrastructure/values/erpc.yaml.gotmpl +++ b/internal/embed/infrastructure/values/erpc.yaml.gotmpl @@ -87,14 +87,9 @@ extraArgs: [] # Command replacement for the erpc container customCommand: [] +# Disable legacy Ingress - using Gateway API HTTPRoute instead ingress: - enabled: true - className: nginx - hosts: - - host: obol.stack - paths: - - path: /rpc - pathType: Prefix + enabled: false service: type: ClusterIP diff --git a/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl b/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl index 3301156..08aa9e0 100644 --- a/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl +++ b/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl @@ -19,11 +19,6 @@ service: type: ClusterIP port: 3000 +# Disable legacy Ingress - using Gateway API HTTPRoute instead ingress: - enabled: true - className: "nginx" - hosts: - - host: obol.stack - paths: - - path: / - pathType: Prefix + enabled: false diff --git a/internal/embed/k3d-config.yaml b/internal/embed/k3d-config.yaml index 563d697..0acd911 100644 --- a/internal/embed/k3d-config.yaml +++ b/internal/embed/k3d-config.yaml @@ -35,10 +35,6 @@ options: - arg: --kube-apiserver-arg=feature-gates=KubeletInUserNamespace=true nodeFilters: - server:* - # Disable Traefik to use nginx instead - - arg: --disable=traefik - nodeFilters: - - server:* # Disable local-storage addon (we provide our own config) - arg: --disable=local-storage nodeFilters: diff --git a/internal/embed/networks/aztec/templates/ingress.yaml b/internal/embed/networks/aztec/templates/ingress.yaml index 1e8ddd3..cdd664c 100644 --- a/internal/embed/networks/aztec/templates/ingress.yaml +++ b/internal/embed/networks/aztec/templates/ingress.yaml @@ -1,23 +1,29 @@ {{- if eq .Release.Name "aztec-ingress" }} -apiVersion: networking.k8s.io/v1 -kind: Ingress +# HTTPRoute for Aztec sequencer node RPC +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute metadata: name: aztec namespace: {{ .Release.Namespace }} - annotations: - nginx.ingress.kubernetes.io/rewrite-target: /$2 - nginx.ingress.kubernetes.io/use-regex: "true" spec: - ingressClassName: nginx + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: web + hostnames: + - obol.stack rules: - - host: obol.stack - http: - paths: - - path: /{{ .Release.Namespace }}(/|$)(.*) - pathType: ImplementationSpecific - backend: - service: - name: l2-sequencer-node-{{ .Values.id }}-node - port: - number: 8080 + - matches: + - path: + type: PathPrefix + value: /{{ .Release.Namespace }} + filters: + - type: URLRewrite + urlRewrite: + path: + type: ReplacePrefixMatch + replacePrefixMatch: / + backendRefs: + - name: l2-sequencer-node-{{ .Values.id }}-node + port: 8080 {{- end }} diff --git a/internal/embed/networks/ethereum/templates/ingress.yaml b/internal/embed/networks/ethereum/templates/ingress.yaml index 75a39a6..a8cda39 100644 --- a/internal/embed/networks/ethereum/templates/ingress.yaml +++ b/internal/embed/networks/ethereum/templates/ingress.yaml @@ -1,30 +1,57 @@ {{- if eq .Release.Name "ethereum-ingress" }} -apiVersion: networking.k8s.io/v1 -kind: Ingress +# HTTPRoute for Ethereum execution client RPC +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute metadata: - name: ethereum + name: ethereum-execution namespace: {{ .Release.Namespace }} - annotations: - nginx.ingress.kubernetes.io/rewrite-target: /$2 - nginx.ingress.kubernetes.io/use-regex: "true" spec: - ingressClassName: nginx + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: web + hostnames: + - obol.stack rules: - - host: obol.stack - http: - paths: - - path: /{{ .Release.Namespace }}/execution(/|$)(.*) - pathType: ImplementationSpecific - backend: - service: - name: ethereum-execution - port: - number: 8545 - - path: /{{ .Release.Namespace }}/beacon(/|$)(.*) - pathType: ImplementationSpecific - backend: - service: - name: ethereum-beacon - port: - number: 5052 + - matches: + - path: + type: PathPrefix + value: /{{ .Release.Namespace }}/execution + filters: + - type: URLRewrite + urlRewrite: + path: + type: ReplacePrefixMatch + replacePrefixMatch: / + backendRefs: + - name: ethereum-execution + port: 8545 +--- +# HTTPRoute for Ethereum beacon client RPC +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: ethereum-beacon + namespace: {{ .Release.Namespace }} +spec: + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: web + hostnames: + - obol.stack + rules: + - matches: + - path: + type: PathPrefix + value: /{{ .Release.Namespace }}/beacon + filters: + - type: URLRewrite + urlRewrite: + path: + type: ReplacePrefixMatch + replacePrefixMatch: / + backendRefs: + - name: ethereum-beacon + port: 5052 {{- end }} diff --git a/internal/embed/networks/helios/helmfile.yaml.gotmpl b/internal/embed/networks/helios/helmfile.yaml.gotmpl index 2be4293..7fbbf53 100644 --- a/internal/embed/networks/helios/helmfile.yaml.gotmpl +++ b/internal/embed/networks/helios/helmfile.yaml.gotmpl @@ -28,17 +28,42 @@ releases: size: 10Gi storageClass: local-path + # Disable legacy Ingress - using Gateway API HTTPRoute instead - ingress: - enabled: true - className: nginx - annotations: - nginx.ingress.kubernetes.io/rewrite-target: /$2 - nginx.ingress.kubernetes.io/use-regex: "true" - hosts: - - host: obol.stack - paths: - - path: /helios-{{ .Values.id }}(/|$)(.*) - pathType: ImplementationSpecific + enabled: false + + # HTTPRoute for Helios RPC endpoint + - name: helios-httproute + namespace: helios-{{ .Values.id }} + chart: bedag/raw + values: + - resources: + - apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + name: helios + namespace: helios-{{ .Values.id }} + spec: + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: web + hostnames: + - obol.stack + rules: + - matches: + - path: + type: PathPrefix + value: /helios-{{ .Values.id }} + filters: + - type: URLRewrite + urlRewrite: + path: + type: ReplacePrefixMatch + replacePrefixMatch: / + backendRefs: + - name: helios-{{ .Values.network }} + port: 8545 # Metadata ConfigMap for frontend discovery - name: helios-metadata From ba54ea5b01d45105f36e6b7252e114b849fa3c31 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Wed, 14 Jan 2026 14:10:04 +0400 Subject: [PATCH 03/15] feat: add monitoring stack and gateway updates --- CLAUDE.md | 13 ++++++++ internal/embed/infrastructure/helmfile.yaml | 32 +++++++++++++++++- .../values/monitoring.yaml.gotmpl | 33 +++++++++++++++++++ .../values/obol-frontend.yaml.gotmpl | 2 +- internal/embed/k3d-config.yaml | 4 +++ .../networks/aztec/templates/ingress.yaml | 5 ++- .../networks/ethereum/templates/ingress.yaml | 10 ++++-- .../networks/helios/helmfile.yaml.gotmpl | 5 ++- renovate.json | 13 ++++++++ 9 files changed, 111 insertions(+), 6 deletions(-) create mode 100644 internal/embed/infrastructure/values/monitoring.yaml.gotmpl diff --git a/CLAUDE.md b/CLAUDE.md index bc40752..a6f3ba8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -20,6 +20,19 @@ The Obol Stack is a local Kubernetes-based framework for running blockchain netw 5. **Two-stage templating**: CLI flags → Go templates → Helmfile → Kubernetes resources 6. **Development mode**: Local `.workspace/` directory with `go run` wrapper for rapid development +### Routing and Gateway API + +Obol Stack uses Traefik with the Kubernetes Gateway API for HTTP routing. + +- Controller: Traefik Helm chart (`traefik` namespace) +- GatewayClass: `traefik` +- Gateway: `traefik-gateway` in `traefik` namespace +- HTTPRoute patterns: + - `/` → `obol-frontend` + - `/rpc` → `erpc` + - `/ethereum-/execution` and `/ethereum-/beacon` + - `/aztec-` and `/helios-` + ## Bootstrap Installer: obolup.sh ### Purpose diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml index c2751a1..6f4d2b5 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml @@ -5,6 +5,8 @@ repositories: - name: traefik url: https://traefik.github.io/charts + - name: prometheus-community + url: https://prometheus-community.github.io/helm-charts - name: obol url: https://obolnetwork.github.io/helm-charts/ - name: ethereum @@ -15,6 +17,7 @@ repositories: # Single source of truth: change this to switch networks values: - network: mainnet + - gatewayApiVersion: v1.4.1 releases: # Local storage provisioner (raw manifests wrapped as chart) @@ -25,12 +28,39 @@ releases: - dataDir: /data - network: "{{ .Values.network }}" + # Monitoring stack (Prometheus operator + Prometheus) + - name: monitoring + namespace: monitoring + createNamespace: true + chart: prometheus-community/kube-prometheus-stack + version: 79.5.0 + values: + - ./values/monitoring.yaml.gotmpl + + # Gateway API CRDs (applied from upstream release) + - name: gateway-api-crds + namespace: gateway-system + createNamespace: true + chart: bedag/raw + values: + - resources: [] + hooks: + - events: ["presync"] + showlogs: true + command: kubectl + args: + - apply + - -f + - https://github.com/kubernetes-sigs/gateway-api/releases/download/{{ .Values.gatewayApiVersion }}/standard-install.yaml + # Traefik ingress controller with Gateway API support - name: traefik namespace: traefik createNamespace: true chart: traefik/traefik version: 38.0.2 + needs: + - gateway-system/gateway-api-crds values: # Gateway API provider configuration - providers: @@ -169,5 +199,5 @@ releases: type: PathPrefix value: / backendRefs: - - name: obol-frontend + - name: obol-frontend-obol-app port: 3000 diff --git a/internal/embed/infrastructure/values/monitoring.yaml.gotmpl b/internal/embed/infrastructure/values/monitoring.yaml.gotmpl new file mode 100644 index 0000000..d7a0dc1 --- /dev/null +++ b/internal/embed/infrastructure/values/monitoring.yaml.gotmpl @@ -0,0 +1,33 @@ +prometheus: + enabled: true + prometheusSpec: + serviceMonitorSelectorNilUsesHelmValues: false + serviceMonitorSelector: + matchLabels: + release: monitoring + serviceMonitorNamespaceSelector: {} + podMonitorSelectorNilUsesHelmValues: false + podMonitorSelector: + matchLabels: + release: monitoring + podMonitorNamespaceSelector: {} + retention: 6h + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 1Gi + +grafana: + enabled: false # Enable when we want UI access + +alertmanager: + enabled: false # Disable to keep the local stack lean + +kubeStateMetrics: + enabled: true + +nodeExporter: + enabled: true diff --git a/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl b/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl index 08aa9e0..92aab95 100644 --- a/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl +++ b/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl @@ -13,7 +13,7 @@ image: repository: obolnetwork/obol-stack-front-end pullPolicy: Always - tag: "v0.1.1" + tag: "latest" service: type: ClusterIP diff --git a/internal/embed/k3d-config.yaml b/internal/embed/k3d-config.yaml index 0acd911..9a97c5d 100644 --- a/internal/embed/k3d-config.yaml +++ b/internal/embed/k3d-config.yaml @@ -35,6 +35,10 @@ options: - arg: --kube-apiserver-arg=feature-gates=KubeletInUserNamespace=true nodeFilters: - server:* + # Disable bundled Traefik (we install Traefik via Helm) + - arg: --disable=traefik + nodeFilters: + - server:* # Disable local-storage addon (we provide our own config) - arg: --disable=local-storage nodeFilters: diff --git a/internal/embed/networks/aztec/templates/ingress.yaml b/internal/embed/networks/aztec/templates/ingress.yaml index cdd664c..821537d 100644 --- a/internal/embed/networks/aztec/templates/ingress.yaml +++ b/internal/embed/networks/aztec/templates/ingress.yaml @@ -15,8 +15,11 @@ spec: rules: - matches: - path: - type: PathPrefix + type: Exact value: /{{ .Release.Namespace }} + - path: + type: PathPrefix + value: /{{ .Release.Namespace }}/ filters: - type: URLRewrite urlRewrite: diff --git a/internal/embed/networks/ethereum/templates/ingress.yaml b/internal/embed/networks/ethereum/templates/ingress.yaml index a8cda39..76c745e 100644 --- a/internal/embed/networks/ethereum/templates/ingress.yaml +++ b/internal/embed/networks/ethereum/templates/ingress.yaml @@ -15,8 +15,11 @@ spec: rules: - matches: - path: - type: PathPrefix + type: Exact value: /{{ .Release.Namespace }}/execution + - path: + type: PathPrefix + value: /{{ .Release.Namespace }}/execution/ filters: - type: URLRewrite urlRewrite: @@ -43,8 +46,11 @@ spec: rules: - matches: - path: - type: PathPrefix + type: Exact value: /{{ .Release.Namespace }}/beacon + - path: + type: PathPrefix + value: /{{ .Release.Namespace }}/beacon/ filters: - type: URLRewrite urlRewrite: diff --git a/internal/embed/networks/helios/helmfile.yaml.gotmpl b/internal/embed/networks/helios/helmfile.yaml.gotmpl index 7fbbf53..c0a5d96 100644 --- a/internal/embed/networks/helios/helmfile.yaml.gotmpl +++ b/internal/embed/networks/helios/helmfile.yaml.gotmpl @@ -53,8 +53,11 @@ releases: rules: - matches: - path: - type: PathPrefix + type: Exact value: /helios-{{ .Values.id }} + - path: + type: PathPrefix + value: /helios-{{ .Values.id }}/ filters: - type: URLRewrite urlRewrite: diff --git a/renovate.json b/renovate.json index 6932b83..afab9bf 100644 --- a/renovate.json +++ b/renovate.json @@ -20,6 +20,19 @@ "datasourceTemplate": "github-releases", "depNameTemplate": "ObolNetwork/obol-stack-front-end", "versioningTemplate": "semver" + }, + { + "customType": "regex", + "description": "Update Gateway API release version", + "matchStrings": [ + "gatewayApiVersion:\\s*[\"']?(?v[0-9]+\\.[0-9]+\\.[0-9]+)[\"']?" + ], + "fileMatch": [ + "^internal/embed/infrastructure/helmfile\\.yaml$" + ], + "datasourceTemplate": "github-releases", + "depNameTemplate": "kubernetes-sigs/gateway-api", + "versioningTemplate": "semver" } ], "packageRules": [ From ccfef5553074d82d0468d988f76bb17a8ec669c5 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Wed, 14 Jan 2026 17:39:57 +0400 Subject: [PATCH 04/15] feat: add cloudflared tunnel for public service exposure Add Cloudflare Tunnel integration to expose obol-stack services publicly without port forwarding or static IPs. Uses quick tunnel mode for MVP. Changes: - Add cloudflared Helm chart (internal/embed/infrastructure/cloudflared/) - Add tunnel management package (internal/tunnel/) - Add CLI commands: obol tunnel status/restart/logs - Integrate cloudflared into infrastructure helmfile The tunnel deploys automatically with `obol stack up` and provides a random trycloudflare.com URL accessible via `obol tunnel status`. Future: Named tunnel support for persistent URLs (obol tunnel login) --- cmd/obol/main.go | 43 +++++ .../infrastructure/cloudflared/Chart.yaml | 6 + .../cloudflared/templates/deployment.yaml | 44 +++++ internal/embed/infrastructure/helmfile.yaml | 7 + internal/tunnel/tunnel.go | 177 ++++++++++++++++++ 5 files changed, 277 insertions(+) create mode 100644 internal/embed/infrastructure/cloudflared/Chart.yaml create mode 100644 internal/embed/infrastructure/cloudflared/templates/deployment.yaml create mode 100644 internal/tunnel/tunnel.go diff --git a/cmd/obol/main.go b/cmd/obol/main.go index cde6626..69f92c5 100644 --- a/cmd/obol/main.go +++ b/cmd/obol/main.go @@ -12,6 +12,7 @@ import ( "github.com/ObolNetwork/obol-stack/internal/app" "github.com/ObolNetwork/obol-stack/internal/config" "github.com/ObolNetwork/obol-stack/internal/stack" + "github.com/ObolNetwork/obol-stack/internal/tunnel" "github.com/ObolNetwork/obol-stack/internal/version" "github.com/urfave/cli/v2" ) @@ -57,6 +58,11 @@ COMMANDS: app sync Deploy application to cluster app delete Remove application and cluster resources + Tunnel Management: + tunnel status Show tunnel status and public URL + tunnel restart Restart tunnel to get a new URL + tunnel logs View cloudflared logs + Kubernetes Tools (with auto-configured KUBECONFIG): kubectl Run kubectl with stack kubeconfig (passthrough) helm Run helm with stack kubeconfig (passthrough) @@ -157,6 +163,43 @@ GLOBAL OPTIONS: }, }, // ============================================================ + // Tunnel Management Commands + // ============================================================ + { + Name: "tunnel", + Usage: "Manage Cloudflare tunnel for public access", + Subcommands: []*cli.Command{ + { + Name: "status", + Usage: "Show tunnel status and public URL", + Action: func(c *cli.Context) error { + return tunnel.Status(cfg) + }, + }, + { + Name: "restart", + Usage: "Restart the tunnel to get a new URL", + Action: func(c *cli.Context) error { + return tunnel.Restart(cfg) + }, + }, + { + Name: "logs", + Usage: "View cloudflared logs", + Flags: []cli.Flag{ + &cli.BoolFlag{ + Name: "follow", + Aliases: []string{"f"}, + Usage: "Follow log output", + }, + }, + Action: func(c *cli.Context) error { + return tunnel.Logs(cfg, c.Bool("follow")) + }, + }, + }, + }, + // ============================================================ // Kubernetes Tool Passthroughs (with auto-configured KUBECONFIG) // ============================================================ { diff --git a/internal/embed/infrastructure/cloudflared/Chart.yaml b/internal/embed/infrastructure/cloudflared/Chart.yaml new file mode 100644 index 0000000..894505e --- /dev/null +++ b/internal/embed/infrastructure/cloudflared/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: cloudflared +description: Cloudflare Tunnel for public access +type: application +version: 0.1.0 +appVersion: "2024.12.2" diff --git a/internal/embed/infrastructure/cloudflared/templates/deployment.yaml b/internal/embed/infrastructure/cloudflared/templates/deployment.yaml new file mode 100644 index 0000000..212556d --- /dev/null +++ b/internal/embed/infrastructure/cloudflared/templates/deployment.yaml @@ -0,0 +1,44 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cloudflared + labels: + app.kubernetes.io/name: cloudflared + app.kubernetes.io/part-of: obol-stack +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: cloudflared + template: + metadata: + labels: + app.kubernetes.io/name: cloudflared + spec: + containers: + - name: cloudflared + image: cloudflare/cloudflared:2024.12.2 + args: + - tunnel + - --no-autoupdate + - --metrics + - 0.0.0.0:2000 + - --url + - http://traefik.traefik.svc.cluster.local:80 + ports: + - name: metrics + containerPort: 2000 + livenessProbe: + httpGet: + path: /ready + port: metrics + initialDelaySeconds: 10 + periodSeconds: 10 + resources: + requests: + cpu: 10m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi + restartPolicy: Always diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml index 6f4d2b5..310ff46 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml @@ -119,6 +119,13 @@ releases: dashboard: enabled: false + # Cloudflare Tunnel (quick tunnel mode for public access) + - name: cloudflared + namespace: traefik + chart: ./cloudflared + needs: + - traefik/traefik + # eRPC - name: erpc namespace: erpc diff --git a/internal/tunnel/tunnel.go b/internal/tunnel/tunnel.go new file mode 100644 index 0000000..355e9ea --- /dev/null +++ b/internal/tunnel/tunnel.go @@ -0,0 +1,177 @@ +package tunnel + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "regexp" + "strings" + "time" + + "github.com/ObolNetwork/obol-stack/internal/config" +) + +const ( + tunnelNamespace = "traefik" + tunnelLabelSelector = "app.kubernetes.io/name=cloudflared" +) + +// Status displays the current tunnel status and URL +func Status(cfg *config.Config) error { + kubectlPath := filepath.Join(cfg.BinDir, "kubectl") + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + + // Check if kubeconfig exists + if _, err := os.Stat(kubeconfigPath); os.IsNotExist(err) { + return fmt.Errorf("stack not running, use 'obol stack up' first") + } + + // Check pod status first + podStatus, err := getPodStatus(kubectlPath, kubeconfigPath) + if err != nil { + printStatusBox("quick", "not deployed", "", time.Now()) + fmt.Println("\nTroubleshooting:") + fmt.Println(" - Start the stack: obol stack up") + return nil + } + + // Try to get tunnel URL from logs + url, err := GetTunnelURL(cfg) + if err != nil { + printStatusBox("quick", podStatus, "(not available)", time.Now()) + fmt.Println("\nTroubleshooting:") + fmt.Println(" - Check logs: obol tunnel logs") + fmt.Println(" - Restart tunnel: obol tunnel restart") + return nil + } + + printStatusBox("quick", "active", url, time.Now()) + fmt.Printf("\nTest with: curl %s/\n", url) + + return nil +} + +// GetTunnelURL parses cloudflared logs to extract the quick tunnel URL +func GetTunnelURL(cfg *config.Config) (string, error) { + kubectlPath := filepath.Join(cfg.BinDir, "kubectl") + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + + cmd := exec.Command(kubectlPath, + "--kubeconfig", kubeconfigPath, + "logs", "-n", tunnelNamespace, + "-l", tunnelLabelSelector, + "--tail=100", + ) + + output, err := cmd.Output() + if err != nil { + return "", fmt.Errorf("failed to get tunnel logs: %w", err) + } + + // Parse URL from logs (quick tunnel uses cfargotunnel.com) + re := regexp.MustCompile(`https://[a-z0-9-]+\.cfargotunnel\.com`) + matches := re.FindString(string(output)) + if matches == "" { + // Also try trycloudflare.com as fallback + re = regexp.MustCompile(`https://[a-z0-9-]+\.trycloudflare\.com`) + matches = re.FindString(string(output)) + } + if matches == "" { + return "", fmt.Errorf("tunnel URL not found in logs") + } + + return matches, nil +} + +// Restart restarts the cloudflared deployment to get a new tunnel URL +func Restart(cfg *config.Config) error { + kubectlPath := filepath.Join(cfg.BinDir, "kubectl") + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + + // Check if kubeconfig exists + if _, err := os.Stat(kubeconfigPath); os.IsNotExist(err) { + return fmt.Errorf("stack not running, use 'obol stack up' first") + } + + fmt.Println("Restarting cloudflared tunnel...") + + cmd := exec.Command(kubectlPath, + "--kubeconfig", kubeconfigPath, + "rollout", "restart", "deployment/cloudflared", + "-n", tunnelNamespace, + ) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + if err := cmd.Run(); err != nil { + return fmt.Errorf("failed to restart tunnel: %w", err) + } + + fmt.Println("\nTunnel restarting...") + fmt.Println("Run 'obol tunnel status' to see the new URL once ready (may take 10-30 seconds).") + + return nil +} + +// Logs displays cloudflared logs +func Logs(cfg *config.Config, follow bool) error { + kubectlPath := filepath.Join(cfg.BinDir, "kubectl") + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + + // Check if kubeconfig exists + if _, err := os.Stat(kubeconfigPath); os.IsNotExist(err) { + return fmt.Errorf("stack not running, use 'obol stack up' first") + } + + args := []string{ + "--kubeconfig", kubeconfigPath, + "logs", "-n", tunnelNamespace, + "-l", tunnelLabelSelector, + } + + if follow { + args = append(args, "-f") + } + + cmd := exec.Command(kubectlPath, args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + cmd.Stdin = os.Stdin + + return cmd.Run() +} + +// getPodStatus returns the status of the cloudflared pod +func getPodStatus(kubectlPath, kubeconfigPath string) (string, error) { + cmd := exec.Command(kubectlPath, + "--kubeconfig", kubeconfigPath, + "get", "pods", "-n", tunnelNamespace, + "-l", tunnelLabelSelector, + "-o", "jsonpath={.items[0].status.phase}", + ) + + output, err := cmd.Output() + if err != nil { + return "", err + } + + status := strings.TrimSpace(string(output)) + if status == "" { + return "", fmt.Errorf("no pods found") + } + + return strings.ToLower(status), nil +} + +// printStatusBox prints a formatted status box +func printStatusBox(mode, status, url string, lastUpdated time.Time) { + fmt.Println() + fmt.Println("Cloudflare Tunnel Status") + fmt.Println(strings.Repeat("─", 50)) + fmt.Printf("Mode: %s\n", mode) + fmt.Printf("Status: %s\n", status) + fmt.Printf("URL: %s\n", url) + fmt.Printf("Last Updated: %s\n", lastUpdated.Format(time.RFC3339)) + fmt.Println(strings.Repeat("─", 50)) +} From bd21826697ab5e0150de8d727188c76c5359e499 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 27 Jan 2026 12:47:15 +0100 Subject: [PATCH 05/15] docs: update CLAUDE.md with new dependency versions Update documentation to reflect the upgraded dependency versions in obolup.sh. This keeps the documentation in sync with the actual pinned versions used by the bootstrap installer. --- CLAUDE.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index bc40752..8aa79e8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -58,12 +58,12 @@ Uses local workspace: **Pinned versions** (lines 50-57): ```bash -KUBECTL_VERSION="1.31.0" -HELM_VERSION="3.16.2" +KUBECTL_VERSION="1.35.0" +HELM_VERSION="3.19.4" K3D_VERSION="5.8.3" -HELMFILE_VERSION="0.169.1" -K9S_VERSION="0.32.5" -HELM_DIFF_VERSION="3.9.11" +HELMFILE_VERSION="1.2.3" +K9S_VERSION="0.50.18" +HELM_DIFF_VERSION="3.14.1" ``` **Smart installation logic**: @@ -811,12 +811,12 @@ obol network delete ethereum- --force - Go 1.21+ (for building from source) **Installed by obolup.sh**: -- kubectl 1.31.0 -- helm 3.16.2 +- kubectl 1.35.0 +- helm 3.19.4 - k3d 5.8.3 -- helmfile 0.169.1 -- k9s 0.32.5 -- helm-diff plugin 3.9.11 +- helmfile 1.2.3 +- k9s 0.50.18 +- helm-diff plugin 3.14.1 **Go dependencies** (key packages): - `github.com/urfave/cli/v2` - CLI framework From d5e5ccd6be7d65197d9e52b5b56e686ca758a4b3 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Mon, 2 Feb 2026 17:02:03 +0100 Subject: [PATCH 06/15] feat(auth): add dashboard auth and nodecore token refresh --- README.md | 29 +++ .../base/templates/oauth-token.yaml | 176 ++++++++++++++++++ internal/embed/infrastructure/helmfile.yaml | 14 +- .../infrastructure/values/erpc.yaml.gotmpl | 21 ++- .../values/obol-frontend.yaml.gotmpl | 27 ++- 5 files changed, 261 insertions(+), 6 deletions(-) create mode 100644 internal/embed/infrastructure/base/templates/oauth-token.yaml diff --git a/README.md b/README.md index 0f24b0d..e525dca 100644 --- a/README.md +++ b/README.md @@ -394,6 +394,35 @@ obol stack purge -f > [!WARNING] > The `purge` command permanently deletes all cluster data and configuration. The `-f` flag is required to remove persistent volume claims (PVCs) owned by root. Use with caution. +### Dashboard Authentication (Better Auth) + +The dashboard UI is protected behind login when configured. RPC endpoints under `/rpc/*` remain unauthenticated (the x402 payment flow is handled separately). + +**Required environment variables (set before `obol stack up`):** + +- `STACK_PUBLIC_DOMAIN` (defaults to `obol.stack`; set to your Cloudflare tunnel hostname for internet exposure) +- `BETTER_AUTH_SECRET` (min 32 chars) +- `OBOL_GOOGLE_CLIENT_ID` +- `OBOL_GOOGLE_CLIENT_SECRET` + +**Google OAuth redirect URI:** + +Register this in Google Cloud Console: + +```text +https:///api/auth/callback/google +``` + +**Nodecore token refresh (for eRPC upstream header injection):** + +Create/update the Secret `erpc/nodecore-oauth-refresh` with: + +- `client_id` +- `client_secret` +- `refresh_token` + +The in-cluster CronJob refreshes a short-lived Google `id_token` and writes it into `erpc/obol-oauth-token`, which eRPC uses to inject `X-Nodecore-Token` on upstream requests. + ### Working with Kubernetes The `obol` CLI includes convenient wrappers for common Kubernetes tools. These automatically use the correct cluster configuration: diff --git a/internal/embed/infrastructure/base/templates/oauth-token.yaml b/internal/embed/infrastructure/base/templates/oauth-token.yaml new file mode 100644 index 0000000..d5baf56 --- /dev/null +++ b/internal/embed/infrastructure/base/templates/oauth-token.yaml @@ -0,0 +1,176 @@ +--- +# Nodecore OAuth token plumbing for eRPC upstream auth (issue #124) +apiVersion: v1 +kind: Namespace +metadata: + name: erpc + +--- +apiVersion: v1 +kind: Secret +metadata: + name: obol-oauth-token + namespace: erpc +type: Opaque +stringData: + # Google `id_token` (JWT). CronJob refreshes and writes into this Secret. + token: "" + +--- +apiVersion: v1 +kind: Secret +metadata: + name: nodecore-oauth-refresh + namespace: erpc +type: Opaque +stringData: + # Google OAuth client credentials + refresh token. + # This is intentionally stored separately from the ID token written to `obol-oauth-token`. + client_id: "" + client_secret: "" + refresh_token: "" + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: nodecore-token-writer + namespace: erpc +rules: + - apiGroups: [""] + resources: ["secrets"] + resourceNames: ["obol-oauth-token"] + verbs: ["get", "update", "patch"] + +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: nodecore-token-refresher + namespace: erpc + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: nodecore-token-writer + namespace: erpc +subjects: + - kind: ServiceAccount + name: nodecore-token-refresher + namespace: erpc +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: nodecore-token-writer + +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: nodecore-token-refresher + namespace: erpc +spec: + # Refresh every 45 minutes to stay ahead of typical 1h ID token expiry. + schedule: "0,45 * * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + template: + spec: + serviceAccountName: nodecore-token-refresher + restartPolicy: OnFailure + containers: + - name: refresh + image: python:3.12-alpine + imagePullPolicy: IfNotPresent + env: + - name: GOOGLE_CLIENT_ID + valueFrom: + secretKeyRef: + name: nodecore-oauth-refresh + key: client_id + - name: GOOGLE_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: nodecore-oauth-refresh + key: client_secret + - name: GOOGLE_REFRESH_TOKEN + valueFrom: + secretKeyRef: + name: nodecore-oauth-refresh + key: refresh_token + command: + - python + - -c + - | + import base64 + import json + import os + import ssl + import urllib.parse + import urllib.request + + client_id = os.environ.get("GOOGLE_CLIENT_ID") + client_secret = os.environ.get("GOOGLE_CLIENT_SECRET") + refresh_token = os.environ.get("GOOGLE_REFRESH_TOKEN") + + if not client_id or not client_secret or not refresh_token: + raise SystemExit("Missing GOOGLE_CLIENT_ID/GOOGLE_CLIENT_SECRET/GOOGLE_REFRESH_TOKEN in Secret erpc/nodecore-oauth-refresh") + + token_url = "https://oauth2.googleapis.com/token" + body = urllib.parse.urlencode({ + "client_id": client_id, + "client_secret": client_secret, + "refresh_token": refresh_token, + "grant_type": "refresh_token", + }).encode("utf-8") + + req = urllib.request.Request( + token_url, + data=body, + method="POST", + headers={"Content-Type": "application/x-www-form-urlencoded"}, + ) + + with urllib.request.urlopen(req, timeout=20) as resp: + payload = json.loads(resp.read().decode("utf-8")) + + id_token = payload.get("id_token") + if not id_token: + raise SystemExit(f"Google token endpoint response missing id_token: {payload}") + + token_b64 = base64.b64encode(id_token.encode("utf-8")).decode("utf-8") + + namespace = "erpc" + secret_name = "obol-oauth-token" + api_server = "https://kubernetes.default.svc" + + sa_token_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" + sa_ca_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + + with open(sa_token_path, "r", encoding="utf-8") as f: + sa_token = f.read().strip() + + patch = json.dumps({"data": {"token": token_b64}}).encode("utf-8") + patch_url = f"{api_server}/api/v1/namespaces/{namespace}/secrets/{secret_name}" + + ctx = ssl.create_default_context(cafile=sa_ca_path) + patch_req = urllib.request.Request( + patch_url, + data=patch, + method="PATCH", + headers={ + "Authorization": f"Bearer {sa_token}", + "Content-Type": "application/merge-patch+json", + "Accept": "application/json", + }, + ) + + with urllib.request.urlopen(patch_req, timeout=20, context=ctx) as resp: + if resp.status < 200 or resp.status >= 300: + raise SystemExit(f"Failed to patch Secret {namespace}/{secret_name}: HTTP {resp.status} {resp.read().decode('utf-8')}") + + print("Updated Secret erpc/obol-oauth-token") diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml index 310ff46..e3ce9a3 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml @@ -1,6 +1,7 @@ # Helmfile for Obol Stack default infrastructure # Orchestrates core infrastructure components deployed with every stack # Uses Traefik with Gateway API for routing (replaces nginx-ingress) +{{- $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} repositories: - name: traefik @@ -13,6 +14,8 @@ repositories: url: https://ethpandaops.github.io/ethereum-helm-charts - name: bedag url: https://bedag.github.io/helm-charts/ + - name: stakater + url: https://stakater.github.io/stakater-charts # Single source of truth: change this to switch networks values: @@ -126,6 +129,13 @@ releases: needs: - traefik/traefik + # Stakater Reloader (restarts workloads on Secret/ConfigMap change) + - name: reloader + namespace: reloader + createNamespace: true + chart: stakater/reloader + version: 2.2.7 + # eRPC - name: erpc namespace: erpc @@ -157,7 +167,7 @@ releases: namespace: traefik sectionName: web hostnames: - - obol.stack + - "{{ $publicDomain }}" rules: - matches: - path: @@ -199,7 +209,7 @@ releases: namespace: traefik sectionName: web hostnames: - - obol.stack + - "{{ $publicDomain }}" rules: - matches: - path: diff --git a/internal/embed/infrastructure/values/erpc.yaml.gotmpl b/internal/embed/infrastructure/values/erpc.yaml.gotmpl index 6799332..b7c07f8 100644 --- a/internal/embed/infrastructure/values/erpc.yaml.gotmpl +++ b/internal/embed/infrastructure/values/erpc.yaml.gotmpl @@ -1,4 +1,5 @@ {{- $network := .Values.network -}} +{{- $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} {{- $chainId := 1 -}} {{/* Default: mainnet */}} {{- if eq $network "hoodi" -}} {{- $chainId = 560048 -}} @@ -48,6 +49,14 @@ config: |- projects: - id: rpc + upstreams: + - id: nodecore + endpoint: https://rpc.nodecore.io + evm: + chainId: {{ $chainId }} + jsonRpc: + headers: + X-Nodecore-Token: "${OBOL_OAUTH_TOKEN}" networks: - architecture: evm evm: @@ -79,7 +88,11 @@ config: |- maxAge: 3600 # Secret env variables -secretEnv: {} +secretEnv: + OBOL_OAUTH_TOKEN: + secretKeyRef: + name: obol-oauth-token + key: token # Extra args for the erpc container extraArgs: [] @@ -101,7 +114,8 @@ affinity: {} imagePullSecrets: [] # Annotations for the Deployment -annotations: {} +annotations: + secret.reloader.stakater.com/reload: "obol-oauth-token" # Liveness probe livenessProbe: @@ -126,7 +140,8 @@ nodeSelector: {} podLabels: {} # Pod annotations -podAnnotations: {} +podAnnotations: + secret.reloader.stakater.com/reload: "obol-oauth-token" # Pod management policy podManagementPolicy: OrderedReady diff --git a/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl b/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl index 92aab95..b3c0d56 100644 --- a/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl +++ b/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl @@ -1,15 +1,29 @@ {{- $network := .Values.network -}} +{{- $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} replicaCount: 1 +serviceAccount: + name: obol-frontend + image: environment: - name: NEXT_PUBLIC_HELIOS_CLIENT_URL value: "http://helios-{{ $network }}.helios.svc.cluster.local:8545" - name: NEXT_PUBLIC_ERPC_URL - value: "http://erpc.default.svc.cluster.local:4000/rpc" + value: "{{ printf \"https://%s/rpc\" $publicDomain }}" - name: NEXT_PUBLIC_AZTEC_SEQUENCER_URL value: "http://l2-sequencer-node-mainnet-node.aztec.svc.cluster.local:8080" + - name: BETTER_AUTH_SECRET + value: "{{ env \"BETTER_AUTH_SECRET\" }}" + - name: BETTER_AUTH_URL + value: "{{ printf \"https://%s\" $publicDomain }}" + - name: OBOL_GOOGLE_CLIENT_ID + value: "{{ env \"OBOL_GOOGLE_CLIENT_ID\" }}" + - name: OBOL_GOOGLE_CLIENT_SECRET + value: "{{ env \"OBOL_GOOGLE_CLIENT_SECRET\" }}" + - name: OBOL_AUTH_DB_PATH + value: "/data/auth.sqlite" repository: obolnetwork/obol-stack-front-end pullPolicy: Always @@ -19,6 +33,17 @@ service: type: ClusterIP port: 3000 +podSecurityContext: + fsGroup: 1001 + +volumes: + - name: auth-db + emptyDir: {} + +volumeMounts: + - name: auth-db + mountPath: /data + # Disable legacy Ingress - using Gateway API HTTPRoute instead ingress: enabled: false From 09356aa167a7aa05d2de62136b98318eda1ca203 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 3 Feb 2026 14:21:04 +0100 Subject: [PATCH 07/15] feat(llm): add ollama cloud + llmspy foundation --- .../infrastructure/base/templates/llm.yaml | 266 ++++++++++++++++++ .../base/templates/obol-agent.yaml | 20 +- .../values/obol-frontend.yaml.gotmpl | 10 + plans/okr1-llmspy-integration.md | 263 +++++++++++++++++ 4 files changed, 558 insertions(+), 1 deletion(-) create mode 100644 internal/embed/infrastructure/base/templates/llm.yaml create mode 100644 plans/okr1-llmspy-integration.md diff --git a/internal/embed/infrastructure/base/templates/llm.yaml b/internal/embed/infrastructure/base/templates/llm.yaml new file mode 100644 index 0000000..4ad413a --- /dev/null +++ b/internal/embed/infrastructure/base/templates/llm.yaml @@ -0,0 +1,266 @@ +--- +# LLM foundation services (OKR-1) +# +# This deploys: +# - Ollama (as the upstream LLM runtime) +# - llms.py (LLMSpy) as an OpenAI-compatible gateway / router over providers +# +# Design notes: +# - We default to Ollama Cloud (`glm-4.7:cloud`) to avoid requiring local GPU/VRAM. +# - We persist Ollama's identity keypair at `/root/.ollama/id_ed25519` so the +# Ollama Cloud "connect" binding survives pod restarts/upgrades. +# - Model cache is kept on `emptyDir` (ephemeral) per product decision. +apiVersion: v1 +kind: Namespace +metadata: + name: llm + +--- +# Persist Ollama identity (Ollama Cloud connect uses the public key derived from this keypair). +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ollama-home + namespace: llm +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 256Mi + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ollama + namespace: llm + labels: + app: ollama +spec: + replicas: 1 + # Ollama uses a ReadWriteOnce PVC; avoid surging a second pod during updates. + strategy: + type: Recreate + selector: + matchLabels: + app: ollama + template: + metadata: + labels: + app: ollama + spec: + containers: + - name: ollama + image: ollama/ollama:latest + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 11434 + protocol: TCP + env: + # Store model blobs (including any cloud model stubs/cache) in an ephemeral volume. + - name: OLLAMA_MODELS + value: /models + # Explicitly bind the HTTP API to all interfaces in-cluster. + - name: OLLAMA_HOST + value: 0.0.0.0:11434 + volumeMounts: + # Persist identity + config (e.g. ~/.ollama/id_ed25519) for Ollama Cloud connect. + - name: ollama-home + mountPath: /root/.ollama + - name: ollama-models + mountPath: /models + readinessProbe: + httpGet: + path: /api/version + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 2 + livenessProbe: + httpGet: + path: /api/version + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 2 + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 2000m + memory: 4Gi + volumes: + - name: ollama-home + persistentVolumeClaim: + claimName: ollama-home + - name: ollama-models + emptyDir: {} + +--- +apiVersion: v1 +kind: Service +metadata: + name: ollama + namespace: llm + labels: + app: ollama +spec: + type: ClusterIP + selector: + app: ollama + ports: + - name: http + port: 11434 + targetPort: http + protocol: TCP + +--- +# llms.py configuration for Obol Stack: +# - Only enable the Ollama provider +# - Default model is `glm-4.7:cloud` (cloud-first) +apiVersion: v1 +kind: ConfigMap +metadata: + name: llmspy-config + namespace: llm +data: + llms.json: | + { + "defaults": { + "headers": { + "Content-Type": "application/json" + }, + "text": { + "model": "glm-4.7:cloud", + "messages": [ + { "role": "user", "content": "" } + ] + } + }, + "providers": { + "ollama": { + "enabled": true, + "type": "OllamaProvider", + "base_url": "http://ollama.llm.svc.cluster.local:11434", + "models": {}, + "all_models": true + } + } + } + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llmspy + namespace: llm + labels: + app: llmspy +spec: + replicas: 1 + selector: + matchLabels: + app: llmspy + template: + metadata: + labels: + app: llmspy + spec: + initContainers: + # Seed ~/.llms/llms.json from the ConfigMap. llms.py also writes runtime + # state (e.g. analytics) under ~/.llms, so we keep the directory writable. + - name: seed-config + image: busybox:1.36.1 + imagePullPolicy: IfNotPresent + command: + - sh + - -c + - | + set -eu + mkdir -p /data + cp /config/llms.json /data/llms.json + volumeMounts: + - name: llmspy-config + mountPath: /config + readOnly: true + - name: llmspy-home + mountPath: /data + containers: + - name: llmspy + # NOTE: We install `llms.py` at runtime to avoid coupling Obol Stack to a + # specific upstream container image. If/when llmspy publishes an official + # image, we can switch to it for faster cold starts. + image: python:3.12-slim + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 8000 + protocol: TCP + # llms.py uses `~/.llms/llms.json` by default; in the container, that's + # /home/llms/.llms/llms.json (from upstream docker docs). + command: + - sh + - -c + - | + set -eu + python -m pip install --no-cache-dir --upgrade pip + python -m pip install --no-cache-dir llms + llms --config /home/llms/.llms/llms.json --serve 8000 + env: + # Avoid surprises if the image changes its default HOME. + - name: HOME + value: /home/llms + volumeMounts: + - name: llmspy-home + mountPath: /home/llms/.llms + readinessProbe: + httpGet: + path: / + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 2 + livenessProbe: + httpGet: + path: / + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 2 + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 1000m + memory: 1Gi + volumes: + - name: llmspy-config + configMap: + name: llmspy-config + items: + - key: llms.json + path: llms.json + - name: llmspy-home + emptyDir: {} + +--- +apiVersion: v1 +kind: Service +metadata: + name: llmspy + namespace: llm + labels: + app: llmspy +spec: + type: ClusterIP + selector: + app: llmspy + ports: + - name: http + port: 8000 + targetPort: http + protocol: TCP diff --git a/internal/embed/infrastructure/base/templates/obol-agent.yaml b/internal/embed/infrastructure/base/templates/obol-agent.yaml index f73dda7..7220dbf 100644 --- a/internal/embed/infrastructure/base/templates/obol-agent.yaml +++ b/internal/embed/infrastructure/base/templates/obol-agent.yaml @@ -139,6 +139,24 @@ spec: - name: PUBLIC_MODE value: "false" + # OKR-1: Default LLM backend via llms.py + Ollama Cloud + # + # The Obol Stack agent is provider-agnostic: + # - `llms.py` (LLMSpy) exposes an OpenAI-compatible API at /v1 + # - LLMSpy forwards to Ollama (in-cluster), which can run `*:cloud` models + # + # Important: Ollama Cloud requires a one-time "connect" of the pod identity + # (public key derived from /root/.ollama/id_ed25519). We persist that key + # in the `llm/ollama-home` PVC so upgrades/restarts don't require re-connect. + - name: LLM_BACKEND + value: "llmspy" + - name: LLM_MODEL + value: "glm-4.7:cloud" + - name: OPENAI_API_BASE + value: "http://llmspy.llm.svc.cluster.local:8000/v1" + - name: OPENAI_API_KEY + value: "ollama" + # Health checks ensure the pod is ready to receive traffic livenessProbe: httpGet: @@ -179,4 +197,4 @@ spec: protocol: TCP name: http selector: - app: obol-agent # Routes traffic to pods with this label \ No newline at end of file + app: obol-agent # Routes traffic to pods with this label diff --git a/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl b/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl index b3c0d56..caff157 100644 --- a/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl +++ b/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl @@ -25,6 +25,16 @@ image: - name: OBOL_AUTH_DB_PATH value: "/data/auth.sqlite" + # Obol Agent (ADK) in-cluster URL for CopilotKit runtime + - name: ADK_AGENT_URL + value: "http://obol-agent.agent.svc.cluster.local:8000/" + - name: NEXT_PUBLIC_ADK_AGENT_URL + value: "http://obol-agent.agent.svc.cluster.local:8000/" + + # Ollama in-cluster URL (used by dashboard to surface Ollama Cloud connect URL) + - name: OLLAMA_URL + value: "http://ollama.llm.svc.cluster.local:11434" + repository: obolnetwork/obol-stack-front-end pullPolicy: Always tag: "latest" diff --git a/plans/okr1-llmspy-integration.md b/plans/okr1-llmspy-integration.md new file mode 100644 index 0000000..250378d --- /dev/null +++ b/plans/okr1-llmspy-integration.md @@ -0,0 +1,263 @@ +# OKR-1 Integration Plan: LLMSpy (`llms.py`) for Keyless, Multi-Provider LLM Access + +Date: 2026-02-03 + +## Goal (Objective 1) +Make Obol Stack the easiest way to spin up and use an on-chain AI agent. + +**Key Results** +1. Median time from install to first successful agent query ≤ **10 minutes** +2. Agent setup requires ≤ **5 user actions** (**no manual API key copy/paste in default flow**) +3. **100 Monthly Active Returning Users (MAUs)** interacting with the agent at least once per month +4. ≥ **60% of new Stack installs** complete agent setup successfully + +## Scope of this integration +Integrate **LLMSpy (`llms.py`)** as an **in-cluster OpenAI-compatible LLM gateway** that can route requests to: +- **Local LLMs** (default path to satisfy “no API key”) +- **Remote providers** (optional, later; keys or OAuth-derived tokens) + +This enables Obol Agent (ADK/FastAPI) to become **provider-agnostic**, while keeping the Dashboard UX simple. + +## Non-goals (for this iteration) +- Building a hosted “Obol-managed” LLM key/service (would change threat model/cost structure) +- Exposing LLMSpy publicly by default (we keep it internal unless explicitly enabled) +- Replacing ADK/AG-UI or refactoring the agent’s tool system +- Adding x402 payment to LLM calls (future candidate; not required for LLMSpy integration) + +--- + +## Current state (baseline) +### User experience bottleneck +- `obol agent init` currently requires a **manually created Google AI Studio API key** (copy/paste) before the agent works. +- Dashboard agent sidebar shows “Initialize your Obol Agent by running `obol agent init`…” when the agent is unavailable. + +### System architecture (today) +``` +Browser + -> Dashboard (Next.js, Better Auth) + -> POST /api/copilotkit (server route) + -> HttpAgent -> obol-agent (FastAPI / Google ADK) + -> Gemini via GOOGLE_API_KEY (direct) +``` + +--- + +## Proposed target architecture (with LLMSpy + Ollama; cloud-first) + +### Runtime request flow (agent query) +``` +Browser (signed-in) + -> Dashboard (Next.js) + -> /api/copilotkit (server; auth-gated) + -> obol-agent (FastAPI/ADK, AG-UI) + -> LiteLLM client (OpenAI-compatible) + -> LLMSpy (llms.py) [cluster-internal service] + -> Provider A: Local (Ollama) [no keys, default] + -> Provider B+: Remote (optional; keys/OAuth later) +``` + +### Deployment topology (Kubernetes) +Namespaces: +- `agent` + - `obol-agent` Deployment (existing) +- `llm` (new) + - **`llmspy`** (`llms.py`) Deployment + ClusterIP Service + - **`ollama`** Deployment + ClusterIP Service (default provider) + - Optional model warmup Job (`ollama pull `) + +Storage: +- Ollama runtime + model cache uses `emptyDir` (ephemeral). +- **Ollama Cloud auth key**: + - Minimum viable: also `emptyDir` (user reconnects after pod restart). + - Recommended: mount a small PVC or Secret-backed volume for `/root/.ollama/id_ed25519` so reconnect isn’t needed after upgrades/restarts. + +--- + +## UX: “≤5 actions” and “≤10 minutes” target + +### Default flow (no API keys) +**Default provider:** Ollama (in-cluster) via LLMSpy, using **Ollama Cloud models** (e.g. `glm-4.7:cloud`). + +Target action count: +1. Install Obol Stack CLI (existing flow) +2. `obol stack init` (if required by current UX) +3. `obol stack up` +4. Open Dashboard URL and sign in +5. Send first message in agent sidebar + +Notes: +- Remove the **mandatory** `obol agent init` step from the default path. +- Replace the “paste an API key” step with an **Ollama Cloud connect** step: + - If Ollama isn’t signed in, show a “Connect Ollama Cloud” action in the dashboard. + - Clicking it surfaces the `https://ollama.com/connect?...` URL returned by the Ollama API and guides the user through login. + +### Time-to-first-query tactics +- Default to a **cloud model** to avoid GPU/VRAM constraints: + - `glm-4.7:cloud` is explicitly supported as a cloud model in Ollama. +- Add a lightweight warmup/prefetch mechanism: + - Post-install Job: `ollama pull glm-4.7:cloud` (downloads the stub/metadata so first chat is faster) + - Readiness gate: “ready” once Ollama is connected and the model is pullable +- Ensure agent readiness checks are reliable and fast: + - Keep `/api/copilotkit/health` public (already required) + - Add `llmspy` and `ollama` readiness checks and surface status in the UI + +--- + +## Configuration model + +### LLMSpy +LLMSpy is configured by `~/.llms/llms.json` (in-container: `/home/llms/.llms/llms.json`). + +We will manage this in-cluster using: +- ConfigMap for `llms.json` +- Volume mount to `/home/llms/.llms` (likely `emptyDir`; no secrets required for Ollama) + +Key config points (concrete based on llms.py docs): +- Only one enabled provider: `ollama` +- `providers.ollama.type = "OllamaProvider"` +- `providers.ollama.base_url = "http://ollama.llm.svc.cluster.local:11434"` +- `providers.ollama.all_models = true` (or restrict to `glm-4.7:cloud`) +- `defaults.text.model = "glm-4.7:cloud"` + +### Obol Agent +Make the agent model/backend configurable: +- `LLM_BACKEND`: + - `gemini` (existing path, requires `GOOGLE_API_KEY`) + - `llmspy` (new default path) +- `LLM_MODEL` (default to the cloud model) +- `OPENAI_API_BASE` set to `http://llmspy.llm.svc.cluster.local:/v1` +- `OPENAI_API_KEY` set to a dummy value (LiteLLM/OpenAI provider compatibility) + +NOTE: With `llmspy` as backend, the agent sends OpenAI-style requests to LLMSpy and LLMSpy forwards to Ollama. + +## Default model choice +Use `glm-4.7:cloud` by default to maximize quality and avoid local GPU requirements. + +This keeps the “no manual API key copy/paste” OKR achievable because Ollama supports a browser-based connect flow (user signs in; Ollama authenticates subsequent cloud requests). + +## OpenClaw tie-in (validation + reuse) +We can validate “tool-calling robustness” of the chosen Ollama model in two ways: + +1) **Direct OpenClaw + Ollama** (matches Ollama’s built-in `openclaw` integration) + - OpenClaw already supports an Ollama provider using the OpenAI-compatible `/v1` API. + - Ollama’s own code includes an integration that edits `~/.openclaw/openclaw.json` to point at Ollama and set `agents.defaults.model.primary`. + +2) **OpenClaw + LLMSpy (preferred for consistency)** + - Configure OpenClaw’s “OpenAI” provider baseUrl to LLMSpy (`http://llmspy.llm.svc.cluster.local:/v1`) + - This ensures OpenClaw and Obol Agent exercise the same gateway path. + +We should treat OpenClaw as: +- A **validation harness** for model/tool behavior (pre-flight testing + regression checks) +- Potential future **multi-channel UX** (WhatsApp/Telegram/etc) once dashboard MVP is stable + +### Obol Stack CLI changes (user-facing) +Reframe `obol agent init` into a provider configuration command: +- Default: **no command needed** +- Optional: `obol agent configure --provider <...>` or `obol agent set-llm --provider <...>` + - Writes K8s secrets/configmaps and triggers rollout restart of `obol-agent` and/or `llmspy` + +--- + +## Security & exposure +- Dashboard remains protected by Better Auth (Google now; GitHub later). +- `/rpc/*` remains public/unprotected (x402 responsibility). +- `/api/copilotkit/health` remains public for monitoring. +- **LLMSpy and Ollama remain cluster-internal by default**: + - No HTTPRoute for them + - ClusterIP only + - (Optional later) expose behind dashboard auth for debugging + +Threat model considerations: +- Ensure LLMSpy cannot be used as an open relay from the internet. +- Ensure remote provider keys (if configured) never get logged or surfaced in UI. + +--- + +## Observability + OKR measurement plan + +### Metrics we can measure in-product (self-hosted) +- `agent_query_success_total` / `agent_query_error_total` +- `agent_query_latency_seconds` histogram +- `agent_first_success_timestamp` (per install) – used for “time to first query” +- `agent_provider_backend` label (gemini vs llmspy; local vs remote) + +### MAU / “install success rate” (cross-install aggregation) +This requires centralized telemetry. Options: +- Opt-in telemetry to an Obol endpoint (privacy-preserving, hashed install id) +- Or a “bring your own analytics” integration (PostHog/Amplitude) + +Proposed approach for this OKR: +- Add **opt-in** telemetry flag at install time +- Emit minimal events: + - `stack_install_completed` + - `agent_ready` + - `agent_first_query_success` + - `agent_returning_user_monthly` (count only) + +--- + +## Implementation workstreams (by repo) + +### 1) `obol-stack` (installer + infra) +- Add `llmspy` Deployment/Service manifest under `internal/embed/infrastructure/base/templates/` +- Add `ollama` Deployment/Service (or allow external Ollama endpoint) +- Add “model warmup” Job (optional but recommended for ≤10 min) +- Add values/env wiring to configure: + - LLMSpy port, config map, and secret mounts + - Obol Agent env vars (`LLM_BACKEND`, `LLM_MODEL`, `OPENAI_API_BASE`, etc.) +- Update CLI: + - Make `obol agent init` optional or replace with `obol agent configure` + - Provide a keyless default; ensure docs and errors reflect new flow +- Update README (agent quickstart + troubleshooting) + +### 2) `obol-agent` (runtime changes) +- Read `LLM_MODEL` from env (remove hard-coded model) +- Add `LLM_BACKEND` switch: + - `gemini` (current) + - `llmspy` using ADK’s `LiteLlm` wrapper + OpenAI-compatible base URL +- Add health diagnostics: + - Include provider status in `/health` (e.g., “llm backend reachable”) +- Add unit/integration tests: + - Mock LLMSpy OpenAI endpoint + - Verify tool calling works with chosen default local model + +### 3) `obol-stack-front-end` (onboarding UX) +- Replace “run `obol agent init`” message with: + - “Agent is initializing” / “Model downloading” (with helpful tips) + - A “Retry health check” action + - A link to agent setup docs for optional remote providers +- Add an “Agent Setup” panel: + - Shows current backend (local/remote) + - Shows readiness status (agent/llmspy/ollama) + +### 4) `helm-charts` (if needed) +- Only if we decide to migrate these new services into charts instead of raw manifests. +- Otherwise, keep in `base/templates/` for speed. + +--- + +## Milestones + +### Milestone A — “Keyless Agent Works Locally” +Acceptance: +- Fresh install: no API keys required +- Agent responds from dashboard +- Median time to first response ≤ 10 min in test environment + +### Milestone B — “Provider Choice” +Acceptance: +- Optional remote providers via secrets/config (still no copy/paste required in default) +- Failover behavior works (local first, remote fallback if configured) + +### Milestone C — “OKR Instrumentation” +Acceptance: +- Prometheus metrics available +- Optional telemetry pipeline documented and implemented (if approved) + +--- + +## Open questions (needs product decision) +1. Do we persist `/root/.ollama/id_ed25519` so the Ollama Cloud connection survives pod restarts/upgrades? +2. Do we want to expose a “Connect Ollama Cloud” UX in the dashboard (recommended) or require a CLI step? +3. Telemetry: opt-in vs opt-out; where is the endpoint; privacy guarantees. +4. Do we expose LLMSpy UI behind auth for debugging, or keep it internal-only? From 5328fc6b38317662a818e21e2e7f88d1b9ee3bd0 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 3 Feb 2026 15:04:18 +0100 Subject: [PATCH 08/15] docs: note llmspy + ollama cloud default --- notes.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notes.md b/notes.md index 025b7ef..6550e6a 100644 --- a/notes.md +++ b/notes.md @@ -6,7 +6,7 @@ - obol agent - skeleton out the cmd - this should have a dummy manifest which templates a config map secret - - obol agent init, gets the secret from google account + - OKR-1: default LLM flow is llms.py -> Ollama Cloud (no API key copy/paste) - frontend (default) - erpc, helios (default) From 9e4b885cd7dbd670c0536aea6a3741d34dbafcdb Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 3 Feb 2026 17:19:10 +0100 Subject: [PATCH 09/15] chore(llm): use official llmspy image and tcp probes --- .../infrastructure/base/templates/llm.yaml | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/internal/embed/infrastructure/base/templates/llm.yaml b/internal/embed/infrastructure/base/templates/llm.yaml index 4ad413a..4f367c9 100644 --- a/internal/embed/infrastructure/base/templates/llm.yaml +++ b/internal/embed/infrastructure/base/templates/llm.yaml @@ -190,24 +190,19 @@ spec: mountPath: /data containers: - name: llmspy - # NOTE: We install `llms.py` at runtime to avoid coupling Obol Stack to a - # specific upstream container image. If/when llmspy publishes an official - # image, we can switch to it for faster cold starts. - image: python:3.12-slim + # Official LLMSpy container image (published by upstream). + # Pin a specific version for reproducibility. + image: ghcr.io/servicestack/llms:v2.0.30 imagePullPolicy: IfNotPresent ports: - name: http containerPort: 8000 protocol: TCP - # llms.py uses `~/.llms/llms.json` by default; in the container, that's - # /home/llms/.llms/llms.json (from upstream docker docs). command: - sh - -c - | set -eu - python -m pip install --no-cache-dir --upgrade pip - python -m pip install --no-cache-dir llms llms --config /home/llms/.llms/llms.json --serve 8000 env: # Avoid surprises if the image changes its default HOME. @@ -217,15 +212,13 @@ spec: - name: llmspy-home mountPath: /home/llms/.llms readinessProbe: - httpGet: - path: / + tcpSocket: port: http initialDelaySeconds: 5 periodSeconds: 5 timeoutSeconds: 2 livenessProbe: - httpGet: - path: / + tcpSocket: port: http initialDelaySeconds: 30 periodSeconds: 10 From 8e8767b79b7d8fdd362c99df6fe069e45a47afa7 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 3 Feb 2026 17:20:35 +0100 Subject: [PATCH 10/15] docs(okr1): note official llmspy image --- plans/okr1-llmspy-integration.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/plans/okr1-llmspy-integration.md b/plans/okr1-llmspy-integration.md index 250378d..a6f1fc7 100644 --- a/plans/okr1-llmspy-integration.md +++ b/plans/okr1-llmspy-integration.md @@ -112,6 +112,10 @@ We will manage this in-cluster using: - ConfigMap for `llms.json` - Volume mount to `/home/llms/.llms` (likely `emptyDir`; no secrets required for Ollama) +Runtime: +- Prefer the upstream-published container image for reproducibility: + - `ghcr.io/servicestack/llms:v2.0.30` (pinned) + Key config points (concrete based on llms.py docs): - Only one enabled provider: `ollama` - `providers.ollama.type = "OllamaProvider"` From 9b98def7dbbaaee8ea3b33780af8ccb94220cc34 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 3 Feb 2026 17:21:40 +0100 Subject: [PATCH 11/15] fix(llm): run llmspy via llms entrypoint --- internal/embed/infrastructure/base/templates/llm.yaml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/internal/embed/infrastructure/base/templates/llm.yaml b/internal/embed/infrastructure/base/templates/llm.yaml index 4f367c9..b9d00b4 100644 --- a/internal/embed/infrastructure/base/templates/llm.yaml +++ b/internal/embed/infrastructure/base/templates/llm.yaml @@ -199,11 +199,12 @@ spec: containerPort: 8000 protocol: TCP command: - - sh - - -c - - | - set -eu - llms --config /home/llms/.llms/llms.json --serve 8000 + - llms + args: + - --config + - /home/llms/.llms/llms.json + - --serve + - "8000" env: # Avoid surprises if the image changes its default HOME. - name: HOME From 8798d0712e8dc5740af83dbe4e9e20c552dc5a99 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Tue, 3 Feb 2026 17:31:24 +0100 Subject: [PATCH 12/15] fix(llm): use http probes for llmspy --- internal/embed/infrastructure/base/templates/llm.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/internal/embed/infrastructure/base/templates/llm.yaml b/internal/embed/infrastructure/base/templates/llm.yaml index b9d00b4..5633866 100644 --- a/internal/embed/infrastructure/base/templates/llm.yaml +++ b/internal/embed/infrastructure/base/templates/llm.yaml @@ -213,13 +213,15 @@ spec: - name: llmspy-home mountPath: /home/llms/.llms readinessProbe: - tcpSocket: + httpGet: + path: / port: http initialDelaySeconds: 5 periodSeconds: 5 timeoutSeconds: 2 livenessProbe: - tcpSocket: + httpGet: + path: / port: http initialDelaySeconds: 30 periodSeconds: 10 From a05bce5460c3cb659d59238a6fd7d847b84d0380 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 7 Feb 2026 01:16:23 +0400 Subject: [PATCH 13/15] feat(stack): add pluggable backend system with native k3s support Introduce a Backend interface that abstracts cluster lifecycle management, enabling both k3d (Docker-based, default) and k3s (native bare-metal) backends. This is a prerequisite for TEE/Confidential Computing workloads which require direct hardware access that k3d cannot provide. Changes: - Add Backend interface (Init, Up, Down, Destroy, IsRunning, DataDir) - Extract k3d logic into K3dBackend with backward-compatible fallback - Add K3sBackend with sudo process management, PID tracking, and API server readiness checks - Convert helmfile.yaml to helmfile.yaml.gotmpl using env vars instead of .Values references (fixes first-pass template rendering) - Fix eRPC secretEnv type mismatch (map vs string for b64enc) - Fix obol-frontend escaped quotes in gotmpl expressions - Add KUBECONFIG env var to helmfile command for hook compatibility - Add 26 unit tests and 10 integration test scenarios Closes #134 --- cmd/obol/bootstrap.go | 2 +- cmd/obol/main.go | 7 +- internal/embed/embed.go | 3 + .../base/templates/local-path.yaml | 2 +- .../{helmfile.yaml => helmfile.yaml.gotmpl} | 16 +- .../infrastructure/values/erpc.yaml.gotmpl | 20 +- .../values/obol-frontend.yaml.gotmpl | 12 +- internal/embed/k3s-config.yaml | 24 ++ internal/stack/backend.go | 77 +++++ internal/stack/backend_k3d.go | 164 +++++++++ internal/stack/backend_k3s.go | 320 +++++++++++++++++ internal/stack/backend_k3s_test.go | 97 ++++++ internal/stack/backend_test.go | 321 ++++++++++++++++++ internal/stack/integration_test.go | 255 ++++++++++++++ internal/stack/stack.go | 258 +++++--------- 15 files changed, 1375 insertions(+), 203 deletions(-) rename internal/embed/infrastructure/{helmfile.yaml => helmfile.yaml.gotmpl} (94%) create mode 100644 internal/embed/k3s-config.yaml create mode 100644 internal/stack/backend.go create mode 100644 internal/stack/backend_k3d.go create mode 100644 internal/stack/backend_k3s.go create mode 100644 internal/stack/backend_k3s_test.go create mode 100644 internal/stack/backend_test.go create mode 100644 internal/stack/integration_test.go diff --git a/cmd/obol/bootstrap.go b/cmd/obol/bootstrap.go index f2d3eb2..60683d3 100644 --- a/cmd/obol/bootstrap.go +++ b/cmd/obol/bootstrap.go @@ -27,7 +27,7 @@ func bootstrapCommand(cfg *config.Config) *cli.Command { // Step 1: Initialize stack fmt.Println("Initializing stack configuration...") - if err := stack.Init(cfg, false); err != nil { + if err := stack.Init(cfg, false, ""); err != nil { // Check if it's an "already exists" error - that's okay if !strings.Contains(err.Error(), "already exists") { return fmt.Errorf("bootstrap init failed: %w", err) diff --git a/cmd/obol/main.go b/cmd/obol/main.go index 69f92c5..871eb07 100644 --- a/cmd/obol/main.go +++ b/cmd/obol/main.go @@ -102,9 +102,14 @@ GLOBAL OPTIONS: Aliases: []string{"f"}, Usage: "Force overwrite existing configuration", }, + &cli.StringFlag{ + Name: "backend", + Usage: "Cluster backend: k3d (Docker-based) or k3s (bare-metal)", + EnvVars: []string{"OBOL_BACKEND"}, + }, }, Action: func(c *cli.Context) error { - return stack.Init(cfg, c.Bool("force")) + return stack.Init(cfg, c.Bool("force"), c.String("backend")) }, }, { diff --git a/internal/embed/embed.go b/internal/embed/embed.go index 2c189eb..7a0d723 100644 --- a/internal/embed/embed.go +++ b/internal/embed/embed.go @@ -15,6 +15,9 @@ import ( //go:embed k3d-config.yaml var K3dConfig string +//go:embed k3s-config.yaml +var K3sConfig string + //go:embed all:infrastructure var infrastructureFS embed.FS diff --git a/internal/embed/infrastructure/base/templates/local-path.yaml b/internal/embed/infrastructure/base/templates/local-path.yaml index 77713e9..2547c50 100644 --- a/internal/embed/infrastructure/base/templates/local-path.yaml +++ b/internal/embed/infrastructure/base/templates/local-path.yaml @@ -11,7 +11,7 @@ data: "nodePathMap":[ { "node":"DEFAULT_PATH_FOR_NON_LISTED_NODES", - "paths":["/data"] + "paths":["{{ .Values.dataDir }}"] } ] } diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml.gotmpl similarity index 94% rename from internal/embed/infrastructure/helmfile.yaml rename to internal/embed/infrastructure/helmfile.yaml.gotmpl index e3ce9a3..d5b1d8a 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml.gotmpl @@ -1,7 +1,10 @@ # Helmfile for Obol Stack default infrastructure # Orchestrates core infrastructure components deployed with every stack # Uses Traefik with Gateway API for routing (replaces nginx-ingress) -{{- $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} +{{ $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} +{{- $dataDir := env "STACK_DATA_DIR" | default "/data" -}} +{{- $network := env "STACK_NETWORK" | default "mainnet" -}} +{{- $gatewayApiVersion := "v1.4.1" }} repositories: - name: traefik @@ -17,19 +20,14 @@ repositories: - name: stakater url: https://stakater.github.io/stakater-charts -# Single source of truth: change this to switch networks -values: - - network: mainnet - - gatewayApiVersion: v1.4.1 - releases: # Local storage provisioner (raw manifests wrapped as chart) - name: base namespace: kube-system chart: ./base values: - - dataDir: /data - - network: "{{ .Values.network }}" + - dataDir: '{{ $dataDir }}' + - network: "{{ $network }}" # Monitoring stack (Prometheus operator + Prometheus) - name: monitoring @@ -54,7 +52,7 @@ releases: args: - apply - -f - - https://github.com/kubernetes-sigs/gateway-api/releases/download/{{ .Values.gatewayApiVersion }}/standard-install.yaml + - https://github.com/kubernetes-sigs/gateway-api/releases/download/{{ $gatewayApiVersion }}/standard-install.yaml # Traefik ingress controller with Gateway API support - name: traefik diff --git a/internal/embed/infrastructure/values/erpc.yaml.gotmpl b/internal/embed/infrastructure/values/erpc.yaml.gotmpl index b7c07f8..78274e9 100644 --- a/internal/embed/infrastructure/values/erpc.yaml.gotmpl +++ b/internal/embed/infrastructure/values/erpc.yaml.gotmpl @@ -1,4 +1,4 @@ -{{- $network := .Values.network -}} +{{- $network := env "STACK_NETWORK" | default "mainnet" -}} {{- $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} {{- $chainId := 1 -}} {{/* Default: mainnet */}} {{- if eq $network "hoodi" -}} @@ -87,12 +87,18 @@ config: |- allowCredentials: true maxAge: 3600 -# Secret env variables -secretEnv: - OBOL_OAUTH_TOKEN: - secretKeyRef: - name: obol-oauth-token - key: token +# Secret env variables (chart expects flat string map, e.g. KEY: "value") +# The OBOL_OAUTH_TOKEN is injected from a Kubernetes secret via extraEnv instead +secretEnv: {} + +# Inject the OAuth token from the Kubernetes secret +extraEnv: + - name: OBOL_OAUTH_TOKEN + valueFrom: + secretKeyRef: + name: obol-oauth-token + key: token + optional: true # Extra args for the erpc container extraArgs: [] diff --git a/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl b/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl index caff157..66f068b 100644 --- a/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl +++ b/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl @@ -1,4 +1,4 @@ -{{- $network := .Values.network -}} +{{- $network := env "STACK_NETWORK" | default "mainnet" -}} {{- $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} replicaCount: 1 @@ -11,17 +11,17 @@ image: - name: NEXT_PUBLIC_HELIOS_CLIENT_URL value: "http://helios-{{ $network }}.helios.svc.cluster.local:8545" - name: NEXT_PUBLIC_ERPC_URL - value: "{{ printf \"https://%s/rpc\" $publicDomain }}" + value: "https://{{ $publicDomain }}/rpc" - name: NEXT_PUBLIC_AZTEC_SEQUENCER_URL value: "http://l2-sequencer-node-mainnet-node.aztec.svc.cluster.local:8080" - name: BETTER_AUTH_SECRET - value: "{{ env \"BETTER_AUTH_SECRET\" }}" + value: '{{ env "BETTER_AUTH_SECRET" }}' - name: BETTER_AUTH_URL - value: "{{ printf \"https://%s\" $publicDomain }}" + value: "https://{{ $publicDomain }}" - name: OBOL_GOOGLE_CLIENT_ID - value: "{{ env \"OBOL_GOOGLE_CLIENT_ID\" }}" + value: '{{ env "OBOL_GOOGLE_CLIENT_ID" }}' - name: OBOL_GOOGLE_CLIENT_SECRET - value: "{{ env \"OBOL_GOOGLE_CLIENT_SECRET\" }}" + value: '{{ env "OBOL_GOOGLE_CLIENT_SECRET" }}' - name: OBOL_AUTH_DB_PATH value: "/data/auth.sqlite" diff --git a/internal/embed/k3s-config.yaml b/internal/embed/k3s-config.yaml new file mode 100644 index 0000000..1c75e5a --- /dev/null +++ b/internal/embed/k3s-config.yaml @@ -0,0 +1,24 @@ +# k3s server configuration for Obol Stack +# Generated by: obol stack init --backend k3s + +# Disable components we manage ourselves (matching k3d config) +disable: + - traefik + - local-storage + +# Data directory for k3s internal state +data-dir: {{DATA_DIR}}/k3s + +# Bind to all interfaces for local access +bind-address: 0.0.0.0 +https-listen-port: 6443 + +# TLS SANs for local access +tls-san: + - "127.0.0.1" + - "localhost" + - "obol.stack" + +# Node labels +node-label: + - "obol.cluster-id={{STACK_ID}}" diff --git a/internal/stack/backend.go b/internal/stack/backend.go new file mode 100644 index 0000000..f26014d --- /dev/null +++ b/internal/stack/backend.go @@ -0,0 +1,77 @@ +package stack + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/ObolNetwork/obol-stack/internal/config" +) + +const ( + // BackendK3d is the k3d backend (Docker-based, default) + BackendK3d = "k3d" + // BackendK3s is the standalone k3s backend (bare-metal) + BackendK3s = "k3s" + + stackBackendFile = ".stack-backend" +) + +// Backend abstracts the Kubernetes cluster runtime (k3d, k3s) +type Backend interface { + // Name returns the backend identifier (e.g., "k3d", "k3s") + Name() string + + // Init generates backend-specific cluster configuration files + Init(cfg *config.Config, stackID string) error + + // Up creates or starts the cluster and returns kubeconfig contents + Up(cfg *config.Config, stackID string) (kubeconfigData []byte, err error) + + // IsRunning returns true if the cluster is currently running + IsRunning(cfg *config.Config, stackID string) (bool, error) + + // Down stops the cluster without destroying configuration or data + Down(cfg *config.Config, stackID string) error + + // Destroy removes the cluster entirely (containers/processes) + Destroy(cfg *config.Config, stackID string) error + + // DataDir returns the storage path for the local-path-provisioner. + // For k3d this is "/data" (Docker volume mount point). + // For k3s this is the absolute host path to cfg.DataDir. + DataDir(cfg *config.Config) string + + // Prerequisites checks that required software/permissions are available + Prerequisites(cfg *config.Config) error +} + +// NewBackend creates a Backend by name +func NewBackend(name string) (Backend, error) { + switch name { + case BackendK3d: + return &K3dBackend{}, nil + case BackendK3s: + return &K3sBackend{}, nil + default: + return nil, fmt.Errorf("unknown backend: %s (supported: k3d, k3s)", name) + } +} + +// LoadBackend reads the persisted backend choice from .stack-backend file. +// Falls back to k3d if no file exists (backward compatibility). +func LoadBackend(cfg *config.Config) (Backend, error) { + path := filepath.Join(cfg.ConfigDir, stackBackendFile) + data, err := os.ReadFile(path) + if err != nil { + return &K3dBackend{}, nil + } + return NewBackend(strings.TrimSpace(string(data))) +} + +// SaveBackend persists the backend choice +func SaveBackend(cfg *config.Config, name string) error { + path := filepath.Join(cfg.ConfigDir, stackBackendFile) + return os.WriteFile(path, []byte(name), 0644) +} diff --git a/internal/stack/backend_k3d.go b/internal/stack/backend_k3d.go new file mode 100644 index 0000000..8fdd3de --- /dev/null +++ b/internal/stack/backend_k3d.go @@ -0,0 +1,164 @@ +package stack + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + + "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/embed" +) + +const ( + k3dConfigFile = "k3d.yaml" +) + +// K3dBackend manages clusters via k3d (k3s inside Docker containers) +type K3dBackend struct{} + +func (b *K3dBackend) Name() string { return BackendK3d } + +func (b *K3dBackend) Prerequisites(cfg *config.Config) error { + // Check Docker is running + cmd := exec.Command("docker", "info") + cmd.Stdout = nil + cmd.Stderr = nil + if err := cmd.Run(); err != nil { + return fmt.Errorf("Docker is not running. k3d backend requires Docker.\nStart Docker and try again") + } + + // Check k3d binary exists + k3dPath := filepath.Join(cfg.BinDir, "k3d") + if _, err := os.Stat(k3dPath); os.IsNotExist(err) { + return fmt.Errorf("k3d not found at %s\nRun obolup.sh to install dependencies", k3dPath) + } + return nil +} + +func (b *K3dBackend) Init(cfg *config.Config, stackID string) error { + absDataDir, err := filepath.Abs(cfg.DataDir) + if err != nil { + return fmt.Errorf("failed to get absolute path for data directory: %w", err) + } + + absConfigDir, err := filepath.Abs(cfg.ConfigDir) + if err != nil { + return fmt.Errorf("failed to get absolute path for config directory: %w", err) + } + + // Template k3d config with actual values + k3dConfig := embed.K3dConfig + k3dConfig = strings.ReplaceAll(k3dConfig, "{{STACK_ID}}", stackID) + k3dConfig = strings.ReplaceAll(k3dConfig, "{{DATA_DIR}}", absDataDir) + k3dConfig = strings.ReplaceAll(k3dConfig, "{{CONFIG_DIR}}", absConfigDir) + + k3dConfigPath := filepath.Join(cfg.ConfigDir, k3dConfigFile) + if err := os.WriteFile(k3dConfigPath, []byte(k3dConfig), 0644); err != nil { + return fmt.Errorf("failed to write k3d config: %w", err) + } + + fmt.Printf("K3d config saved to: %s\n", k3dConfigPath) + return nil +} + +func (b *K3dBackend) IsRunning(cfg *config.Config, stackID string) (bool, error) { + stackName := fmt.Sprintf("obol-stack-%s", stackID) + listCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "list", "--no-headers") + output, err := listCmd.Output() + if err != nil { + return false, fmt.Errorf("k3d list command failed: %w", err) + } + return strings.Contains(string(output), stackName), nil +} + +func (b *K3dBackend) Up(cfg *config.Config, stackID string) ([]byte, error) { + stackName := fmt.Sprintf("obol-stack-%s", stackID) + k3dConfigPath := filepath.Join(cfg.ConfigDir, k3dConfigFile) + + running, err := b.IsRunning(cfg, stackID) + if err != nil { + return nil, err + } + + if running { + fmt.Printf("Stack already exists, attempting to start: %s (id: %s)\n", stackName, stackID) + startCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "start", stackName) + startCmd.Stdout = os.Stdout + startCmd.Stderr = os.Stderr + if err := startCmd.Run(); err != nil { + return nil, fmt.Errorf("failed to start existing cluster: %w", err) + } + } else { + // Create data directory if it doesn't exist + absDataDir, err := filepath.Abs(cfg.DataDir) + if err != nil { + return nil, fmt.Errorf("failed to get absolute path for data directory: %w", err) + } + if err := os.MkdirAll(absDataDir, 0755); err != nil { + return nil, fmt.Errorf("failed to create data directory: %w", err) + } + + fmt.Println("Creating k3d cluster...") + createCmd := exec.Command( + filepath.Join(cfg.BinDir, "k3d"), + "cluster", "create", stackName, + "--config", k3dConfigPath, + "--kubeconfig-update-default=false", + ) + createCmd.Stdout = os.Stdout + createCmd.Stderr = os.Stderr + if err := createCmd.Run(); err != nil { + return nil, fmt.Errorf("failed to create cluster: %w", err) + } + } + + // Export kubeconfig + kubeconfigCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "kubeconfig", "get", stackName) + kubeconfigData, err := kubeconfigCmd.Output() + if err != nil { + return nil, fmt.Errorf("failed to get kubeconfig: %w", err) + } + + return kubeconfigData, nil +} + +func (b *K3dBackend) Down(cfg *config.Config, stackID string) error { + stackName := fmt.Sprintf("obol-stack-%s", stackID) + + fmt.Printf("Stopping stack gracefully: %s (id: %s)\n", stackName, stackID) + + stopCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "stop", stackName) + stopCmd.Stdout = os.Stdout + stopCmd.Stderr = os.Stderr + if err := stopCmd.Run(); err != nil { + fmt.Println("Graceful stop timed out or failed, forcing cluster deletion") + deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) + deleteCmd.Stdout = os.Stdout + deleteCmd.Stderr = os.Stderr + if err := deleteCmd.Run(); err != nil { + return fmt.Errorf("failed to stop cluster: %w", err) + } + } + + return nil +} + +func (b *K3dBackend) Destroy(cfg *config.Config, stackID string) error { + stackName := fmt.Sprintf("obol-stack-%s", stackID) + + fmt.Printf("Deleting cluster containers: %s\n", stackName) + deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) + deleteCmd.Stdout = os.Stdout + deleteCmd.Stderr = os.Stderr + if err := deleteCmd.Run(); err != nil { + fmt.Printf("Failed to delete cluster (may already be deleted): %v\n", err) + } + + return nil +} + +func (b *K3dBackend) DataDir(cfg *config.Config) string { + return "/data" +} diff --git a/internal/stack/backend_k3s.go b/internal/stack/backend_k3s.go new file mode 100644 index 0000000..482d7e8 --- /dev/null +++ b/internal/stack/backend_k3s.go @@ -0,0 +1,320 @@ +package stack + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "strings" + "syscall" + "time" + + "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/embed" +) + +const ( + k3sConfigFile = "k3s-config.yaml" + k3sPidFile = ".k3s.pid" + k3sLogFile = "k3s.log" +) + +// K3sBackend manages a standalone k3s cluster (bare-metal) +type K3sBackend struct{} + +func (b *K3sBackend) Name() string { return BackendK3s } + +func (b *K3sBackend) Prerequisites(cfg *config.Config) error { + if runtime.GOOS != "linux" { + return fmt.Errorf("k3s backend is only supported on Linux") + } + + // Check sudo access (allow interactive password prompt) + cmd := exec.Command("sudo", "-v") + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("k3s backend requires root/sudo access") + } + + // Check k3s binary exists + k3sPath := filepath.Join(cfg.BinDir, "k3s") + if _, err := os.Stat(k3sPath); os.IsNotExist(err) { + return fmt.Errorf("k3s not found at %s\nRun obolup.sh to install dependencies", k3sPath) + } + + return nil +} + +func (b *K3sBackend) Init(cfg *config.Config, stackID string) error { + absDataDir, err := filepath.Abs(cfg.DataDir) + if err != nil { + return fmt.Errorf("failed to get absolute path for data directory: %w", err) + } + + // Template k3s config with actual values + k3sConfig := embed.K3sConfig + k3sConfig = strings.ReplaceAll(k3sConfig, "{{STACK_ID}}", stackID) + k3sConfig = strings.ReplaceAll(k3sConfig, "{{DATA_DIR}}", absDataDir) + + k3sConfigPath := filepath.Join(cfg.ConfigDir, k3sConfigFile) + if err := os.WriteFile(k3sConfigPath, []byte(k3sConfig), 0644); err != nil { + return fmt.Errorf("failed to write k3s config: %w", err) + } + + fmt.Printf("K3s config saved to: %s\n", k3sConfigPath) + return nil +} + +func (b *K3sBackend) IsRunning(cfg *config.Config, stackID string) (bool, error) { + pid, err := b.readPid(cfg) + if err != nil { + return false, nil + } + + return b.isProcessAlive(pid), nil +} + +func (b *K3sBackend) Up(cfg *config.Config, stackID string) ([]byte, error) { + running, _ := b.IsRunning(cfg, stackID) + if running { + fmt.Println("k3s is already running") + kubeconfigPath := filepath.Join(cfg.ConfigDir, kubeconfigFile) + data, err := os.ReadFile(kubeconfigPath) + if err != nil { + return nil, fmt.Errorf("k3s is running but kubeconfig not found: %w", err) + } + return data, nil + } + + // Clean up stale PID file if it exists (QA R6) + b.cleanStalePid(cfg) + + k3sConfigPath := filepath.Join(cfg.ConfigDir, k3sConfigFile) + if _, err := os.Stat(k3sConfigPath); os.IsNotExist(err) { + return nil, fmt.Errorf("k3s config not found at %s\nRun 'obol stack init --backend k3s' first", k3sConfigPath) + } + + // Create data directory + absDataDir, err := filepath.Abs(cfg.DataDir) + if err != nil { + return nil, fmt.Errorf("failed to get absolute path for data directory: %w", err) + } + if err := os.MkdirAll(absDataDir, 0755); err != nil { + return nil, fmt.Errorf("failed to create data directory: %w", err) + } + + kubeconfigPath := filepath.Join(cfg.ConfigDir, kubeconfigFile) + k3sBinary := filepath.Join(cfg.BinDir, "k3s") + logPath := filepath.Join(cfg.ConfigDir, k3sLogFile) + + // Remove stale kubeconfig so we wait for k3s to write a fresh one + os.Remove(kubeconfigPath) + + // Open log file for k3s output + logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) + if err != nil { + return nil, fmt.Errorf("failed to create k3s log file: %w", err) + } + + fmt.Println("Starting k3s server...") + + // Start k3s server as background process via sudo + cmd := exec.Command("sudo", + k3sBinary, "server", + "--config", k3sConfigPath, + "--write-kubeconfig", kubeconfigPath, + "--write-kubeconfig-mode", "0600", + ) + cmd.Stdout = logFile + cmd.Stderr = logFile + // Set process group so we can clean up child processes + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + + if err := cmd.Start(); err != nil { + logFile.Close() + return nil, fmt.Errorf("failed to start k3s: %w", err) + } + + // Save PID before releasing the process handle + pid := cmd.Process.Pid + + // Write PID file + pidPath := filepath.Join(cfg.ConfigDir, k3sPidFile) + if err := os.WriteFile(pidPath, []byte(strconv.Itoa(pid)), 0600); err != nil { + logFile.Close() + return nil, fmt.Errorf("failed to write k3s PID file: %w", err) + } + + // Detach the process + cmd.Process.Release() + logFile.Close() + + fmt.Printf("k3s started (pid: %d)\n", pid) + fmt.Printf("Logs: %s\n", logPath) + + // Wait for kubeconfig to be written by k3s + fmt.Println("Waiting for kubeconfig...") + deadline := time.Now().Add(2 * time.Minute) + for time.Now().Before(deadline) { + if info, err := os.Stat(kubeconfigPath); err == nil && info.Size() > 0 { + // Fix ownership: k3s writes kubeconfig as root via sudo + exec.Command("sudo", "chown", fmt.Sprintf("%d:%d", os.Getuid(), os.Getgid()), kubeconfigPath).Run() + + data, err := os.ReadFile(kubeconfigPath) + if err == nil && len(data) > 0 { + fmt.Println("Kubeconfig ready, waiting for API server...") + + // Wait for the API server to actually respond + apiDeadline := time.Now().Add(90 * time.Second) + kubectlPath := filepath.Join(cfg.BinDir, "kubectl") + for time.Now().Before(apiDeadline) { + probe := exec.Command(kubectlPath, "--kubeconfig", kubeconfigPath, + "get", "nodes", "--no-headers") + if out, err := probe.Output(); err == nil && len(out) > 0 { + fmt.Println("API server ready") + return data, nil + } + time.Sleep(3 * time.Second) + } + + // Return kubeconfig even if API isn't fully ready yet + fmt.Println("Warning: API server not fully ready, proceeding anyway") + return data, nil + } + } + time.Sleep(2 * time.Second) + } + + return nil, fmt.Errorf("k3s did not write kubeconfig within timeout\nCheck logs: %s", logPath) +} + +func (b *K3sBackend) Down(cfg *config.Config, stackID string) error { + pid, err := b.readPid(cfg) + if err != nil { + fmt.Println("k3s PID file not found, may not be running") + return nil + } + + if !b.isProcessAlive(pid) { + fmt.Println("k3s process not running, cleaning up PID file") + b.removePidFile(cfg) + return nil + } + + fmt.Printf("Stopping k3s (pid: %d)...\n", pid) + + // Send SIGTERM to the process group for clean shutdown (negative PID = process group) + pgid := fmt.Sprintf("-%d", pid) + stopCmd := exec.Command("sudo", "kill", "-TERM", pgid) + stopCmd.Stdout = os.Stdout + stopCmd.Stderr = os.Stderr + if err := stopCmd.Run(); err != nil { + fmt.Printf("SIGTERM to process group failed, sending SIGKILL: %v\n", err) + exec.Command("sudo", "kill", "-9", pgid).Run() + } + + // Wait for process to exit (up to 30 seconds) + deadline := time.Now().Add(30 * time.Second) + for time.Now().Before(deadline) { + if !b.isProcessAlive(pid) { + break + } + time.Sleep(1 * time.Second) + } + + // Run k3s-killall.sh if available (cleans up containerd/iptables) + killallPath := "/usr/local/bin/k3s-killall.sh" + if _, err := os.Stat(killallPath); err == nil { + fmt.Println("Running k3s cleanup...") + cleanCmd := exec.Command("sudo", killallPath) + cleanCmd.Stdout = os.Stdout + cleanCmd.Stderr = os.Stderr + cleanCmd.Run() + } + + b.removePidFile(cfg) + fmt.Println("k3s stopped") + return nil +} + +func (b *K3sBackend) Destroy(cfg *config.Config, stackID string) error { + // Stop if running + b.Down(cfg, stackID) + + // Clean up k3s state directories (default + custom data-dir) + absDataDir, _ := filepath.Abs(cfg.DataDir) + cleanDirs := []string{ + "/var/lib/rancher/k3s", + "/etc/rancher/k3s", + filepath.Join(absDataDir, "k3s"), + } + for _, dir := range cleanDirs { + if _, err := os.Stat(dir); err == nil { + fmt.Printf("Cleaning up: %s\n", dir) + exec.Command("sudo", "rm", "-rf", dir).Run() + } + } + + // Run uninstall script if available + uninstallPath := "/usr/local/bin/k3s-uninstall.sh" + if _, err := os.Stat(uninstallPath); err == nil { + fmt.Println("Running k3s uninstall...") + uninstallCmd := exec.Command("sudo", uninstallPath) + uninstallCmd.Stdout = os.Stdout + uninstallCmd.Stderr = os.Stderr + uninstallCmd.Run() + } + + return nil +} + +func (b *K3sBackend) DataDir(cfg *config.Config) string { + absDataDir, _ := filepath.Abs(cfg.DataDir) + return absDataDir +} + +// readPid reads the k3s PID from the PID file +func (b *K3sBackend) readPid(cfg *config.Config) (int, error) { + pidPath := filepath.Join(cfg.ConfigDir, k3sPidFile) + data, err := os.ReadFile(pidPath) + if err != nil { + return 0, err + } + pid, err := strconv.Atoi(strings.TrimSpace(string(data))) + if err != nil { + return 0, fmt.Errorf("invalid PID in %s: %w", pidPath, err) + } + if pid <= 0 { + return 0, fmt.Errorf("invalid PID in %s: %d", pidPath, pid) + } + return pid, nil +} + +// cleanStalePid removes the PID file if the process is no longer running +func (b *K3sBackend) cleanStalePid(cfg *config.Config) { + pid, err := b.readPid(cfg) + if err != nil { + return + } + if !b.isProcessAlive(pid) { + fmt.Printf("Cleaning up stale PID file (pid %d no longer running)\n", pid) + b.removePidFile(cfg) + } +} + +// isProcessAlive checks if a root-owned process is still running. +// Uses sudo kill -0 since the k3s process runs as root and direct +// signal(0) from an unprivileged user returns EPERM. +func (b *K3sBackend) isProcessAlive(pid int) bool { + return exec.Command("sudo", "kill", "-0", strconv.Itoa(pid)).Run() == nil +} + +// removePidFile removes the k3s PID file +func (b *K3sBackend) removePidFile(cfg *config.Config) { + pidPath := filepath.Join(cfg.ConfigDir, k3sPidFile) + os.Remove(pidPath) +} diff --git a/internal/stack/backend_k3s_test.go b/internal/stack/backend_k3s_test.go new file mode 100644 index 0000000..e7a09ba --- /dev/null +++ b/internal/stack/backend_k3s_test.go @@ -0,0 +1,97 @@ +package stack + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/ObolNetwork/obol-stack/internal/config" +) + +func TestK3sReadPid(t *testing.T) { + tests := []struct { + name string + content string + wantPid int + wantErr bool + errContains string + }{ + {name: "valid pid", content: "12345", wantPid: 12345}, + {name: "with trailing newline", content: "12345\n", wantPid: 12345}, + {name: "with whitespace", content: " 12345 ", wantPid: 12345}, + {name: "pid 1", content: "1", wantPid: 1}, + {name: "large pid", content: "4194304", wantPid: 4194304}, + {name: "not a number", content: "not-a-number", wantErr: true, errContains: "invalid PID"}, + {name: "empty content", content: "", wantErr: true, errContains: "invalid PID"}, + {name: "float", content: "123.45", wantErr: true, errContains: "invalid PID"}, + {name: "negative", content: "-1", wantErr: true, errContains: "invalid PID"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + pidPath := filepath.Join(tmpDir, k3sPidFile) + if err := os.WriteFile(pidPath, []byte(tt.content), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + b := &K3sBackend{} + pid, err := b.readPid(cfg) + if tt.wantErr { + if err == nil { + t.Fatalf("readPid() = %d, nil error; want error containing %q", pid, tt.errContains) + } + if !strings.Contains(err.Error(), tt.errContains) { + t.Errorf("readPid() error = %q, want containing %q", err.Error(), tt.errContains) + } + return + } + if err != nil { + t.Fatalf("readPid() unexpected error: %v", err) + } + if pid != tt.wantPid { + t.Errorf("readPid() = %d, want %d", pid, tt.wantPid) + } + }) + } + + t.Run("missing file", func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + b := &K3sBackend{} + _, err := b.readPid(cfg) + if err == nil { + t.Fatal("readPid() with no file should return error") + } + }) +} + +func TestK3sRemovePidFile(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + pidPath := filepath.Join(tmpDir, k3sPidFile) + if err := os.WriteFile(pidPath, []byte("12345"), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + b := &K3sBackend{} + b.removePidFile(cfg) + + if _, err := os.Stat(pidPath); !os.IsNotExist(err) { + t.Error("PID file should have been removed") + } +} + +func TestK3sRemovePidFileNoop(t *testing.T) { + // Removing a non-existent PID file should not panic or error + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + b := &K3sBackend{} + b.removePidFile(cfg) // should not panic +} diff --git a/internal/stack/backend_test.go b/internal/stack/backend_test.go new file mode 100644 index 0000000..e59836c --- /dev/null +++ b/internal/stack/backend_test.go @@ -0,0 +1,321 @@ +package stack + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/ObolNetwork/obol-stack/internal/config" +) + +// Compile-time interface compliance checks +var ( + _ Backend = (*K3dBackend)(nil) + _ Backend = (*K3sBackend)(nil) +) + +func TestNewBackend(t *testing.T) { + tests := []struct { + name string + input string + wantName string + wantErr bool + errContains string + }{ + {name: "k3d backend", input: "k3d", wantName: "k3d"}, + {name: "k3s backend", input: "k3s", wantName: "k3s"}, + {name: "unknown backend", input: "docker", wantErr: true, errContains: "unknown backend"}, + {name: "empty string", input: "", wantErr: true, errContains: "unknown backend"}, + {name: "case sensitive", input: "K3D", wantErr: true, errContains: "unknown backend"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + backend, err := NewBackend(tt.input) + if tt.wantErr { + if err == nil { + t.Fatalf("NewBackend(%q) = nil error, want error containing %q", tt.input, tt.errContains) + } + if !strings.Contains(err.Error(), tt.errContains) { + t.Errorf("NewBackend(%q) error = %q, want containing %q", tt.input, err.Error(), tt.errContains) + } + return + } + if err != nil { + t.Fatalf("NewBackend(%q) unexpected error: %v", tt.input, err) + } + if backend.Name() != tt.wantName { + t.Errorf("NewBackend(%q).Name() = %q, want %q", tt.input, backend.Name(), tt.wantName) + } + }) + } +} + +func TestK3dBackendName(t *testing.T) { + b := &K3dBackend{} + if got := b.Name(); got != BackendK3d { + t.Errorf("K3dBackend.Name() = %q, want %q", got, BackendK3d) + } +} + +func TestK3sBackendName(t *testing.T) { + b := &K3sBackend{} + if got := b.Name(); got != BackendK3s { + t.Errorf("K3sBackend.Name() = %q, want %q", got, BackendK3s) + } +} + +func TestK3dBackendDataDir(t *testing.T) { + // k3d DataDir must always return "/data" regardless of cfg.DataDir, + // because k3d mounts the host data dir to /data inside the container. + tests := []struct { + name string + dataDir string + }{ + {name: "absolute path", dataDir: "/home/user/.local/share/obol"}, + {name: "relative path", dataDir: ".workspace/data"}, + {name: "empty string", dataDir: ""}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + b := &K3dBackend{} + cfg := &config.Config{DataDir: tt.dataDir} + if got := b.DataDir(cfg); got != "/data" { + t.Errorf("K3dBackend.DataDir() = %q, want %q (must always be /data for Docker mount)", got, "/data") + } + }) + } +} + +func TestK3sBackendDataDir(t *testing.T) { + // k3s DataDir must return an absolute version of cfg.DataDir, + // because k3s runs directly on the host. + b := &K3sBackend{} + + t.Run("absolute path passthrough", func(t *testing.T) { + cfg := &config.Config{DataDir: "/home/user/.local/share/obol"} + got := b.DataDir(cfg) + if got != "/home/user/.local/share/obol" { + t.Errorf("K3sBackend.DataDir() = %q, want %q", got, "/home/user/.local/share/obol") + } + }) + + t.Run("relative path resolved to absolute", func(t *testing.T) { + cfg := &config.Config{DataDir: "relative/path"} + got := b.DataDir(cfg) + if !filepath.IsAbs(got) { + t.Errorf("K3sBackend.DataDir() = %q, want absolute path", got) + } + if !strings.HasSuffix(got, "relative/path") { + t.Errorf("K3sBackend.DataDir() = %q, want suffix %q", got, "relative/path") + } + }) +} + +func TestSaveAndLoadBackend(t *testing.T) { + tests := []struct { + name string + backend string + wantName string + }{ + {name: "save k3s load k3s", backend: "k3s", wantName: "k3s"}, + {name: "save k3d load k3d", backend: "k3d", wantName: "k3d"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + if err := SaveBackend(cfg, tt.backend); err != nil { + t.Fatalf("SaveBackend() error: %v", err) + } + + backend, err := LoadBackend(cfg) + if err != nil { + t.Fatalf("LoadBackend() error: %v", err) + } + if backend.Name() != tt.wantName { + t.Errorf("LoadBackend().Name() = %q, want %q", backend.Name(), tt.wantName) + } + }) + } +} + +func TestLoadBackendFallsBackToK3d(t *testing.T) { + // When no .stack-backend file exists, LoadBackend must return k3d + // for backward compatibility with existing stacks. + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + backend, err := LoadBackend(cfg) + if err != nil { + t.Fatalf("LoadBackend() error: %v", err) + } + if backend.Name() != BackendK3d { + t.Errorf("LoadBackend() with no file = %q, want %q (backward compat)", backend.Name(), BackendK3d) + } +} + +func TestLoadBackendWithWhitespace(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + // Write file with trailing newline and whitespace + path := filepath.Join(tmpDir, stackBackendFile) + if err := os.WriteFile(path, []byte("k3s\n "), 0644); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + backend, err := LoadBackend(cfg) + if err != nil { + t.Fatalf("LoadBackend() error: %v", err) + } + if backend.Name() != BackendK3s { + t.Errorf("LoadBackend() = %q, want %q", backend.Name(), BackendK3s) + } +} + +func TestLoadBackendInvalidName(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + path := filepath.Join(tmpDir, stackBackendFile) + if err := os.WriteFile(path, []byte("docker-swarm"), 0644); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + _, err := LoadBackend(cfg) + if err == nil { + t.Fatal("LoadBackend() with invalid backend name should return error") + } + if !strings.Contains(err.Error(), "unknown backend") { + t.Errorf("LoadBackend() error = %q, want containing %q", err.Error(), "unknown backend") + } +} + +func TestK3dBackendInit(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ + ConfigDir: tmpDir, + DataDir: filepath.Join(tmpDir, "data"), + } + + b := &K3dBackend{} + if err := b.Init(cfg, "test-stack"); err != nil { + t.Fatalf("K3dBackend.Init() error: %v", err) + } + + // Verify config file was written + configPath := filepath.Join(tmpDir, k3dConfigFile) + data, err := os.ReadFile(configPath) + if err != nil { + t.Fatalf("Failed to read generated config: %v", err) + } + + content := string(data) + + // Verify placeholders were replaced + if strings.Contains(content, "{{STACK_ID}}") { + t.Error("Config still contains {{STACK_ID}} placeholder") + } + if strings.Contains(content, "{{DATA_DIR}}") { + t.Error("Config still contains {{DATA_DIR}} placeholder") + } + if strings.Contains(content, "{{CONFIG_DIR}}") { + t.Error("Config still contains {{CONFIG_DIR}} placeholder") + } + + // Verify actual values are present + if !strings.Contains(content, "test-stack") { + t.Error("Config does not contain stack ID 'test-stack'") + } + + // Verify paths are absolute + if !strings.Contains(content, tmpDir) { + t.Errorf("Config does not contain absolute data dir path %q", tmpDir) + } +} + +func TestK3sBackendInit(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ + ConfigDir: tmpDir, + DataDir: filepath.Join(tmpDir, "data"), + } + + b := &K3sBackend{} + if err := b.Init(cfg, "my-cluster"); err != nil { + t.Fatalf("K3sBackend.Init() error: %v", err) + } + + // Verify config file was written + configPath := filepath.Join(tmpDir, k3sConfigFile) + data, err := os.ReadFile(configPath) + if err != nil { + t.Fatalf("Failed to read generated config: %v", err) + } + + content := string(data) + + // Verify placeholders were replaced + if strings.Contains(content, "{{STACK_ID}}") { + t.Error("Config still contains {{STACK_ID}} placeholder") + } + if strings.Contains(content, "{{DATA_DIR}}") { + t.Error("Config still contains {{DATA_DIR}} placeholder") + } + + // Verify actual values are present + if !strings.Contains(content, "my-cluster") { + t.Error("Config does not contain stack ID 'my-cluster'") + } + + // Verify data-dir uses absolute path + absDataDir, _ := filepath.Abs(filepath.Join(tmpDir, "data")) + expectedDataDir := absDataDir + "/k3s" + if !strings.Contains(content, expectedDataDir) { + t.Errorf("Config does not contain absolute data-dir %q", expectedDataDir) + } +} + +func TestGetStackID(t *testing.T) { + tests := []struct { + name string + content string + want string + }{ + {name: "simple id", content: "happy-panda", want: "happy-panda"}, + {name: "with trailing newline", content: "happy-panda\n", want: "happy-panda"}, + {name: "with whitespace", content: " happy-panda \n", want: "happy-panda"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + path := filepath.Join(tmpDir, stackIDFile) + if err := os.WriteFile(path, []byte(tt.content), 0644); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + got := getStackID(cfg) + if got != tt.want { + t.Errorf("getStackID() = %q, want %q", got, tt.want) + } + }) + } + + t.Run("missing file returns empty", func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + got := getStackID(cfg) + if got != "" { + t.Errorf("getStackID() with no file = %q, want empty string", got) + } + }) +} diff --git a/internal/stack/integration_test.go b/internal/stack/integration_test.go new file mode 100644 index 0000000..66088bc --- /dev/null +++ b/internal/stack/integration_test.go @@ -0,0 +1,255 @@ +//go:build integration + +package stack_test + +import ( + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" +) + +// Integration tests for the k3s backend user flows. +// Requires: sudo access, k3s binary, OBOL_DEVELOPMENT=true. +// +// Run with: +// go test -tags integration -timeout 15m -v ./internal/stack/ + +func TestK3sUserFlows(t *testing.T) { + if os.Getenv("OBOL_DEVELOPMENT") != "true" { + t.Skip("OBOL_DEVELOPMENT not set, skipping integration test") + } + + projectRoot := findProjectRoot(t) + obol := filepath.Join(projectRoot, ".workspace", "bin", "obol") + if _, err := os.Stat(obol); os.IsNotExist(err) { + t.Fatalf("obol binary not found at %s — build it first", obol) + } + + configDir := filepath.Join(projectRoot, ".workspace", "config") + binDir := filepath.Join(projectRoot, ".workspace", "bin") + + // Helper to run obol commands + run := func(t *testing.T, args ...string) (string, error) { + t.Helper() + cmd := exec.Command(obol, args...) + cmd.Env = append(os.Environ(), + "OBOL_DEVELOPMENT=true", + "PATH="+binDir+":"+os.Getenv("PATH"), + ) + cmd.Dir = projectRoot + out, err := cmd.CombinedOutput() + return string(out), err + } + + // Cleanup before tests + run(t, "stack", "purge", "--force") + + // Cleanup after all tests + t.Cleanup(func() { + run(t, "stack", "purge", "--force") + }) + + t.Run("init", func(t *testing.T) { + out, err := run(t, "stack", "init", "--backend", "k3s") + if err != nil { + t.Fatalf("stack init failed: %v\n%s", err, out) + } + + // Verify config files created + for _, f := range []string{"k3s-config.yaml", ".stack-id", ".stack-backend"} { + if _, err := os.Stat(filepath.Join(configDir, f)); os.IsNotExist(err) { + t.Errorf("expected %s to exist after init", f) + } + } + + // Verify defaults directory + if _, err := os.Stat(filepath.Join(configDir, "defaults")); os.IsNotExist(err) { + t.Error("expected defaults/ directory after init") + } + + // Verify backend is k3s + data, _ := os.ReadFile(filepath.Join(configDir, ".stack-backend")) + if got := strings.TrimSpace(string(data)); got != "k3s" { + t.Errorf("backend = %q, want k3s", got) + } + }) + + t.Run("init_rejects_without_force", func(t *testing.T) { + _, err := run(t, "stack", "init", "--backend", "k3s") + if err == nil { + t.Error("init without --force should fail when config exists") + } + }) + + t.Run("init_force_preserves_stack_id", func(t *testing.T) { + idBefore, _ := os.ReadFile(filepath.Join(configDir, ".stack-id")) + out, err := run(t, "stack", "init", "--backend", "k3s", "--force") + if err != nil { + t.Fatalf("stack init --force failed: %v\n%s", err, out) + } + idAfter, _ := os.ReadFile(filepath.Join(configDir, ".stack-id")) + if string(idBefore) != string(idAfter) { + t.Errorf("stack ID changed: %q → %q", string(idBefore), string(idAfter)) + } + }) + + t.Run("up", func(t *testing.T) { + out, err := run(t, "stack", "up") + if err != nil { + t.Fatalf("stack up failed: %v\n%s", err, out) + } + + // Verify PID file and kubeconfig exist + if _, err := os.Stat(filepath.Join(configDir, ".k3s.pid")); os.IsNotExist(err) { + t.Error("PID file not found after stack up") + } + if _, err := os.Stat(filepath.Join(configDir, "kubeconfig.yaml")); os.IsNotExist(err) { + t.Error("kubeconfig not found after stack up") + } + }) + + t.Run("kubectl_passthrough", func(t *testing.T) { + out, err := run(t, "kubectl", "get", "nodes", "--no-headers") + if err != nil { + t.Fatalf("kubectl passthrough failed: %v\n%s", err, out) + } + lines := strings.Split(strings.TrimSpace(out), "\n") + if len(lines) < 1 { + t.Error("kubectl get nodes returned no nodes") + } + + out, err = run(t, "kubectl", "get", "namespaces", "--no-headers") + if err != nil { + t.Fatalf("kubectl get namespaces failed: %v\n%s", err, out) + } + lines = strings.Split(strings.TrimSpace(out), "\n") + if len(lines) < 1 { + t.Error("kubectl get namespaces returned no namespaces") + } + }) + + t.Run("up_idempotent", func(t *testing.T) { + pidBefore, _ := os.ReadFile(filepath.Join(configDir, ".k3s.pid")) + + out, err := run(t, "stack", "up") + if err != nil { + t.Fatalf("stack up (idempotent) failed: %v\n%s", err, out) + } + + pidAfter, _ := os.ReadFile(filepath.Join(configDir, ".k3s.pid")) + if string(pidBefore) != string(pidAfter) { + t.Errorf("PID changed on idempotent up: %q → %q", string(pidBefore), string(pidAfter)) + } + }) + + t.Run("down", func(t *testing.T) { + out, err := run(t, "stack", "down") + if err != nil { + t.Fatalf("stack down failed: %v\n%s", err, out) + } + + // PID file should be cleaned up + if _, err := os.Stat(filepath.Join(configDir, ".k3s.pid")); !os.IsNotExist(err) { + t.Error("PID file should be removed after down") + } + + // Config should be preserved + if _, err := os.Stat(filepath.Join(configDir, ".stack-id")); os.IsNotExist(err) { + t.Error("stack ID should be preserved after down") + } + }) + + t.Run("down_already_stopped", func(t *testing.T) { + out, err := run(t, "stack", "down") + if err != nil { + t.Fatalf("stack down (already stopped) failed: %v\n%s", err, out) + } + }) + + t.Run("up_restart_after_down", func(t *testing.T) { + out, err := run(t, "stack", "up") + if err != nil { + t.Fatalf("stack up (restart) failed: %v\n%s", err, out) + } + + // Verify PID file exists + if _, err := os.Stat(filepath.Join(configDir, ".k3s.pid")); os.IsNotExist(err) { + t.Error("PID file not found after restart") + } + + // Wait for node to be ready + deadline := time.Now().Add(60 * time.Second) + for time.Now().Before(deadline) { + out, err := run(t, "kubectl", "get", "nodes", "--no-headers") + if err == nil && strings.Contains(out, "Ready") { + break + } + time.Sleep(3 * time.Second) + } + + out, _ = run(t, "kubectl", "get", "nodes", "--no-headers") + if !strings.Contains(out, "Ready") { + t.Error("node not ready after restart") + } + }) + + t.Run("purge", func(t *testing.T) { + out, err := run(t, "stack", "purge") + if err != nil { + t.Fatalf("stack purge failed: %v\n%s", err, out) + } + + time.Sleep(2 * time.Second) + + if _, err := os.Stat(filepath.Join(configDir, ".stack-id")); !os.IsNotExist(err) { + t.Error("stack ID should be removed after purge") + } + if _, err := os.Stat(filepath.Join(configDir, ".k3s.pid")); !os.IsNotExist(err) { + t.Error("PID file should be removed after purge") + } + }) + + t.Run("full_cycle_purge_force", func(t *testing.T) { + out, err := run(t, "stack", "init", "--backend", "k3s") + if err != nil { + t.Fatalf("init: %v\n%s", err, out) + } + + out, err = run(t, "stack", "up") + if err != nil { + t.Fatalf("up: %v\n%s", err, out) + } + + out, err = run(t, "stack", "purge", "--force") + if err != nil { + t.Fatalf("purge --force: %v\n%s", err, out) + } + + time.Sleep(2 * time.Second) + + if _, err := os.Stat(filepath.Join(configDir, ".stack-id")); !os.IsNotExist(err) { + t.Error("config should be removed after purge --force") + } + }) +} + +func findProjectRoot(t *testing.T) string { + t.Helper() + dir, err := os.Getwd() + if err != nil { + t.Fatalf("failed to get working directory: %v", err) + } + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + return dir + } + parent := filepath.Dir(dir) + if parent == dir { + t.Fatal("could not find project root (no go.mod)") + } + dir = parent + } +} diff --git a/internal/stack/stack.go b/internal/stack/stack.go index c8366f6..8e2442b 100644 --- a/internal/stack/stack.go +++ b/internal/stack/stack.go @@ -13,21 +13,30 @@ import ( ) const ( - k3dConfigFile = "k3d.yaml" kubeconfigFile = "kubeconfig.yaml" stackIDFile = ".stack-id" ) // Init initializes the stack configuration -func Init(cfg *config.Config, force bool) error { - // Create flat stack config directory - k3dConfigPath := filepath.Join(cfg.ConfigDir, k3dConfigFile) - - // Check if config already exists - if _, err := os.Stat(k3dConfigPath); err == nil { - if !force { - return fmt.Errorf("stack configuration already exists at %s\nUse --force to overwrite", k3dConfigPath) - } +func Init(cfg *config.Config, force bool, backendName string) error { + // Check if any stack config already exists + stackIDPath := filepath.Join(cfg.ConfigDir, stackIDFile) + backendFilePath := filepath.Join(cfg.ConfigDir, stackBackendFile) + + hasExistingConfig := false + if _, err := os.Stat(stackIDPath); err == nil { + hasExistingConfig = true + } + if _, err := os.Stat(backendFilePath); err == nil { + hasExistingConfig = true + } + // Also check legacy k3d.yaml for backward compatibility + if _, err := os.Stat(filepath.Join(cfg.ConfigDir, k3dConfigFile)); err == nil { + hasExistingConfig = true + } + + if hasExistingConfig && !force { + return fmt.Errorf("stack configuration already exists at %s\nUse --force to overwrite", cfg.ConfigDir) } if err := os.MkdirAll(cfg.ConfigDir, 0755); err != nil { @@ -35,46 +44,37 @@ func Init(cfg *config.Config, force bool) error { } // Check if stack ID already exists (preserve on --force) - stackIDPath := filepath.Join(cfg.ConfigDir, stackIDFile) var stackID string if existingID, err := os.ReadFile(stackIDPath); err == nil { - stackID = string(existingID) + stackID = strings.TrimSpace(string(existingID)) fmt.Printf("Preserving existing stack ID: %s (use purge to reset)\n", stackID) } else { - // Generate unique stack ID only if one doesn't exist stackID = petname.Generate(2, "-") } - fmt.Println("Initializing cluster configuration") - fmt.Printf("Cluster ID: %s\n", stackID) - - absDataDir, err := filepath.Abs(cfg.DataDir) - if err != nil { - return fmt.Errorf("failed to get absolute path for data directory: %w", err) + // Default to k3d if no backend specified + if backendName == "" { + backendName = BackendK3d } - absConfigDir, err := filepath.Abs(cfg.ConfigDir) + backend, err := NewBackend(backendName) if err != nil { - return fmt.Errorf("failed to get absolute path for config directory: %w", err) - } - - // Check if overwriting config - if _, err := os.Stat(k3dConfigPath); err == nil { - fmt.Printf("Overwriting existing stack configuration: %s\n", k3dConfigPath) + return err } - // Replace placeholder in k3d config with actual stack ID - k3dConfig := embed.K3dConfig - k3dConfig = strings.ReplaceAll(k3dConfig, "{{STACK_ID}}", stackID) - k3dConfig = strings.ReplaceAll(k3dConfig, "{{DATA_DIR}}", absDataDir) - k3dConfig = strings.ReplaceAll(k3dConfig, "{{CONFIG_DIR}}", absConfigDir) + fmt.Println("Initializing cluster configuration") + fmt.Printf("Cluster ID: %s\n", stackID) + fmt.Printf("Backend: %s\n", backend.Name()) - // Write k3d config with stack ID to destination - if err := os.WriteFile(k3dConfigPath, []byte(k3dConfig), 0644); err != nil { - return fmt.Errorf("failed to write k3d config: %w", err) + // Check prerequisites + if err := backend.Prerequisites(cfg); err != nil { + return fmt.Errorf("prerequisites check failed: %w", err) } - fmt.Printf("K3d config saved to: %s\n", k3dConfigPath) + // Generate backend-specific config + if err := backend.Init(cfg, stackID); err != nil { + return err + } // Copy embedded defaults (helmfile + charts for infrastructure) defaultsDir := filepath.Join(cfg.ConfigDir, "defaults") @@ -83,100 +83,50 @@ func Init(cfg *config.Config, force bool) error { } fmt.Printf("Defaults copied to: %s\n", defaultsDir) - // Store stack ID for later use (stackIDPath already declared above) + // Store stack ID if err := os.WriteFile(stackIDPath, []byte(stackID), 0644); err != nil { return fmt.Errorf("failed to write stack ID: %w", err) } - fmt.Printf("Initialized stack configuration: %s\n", k3dConfigPath) + // Save backend choice + if err := SaveBackend(cfg, backendName); err != nil { + return fmt.Errorf("failed to save backend choice: %w", err) + } + + fmt.Printf("Initialized stack configuration\n") fmt.Printf("Stack ID: %s\n", stackID) return nil } -// Up starts the k3d cluster +// Up starts the cluster using the configured backend func Up(cfg *config.Config) error { - k3dConfigPath := filepath.Join(cfg.ConfigDir, k3dConfigFile) - kubeconfigPath := filepath.Join(cfg.ConfigDir, kubeconfigFile) - - // Check if config exists - if _, err := os.Stat(k3dConfigPath); os.IsNotExist(err) { - return fmt.Errorf("stack config not found, run 'obol stack init' first") - } - - // Get stack ID and full stack name stackID := getStackID(cfg) if stackID == "" { return fmt.Errorf("stack ID not found, run 'obol stack init' first") } - stackName := getStackName(cfg) - - // Check if cluster already exists using cluster list - listCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "list", "--no-headers") - listCmdOutput, err := listCmd.Output() + backend, err := LoadBackend(cfg) if err != nil { - return fmt.Errorf("k3d list command failed: %w", err) + return fmt.Errorf("failed to load backend: %w", err) } - if stackExists(string(listCmdOutput), stackName) { - // Cluster exists - check if it's stopped or running - fmt.Printf("Stack already exists, attempting to start: %s (id: %s)\n", stackName, stackID) - startCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "start", stackName) - startCmd.Stdout = os.Stdout - startCmd.Stderr = os.Stderr - if err := startCmd.Run(); err != nil { - return fmt.Errorf("failed to start existing cluster: %w", err) - } - - if err := syncDefaults(cfg, kubeconfigPath); err != nil { - return err - } - - fmt.Println("Stack restarted successfully") - fmt.Printf("Stack ID: %s\n", stackID) - return nil - } - - fmt.Printf("Starting stack: %s (id: %s)\n", stackName, stackID) - - // Get absolute path to data directory for k3d volume mount - absDataDir, err := filepath.Abs(cfg.DataDir) - if err != nil { - return fmt.Errorf("failed to get absolute path for data directory: %w", err) - } - - // Create data directory if it doesn't exist - if err := os.MkdirAll(absDataDir, 0755); err != nil { - return fmt.Errorf("failed to create data directory: %w", err) - } - - // Create cluster using k3d config with custom name - fmt.Println("Creating k3d cluster...") - createCmd := exec.Command( - filepath.Join(cfg.BinDir, "k3d"), - "cluster", "create", stackName, - "--config", k3dConfigPath, - "--kubeconfig-update-default=false", - ) - createCmd.Stdout = os.Stdout - createCmd.Stderr = os.Stderr + kubeconfigPath := filepath.Join(cfg.ConfigDir, kubeconfigFile) - if err := createCmd.Run(); err != nil { - return fmt.Errorf("failed to create cluster: %w", err) - } + fmt.Printf("Starting stack (id: %s, backend: %s)\n", stackID, backend.Name()) - // Export kubeconfig - kubeconfigCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "kubeconfig", "get", stackName) - kubeconfigData, err := kubeconfigCmd.Output() + kubeconfigData, err := backend.Up(cfg, stackID) if err != nil { - return fmt.Errorf("failed to get kubeconfig: %w", err) + return err } + // Write kubeconfig (backend may have already written it, but ensure consistency) if err := os.WriteFile(kubeconfigPath, kubeconfigData, 0600); err != nil { return fmt.Errorf("failed to write kubeconfig: %w", err) } - if err := syncDefaults(cfg, kubeconfigPath); err != nil { + // Sync defaults with backend-aware dataDir + dataDir := backend.DataDir(cfg) + if err := syncDefaults(cfg, kubeconfigPath, dataDir); err != nil { return err } @@ -187,85 +137,50 @@ func Up(cfg *config.Config) error { return nil } -// Down stops the k3d cluster +// Down stops the cluster func Down(cfg *config.Config) error { stackID := getStackID(cfg) if stackID == "" { return fmt.Errorf("stack ID not found, stack may not be initialized") } - stackName := getStackName(cfg) - - fmt.Printf("Stopping stack gracefully: %s (id: %s)\n", stackName, stackID) - - // First attempt graceful stop (allows processes to shutdown gracefully) - stopCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "stop", stackName) - stopCmd.Stdout = os.Stdout - stopCmd.Stderr = os.Stderr - - if err := stopCmd.Run(); err != nil { - fmt.Println("Graceful stop timed out or failed, forcing cluster deletion") - // Fallback to delete if stop fails - deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) - deleteCmd.Stdout = os.Stdout - deleteCmd.Stderr = os.Stderr - if err := deleteCmd.Run(); err != nil { - return fmt.Errorf("failed to stop cluster: %w", err) - } + + backend, err := LoadBackend(cfg) + if err != nil { + return fmt.Errorf("failed to load backend: %w", err) } - fmt.Println("Stack stopped successfully") - return nil + return backend.Down(cfg, stackID) } // Purge deletes the cluster config and optionally data func Purge(cfg *config.Config, force bool) error { - // Delete cluster containers - stackName := getStackName(cfg) - if stackName != "" { + stackID := getStackID(cfg) + + backend, err := LoadBackend(cfg) + if err != nil { + return fmt.Errorf("failed to load backend: %w", err) + } + + // Destroy cluster if we have a stack ID + if stackID != "" { if force { - // Force delete without graceful shutdown - fmt.Printf("Force deleting cluster containers: %s\n", stackName) - deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) - deleteCmd.Stdout = os.Stdout - deleteCmd.Stderr = os.Stderr - if err := deleteCmd.Run(); err != nil { - fmt.Printf("Failed to delete cluster (may already be deleted): %v\n", err) - } - fmt.Println("Cluster containers force deleted") + fmt.Printf("Force destroying cluster (id: %s)\n", stackID) } else { - // Graceful shutdown first to ensure data is written properly - fmt.Printf("Gracefully stopping cluster before deletion: %s\n", stackName) - stopCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "stop", stackName) - stopCmd.Stdout = os.Stdout - stopCmd.Stderr = os.Stderr - if err := stopCmd.Run(); err != nil { - fmt.Println("Graceful stop timed out or failed, proceeding with deletion anyway") - } else { - fmt.Println("Cluster stopped gracefully") - } - - // Now delete the stopped cluster - fmt.Println("Deleting cluster containers") - deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) - deleteCmd.Stdout = os.Stdout - deleteCmd.Stderr = os.Stderr - if err := deleteCmd.Run(); err != nil { - fmt.Printf("Failed to delete cluster (may already be deleted): %v\n", err) - } - fmt.Println("Cluster containers deleted") + fmt.Printf("Destroying cluster (id: %s)\n", stackID) + } + if err := backend.Destroy(cfg, stackID); err != nil { + fmt.Printf("Failed to destroy cluster (may already be deleted): %v\n", err) } } // Remove stack config directory - stackConfigDir := filepath.Join(cfg.ConfigDir) - if err := os.RemoveAll(stackConfigDir); err != nil { + if err := os.RemoveAll(cfg.ConfigDir); err != nil { return fmt.Errorf("failed to remove stack config: %w", err) } fmt.Println("Removed cluster config directory") // Remove data directory only if force flag is set if force { - // Use sudo to remove data directory since it may contain root-owned files fmt.Println("Removing data directory...") rmCmd := exec.Command("sudo", "rm", "-rf", cfg.DataDir) rmCmd.Stdout = os.Stdout @@ -284,12 +199,6 @@ func Purge(cfg *config.Config, force bool) error { return nil } -// stackExists checks if stack name exists in k3d cluster list output -func stackExists(output, name string) bool { - // Check if the stack name appears in the output - return strings.Contains(output, name) -} - // getStackID reads the stored stack ID func getStackID(cfg *config.Config) string { stackIDPath := filepath.Join(cfg.ConfigDir, stackIDFile) @@ -300,15 +209,6 @@ func getStackID(cfg *config.Config) string { return strings.TrimSpace(string(data)) } -// getStackName returns the full stack name (obol-stack-{stackid}) -func getStackName(cfg *config.Config) string { - stackID := getStackID(cfg) - if stackID == "" { - return "" - } - return fmt.Sprintf("obol-stack-%s", stackID) -} - // GetStackID reads the stored stack ID (exported for use in main) func GetStackID(cfg *config.Config) string { return getStackID(cfg) @@ -316,23 +216,25 @@ func GetStackID(cfg *config.Config) string { // syncDefaults deploys the default infrastructure using helmfile // If deployment fails, the cluster is automatically stopped via Down() -func syncDefaults(cfg *config.Config, kubeconfigPath string) error { +func syncDefaults(cfg *config.Config, kubeconfigPath string, dataDir string) error { fmt.Println("Deploying default infrastructure with helmfile") - // Sync defaults using helmfile (handles Helm hooks properly) defaultsHelmfilePath := filepath.Join(cfg.ConfigDir, "defaults") helmfileCmd := exec.Command( filepath.Join(cfg.BinDir, "helmfile"), - "--file", filepath.Join(defaultsHelmfilePath, "helmfile.yaml"), + "--file", filepath.Join(defaultsHelmfilePath, "helmfile.yaml.gotmpl"), "--kubeconfig", kubeconfigPath, "sync", ) + helmfileCmd.Env = append(os.Environ(), + fmt.Sprintf("KUBECONFIG=%s", kubeconfigPath), + fmt.Sprintf("STACK_DATA_DIR=%s", dataDir), + ) helmfileCmd.Stdout = os.Stdout helmfileCmd.Stderr = os.Stderr if err := helmfileCmd.Run(); err != nil { fmt.Println("Failed to apply defaults helmfile, stopping cluster") - // Attempt to stop the cluster to clean up if downErr := Down(cfg); downErr != nil { fmt.Printf("Failed to stop cluster during cleanup: %v\n", downErr) } From 61a7e2067f542af54fecac5abeeeaa52090b793d Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 7 Feb 2026 15:09:49 +0400 Subject: [PATCH 14/15] test(stack): add test-backend skill for k3d/k3s integration testing Adds a Claude Code skill (`/test-backend`) with bash scripts that exercise the full backend lifecycle: init, up, kubectl, down, restart, and purge for both k3d and k3s backends. --- .agents/skills/test-backend/SKILL.md | 70 ++++++++ .../skills/test-backend/scripts/test-k3d.sh | 153 ++++++++++++++++++ .../skills/test-backend/scripts/test-k3s.sh | 142 ++++++++++++++++ 3 files changed, 365 insertions(+) create mode 100644 .agents/skills/test-backend/SKILL.md create mode 100755 .agents/skills/test-backend/scripts/test-k3d.sh create mode 100755 .agents/skills/test-backend/scripts/test-k3s.sh diff --git a/.agents/skills/test-backend/SKILL.md b/.agents/skills/test-backend/SKILL.md new file mode 100644 index 0000000..2696c80 --- /dev/null +++ b/.agents/skills/test-backend/SKILL.md @@ -0,0 +1,70 @@ +--- +name: test-backend +description: Launch and test the k3d or k3s backend lifecycle (init, up, kubectl, down, purge). Use when you want to run a full integration test of a stack backend. +user_invocable: true +metadata: + author: obol-team + version: "1.0.0" + domain: testing + triggers: test backend, test k3d, test k3s, integration test, flow test, backend test + role: tester + scope: validation + output-format: report +--- + +# Test Backend Skill + +Runs a full lifecycle integration test for the obol stack backend (k3d or k3s). + +## Arguments + +The skill accepts an optional argument specifying which backend to test: + +- `k3s` - Test the k3s (bare-metal) backend only +- `k3d` - Test the k3d (Docker-based) backend only +- `all` - Test both backends sequentially (default) +- No argument defaults to `all` + +Examples: +- `/test-backend k3s` +- `/test-backend k3d` +- `/test-backend all` +- `/test-backend` (same as `all`) + +## Workflow + +### 1. Pre-flight + +- Build the obol binary: `go build -o .workspace/bin/obol ./cmd/obol` from the project root +- Verify the binary was created successfully +- Set `OBOL_DEVELOPMENT=true` and add `.workspace/bin` to PATH + +### 2. Run Test Script + +Based on the argument, run the appropriate test script(s) located alongside this skill: + +- **k3s**: Run `.agents/skills/test-backend/scripts/test-k3s.sh` +- **k3d**: Run `.agents/skills/test-backend/scripts/test-k3d.sh` +- **all**: Run k3s first, then k3d (k3s requires sudo so test it first while credentials are fresh) + +Execute the script via Bash tool from the project root directory. The scripts require: +- **k3s**: Linux, sudo access, k3s binary in `.workspace/bin/` +- **k3d**: Docker running, k3d binary in `.workspace/bin/` + +### 3. Report Results + +After each script completes, report: +- Total pass/fail counts (shown in the RESULTS line) +- Any specific test failures with their names +- Overall verdict: all green or needs attention + +If a test script fails (non-zero exit), read the output to identify which test(s) failed and summarize. + +## Important Notes + +- The k3s backend requires **sudo access** - the user may need to enter their password +- The k3d backend requires **Docker to be running** +- Each test script performs its own cleanup (purge) before and after +- Tests are sequential and ordered: init -> up -> verify -> down -> restart -> purge +- Typical runtime: ~2-4 minutes per backend +- If the environment has issues (Docker not starting, k3s not installing), report the problem clearly rather than retrying endlessly diff --git a/.agents/skills/test-backend/scripts/test-k3d.sh b/.agents/skills/test-backend/scripts/test-k3d.sh new file mode 100755 index 0000000..9657254 --- /dev/null +++ b/.agents/skills/test-backend/scripts/test-k3d.sh @@ -0,0 +1,153 @@ +#!/usr/bin/env bash +set -euo pipefail + +# K3d Backend Integration Test +# Requires: Docker running, k3d binary, OBOL_DEVELOPMENT=true + +PROJECT_ROOT="$(cd "$(dirname "$0")/../../../.." && pwd)" +OBOL="${PROJECT_ROOT}/.workspace/bin/obol" +export OBOL_DEVELOPMENT=true +export PATH="${PROJECT_ROOT}/.workspace/bin:$PATH" + +cd "$PROJECT_ROOT" + +PASS=0 +FAIL=0 + +log() { echo "$(date +%H:%M:%S) $*"; } +pass() { log " PASS: $*"; PASS=$((PASS + 1)); } +fail() { log " FAIL: $*"; FAIL=$((FAIL + 1)); } + +check() { + local desc="$1"; shift + if "$@"; then pass "$desc"; else fail "$desc"; fi +} + +check_fail() { + local desc="$1"; shift + if ! "$@" 2>/dev/null; then pass "$desc"; else fail "$desc (should have failed)"; fi +} + +k3d_is_functional() { + $OBOL kubectl get nodes --no-headers 2>/dev/null | grep -q "Ready" +} + +# Pre-flight: verify Docker is running +if ! docker info >/dev/null 2>&1; then + log "ERROR: Docker is not running. Start Docker and try again." + exit 1 +fi + +log "=========================================" +log "K3d Backend Integration Test" +log "=========================================" + +# --- Cleanup --- +log "--- Cleanup: purging any existing stack ---" +$OBOL stack purge --force 2>/dev/null || true + +# --- TEST 1: stack init (default = k3d) --- +log "" +log "--- TEST 1: stack init (default = k3d) ---" +check "stack init" $OBOL stack init +check "k3d.yaml exists" test -f .workspace/config/k3d.yaml +check ".stack-id exists" test -f .workspace/config/.stack-id +check ".stack-backend exists" test -f .workspace/config/.stack-backend +check "defaults/ directory exists" test -d .workspace/config/defaults +BACKEND=$(cat .workspace/config/.stack-backend) +check "backend is k3d" test "$BACKEND" = "k3d" +STACK_ID=$(cat .workspace/config/.stack-id) +log " Stack ID: $STACK_ID" + +# --- TEST 2: stack init again (should fail without --force) --- +log "" +log "--- TEST 2: stack init again (should fail without --force) ---" +check_fail "init without --force correctly rejected" $OBOL stack init + +# --- TEST 3: stack init --force --- +log "" +log "--- TEST 3: stack init --force ---" +$OBOL stack init --force +NEW_ID=$(cat .workspace/config/.stack-id) +check "stack ID preserved on --force ($STACK_ID)" test "$STACK_ID" = "$NEW_ID" + +# --- TEST 4: stack up --- +log "" +log "--- TEST 4: stack up ---" +check "stack up" $OBOL stack up +check "kubeconfig.yaml exists" test -f .workspace/config/kubeconfig.yaml + +# Wait for nodes to be ready (k3d can take a moment) +log " Waiting for nodes to be ready..." +DEADLINE=$((SECONDS + 120)) +while [ $SECONDS -lt $DEADLINE ]; do + if k3d_is_functional; then break; fi + sleep 3 +done +check "k3d is functional (nodes ready)" k3d_is_functional + +# --- TEST 5: kubectl passthrough --- +log "" +log "--- TEST 5: kubectl passthrough ---" +NODES=$($OBOL kubectl get nodes --no-headers 2>/dev/null | wc -l) +check "kubectl sees nodes ($NODES)" test "$NODES" -ge 1 + +NS=$($OBOL kubectl get namespaces --no-headers 2>/dev/null | wc -l) +check "kubectl sees namespaces ($NS)" test "$NS" -ge 1 + +# --- TEST 6: stack down --- +log "" +log "--- TEST 6: stack down ---" +check "stack down" $OBOL stack down +check "config preserved after down" test -f .workspace/config/.stack-id + +# Verify cluster stopped (kubectl should fail) +sleep 2 +check_fail "kubectl unreachable after down" $OBOL kubectl get nodes --no-headers + +# --- TEST 7: stack down already stopped --- +log "" +log "--- TEST 7: stack down already stopped ---" +check "stack down (already stopped)" $OBOL stack down + +# --- TEST 8: stack up (restart after down) --- +log "" +log "--- TEST 8: stack up (restart) ---" +check "stack up (restart)" $OBOL stack up + +# Wait for nodes to be ready after restart +log " Waiting for nodes to be ready..." +DEADLINE=$((SECONDS + 120)) +while [ $SECONDS -lt $DEADLINE ]; do + if k3d_is_functional; then break; fi + sleep 3 +done +check "k3d functional after restart" k3d_is_functional + +READY=$($OBOL kubectl get nodes --no-headers 2>/dev/null | grep -c "Ready" || true) +check "node ready after restart ($READY)" test "$READY" -ge 1 + +# --- TEST 9: stack purge --- +log "" +log "--- TEST 9: stack purge ---" +check "stack purge" $OBOL stack purge +sleep 2 +check "config removed" test ! -f .workspace/config/.stack-id + +# --- TEST 10: full cycle + purge --force --- +log "" +log "--- TEST 10: full cycle + purge --force ---" +check "init for purge test" $OBOL stack init +check "up for purge test" $OBOL stack up +check "purge --force" $OBOL stack purge --force +sleep 2 +check "config removed after purge --force" test ! -f .workspace/config/.stack-id + +log "" +log "=========================================" +log "K3d RESULTS: $PASS passed, $FAIL failed" +log "=========================================" + +if [ "$FAIL" -gt 0 ]; then + exit 1 +fi diff --git a/.agents/skills/test-backend/scripts/test-k3s.sh b/.agents/skills/test-backend/scripts/test-k3s.sh new file mode 100755 index 0000000..1dcaac4 --- /dev/null +++ b/.agents/skills/test-backend/scripts/test-k3s.sh @@ -0,0 +1,142 @@ +#!/usr/bin/env bash +set -euo pipefail + +# K3s Backend Integration Test +# Requires: Linux, sudo access, k3s binary, OBOL_DEVELOPMENT=true + +PROJECT_ROOT="$(cd "$(dirname "$0")/../../../.." && pwd)" +OBOL="${PROJECT_ROOT}/.workspace/bin/obol" +export OBOL_DEVELOPMENT=true +export PATH="${PROJECT_ROOT}/.workspace/bin:$PATH" + +cd "$PROJECT_ROOT" + +PASS=0 +FAIL=0 + +log() { echo "$(date +%H:%M:%S) $*"; } +pass() { log " PASS: $*"; PASS=$((PASS + 1)); } +fail() { log " FAIL: $*"; FAIL=$((FAIL + 1)); } + +check() { + local desc="$1"; shift + if "$@"; then pass "$desc"; else fail "$desc"; fi +} + +check_fail() { + local desc="$1"; shift + if ! "$@" 2>/dev/null; then pass "$desc"; else fail "$desc (should have failed)"; fi +} + +k3s_is_functional() { + $OBOL kubectl get nodes --no-headers 2>/dev/null | grep -q "Ready" +} + +log "=========================================" +log "K3s Backend Integration Test" +log "=========================================" + +# --- Cleanup --- +log "--- Cleanup: purging any existing stack ---" +$OBOL stack purge --force 2>/dev/null || true + +# --- TEST 1: stack init --backend k3s --- +log "" +log "--- TEST 1: stack init --backend k3s ---" +check "stack init --backend k3s" $OBOL stack init --backend k3s +check "k3s-config.yaml exists" test -f .workspace/config/k3s-config.yaml +check ".stack-id exists" test -f .workspace/config/.stack-id +check ".stack-backend exists" test -f .workspace/config/.stack-backend +check "defaults/ directory exists" test -d .workspace/config/defaults +BACKEND=$(cat .workspace/config/.stack-backend) +check "backend is k3s" test "$BACKEND" = "k3s" +STACK_ID=$(cat .workspace/config/.stack-id) +log " Stack ID: $STACK_ID" + +# --- TEST 2: stack init again (should fail without --force) --- +log "" +log "--- TEST 2: stack init again (should fail without --force) ---" +check_fail "init without --force correctly rejected" $OBOL stack init --backend k3s + +# --- TEST 3: stack init --force (should preserve stack ID) --- +log "" +log "--- TEST 3: stack init --force (should preserve stack ID) ---" +$OBOL stack init --backend k3s --force +NEW_ID=$(cat .workspace/config/.stack-id) +check "stack ID preserved on --force ($STACK_ID)" test "$STACK_ID" = "$NEW_ID" + +# --- TEST 4: stack up --- +log "" +log "--- TEST 4: stack up ---" +check "stack up" $OBOL stack up +check "PID file exists" test -f .workspace/config/.k3s.pid +check "kubeconfig.yaml exists" test -f .workspace/config/kubeconfig.yaml +check "k3s is functional (nodes ready)" k3s_is_functional + +# --- TEST 5: kubectl passthrough --- +log "" +log "--- TEST 5: kubectl passthrough ---" +NODES=$($OBOL kubectl get nodes --no-headers 2>/dev/null | wc -l) +check "kubectl sees nodes ($NODES)" test "$NODES" -ge 1 + +NS=$($OBOL kubectl get namespaces --no-headers 2>/dev/null | wc -l) +check "kubectl sees namespaces ($NS)" test "$NS" -ge 1 + +# --- TEST 6: stack up idempotent (already running) --- +log "" +log "--- TEST 6: stack up idempotent ---" +OLD_PID=$(cat .workspace/config/.k3s.pid) +check "stack up while running" $OBOL stack up +NEW_PID=$(cat .workspace/config/.k3s.pid) +check "PID unchanged (idempotent) ($OLD_PID = $NEW_PID)" test "$OLD_PID" = "$NEW_PID" + +# --- TEST 7: stack down --- +log "" +log "--- TEST 7: stack down ---" +check "stack down" $OBOL stack down +check "PID file cleaned up" test ! -f .workspace/config/.k3s.pid +check "config preserved after down" test -f .workspace/config/.stack-id +log " Waiting for API server to become unreachable..." +sleep 5 +check_fail "kubectl unreachable after down" $OBOL kubectl get nodes --no-headers + +# --- TEST 8: stack down again (already stopped) --- +log "" +log "--- TEST 8: stack down already stopped ---" +check "stack down (already stopped)" $OBOL stack down + +# --- TEST 9: stack up (restart after down) --- +log "" +log "--- TEST 9: stack up (restart) ---" +check "stack up (restart)" $OBOL stack up +check "PID file exists after restart" test -f .workspace/config/.k3s.pid +check "k3s functional after restart" k3s_is_functional + +READY=$($OBOL kubectl get nodes --no-headers 2>/dev/null | grep -c "Ready" || true) +check "node ready after restart ($READY)" test "$READY" -ge 1 + +# --- TEST 10: stack purge (without --force) --- +log "" +log "--- TEST 10: stack purge ---" +check "stack purge" $OBOL stack purge +sleep 2 +check "config removed" test ! -f .workspace/config/.stack-id +check "k3s pid file removed" test ! -f .workspace/config/.k3s.pid + +# --- TEST 11: full cycle + purge --force --- +log "" +log "--- TEST 11: full cycle + purge --force ---" +check "init for purge test" $OBOL stack init --backend k3s +check "up for purge test" $OBOL stack up +check "purge --force" $OBOL stack purge --force +sleep 2 +check "config removed after purge --force" test ! -f .workspace/config/.stack-id + +log "" +log "=========================================" +log "K3s RESULTS: $PASS passed, $FAIL failed" +log "=========================================" + +if [ "$FAIL" -gt 0 ]; then + exit 1 +fi From 46481830a6c215d2fe2ea0b9beffa113024c42ec Mon Sep 17 00:00:00 2001 From: bussyjd Date: Fri, 13 Feb 2026 18:01:07 +0400 Subject: [PATCH 15/15] fix(stack): prevent process group kill from crashing desktop session The k3s Down() method was using kill -TERM with a negative PID (process group kill), which could kill unrelated system processes like systemd-logind sharing the same process group as the sudo wrapper. This caused the entire desktop session to crash. Changes: - Kill only the specific sudo/k3s process, not the process group - Remove unused Setpgid/syscall since we no longer use process groups - Add containerd-shim cleanup fallback for binary-only k3s installs - Add 600s helm timeout for kube-prometheus-stack deployment - Disable admission webhook pre-install hooks that timeout on fresh k3s - Fix flaky test: replace fixed sleep with polling loop for API shutdown --- .../skills/test-backend/scripts/test-k3s.sh | 11 ++++++-- .../embed/infrastructure/helmfile.yaml.gotmpl | 1 + .../values/monitoring.yaml.gotmpl | 4 +++ internal/stack/backend_k3s.go | 28 +++++++++++++------ 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/.agents/skills/test-backend/scripts/test-k3s.sh b/.agents/skills/test-backend/scripts/test-k3s.sh index 1dcaac4..03e3bca 100755 --- a/.agents/skills/test-backend/scripts/test-k3s.sh +++ b/.agents/skills/test-backend/scripts/test-k3s.sh @@ -97,8 +97,15 @@ check "stack down" $OBOL stack down check "PID file cleaned up" test ! -f .workspace/config/.k3s.pid check "config preserved after down" test -f .workspace/config/.stack-id log " Waiting for API server to become unreachable..." -sleep 5 -check_fail "kubectl unreachable after down" $OBOL kubectl get nodes --no-headers +API_DOWN=false +for i in $(seq 1 15); do + if ! $OBOL kubectl get nodes --no-headers 2>/dev/null; then + API_DOWN=true + break + fi + sleep 2 +done +check "kubectl unreachable after down" test "$API_DOWN" = "true" # --- TEST 8: stack down again (already stopped) --- log "" diff --git a/internal/embed/infrastructure/helmfile.yaml.gotmpl b/internal/embed/infrastructure/helmfile.yaml.gotmpl index d5b1d8a..1fd2e7e 100644 --- a/internal/embed/infrastructure/helmfile.yaml.gotmpl +++ b/internal/embed/infrastructure/helmfile.yaml.gotmpl @@ -35,6 +35,7 @@ releases: createNamespace: true chart: prometheus-community/kube-prometheus-stack version: 79.5.0 + timeout: 600 values: - ./values/monitoring.yaml.gotmpl diff --git a/internal/embed/infrastructure/values/monitoring.yaml.gotmpl b/internal/embed/infrastructure/values/monitoring.yaml.gotmpl index d7a0dc1..a7a6095 100644 --- a/internal/embed/infrastructure/values/monitoring.yaml.gotmpl +++ b/internal/embed/infrastructure/values/monitoring.yaml.gotmpl @@ -20,6 +20,10 @@ prometheus: cpu: 500m memory: 1Gi +prometheusOperator: + admissionWebhooks: + enabled: false # Disable webhook pre-install hooks (avoids timeout on fresh k3s) + grafana: enabled: false # Enable when we want UI access diff --git a/internal/stack/backend_k3s.go b/internal/stack/backend_k3s.go index 482d7e8..3325b13 100644 --- a/internal/stack/backend_k3s.go +++ b/internal/stack/backend_k3s.go @@ -8,7 +8,6 @@ import ( "runtime" "strconv" "strings" - "syscall" "time" "github.com/ObolNetwork/obol-stack/internal/config" @@ -131,8 +130,6 @@ func (b *K3sBackend) Up(cfg *config.Config, stackID string) ([]byte, error) { ) cmd.Stdout = logFile cmd.Stderr = logFile - // Set process group so we can clean up child processes - cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} if err := cmd.Start(); err != nil { logFile.Close() @@ -207,14 +204,18 @@ func (b *K3sBackend) Down(cfg *config.Config, stackID string) error { fmt.Printf("Stopping k3s (pid: %d)...\n", pid) - // Send SIGTERM to the process group for clean shutdown (negative PID = process group) - pgid := fmt.Sprintf("-%d", pid) - stopCmd := exec.Command("sudo", "kill", "-TERM", pgid) + // Send SIGTERM to the sudo/k3s process only (not the process group). + // Using negative PID (process group kill) is unsafe here because the saved PID + // is the sudo wrapper, whose process group can include unrelated system processes + // like systemd-logind — killing those crashes the desktop session. + // sudo forwards SIGTERM to k3s, which handles its own child process cleanup. + pidStr := strconv.Itoa(pid) + stopCmd := exec.Command("sudo", "kill", "-TERM", pidStr) stopCmd.Stdout = os.Stdout stopCmd.Stderr = os.Stderr if err := stopCmd.Run(); err != nil { - fmt.Printf("SIGTERM to process group failed, sending SIGKILL: %v\n", err) - exec.Command("sudo", "kill", "-9", pgid).Run() + fmt.Printf("SIGTERM failed, sending SIGKILL: %v\n", err) + exec.Command("sudo", "kill", "-9", pidStr).Run() } // Wait for process to exit (up to 30 seconds) @@ -226,7 +227,8 @@ func (b *K3sBackend) Down(cfg *config.Config, stackID string) error { time.Sleep(1 * time.Second) } - // Run k3s-killall.sh if available (cleans up containerd/iptables) + // Clean up orphaned k3s child processes (containerd-shim, etc.) + // Use k3s-killall.sh if available, otherwise kill containerd shims directly. killallPath := "/usr/local/bin/k3s-killall.sh" if _, err := os.Stat(killallPath); err == nil { fmt.Println("Running k3s cleanup...") @@ -234,6 +236,14 @@ func (b *K3sBackend) Down(cfg *config.Config, stackID string) error { cleanCmd.Stdout = os.Stdout cleanCmd.Stderr = os.Stderr cleanCmd.Run() + } else { + // k3s-killall.sh not installed (binary-only install via obolup). + // Kill orphaned containerd-shim processes that use the k3s socket. + fmt.Println("Cleaning up k3s child processes...") + exec.Command("sudo", "pkill", "-TERM", "-f", "containerd-shim.*k3s").Run() + time.Sleep(2 * time.Second) + // Force-kill any that survived SIGTERM + exec.Command("sudo", "pkill", "-KILL", "-f", "containerd-shim.*k3s").Run() } b.removePidFile(cfg)