docker · dgageot · May 7, 2026 · May 7, 2026 · May 7, 2026 · aheritier
@@ -182,7 +182,7 @@
         },
         "unload_api": {
           "type": "string",
-          "description": "Optional path (or absolute URL) to the provider's model-unload endpoint. POSTed with `{\"model\": \"<id>\"}` when the agent wires the `unload` builtin into its `on_agent_switch` hook chain, to free GPU/RAM held by the previous model. Today only Docker Model Runner ships a provider that calls this endpoint; cloud providers don't implement [provider.Unloader] and the hook silently skips them. A relative path is resolved against the scheme+host of base_url; an absolute URL is used verbatim.",
+          "description": "Optional path (or absolute URL) to the provider's model-unload endpoint. POSTed with `{\"model\": \"<id>\"}` when the agent wires the `unload` builtin into its `on_agent_switch` hook chain, to free GPU/RAM held by the previous model. Today only Docker Model Runner exposes such an endpoint; the `unload` builtin is a pure HTTP hook that silently skips non-DMR providers. A relative path is resolved against the scheme+host of base_url; an absolute URL is used verbatim.",
           "examples": [
             "/engines/_unload",
             "/api/unload",
@@ -881,7 +881,7 @@
         },
         "type": {
           "type": "string",
-          "description": "Type of hook. 'command' executes a shell command; 'builtin' invokes a named in-process Go function registered by the runtime; 'model' asks an LLM and translates its reply into the hook's native output (used for LLM-as-a-judge pre_tool_use, summarizers, etc., with no Go code). The docker-agent runtime ships these builtins: 'add_date' (turn_start: today's date), 'add_environment_info' (session_start: cwd, git, OS, arch), 'add_prompt_files' (turn_start: contents of named files looked up in the workdir hierarchy and the home directory), 'add_git_status' (turn_start: `git status --short --branch`), 'add_git_diff' (turn_start: `git diff --stat`, or full diff with args=['full']), 'add_directory_listing' (session_start: top-level entries of cwd), 'add_user_info' (session_start: current OS user and hostname), 'add_recent_commits' (session_start: `git log --oneline -n N`, default N=10, override via args=['<N>']), 'max_iterations' (before_llm_call: hard stop after N model calls; args=['<N>'] required), 'redact_secrets' (pre_tool_use / before_llm_call / tool_response_transform: scrubs detected secrets from tool arguments, outgoing chat content, and tool output — the same builtin handles all three legs and dispatches on the event; the matching agent-level 'redact_secrets: true' flag auto-injects the entries for all three), 'unload' (on_agent_switch: walks the previous agent's models and calls Unload() on every provider that implements provider.Unloader — e.g. asks Docker Model Runner to release the GPU/RAM held by the just-departing model so the next agent's model can claim it; opt in by adding the entry to the agent's hooks.on_agent_switch list).",
+          "description": "Type of hook. 'command' executes a shell command; 'builtin' invokes a named in-process Go function registered by the runtime; 'model' asks an LLM and translates its reply into the hook's native output (used for LLM-as-a-judge pre_tool_use, summarizers, etc., with no Go code). The docker-agent runtime ships these builtins: 'add_date' (turn_start: today's date), 'add_environment_info' (session_start: cwd, git, OS, arch), 'add_prompt_files' (turn_start: contents of named files looked up in the workdir hierarchy and the home directory), 'add_git_status' (turn_start: `git status --short --branch`), 'add_git_diff' (turn_start: `git diff --stat`, or full diff with args=['full']), 'add_directory_listing' (session_start: top-level entries of cwd), 'add_user_info' (session_start: current OS user and hostname), 'add_recent_commits' (session_start: `git log --oneline -n N`, default N=10, override via args=['<N>']), 'max_iterations' (before_llm_call: hard stop after N model calls; args=['<N>'] required), 'redact_secrets' (pre_tool_use / before_llm_call / tool_response_transform: scrubs detected secrets from tool arguments, outgoing chat content, and tool output — the same builtin handles all three legs and dispatches on the event; the matching agent-level 'redact_secrets: true' flag auto-injects the entries for all three), 'unload' (on_agent_switch: POSTs `{\"model\": \"<id>\"}` to the previous agent's DMR model endpoints — e.g. asks Docker Model Runner to release the GPU/RAM held by the just-departing model so the next agent's model can claim it. Pure HTTP, no provider-specific runtime coupling; non-DMR providers are silently skipped. Opt in by adding the entry to the agent's hooks.on_agent_switch list).",
           "enum": [
             "command",
             "builtin",

@@ -156,7 +156,7 @@ Built-ins are typically zero-config and faster than equivalent shell hooks becau
 | `max_iterations`        | `before_llm_call`                                                                         | `["<N>"]` (required)  | Hard-stops the agent after `N` model calls. Stateless: the runtime supplies the iteration counter on every dispatch.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
 | `snapshot`              | `session_start`, `turn_start`, `turn_end`, `pre_tool_use`, `post_tool_use`, `session_end` | _none_                | Records filesystem snapshots in a shadow git repo under the docker-agent data directory. No-op outside git repos; respects the source repo's ignore rules and skips newly-added files larger than 2 MiB.                                                                                                                                                                                                                                                                                                                                                                                                                           |
 | `redact_secrets`        | `pre_tool_use`, `before_llm_call`, `tool_response_transform`                              | _none_                | Scrubs detected secrets (API keys, tokens, private keys, …) out of tool call arguments, outgoing chat content, and tool output. The same builtin handles all three events and dispatches on the event name. Auto-registered on all three events by `redact_secrets: true` on the agent — see [`examples/redact_secrets_hooks.yaml`](https://github.com/docker/docker-agent/blob/main/examples/redact_secrets_hooks.yaml) for the manual wiring.                                                                                                                                                                                     |
-| `unload`                | `on_agent_switch`                                                                         | _none_                | Walks the previous agent's models and calls `Unload()` on every provider that implements [`provider.Unloader`](https://pkg.go.dev/github.com/docker/docker-agent/pkg/model/provider#Unloader) — typically Docker Model Runner — to free the GPU/RAM the just-departing model was holding. Cloud-only providers don't implement the interface and are silently skipped. Errors are logged and swallowed; agent switching never blocks on a slow or unreachable engine (each Unload call has a 10 s timeout). See [`examples/unload_on_switch.yaml`](https://github.com/docker/docker-agent/blob/main/examples/unload_on_switch.yaml). |
+| `unload`                | `on_agent_switch`                                                                         | _none_                | POSTs `{"model": "<id>"}` to each of the previous agent's DMR model endpoints (`/_unload` by default, overridable per-model via `unload_api`) to free the GPU/RAM the just-departing model was holding. Pure HTTP — reads the model snapshot the runtime ships on `on_agent_switch` and depends on no provider-specific runtime state. Non-DMR providers (OpenAI, Anthropic, …) are silently skipped, so cross-provider chains are safe. Errors are logged and swallowed; agent switching never blocks on a slow or unreachable engine (each call has a 10 s timeout). See [`examples/unload_on_switch.yaml`](https://github.com/docker/docker-agent/blob/main/examples/unload_on_switch.yaml). |
 
 <div class="callout callout-info" markdown="1">
 <div class="callout-title">ℹ️ Per-turn vs. per-session
@@ -590,7 +590,7 @@ models:
     model: ai/qwen3-coder
 ```
 
-At every transfer the runtime calls `Unload()` on the previous agent's model providers. For Docker Model Runner this hits the engine's `_unload` endpoint; for cloud providers (OpenAI, Anthropic, …) it is a silent no-op. Cross-provider chains are safe — only the providers that actually implement [`provider.Unloader`](https://pkg.go.dev/github.com/docker/docker-agent/pkg/model/provider#Unloader) are touched. See [`examples/unload_on_switch.yaml`](https://github.com/docker/docker-agent/blob/main/examples/unload_on_switch.yaml) for the full file.
+At every transfer the runtime ships a snapshot of the previous agent's model endpoints on the `on_agent_switch` hook input, and the `unload` builtin POSTs `{"model": "<id>"}` to each DMR endpoint's `/_unload` URL over plain HTTP. For cloud providers (OpenAI, Anthropic, …) the hook is a silent no-op since they don't expose an HTTP unload endpoint. Cross-provider chains are safe — only DMR endpoints are touched. See [`examples/unload_on_switch.yaml`](https://github.com/docker/docker-agent/blob/main/examples/unload_on_switch.yaml) for the full file.
 
 `on_session_resume` fires when the user explicitly approves the runtime to continue past its configured `max_iterations` limit. `previous_max_iterations` carries the cap that was reached and `new_max_iterations` carries the new cap after approval. Useful for alerting on extended-runtime sessions or for billing / quota pipelines that meter resumes.
 

@@ -4,11 +4,13 @@
 #
 # Two agents share Docker Model Runner but use different models that don't
 # fit in GPU memory at the same time. Wiring the `unload` builtin into
-# each agent's `on_agent_switch` hook chain tells the runtime to call
-# Unload on the previous agent's model providers every time the active
-# agent transfers control. For DMR this hits the engine's `_unload`
-# endpoint; for cloud-only providers (OpenAI, Anthropic, ...) the hook is
-# a silent no-op since they don't implement provider.Unloader.
+# each agent's `on_agent_switch` hook chain asks the previous agent's
+# DMR endpoint(s) to release GPU memory every time the active agent
+# transfers control. The hook is pure: it reads the model snapshot the
+# runtime ships on every on_agent_switch dispatch and POSTs to DMR's
+# `_unload` endpoint over plain HTTP — no provider-specific runtime
+# coupling. For cloud-only providers (OpenAI, Anthropic, ...) the hook
+# is a silent no-op since they don't expose an HTTP unload endpoint.
 #
 # Switching back and forth between `coder` and `reviewer` therefore costs
 # one model load per switch instead of failing on out-of-memory.

@@ -238,7 +238,7 @@ type ProviderConfig struct {
 	// models are POSTed `{"model": "<id>"}` here at every switch.
 	// Cloud providers should leave this unset.
 	//
-	// [unload]: https://pkg.go.dev/github.com/docker/docker-agent/pkg/runtime#BuiltinUnload
+	// [unload]: https://pkg.go.dev/github.com/docker/docker-agent/pkg/hooks/builtins#Unload
 	UnloadAPI string `json:"unload_api,omitempty"`
 	// TokenKey is the environment variable name containing the API token
 	TokenKey string `json:"token_key,omitempty"`

@@ -12,6 +12,8 @@
 //   - add_user_info         (session_start)   — current OS user and host
 //   - add_recent_commits    (session_start)   — `git log --oneline -n N`
 //   - max_iterations        (before_llm_call) — hard stop after N model calls
+//   - unload                (on_agent_switch) — release the previous
+//     agent's local-engine resources via HTTP unload (DMR today)
 //   - snapshot              (session_start,
 //     turn_start, turn_end,
 //     pre_tool_use, post_tool_use,
@@ -72,6 +74,7 @@ func Register(r *hooks.Registry) error {
 		r.RegisterBuiltin(MaxIterations, maxIterations),
 		r.RegisterBuiltin(RedactSecrets, redactSecrets),
 		r.RegisterBuiltin(HTTPPost, httpPost),
+		r.RegisterBuiltin(Unload, unload),
 	)
 }
 

@@ -35,6 +35,7 @@ func TestRegisterInstallsAllBuiltins(t *testing.T) {
 		builtins.MaxIterations,
 		builtins.RedactSecrets,
 		builtins.HTTPPost,
+		builtins.Unload,
 	} {
 		fn, ok := r.LookupBuiltin(name)
 		assert.True(t, ok, "builtin %q must be registered", name)

@@ -0,0 +1,108 @@
+package builtins
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/docker/docker-agent/pkg/hooks"
+	"github.com/docker/docker-agent/pkg/model/provider/dmr"
+)
+
+// Unload is the registered name of the on_agent_switch builtin that
+// asks the previous agent's local inference engines (today: Docker
+// Model Runner) to release the resources they hold.
+//
+// Wire it into a config with:
+//
+//	hooks:
+//	  on_agent_switch:
+//	    - type: builtin
+//	      command: unload
+//
+// The hook is pure: it depends only on the [hooks.Input.FromAgentModels]
+// snapshot the runtime ships on every on_agent_switch dispatch, plus
+// net/http. It carries no runtime-side coupling and silently skips any
+// model whose endpoint isn't reachable as plain HTTP (e.g. cloud
+// providers that don't expose [hooks.ModelEndpoint.BaseURL]).
+//
+// Provider dispatch and URL resolution are owned by
+// [pkg/model/provider/dmr] (see [dmr.ProviderType] and [dmr.UnloadURL]),
+// so this builtin stays a dumb dispatcher and DMR keeps full control
+// of its conventions.
+const Unload = "unload"
+
+// unloadTimeout caps each per-model Unload call so a stalled engine
+// cannot stall agent switching.
+const unloadTimeout = 10 * time.Second
+
+// unload iterates the [hooks.Input.FromAgentModels] snapshot the
+// runtime captured at dispatch time and POSTs `{"model": "<id>"}` to
+// the resolved unload endpoint of each DMR model. Errors are logged
+// but never propagated — agent switching must never block on a slow
+// or unreachable engine.
+func unload(ctx context.Context, in *hooks.Input, _ []string) (*hooks.Output, error) {
+	if in == nil || in.FromAgent == "" || in.FromAgent == in.ToAgent {
+		return nil, nil
+	}
+	for _, m := range in.FromAgentModels {
+		if m.Provider != dmr.ProviderType {
+			continue
+		}
+		if err := unloadOne(ctx, m); err != nil {
+			slog.WarnContext(ctx, "unload: failed",
+				"agent", in.FromAgent, "model", m.Model, "error", err)
+		}
+	}
+	return nil, nil
+}
+
+// unloadOne resolves the unload URL for m and POSTs the model id to
+// it, bounded by [unloadTimeout]. A model with no resolvable endpoint
+// (no base_url and no unload_api) is a silent no-op so the hook stays
+// harmless on test / in-process providers.
+func unloadOne(parent context.Context, m hooks.ModelEndpoint) error {
+	endpoint, err := dmr.UnloadURL(m.BaseURL, m.UnloadAPI)
+	if err != nil || endpoint == "" {
+		return err
+	}
+	ctx, cancel := context.WithTimeout(parent, unloadTimeout)
+	defer cancel()
+
+	body, _ := json.Marshal(map[string]string{"model": m.Model})
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewReader(body))
+	if err != nil {
+		return fmt.Errorf("building unload request: %w", err)
+	}
+	req.Header.Set("Content-Type", "application/json")
+
+	slog.DebugContext(ctx, "Unloading model", "url", endpoint, "model", m.Model)
+
+	// Unlike the http_post builtin, the unload target is the
+	// operator-configured DMR base URL — typically a loopback engine
+	// (Docker Desktop socket, 127.0.0.1:12434, …). The SSRF-safe
+	// dialer used by http_post would refuse those addresses by
+	// design, so we use the default client here.
+	resp, err := http.DefaultClient.Do(req)
+	if err != nil {
+		return fmt.Errorf("calling unload endpoint %s: %w", endpoint, err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 4*1024))
+		return fmt.Errorf("unload endpoint returned %d: %s",
+			resp.StatusCode, strings.TrimSpace(string(respBody)))
+	}
+	// Drain the success-path body so the underlying transport can reuse
+	// the connection (Go's http.Client only re-pools a connection whose
+	// body has been read to EOF and closed).
+	_, _ = io.Copy(io.Discard, resp.Body)
+	return nil
+}