From e2bd22704b31e549f80bef2089cee340e0b530d6 Mon Sep 17 00:00:00 2001
From: Zeke Sikelianos <zeke@sikelianos.com>
Date: Mon, 27 Apr 2026 10:33:55 -0700
Subject: [PATCH 1/2] feat: add build-models and publish-models skills

Two new skills covering the model-creation lifecycle:

- build-models: packaging custom models with Cog (cog.yaml, predict.py,
  weights loading with pget, cold-boot tricks, async predictors).
- publish-models: cog push, cog-safe-push, GitHub Actions patterns,
  multi-model matrix pushes, and post-publish monitoring.

Descriptions are written to be easy for agents to trigger on, with
specific file names, commands, and trigger phrases.

Patterns are distilled from production Replicate model repos including
cog-flux, cog-flux-kontext, cog-vllm, cog-comfyui, flux-fine-tuner,
vibevoice, cog-template, and model-ci-template.
---
 .claude-plugin/marketplace.json |  10 +
 AGENTS.md                       |   6 +-
 skills/build-models/SKILL.md    | 390 ++++++++++++++++++++++++++++++++
 skills/publish-models/SKILL.md  | 330 +++++++++++++++++++++++++++
 4 files changed, 735 insertions(+), 1 deletion(-)
 create mode 100644 skills/build-models/SKILL.md
 create mode 100644 skills/publish-models/SKILL.md

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 3b09d80..e7c9c11 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -21,6 +21,16 @@
       "source": "skills/run-models",
       "description": "Run AI models on Replicate via predictions, webhooks, and streaming."
     },
+    {
+      "name": "build-models",
+      "source": "skills/build-models",
+      "description": "Package and build custom AI models with Cog for deployment on Replicate."
+    },
+    {
+      "name": "publish-models",
+      "source": "skills/publish-models",
+      "description": "Push and publish custom AI models to Replicate, and set up CI/CD for releasing new model versions safely."
+    },
     {
       "name": "prompt-images",
       "source": "skills/prompt-images",
diff --git a/AGENTS.md b/AGENTS.md
index 0cb636b..185548a 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -2,13 +2,15 @@
 
 ## Purpose
 
-This repo publishes Agent Skills for Replicate: focused guides for finding, comparing, running, and prompting AI models.
+This repo publishes Agent Skills for Replicate: focused guides for finding, comparing, running, building, publishing, and prompting AI models.
 
 ## Files that matter
 
 - `skills/find-models/SKILL.md` — search, collections, schemas, picking the right model.
 - `skills/compare-models/SKILL.md` — evaluating models by cost, speed, quality, and capabilities.
 - `skills/run-models/SKILL.md` — predictions, polling, webhooks, streaming, file I/O, concurrency, multi-model workflows.
+- `skills/build-models/SKILL.md` — packaging custom models with Cog: cog.yaml, predict.py, weights loading, cold-boot tricks.
+- `skills/publish-models/SKILL.md` — pushing models to Replicate with cog push and cog-safe-push, and CI/CD for releases.
 - `skills/prompt-images/SKILL.md` — prompting techniques for image generation and editing models.
 - `skills/prompt-videos/SKILL.md` — prompting techniques for video generation models.
 - `script/lint` — validates the skills.
@@ -28,6 +30,8 @@ This repo publishes Agent Skills for Replicate: focused guides for finding, comp
 script/lint skills/find-models
 script/lint skills/compare-models
 script/lint skills/run-models
+script/lint skills/build-models
+script/lint skills/publish-models
 script/lint skills/prompt-images
 script/lint skills/prompt-videos
 ```
diff --git a/skills/build-models/SKILL.md b/skills/build-models/SKILL.md
new file mode 100644
index 0000000..d981ca3
--- /dev/null
+++ b/skills/build-models/SKILL.md
@@ -0,0 +1,390 @@
+---
+name: build-models
+description: >
+  Package and build custom AI models with Cog for deployment on Replicate.
+  Use when creating a cog.yaml or predict.py, defining model inputs and
+  outputs, loading model weights at setup time, building Docker images for
+  ML models, serving locally with cog serve or cog predict, or porting a
+  HuggingFace, GitHub, or ComfyUI model to run on Replicate. Trigger on
+  phrases like "build a model", "package a model", "create a Cog model",
+  "wrap a model", "containerize an AI model", "predict.py", "cog.yaml",
+  "BasePredictor", or "Cog container", and when referencing cog.run,
+  github.com/replicate/cog, or github.com/replicate/cog-examples. Covers
+  GPU and CUDA setup, pget for fast weight downloads, async predictors
+  with continuous batching, streaming outputs, and cold-boot optimization
+  for image, video, audio, and LLM models. For pushing built models to
+  Replicate, see publish-models. For running existing models, see
+  run-models.
+---
+
+## Docs
+
+- Cog reference (single file): <https://cog.run/llms.txt>
+- `cog.yaml` reference: <https://cog.run/yaml>
+- Python predictor reference: <https://cog.run/python>
+- Examples: <https://github.com/replicate/cog-examples>
+- Template: <https://github.com/replicate/cog-template>
+
+## When to use this skill
+
+- You have model code, weights, or a HuggingFace/GitHub project you want to host on Replicate.
+- You're writing or editing a `cog.yaml`, `predict.py`, or `train.py`.
+- For pushing a built model to Replicate, see `publish-models`.
+- For running existing Replicate models, see `run-models`.
+
+## Prerequisites
+
+- Docker running locally.
+- Cog installed: `brew install replicate/tap/cog` or `sh <(curl -fsSL https://cog.run/install.sh)`.
+- Optional: `cog init` to scaffold `cog.yaml` and `predict.py`.
+
+## Project layout
+
+The canonical Replicate model layout:
+
+```
+cog.yaml
+predict.py
+weights.py                 # optional download helpers
+requirements.txt
+cog-safe-push-configs/
+  default.yaml             # see publish-models skill
+.github/workflows/
+  ci.yaml
+script/                    # github.com/github/scripts-to-rule-them-all
+  lint
+  test
+  push
+```
+
+## cog.yaml essentials
+
+A modern config for a GPU model:
+
+```yaml
+build:
+  gpu: true
+  cuda: "12.8"
+  python_version: "3.12"
+  python_requirements: requirements.txt
+  cog_runtime: true
+  system_packages:
+    - libgl1
+    - libglib2.0-0
+predict: predict.py:Predictor
+```
+
+Notes:
+
+- `cog_runtime: true` opts into the newer Rust-based runtime. Set it for new models.
+- Pin Python to a specific minor version, and pin every line in `requirements.txt`. Floating versions break cold boots.
+- Use `python_requirements` over inline `python_packages` once the list grows.
+- `cuda` follows your torch wheel (e.g. `12.8` paired with `torch==2.7.1+cu128`).
+- Add `train: train.py:train` if your model is fine-tunable.
+- Add `image: r8.im/owner/name` to enable bare `cog push`.
+
+For async predictors with continuous batching:
+
+```yaml
+concurrency:
+  max: 32
+```
+
+## predict.py essentials
+
+```python
+from cog import BasePredictor, Input, Path
+
+class Predictor(BasePredictor):
+    def setup(self) -> None:
+        """One-time loads. Heavy work goes here, not in predict()."""
+        self.model = load_model("weights/")
+
+    def predict(
+        self,
+        prompt: str = Input(description="Text prompt for generation"),
+        seed: int = Input(description="Random seed; leave blank for random", default=None),
+        num_steps: int = Input(description="Number of denoising steps", ge=1, le=50, default=20),
+        output_format: str = Input(description="Output image format", choices=["webp", "jpg", "png"], default="webp"),
+    ) -> Path:
+        """Run a single prediction."""
+        if not prompt.strip():
+            raise ValueError("prompt cannot be empty")
+        out = self.model.generate(prompt, seed=seed, steps=num_steps)
+        return Path(out)
+```
+
+Input rules:
+
+- Every input needs a `description`. The description shows up in the model schema and on Replicate's web UI.
+- Use `ge`/`le` for numeric bounds, `choices=[...]` for enums, `regex=` for strings.
+- Use `cog.Path` for file inputs and outputs, never raw bytes.
+- Use `cog.Secret` for any token-like input (HF tokens, API keys), never plain `str`.
+- Provide a default that's inside `choices` for categorical inputs.
+- Validate inputs early in `predict()` and raise `ValueError`.
+
+Streaming text output (for LLMs):
+
+```python
+from cog import BasePredictor, Input, ConcatenateIterator
+
+class Predictor(BasePredictor):
+    def predict(self, prompt: str = Input(description="Prompt")) -> ConcatenateIterator[str]:
+        for token in self.model.stream(prompt):
+            yield token
+```
+
+Async predictor with continuous batching (paired with `concurrency.max` in cog.yaml):
+
+```python
+from cog import BasePredictor, Input, AsyncConcatenateIterator
+
+class Predictor(BasePredictor):
+    async def setup(self) -> None:
+        self.engine = await load_async_engine()
+
+    async def predict(
+        self,
+        prompt: str = Input(description="Prompt"),
+    ) -> AsyncConcatenateIterator[str]:
+        async for token in self.engine.generate(prompt):
+            yield token
+```
+
+Dynamic `choices` from on-disk assets (e.g. a `voices/` directory of audio samples):
+
+```python
+from pathlib import Path as _P
+AVAILABLE_VOICES = sorted(p.stem for p in _P("voices").glob("*.wav"))
+
+class Predictor(BasePredictor):
+    def predict(
+        self,
+        speaker: str = Input(description="Voice", choices=AVAILABLE_VOICES, default=AVAILABLE_VOICES[0]),
+    ) -> Path: ...
+```
+
+## Loading weights fast
+
+Cold boot dominates user-perceived latency. Three patterns, ranked by simplicity:
+
+### 1. Bake weights into the image at build time
+
+Best for small or medium weights (< 5GB) that you want zero-cold-boot for.
+
+For torchvision:
+
+```python
+import os
+os.environ["TORCH_HOME"] = "."  # set before importing torch
+import torch
+from torchvision import models
+```
+
+For HuggingFace:
+
+```python
+import os
+os.environ["HF_HUB_CACHE"] = "./.cache"
+os.environ["HF_XET_HIGH_PERFORMANCE"] = "1"
+```
+
+Then download once during `cog build` (e.g. in a `run:` step or by running a small fetcher script as part of the build). The weights become part of the image layer.
+
+### 2. Pull from `weights.replicate.delivery` with pget
+
+Best for large weights, or when you want to share weights across multiple models. `pget` is Replicate's parallel HTTP fetcher.
+
+In `cog.yaml`:
+
+```yaml
+build:
+  run:
+    - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.8.2/pget_linux_x86_64"
+    - chmod +x /usr/local/bin/pget
+```
+
+In `setup()`:
+
+```python
+import subprocess
+from pathlib import Path
+
+WEIGHTS_URL = "https://weights.replicate.delivery/default/my-model/weights.tar"
+WEIGHTS_DIR = Path("weights")
+
+class Predictor(BasePredictor):
+    def setup(self) -> None:
+        if not WEIGHTS_DIR.exists():
+            # -x extracts tar in-memory; default concurrency is 4 * NumCPU
+            subprocess.check_call(["pget", "-x", WEIGHTS_URL, str(WEIGHTS_DIR)])
+        self.model = load_from(WEIGHTS_DIR)
+```
+
+For multiple files in one shot:
+
+```python
+manifest = "\n".join([
+    f"{base}/unet.safetensors weights/unet.safetensors",
+    f"{base}/vae.safetensors  weights/vae.safetensors",
+    f"{base}/text_encoder.safetensors weights/text_encoder.safetensors",
+])
+subprocess.run(["pget", "multifile", "-"], input=manifest, text=True, check=True)
+```
+
+### 3. HuggingFace Hub with hf_transfer
+
+Set `HF_HUB_ENABLE_HF_TRANSFER=1` and use `huggingface_hub.snapshot_download` or `from_pretrained`. Faster than vanilla HF downloads. Use a `cog.Secret` input for gated models.
+
+## Weight cache for user-supplied weights
+
+For LoRAs or any weights URL the user passes at predict time, use a sha256-keyed disk cache with LRU eviction:
+
+```python
+import hashlib, shutil, subprocess
+from pathlib import Path
+
+class WeightsDownloadCache:
+    def __init__(self, cache_dir: str = "/tmp/weights-cache", min_disk_free_gb: int = 10):
+        self.cache_dir = Path(cache_dir)
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        self.min_disk_free = min_disk_free_gb * 1024**3
+
+    def ensure(self, url: str) -> Path:
+        key = hashlib.sha256(url.encode()).hexdigest()
+        target = self.cache_dir / key
+        if target.exists():
+            target.touch()  # bump LRU mtime
+            return target
+        self._evict_until_room()
+        subprocess.check_call(["pget", url, str(target)])
+        return target
+
+    def _evict_until_room(self) -> None:
+        while shutil.disk_usage(self.cache_dir).free < self.min_disk_free:
+            entries = sorted(self.cache_dir.iterdir(), key=lambda p: p.stat().st_mtime)
+            if not entries:
+                return
+            entries[0].unlink()
+```
+
+See `replicate/cog-flux/weights.py` for a production version that handles HF, CivitAI, Replicate, and arbitrary `.safetensors` URLs.
+
+## Multi-LoRA composition
+
+Reload only when the URL changes; compose two LoRAs with separate scales:
+
+```python
+class Predictor(BasePredictor):
+    def setup(self) -> None:
+        self.pipe = load_base_pipeline()
+        self.loaded = {"main": None, "extra": None}
+
+    def _ensure_lora(self, slot: str, url: str | None) -> None:
+        if url == self.loaded[slot]:
+            return
+        if self.loaded[slot] is not None:
+            self.pipe.unload_lora_weights(adapter_name=slot)
+        if url:
+            path = self.cache.ensure(url)
+            self.pipe.load_lora_weights(str(path), adapter_name=slot)
+        self.loaded[slot] = url
+
+    def predict(
+        self,
+        prompt: str = Input(description="Prompt"),
+        lora_url: str = Input(description="Primary LoRA URL", default=None),
+        lora_scale: float = Input(description="Primary LoRA scale", ge=0.0, le=2.0, default=1.0),
+        extra_lora_url: str = Input(description="Optional second LoRA URL", default=None),
+        extra_lora_scale: float = Input(description="Second LoRA scale", ge=0.0, le=2.0, default=1.0),
+    ) -> Path:
+        self._ensure_lora("main", lora_url)
+        self._ensure_lora("extra", extra_lora_url)
+        adapters = [s for s, u in self.loaded.items() if u]
+        scales = [lora_scale if s == "main" else extra_lora_scale for s in adapters]
+        if adapters:
+            self.pipe.set_adapters(adapters, adapter_weights=scales)
+        return Path(self.pipe(prompt).images[0].save("/tmp/out.png"))
+```
+
+## Cold-boot tricks
+
+From production diffusion models like `replicate/cog-flux` and `replicate/cog-flux-kontext`:
+
+- Set perf flags once in `setup()`:
+  ```python
+  import torch
+  torch.set_float32_matmul_precision("high")
+  torch.backends.cuda.matmul.allow_tf32 = True
+  torch.backends.cudnn.benchmark = True
+  ```
+- Compile and warm up:
+  ```python
+  self.model = torch.compile(self.model, dynamic=True)
+  _ = self.predict(prompt="warmup", num_steps=1)  # absorbs compile cost in setup
+  ```
+- Load big weights with meta device + `assign=True` to avoid double-allocating:
+  ```python
+  with torch.device("meta"):
+      model = build_model_skeleton()
+  state = torch.load("weights.pt", map_location="cpu")
+  model.load_state_dict(state, assign=True)
+  ```
+- Share VAE / text encoder across multiple pipelines (e.g. base + img2img + inpaint) instead of loading three copies.
+- For fp8/int8, save quantized weights ahead of time and load directly; don't quantize at boot.
+
+## Local development
+
+```
+cog init                                    # scaffold cog.yaml + predict.py
+cog predict -i prompt="hello"               # build + run a single prediction
+cog predict -i image=@input.jpg -o out.png  # file inputs and outputs
+cog serve -p 8393                           # HTTP server matching production
+cog exec python                             # interactive shell inside the build env
+```
+
+## Building
+
+```
+cog build -t my-model
+cog build --separate-weights -t my-model    # weights in their own image layer
+cog build --secret id=hf,src=$HOME/.hf_token -t my-model
+```
+
+Tips:
+
+- Use `--separate-weights` for any model with weights > ~1GB. It speeds up cold boots and registry pushes.
+- Use `--mount=type=cache,target=/root/.cache/pip` in `run:` steps to cache pip across builds.
+- Use `--secret` instead of `ARG` to keep tokens out of image history.
+- The default Cog base image (`--use-cog-base-image=true`) is faster than rolling your own.
+
+## Training
+
+If your model supports fine-tuning, add `train: train.py:train` to `cog.yaml` and write a `train()` function that returns `TrainingOutput(weights=Path("model.tar"))`. The predictor then accepts the URL via `setup(self, weights)` or the `COG_WEIGHTS` env var. See <https://cog.run/training> and `replicate/flux-fine-tuner` for a full example.
+
+## Internal infrastructure
+
+Replicate runs an internal base-image system (`monobase`) and a FUSE-backed lazy weights layer for production models. You don't need to configure these; standard `cog build` benefits from them automatically.
+
+## Guidelines
+
+- Keep `setup()` for one-time loads; keep `predict()` fast and deterministic in shape.
+- Pin Python and every dependency. Use `numpy<2` if your torch is older.
+- Always describe every input. Schemas without descriptions are unusable on the web UI.
+- Use `cog.Path` for files and `cog.Secret` for tokens.
+- Pin `pget` to a specific release (`v0.8.2`) for reproducibility.
+- Set `HF_HUB_ENABLE_HF_TRANSFER=1` whenever you call HuggingFace Hub.
+- Set `TRANSFORMERS_OFFLINE=1` after weights are loaded to prevent runtime HF lookups.
+- Test with `cog predict` before pushing. If it doesn't work locally, it won't work in production.
+
+## Production references
+
+- <https://github.com/replicate/cog-examples> — minimal patterns (resnet, hello-world, streaming, training)
+- <https://github.com/replicate/cog-template> — scaffolder for new model repos
+- <https://github.com/replicate/cog-flux> — multi-variant FLUX models, weights cache, fp8 + torch.compile
+- <https://github.com/replicate/cog-flux-kontext> — meta-device loading, warmup compilation
+- <https://github.com/replicate/cog-vllm> — async LLM server with continuous batching, training-as-packaging
+- <https://github.com/replicate/cog-comfyui> — ComfyUI workflows as a Cog model, custom-node helpers
+- <https://github.com/replicate/flux-fine-tuner> — multi-LoRA composition, shared pipeline components
+- <https://github.com/replicate/vibevoice> — TTS with dynamic `choices`, minimal cog.yaml
+- <https://github.com/replicate/pget> — parallel weights fetcher
diff --git a/skills/publish-models/SKILL.md b/skills/publish-models/SKILL.md
new file mode 100644
index 0000000..40955f7
--- /dev/null
+++ b/skills/publish-models/SKILL.md
@@ -0,0 +1,330 @@
+---
+name: publish-models
+description: >
+  Push and publish custom AI models to Replicate, and set up CI/CD for
+  releasing new model versions safely. Use when running cog push,
+  deploying a model to Replicate, releasing a new version, validating
+  a model with cog-safe-push before publishing, configuring a Replicate
+  deployment, setting up GitHub Actions for model releases, or porting a
+  community model to an official one. Trigger on phrases like "push a
+  model to Replicate", "publish a model", "deploy a model", "release a
+  new version", "cog push", "cog-safe-push", "model CI", "r8.im", or
+  "schema compatibility", and when referencing
+  github.com/replicate/cog-safe-push or
+  github.com/replicate/model-ci-template. Covers cog push, the full
+  cog-safe-push config (test cases, fuzz, deployment, official_model),
+  GitHub Actions patterns, multi-model matrix pushes, and post-publish
+  monitoring. Assumes you already have a working Cog project; see
+  build-models if you need to package one first.
+---
+
+## Docs
+
+- Cog reference: <https://cog.run/llms.txt>
+- `cog push` reference: <https://cog.run/cli#cog-push>
+- cog-safe-push: <https://github.com/replicate/cog-safe-push>
+- Model CI template: <https://github.com/replicate/model-ci-template>
+- Continuous deployment guide: <https://replicate.com/docs/guides/continuous-model-deployment>
+
+## When to use this skill
+
+- You have a working Cog project (see `build-models` if you don't yet).
+- You want to publish a private or public model on Replicate.
+- You're releasing a new version of an existing model and want to avoid breaking changes.
+- You're setting up CI/CD for model releases.
+
+## Prerequisites
+
+- Cog installed and `cog login` against `r8.im` (or `echo $TOKEN | cog login --token-stdin`).
+- A model created at `replicate.com/{owner}/{name}` via the API, web UI, or `r8-model` CLI.
+- `REPLICATE_API_TOKEN` set in your environment.
+
+## Plain `cog push`
+
+The simplest path. Build and upload a new version:
+
+```
+cog push r8.im/owner/my-model
+```
+
+Or set `image: r8.im/owner/my-model` in `cog.yaml` and run a bare:
+
+```
+cog push
+```
+
+Useful flags:
+
+- `--separate-weights` — store weights in a separate layer; faster cold boots and pushes for models with > 1GB of weights.
+- `--x-fast` — faster pushes during iteration (skips some validation).
+- `--secret id=hf,src=$HOME/.hf_token` — pass build-time secrets without baking them into image history.
+
+## cog-safe-push (recommended for any model with users)
+
+`cog-safe-push` pushes to a private `-test` model first, checks schema compatibility against the live version, runs prediction comparisons, and fuzzes inputs. Catches breaking changes before they reach users.
+
+Install:
+
+```
+pip install git+https://github.com/replicate/cog-safe-push.git
+```
+
+Required env vars:
+
+- `REPLICATE_API_TOKEN`
+- `ANTHROPIC_API_KEY` (Claude judges output similarity for stochastic models)
+
+Basic usage:
+
+```
+cog-safe-push --test-hardware=gpu-l40s owner/my-model
+```
+
+This will:
+
+1. Lint `predict.py` with ruff.
+2. Create a private test model `owner/my-model-test` if missing.
+3. Push the local Cog model to the test model.
+4. Lint the schema (descriptions, defaults, etc.).
+5. Check schema compatibility against the live `owner/my-model` version.
+6. Run prediction comparisons between live and test versions.
+7. Fuzz the test model with AI-generated inputs.
+8. If everything passes, push to `owner/my-model`.
+
+## cog-safe-push.yaml schema
+
+Drop a `cog-safe-push.yaml` in your project root (or `cog-safe-push-configs/<variant>.yaml` for multi-model repos). All five test-case checker types in one example:
+
+```yaml
+model: owner/my-model
+test_model: owner/my-model-test
+test_hardware: gpu-l40s
+
+predict:
+  compare_outputs: false              # set false for stochastic models
+  predict_timeout: 600
+  test_cases:
+    - inputs:
+        prompt: "a serene mountain landscape"
+      match_prompt: "a landscape photo of mountains"   # AI-judged via Claude
+    - inputs:
+        prompt: "a cat"
+      match_url: "https://example.com/reference-cat.png"   # binary/image match
+    - inputs:
+        prompt: ""
+      error_contains: "prompt cannot be empty"           # negative test
+    - inputs:
+        mode: "json"
+      jq_query: '.confidence > 0.8 and .status == "success"'   # JSON output
+    - inputs:
+        prompt: "echo this"
+      exact_string: "echo this"                          # exact string match
+  fuzz:
+    fixed_inputs:
+      seed: 42
+    disabled_inputs:
+      - debug
+    iterations: 10
+    prompt: "Generate creative and diverse prompts"
+
+train:                                  # if your model has a trainer
+  destination: owner/my-model-trained
+  destination_hardware: gpu-l40s
+  train_timeout: 1800
+  test_cases:
+    - inputs:
+        input_images: "https://.../training.zip"
+        steps: 10
+
+deployment:                             # auto-create or update on push
+  name: my-model
+  owner: owner
+  hardware: gpu-l40s
+
+parallel: 4
+fast_push: false
+ignore_schema_compatibility: false
+official_model: owner/my-model         # for proxy/wrapper models, see below
+```
+
+Test case checkers are mutually exclusive: pick exactly one of `match_prompt`, `match_url`, `error_contains`, `jq_query`, or `exact_string` per case. Use `compare_outputs: false` for any stochastic model (diffusion, LLMs); the default `true` is brittle.
+
+## CI/CD: GitHub Actions
+
+Two paths, depending on how much glue you want.
+
+### Path A: roll your own
+
+```yaml
+# .github/workflows/push.yaml
+name: Push to Replicate
+on:
+  workflow_dispatch:
+    inputs:
+      no_push:
+        type: boolean
+        default: false
+
+jobs:
+  push:
+    runs-on: ubuntu-latest-4-cores       # builds need disk + cores
+    steps:
+      - uses: actions/checkout@v4
+      - uses: jlumbroso/free-disk-space@v1.3.1
+        with:
+          tool-cache: false
+          docker-images: false
+      - uses: replicate/setup-cog@v2
+        with:
+          token: ${{ secrets.REPLICATE_API_TOKEN }}
+      - run: pip install git+https://github.com/replicate/cog-safe-push.git
+      - env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          REPLICATE_API_TOKEN: ${{ secrets.REPLICATE_API_TOKEN }}
+        run: |
+          cog-safe-push -vv ${{ inputs.no_push && '--no-push' || '' }}
+```
+
+Add a `concurrency:` block so PR builds cancel each other while main-branch pushes queue:
+
+```yaml
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+```
+
+### Path B: reusable workflow from model-ci-template
+
+For Replicate-style multi-model repos, drop in:
+
+```yaml
+# .github/workflows/ci.yaml
+name: CI
+on:
+  pull_request: { branches: [main] }
+  push: { branches: [main] }
+  workflow_dispatch:
+    inputs:
+      models: { type: string, default: "all" }
+      ignore_schema_checks: { type: boolean, default: false }
+      cog_version: { type: string, default: "latest" }
+      test_only: { type: boolean, default: false }
+
+jobs:
+  ci:
+    uses: replicate/model-ci-template/.github/workflows/template.yaml@main
+    with:
+      trigger_type: ${{ github.event_name }}
+      models: ${{ inputs.models || 'all' }}
+      ignore_schema_checks: ${{ inputs.ignore_schema_checks || false }}
+      cog_version: ${{ inputs.cog_version || 'latest' }}
+      test_only: ${{ inputs.test_only || false }}
+    secrets: inherit
+```
+
+The reusable workflow expects:
+
+- `cog-safe-push-configs/<model>.yaml` — one per model variant.
+- `script/select-model` — bash file with `if/elif [[ "$MODEL" == "..." ]]` blocks listing valid model names.
+- Secrets: `COG_TOKEN`, `REPLICATE_API_TOKEN`, `ANTHROPIC_API_KEY`.
+
+## Multi-model matrix pushes
+
+Pattern from `replicate/cog-flux`: one repo, N variants, push them in parallel.
+
+```yaml
+jobs:
+  prepare:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set.outputs.matrix }}
+    steps:
+      - id: set
+        run: |
+          if [ "${{ inputs.models }}" = "all" ]; then
+            echo 'matrix={"model":["schnell","dev","krea-dev"]}' >> "$GITHUB_OUTPUT"
+          else
+            list=$(echo "${{ inputs.models }}" | jq -Rc 'split(",")')
+            echo "matrix={\"model\":$list}" >> "$GITHUB_OUTPUT"
+          fi
+
+  push:
+    needs: prepare
+    runs-on: ubuntu-latest-4-cores
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}
+    steps:
+      - uses: actions/checkout@v4
+      - run: ./script/select.sh ${{ matrix.model }}     # produces cog.yaml from a template
+      - run: cog-safe-push --config cog-safe-push-configs/${{ matrix.model }}.yaml -vv
+```
+
+## Two-pass push for proxy / official models
+
+When you maintain a proxy that wraps a third-party API, you push to a private wrapper first, then update the public-facing official model card. Pattern from `replicate/cog-official-template`:
+
+```bash
+./script/write-api-key                                              # bake API key into config
+cog-safe-push --config cog-safe-push-configs/${MODEL}.yaml -vv
+
+./script/delete-api-key                                             # strip the key
+cog-safe-push --push-official-model --config cog-safe-push-configs/${MODEL}.yaml -vv
+```
+
+Set `official_model: owner/name` in the config so `--push-official-model` knows where to publish.
+
+## Deployments
+
+Add a `deployment` block to `cog-safe-push.yaml` to create or update a Replicate deployment automatically on each push:
+
+```yaml
+deployment:
+  name: my-model
+  owner: owner
+  hardware: gpu-l40s
+```
+
+Scaling defaults: CPU deployments scale 1-20 instances, GPU deployments scale 0-2. Adjust manually via the API or web UI when needed.
+
+## Monitoring published models
+
+Run an hourly canary that exercises the registry path. Pattern from `replicate/cog-pagerduty-check`:
+
+```yaml
+name: Hourly cog push check
+on:
+  schedule:
+    - cron: "0 * * * *"
+  workflow_dispatch:
+
+jobs:
+  check:
+    runs-on: ubuntu-latest
+    steps:
+      - run: |
+          # generate a tiny model with a unique uuid, push it, run a prediction
+          # by digest, fail loudly if anything breaks.
+          ./script/canary.sh
+```
+
+Worth doing for any production-critical model, especially when revenue depends on the registry being up.
+
+## Guidelines
+
+- Don't break schema compatibility unless you mean to. cog-safe-push catches it; `--ignore-schema-compatibility` is the opt-out.
+- Pin `test_hardware` so test pushes are reproducible.
+- Use `--no-push` for dry runs in PR CI; full push on merge to main or on version tags.
+- Push from CI rather than laptops once you have users.
+- Use `compare_outputs: false` for stochastic models. Use `match_prompt:` for image/video outputs (VLM judgment), `match_url:` for binary outputs you control, `jq_query:` for JSON, `error_contains:` for negative tests.
+- Never commit `REPLICATE_API_TOKEN` or `ANTHROPIC_API_KEY`. Use repo secrets.
+- For models with weights > 1GB, push with `--separate-weights`.
+
+## Production references
+
+- <https://github.com/replicate/cog-safe-push> — the tool itself, plus its config schema.
+- <https://github.com/replicate/model-ci-template> — reusable GitHub Actions workflow.
+- <https://github.com/replicate/cog-official-template> — proxy/official model template.
+- <https://github.com/replicate/cog-flux/blob/main/.github/workflows/push.yaml> — matrix push across FLUX variants.
+- <https://github.com/replicate/cog-comfyui/blob/main/.github/workflows/ci.yaml> — ComfyUI model CI with custom-node install step.
+- <https://github.com/replicate/cog-pagerduty-check> — hourly canary pattern.

From ff67d7c2f3ced2ee73bac2aa530def979e275b5a Mon Sep 17 00:00:00 2001
From: Zeke Sikelianos <zeke@sikelianos.com>
Date: Tue, 28 Apr 2026 10:44:25 -0700
Subject: [PATCH 2/2] chore(build-models): remove stale cog_runtime and
 monobase references

---
 skills/build-models/SKILL.md | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/skills/build-models/SKILL.md b/skills/build-models/SKILL.md
index d981ca3..03b30d4 100644
--- a/skills/build-models/SKILL.md
+++ b/skills/build-models/SKILL.md
@@ -67,7 +67,6 @@ build:
   cuda: "12.8"
   python_version: "3.12"
   python_requirements: requirements.txt
-  cog_runtime: true
   system_packages:
     - libgl1
     - libglib2.0-0
@@ -76,7 +75,6 @@ predict: predict.py:Predictor
 
 Notes:
 
-- `cog_runtime: true` opts into the newer Rust-based runtime. Set it for new models.
 - Pin Python to a specific minor version, and pin every line in `requirements.txt`. Floating versions break cold boots.
 - Use `python_requirements` over inline `python_packages` once the list grows.
 - `cuda` follows your torch wheel (e.g. `12.8` paired with `torch==2.7.1+cu128`).
@@ -362,10 +360,6 @@ Tips:
 
 If your model supports fine-tuning, add `train: train.py:train` to `cog.yaml` and write a `train()` function that returns `TrainingOutput(weights=Path("model.tar"))`. The predictor then accepts the URL via `setup(self, weights)` or the `COG_WEIGHTS` env var. See <https://cog.run/training> and `replicate/flux-fine-tuner` for a full example.
 
-## Internal infrastructure
-
-Replicate runs an internal base-image system (`monobase`) and a FUSE-backed lazy weights layer for production models. You don't need to configure these; standard `cog build` benefits from them automatically.
-
 ## Guidelines
 
 - Keep `setup()` for one-time loads; keep `predict()` fast and deterministic in shape.