diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 3b09d80..e7c9c11 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -21,6 +21,16 @@ "source": "skills/run-models", "description": "Run AI models on Replicate via predictions, webhooks, and streaming." }, + { + "name": "build-models", + "source": "skills/build-models", + "description": "Package and build custom AI models with Cog for deployment on Replicate." + }, + { + "name": "publish-models", + "source": "skills/publish-models", + "description": "Push and publish custom AI models to Replicate, and set up CI/CD for releasing new model versions safely." + }, { "name": "prompt-images", "source": "skills/prompt-images", diff --git a/AGENTS.md b/AGENTS.md index 0cb636b..185548a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -2,13 +2,15 @@ ## Purpose -This repo publishes Agent Skills for Replicate: focused guides for finding, comparing, running, and prompting AI models. +This repo publishes Agent Skills for Replicate: focused guides for finding, comparing, running, building, publishing, and prompting AI models. ## Files that matter - `skills/find-models/SKILL.md` — search, collections, schemas, picking the right model. - `skills/compare-models/SKILL.md` — evaluating models by cost, speed, quality, and capabilities. - `skills/run-models/SKILL.md` — predictions, polling, webhooks, streaming, file I/O, concurrency, multi-model workflows. +- `skills/build-models/SKILL.md` — packaging custom models with Cog: cog.yaml, predict.py, weights loading, cold-boot tricks. +- `skills/publish-models/SKILL.md` — pushing models to Replicate with cog push and cog-safe-push, and CI/CD for releases. - `skills/prompt-images/SKILL.md` — prompting techniques for image generation and editing models. - `skills/prompt-videos/SKILL.md` — prompting techniques for video generation models. - `script/lint` — validates the skills. @@ -28,6 +30,8 @@ This repo publishes Agent Skills for Replicate: focused guides for finding, comp script/lint skills/find-models script/lint skills/compare-models script/lint skills/run-models +script/lint skills/build-models +script/lint skills/publish-models script/lint skills/prompt-images script/lint skills/prompt-videos ``` diff --git a/skills/build-models/SKILL.md b/skills/build-models/SKILL.md new file mode 100644 index 0000000..03b30d4 --- /dev/null +++ b/skills/build-models/SKILL.md @@ -0,0 +1,384 @@ +--- +name: build-models +description: > + Package and build custom AI models with Cog for deployment on Replicate. + Use when creating a cog.yaml or predict.py, defining model inputs and + outputs, loading model weights at setup time, building Docker images for + ML models, serving locally with cog serve or cog predict, or porting a + HuggingFace, GitHub, or ComfyUI model to run on Replicate. Trigger on + phrases like "build a model", "package a model", "create a Cog model", + "wrap a model", "containerize an AI model", "predict.py", "cog.yaml", + "BasePredictor", or "Cog container", and when referencing cog.run, + github.com/replicate/cog, or github.com/replicate/cog-examples. Covers + GPU and CUDA setup, pget for fast weight downloads, async predictors + with continuous batching, streaming outputs, and cold-boot optimization + for image, video, audio, and LLM models. For pushing built models to + Replicate, see publish-models. For running existing models, see + run-models. +--- + +## Docs + +- Cog reference (single file): +- `cog.yaml` reference: +- Python predictor reference: +- Examples: +- Template: + +## When to use this skill + +- You have model code, weights, or a HuggingFace/GitHub project you want to host on Replicate. +- You're writing or editing a `cog.yaml`, `predict.py`, or `train.py`. +- For pushing a built model to Replicate, see `publish-models`. +- For running existing Replicate models, see `run-models`. + +## Prerequisites + +- Docker running locally. +- Cog installed: `brew install replicate/tap/cog` or `sh <(curl -fsSL https://cog.run/install.sh)`. +- Optional: `cog init` to scaffold `cog.yaml` and `predict.py`. + +## Project layout + +The canonical Replicate model layout: + +``` +cog.yaml +predict.py +weights.py # optional download helpers +requirements.txt +cog-safe-push-configs/ + default.yaml # see publish-models skill +.github/workflows/ + ci.yaml +script/ # github.com/github/scripts-to-rule-them-all + lint + test + push +``` + +## cog.yaml essentials + +A modern config for a GPU model: + +```yaml +build: + gpu: true + cuda: "12.8" + python_version: "3.12" + python_requirements: requirements.txt + system_packages: + - libgl1 + - libglib2.0-0 +predict: predict.py:Predictor +``` + +Notes: + +- Pin Python to a specific minor version, and pin every line in `requirements.txt`. Floating versions break cold boots. +- Use `python_requirements` over inline `python_packages` once the list grows. +- `cuda` follows your torch wheel (e.g. `12.8` paired with `torch==2.7.1+cu128`). +- Add `train: train.py:train` if your model is fine-tunable. +- Add `image: r8.im/owner/name` to enable bare `cog push`. + +For async predictors with continuous batching: + +```yaml +concurrency: + max: 32 +``` + +## predict.py essentials + +```python +from cog import BasePredictor, Input, Path + +class Predictor(BasePredictor): + def setup(self) -> None: + """One-time loads. Heavy work goes here, not in predict().""" + self.model = load_model("weights/") + + def predict( + self, + prompt: str = Input(description="Text prompt for generation"), + seed: int = Input(description="Random seed; leave blank for random", default=None), + num_steps: int = Input(description="Number of denoising steps", ge=1, le=50, default=20), + output_format: str = Input(description="Output image format", choices=["webp", "jpg", "png"], default="webp"), + ) -> Path: + """Run a single prediction.""" + if not prompt.strip(): + raise ValueError("prompt cannot be empty") + out = self.model.generate(prompt, seed=seed, steps=num_steps) + return Path(out) +``` + +Input rules: + +- Every input needs a `description`. The description shows up in the model schema and on Replicate's web UI. +- Use `ge`/`le` for numeric bounds, `choices=[...]` for enums, `regex=` for strings. +- Use `cog.Path` for file inputs and outputs, never raw bytes. +- Use `cog.Secret` for any token-like input (HF tokens, API keys), never plain `str`. +- Provide a default that's inside `choices` for categorical inputs. +- Validate inputs early in `predict()` and raise `ValueError`. + +Streaming text output (for LLMs): + +```python +from cog import BasePredictor, Input, ConcatenateIterator + +class Predictor(BasePredictor): + def predict(self, prompt: str = Input(description="Prompt")) -> ConcatenateIterator[str]: + for token in self.model.stream(prompt): + yield token +``` + +Async predictor with continuous batching (paired with `concurrency.max` in cog.yaml): + +```python +from cog import BasePredictor, Input, AsyncConcatenateIterator + +class Predictor(BasePredictor): + async def setup(self) -> None: + self.engine = await load_async_engine() + + async def predict( + self, + prompt: str = Input(description="Prompt"), + ) -> AsyncConcatenateIterator[str]: + async for token in self.engine.generate(prompt): + yield token +``` + +Dynamic `choices` from on-disk assets (e.g. a `voices/` directory of audio samples): + +```python +from pathlib import Path as _P +AVAILABLE_VOICES = sorted(p.stem for p in _P("voices").glob("*.wav")) + +class Predictor(BasePredictor): + def predict( + self, + speaker: str = Input(description="Voice", choices=AVAILABLE_VOICES, default=AVAILABLE_VOICES[0]), + ) -> Path: ... +``` + +## Loading weights fast + +Cold boot dominates user-perceived latency. Three patterns, ranked by simplicity: + +### 1. Bake weights into the image at build time + +Best for small or medium weights (< 5GB) that you want zero-cold-boot for. + +For torchvision: + +```python +import os +os.environ["TORCH_HOME"] = "." # set before importing torch +import torch +from torchvision import models +``` + +For HuggingFace: + +```python +import os +os.environ["HF_HUB_CACHE"] = "./.cache" +os.environ["HF_XET_HIGH_PERFORMANCE"] = "1" +``` + +Then download once during `cog build` (e.g. in a `run:` step or by running a small fetcher script as part of the build). The weights become part of the image layer. + +### 2. Pull from `weights.replicate.delivery` with pget + +Best for large weights, or when you want to share weights across multiple models. `pget` is Replicate's parallel HTTP fetcher. + +In `cog.yaml`: + +```yaml +build: + run: + - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.8.2/pget_linux_x86_64" + - chmod +x /usr/local/bin/pget +``` + +In `setup()`: + +```python +import subprocess +from pathlib import Path + +WEIGHTS_URL = "https://weights.replicate.delivery/default/my-model/weights.tar" +WEIGHTS_DIR = Path("weights") + +class Predictor(BasePredictor): + def setup(self) -> None: + if not WEIGHTS_DIR.exists(): + # -x extracts tar in-memory; default concurrency is 4 * NumCPU + subprocess.check_call(["pget", "-x", WEIGHTS_URL, str(WEIGHTS_DIR)]) + self.model = load_from(WEIGHTS_DIR) +``` + +For multiple files in one shot: + +```python +manifest = "\n".join([ + f"{base}/unet.safetensors weights/unet.safetensors", + f"{base}/vae.safetensors weights/vae.safetensors", + f"{base}/text_encoder.safetensors weights/text_encoder.safetensors", +]) +subprocess.run(["pget", "multifile", "-"], input=manifest, text=True, check=True) +``` + +### 3. HuggingFace Hub with hf_transfer + +Set `HF_HUB_ENABLE_HF_TRANSFER=1` and use `huggingface_hub.snapshot_download` or `from_pretrained`. Faster than vanilla HF downloads. Use a `cog.Secret` input for gated models. + +## Weight cache for user-supplied weights + +For LoRAs or any weights URL the user passes at predict time, use a sha256-keyed disk cache with LRU eviction: + +```python +import hashlib, shutil, subprocess +from pathlib import Path + +class WeightsDownloadCache: + def __init__(self, cache_dir: str = "/tmp/weights-cache", min_disk_free_gb: int = 10): + self.cache_dir = Path(cache_dir) + self.cache_dir.mkdir(parents=True, exist_ok=True) + self.min_disk_free = min_disk_free_gb * 1024**3 + + def ensure(self, url: str) -> Path: + key = hashlib.sha256(url.encode()).hexdigest() + target = self.cache_dir / key + if target.exists(): + target.touch() # bump LRU mtime + return target + self._evict_until_room() + subprocess.check_call(["pget", url, str(target)]) + return target + + def _evict_until_room(self) -> None: + while shutil.disk_usage(self.cache_dir).free < self.min_disk_free: + entries = sorted(self.cache_dir.iterdir(), key=lambda p: p.stat().st_mtime) + if not entries: + return + entries[0].unlink() +``` + +See `replicate/cog-flux/weights.py` for a production version that handles HF, CivitAI, Replicate, and arbitrary `.safetensors` URLs. + +## Multi-LoRA composition + +Reload only when the URL changes; compose two LoRAs with separate scales: + +```python +class Predictor(BasePredictor): + def setup(self) -> None: + self.pipe = load_base_pipeline() + self.loaded = {"main": None, "extra": None} + + def _ensure_lora(self, slot: str, url: str | None) -> None: + if url == self.loaded[slot]: + return + if self.loaded[slot] is not None: + self.pipe.unload_lora_weights(adapter_name=slot) + if url: + path = self.cache.ensure(url) + self.pipe.load_lora_weights(str(path), adapter_name=slot) + self.loaded[slot] = url + + def predict( + self, + prompt: str = Input(description="Prompt"), + lora_url: str = Input(description="Primary LoRA URL", default=None), + lora_scale: float = Input(description="Primary LoRA scale", ge=0.0, le=2.0, default=1.0), + extra_lora_url: str = Input(description="Optional second LoRA URL", default=None), + extra_lora_scale: float = Input(description="Second LoRA scale", ge=0.0, le=2.0, default=1.0), + ) -> Path: + self._ensure_lora("main", lora_url) + self._ensure_lora("extra", extra_lora_url) + adapters = [s for s, u in self.loaded.items() if u] + scales = [lora_scale if s == "main" else extra_lora_scale for s in adapters] + if adapters: + self.pipe.set_adapters(adapters, adapter_weights=scales) + return Path(self.pipe(prompt).images[0].save("/tmp/out.png")) +``` + +## Cold-boot tricks + +From production diffusion models like `replicate/cog-flux` and `replicate/cog-flux-kontext`: + +- Set perf flags once in `setup()`: + ```python + import torch + torch.set_float32_matmul_precision("high") + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.benchmark = True + ``` +- Compile and warm up: + ```python + self.model = torch.compile(self.model, dynamic=True) + _ = self.predict(prompt="warmup", num_steps=1) # absorbs compile cost in setup + ``` +- Load big weights with meta device + `assign=True` to avoid double-allocating: + ```python + with torch.device("meta"): + model = build_model_skeleton() + state = torch.load("weights.pt", map_location="cpu") + model.load_state_dict(state, assign=True) + ``` +- Share VAE / text encoder across multiple pipelines (e.g. base + img2img + inpaint) instead of loading three copies. +- For fp8/int8, save quantized weights ahead of time and load directly; don't quantize at boot. + +## Local development + +``` +cog init # scaffold cog.yaml + predict.py +cog predict -i prompt="hello" # build + run a single prediction +cog predict -i image=@input.jpg -o out.png # file inputs and outputs +cog serve -p 8393 # HTTP server matching production +cog exec python # interactive shell inside the build env +``` + +## Building + +``` +cog build -t my-model +cog build --separate-weights -t my-model # weights in their own image layer +cog build --secret id=hf,src=$HOME/.hf_token -t my-model +``` + +Tips: + +- Use `--separate-weights` for any model with weights > ~1GB. It speeds up cold boots and registry pushes. +- Use `--mount=type=cache,target=/root/.cache/pip` in `run:` steps to cache pip across builds. +- Use `--secret` instead of `ARG` to keep tokens out of image history. +- The default Cog base image (`--use-cog-base-image=true`) is faster than rolling your own. + +## Training + +If your model supports fine-tuning, add `train: train.py:train` to `cog.yaml` and write a `train()` function that returns `TrainingOutput(weights=Path("model.tar"))`. The predictor then accepts the URL via `setup(self, weights)` or the `COG_WEIGHTS` env var. See and `replicate/flux-fine-tuner` for a full example. + +## Guidelines + +- Keep `setup()` for one-time loads; keep `predict()` fast and deterministic in shape. +- Pin Python and every dependency. Use `numpy<2` if your torch is older. +- Always describe every input. Schemas without descriptions are unusable on the web UI. +- Use `cog.Path` for files and `cog.Secret` for tokens. +- Pin `pget` to a specific release (`v0.8.2`) for reproducibility. +- Set `HF_HUB_ENABLE_HF_TRANSFER=1` whenever you call HuggingFace Hub. +- Set `TRANSFORMERS_OFFLINE=1` after weights are loaded to prevent runtime HF lookups. +- Test with `cog predict` before pushing. If it doesn't work locally, it won't work in production. + +## Production references + +- — minimal patterns (resnet, hello-world, streaming, training) +- — scaffolder for new model repos +- — multi-variant FLUX models, weights cache, fp8 + torch.compile +- — meta-device loading, warmup compilation +- — async LLM server with continuous batching, training-as-packaging +- — ComfyUI workflows as a Cog model, custom-node helpers +- — multi-LoRA composition, shared pipeline components +- — TTS with dynamic `choices`, minimal cog.yaml +- — parallel weights fetcher diff --git a/skills/publish-models/SKILL.md b/skills/publish-models/SKILL.md new file mode 100644 index 0000000..40955f7 --- /dev/null +++ b/skills/publish-models/SKILL.md @@ -0,0 +1,330 @@ +--- +name: publish-models +description: > + Push and publish custom AI models to Replicate, and set up CI/CD for + releasing new model versions safely. Use when running cog push, + deploying a model to Replicate, releasing a new version, validating + a model with cog-safe-push before publishing, configuring a Replicate + deployment, setting up GitHub Actions for model releases, or porting a + community model to an official one. Trigger on phrases like "push a + model to Replicate", "publish a model", "deploy a model", "release a + new version", "cog push", "cog-safe-push", "model CI", "r8.im", or + "schema compatibility", and when referencing + github.com/replicate/cog-safe-push or + github.com/replicate/model-ci-template. Covers cog push, the full + cog-safe-push config (test cases, fuzz, deployment, official_model), + GitHub Actions patterns, multi-model matrix pushes, and post-publish + monitoring. Assumes you already have a working Cog project; see + build-models if you need to package one first. +--- + +## Docs + +- Cog reference: +- `cog push` reference: +- cog-safe-push: +- Model CI template: +- Continuous deployment guide: + +## When to use this skill + +- You have a working Cog project (see `build-models` if you don't yet). +- You want to publish a private or public model on Replicate. +- You're releasing a new version of an existing model and want to avoid breaking changes. +- You're setting up CI/CD for model releases. + +## Prerequisites + +- Cog installed and `cog login` against `r8.im` (or `echo $TOKEN | cog login --token-stdin`). +- A model created at `replicate.com/{owner}/{name}` via the API, web UI, or `r8-model` CLI. +- `REPLICATE_API_TOKEN` set in your environment. + +## Plain `cog push` + +The simplest path. Build and upload a new version: + +``` +cog push r8.im/owner/my-model +``` + +Or set `image: r8.im/owner/my-model` in `cog.yaml` and run a bare: + +``` +cog push +``` + +Useful flags: + +- `--separate-weights` — store weights in a separate layer; faster cold boots and pushes for models with > 1GB of weights. +- `--x-fast` — faster pushes during iteration (skips some validation). +- `--secret id=hf,src=$HOME/.hf_token` — pass build-time secrets without baking them into image history. + +## cog-safe-push (recommended for any model with users) + +`cog-safe-push` pushes to a private `-test` model first, checks schema compatibility against the live version, runs prediction comparisons, and fuzzes inputs. Catches breaking changes before they reach users. + +Install: + +``` +pip install git+https://github.com/replicate/cog-safe-push.git +``` + +Required env vars: + +- `REPLICATE_API_TOKEN` +- `ANTHROPIC_API_KEY` (Claude judges output similarity for stochastic models) + +Basic usage: + +``` +cog-safe-push --test-hardware=gpu-l40s owner/my-model +``` + +This will: + +1. Lint `predict.py` with ruff. +2. Create a private test model `owner/my-model-test` if missing. +3. Push the local Cog model to the test model. +4. Lint the schema (descriptions, defaults, etc.). +5. Check schema compatibility against the live `owner/my-model` version. +6. Run prediction comparisons between live and test versions. +7. Fuzz the test model with AI-generated inputs. +8. If everything passes, push to `owner/my-model`. + +## cog-safe-push.yaml schema + +Drop a `cog-safe-push.yaml` in your project root (or `cog-safe-push-configs/.yaml` for multi-model repos). All five test-case checker types in one example: + +```yaml +model: owner/my-model +test_model: owner/my-model-test +test_hardware: gpu-l40s + +predict: + compare_outputs: false # set false for stochastic models + predict_timeout: 600 + test_cases: + - inputs: + prompt: "a serene mountain landscape" + match_prompt: "a landscape photo of mountains" # AI-judged via Claude + - inputs: + prompt: "a cat" + match_url: "https://example.com/reference-cat.png" # binary/image match + - inputs: + prompt: "" + error_contains: "prompt cannot be empty" # negative test + - inputs: + mode: "json" + jq_query: '.confidence > 0.8 and .status == "success"' # JSON output + - inputs: + prompt: "echo this" + exact_string: "echo this" # exact string match + fuzz: + fixed_inputs: + seed: 42 + disabled_inputs: + - debug + iterations: 10 + prompt: "Generate creative and diverse prompts" + +train: # if your model has a trainer + destination: owner/my-model-trained + destination_hardware: gpu-l40s + train_timeout: 1800 + test_cases: + - inputs: + input_images: "https://.../training.zip" + steps: 10 + +deployment: # auto-create or update on push + name: my-model + owner: owner + hardware: gpu-l40s + +parallel: 4 +fast_push: false +ignore_schema_compatibility: false +official_model: owner/my-model # for proxy/wrapper models, see below +``` + +Test case checkers are mutually exclusive: pick exactly one of `match_prompt`, `match_url`, `error_contains`, `jq_query`, or `exact_string` per case. Use `compare_outputs: false` for any stochastic model (diffusion, LLMs); the default `true` is brittle. + +## CI/CD: GitHub Actions + +Two paths, depending on how much glue you want. + +### Path A: roll your own + +```yaml +# .github/workflows/push.yaml +name: Push to Replicate +on: + workflow_dispatch: + inputs: + no_push: + type: boolean + default: false + +jobs: + push: + runs-on: ubuntu-latest-4-cores # builds need disk + cores + steps: + - uses: actions/checkout@v4 + - uses: jlumbroso/free-disk-space@v1.3.1 + with: + tool-cache: false + docker-images: false + - uses: replicate/setup-cog@v2 + with: + token: ${{ secrets.REPLICATE_API_TOKEN }} + - run: pip install git+https://github.com/replicate/cog-safe-push.git + - env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + REPLICATE_API_TOKEN: ${{ secrets.REPLICATE_API_TOKEN }} + run: | + cog-safe-push -vv ${{ inputs.no_push && '--no-push' || '' }} +``` + +Add a `concurrency:` block so PR builds cancel each other while main-branch pushes queue: + +```yaml +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} +``` + +### Path B: reusable workflow from model-ci-template + +For Replicate-style multi-model repos, drop in: + +```yaml +# .github/workflows/ci.yaml +name: CI +on: + pull_request: { branches: [main] } + push: { branches: [main] } + workflow_dispatch: + inputs: + models: { type: string, default: "all" } + ignore_schema_checks: { type: boolean, default: false } + cog_version: { type: string, default: "latest" } + test_only: { type: boolean, default: false } + +jobs: + ci: + uses: replicate/model-ci-template/.github/workflows/template.yaml@main + with: + trigger_type: ${{ github.event_name }} + models: ${{ inputs.models || 'all' }} + ignore_schema_checks: ${{ inputs.ignore_schema_checks || false }} + cog_version: ${{ inputs.cog_version || 'latest' }} + test_only: ${{ inputs.test_only || false }} + secrets: inherit +``` + +The reusable workflow expects: + +- `cog-safe-push-configs/.yaml` — one per model variant. +- `script/select-model` — bash file with `if/elif [[ "$MODEL" == "..." ]]` blocks listing valid model names. +- Secrets: `COG_TOKEN`, `REPLICATE_API_TOKEN`, `ANTHROPIC_API_KEY`. + +## Multi-model matrix pushes + +Pattern from `replicate/cog-flux`: one repo, N variants, push them in parallel. + +```yaml +jobs: + prepare: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set.outputs.matrix }} + steps: + - id: set + run: | + if [ "${{ inputs.models }}" = "all" ]; then + echo 'matrix={"model":["schnell","dev","krea-dev"]}' >> "$GITHUB_OUTPUT" + else + list=$(echo "${{ inputs.models }}" | jq -Rc 'split(",")') + echo "matrix={\"model\":$list}" >> "$GITHUB_OUTPUT" + fi + + push: + needs: prepare + runs-on: ubuntu-latest-4-cores + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.prepare.outputs.matrix) }} + steps: + - uses: actions/checkout@v4 + - run: ./script/select.sh ${{ matrix.model }} # produces cog.yaml from a template + - run: cog-safe-push --config cog-safe-push-configs/${{ matrix.model }}.yaml -vv +``` + +## Two-pass push for proxy / official models + +When you maintain a proxy that wraps a third-party API, you push to a private wrapper first, then update the public-facing official model card. Pattern from `replicate/cog-official-template`: + +```bash +./script/write-api-key # bake API key into config +cog-safe-push --config cog-safe-push-configs/${MODEL}.yaml -vv + +./script/delete-api-key # strip the key +cog-safe-push --push-official-model --config cog-safe-push-configs/${MODEL}.yaml -vv +``` + +Set `official_model: owner/name` in the config so `--push-official-model` knows where to publish. + +## Deployments + +Add a `deployment` block to `cog-safe-push.yaml` to create or update a Replicate deployment automatically on each push: + +```yaml +deployment: + name: my-model + owner: owner + hardware: gpu-l40s +``` + +Scaling defaults: CPU deployments scale 1-20 instances, GPU deployments scale 0-2. Adjust manually via the API or web UI when needed. + +## Monitoring published models + +Run an hourly canary that exercises the registry path. Pattern from `replicate/cog-pagerduty-check`: + +```yaml +name: Hourly cog push check +on: + schedule: + - cron: "0 * * * *" + workflow_dispatch: + +jobs: + check: + runs-on: ubuntu-latest + steps: + - run: | + # generate a tiny model with a unique uuid, push it, run a prediction + # by digest, fail loudly if anything breaks. + ./script/canary.sh +``` + +Worth doing for any production-critical model, especially when revenue depends on the registry being up. + +## Guidelines + +- Don't break schema compatibility unless you mean to. cog-safe-push catches it; `--ignore-schema-compatibility` is the opt-out. +- Pin `test_hardware` so test pushes are reproducible. +- Use `--no-push` for dry runs in PR CI; full push on merge to main or on version tags. +- Push from CI rather than laptops once you have users. +- Use `compare_outputs: false` for stochastic models. Use `match_prompt:` for image/video outputs (VLM judgment), `match_url:` for binary outputs you control, `jq_query:` for JSON, `error_contains:` for negative tests. +- Never commit `REPLICATE_API_TOKEN` or `ANTHROPIC_API_KEY`. Use repo secrets. +- For models with weights > 1GB, push with `--separate-weights`. + +## Production references + +- — the tool itself, plus its config schema. +- — reusable GitHub Actions workflow. +- — proxy/official model template. +- — matrix push across FLUX variants. +- — ComfyUI model CI with custom-node install step. +- — hourly canary pattern.