From 790d88b48237b6e06f59f5d679af880a019548ae Mon Sep 17 00:00:00 2001
From: Mark Phelps <mphelps@cloudflare.com>
Date: Wed, 20 May 2026 11:52:14 -0400
Subject: [PATCH] docs: update predict terminology to run

---
 README.md                         |  12 +--
 docs/cli.md                       |  10 +--
 docs/deploy.md                    |  12 +--
 docs/environment.md               |   2 +-
 docs/getting-started-own-model.md |  16 ++--
 docs/getting-started.md           |  12 +--
 docs/llms.txt                     | 128 +++++++++++++++---------------
 docs/python.md                    |  46 +++++------
 docs/training.md                  |   2 +-
 docs/wsl2/wsl2.md                 |  12 +--
 docs/yaml.md                      |   4 +-
 pkg/cli/predict.go                |  13 +--
 pkg/cli/serve.go                  |   4 +-
 13 files changed, 137 insertions(+), 136 deletions(-)

diff --git a/README.md b/README.md
index 3efb843b75..7766bdc9bd 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ You can deploy your packaged model to your own infrastructure, or to [Replicate]
 
 - ✅ **Define the inputs and outputs for your model with standard Python.** Then, Cog generates an OpenAPI schema and validates the inputs and outputs.
 
-- 🎁 **Automatic HTTP prediction server**: Your model's types are used to dynamically generate a RESTful HTTP API using a high-performance Rust/Axum server.
+- 🎁 **Automatic HTTP inference server**: Your model's types are used to dynamically generate a RESTful HTTP API using a high-performance Rust/Axum server.
 
 - 🚀 **Ready for production.** Deploy your model anywhere that Docker images run. Your own infrastructure, or [Replicate](https://replicate.com).
 
@@ -31,7 +31,7 @@ build:
 run: "run.py:Runner"
 ```
 
-Define how predictions are run on your model with `run.py`:
+Define how your model runs with `run.py`:
 
 ```python
 from cog import BaseRunner, Input, Path
@@ -39,14 +39,14 @@ import torch
 
 class Runner(BaseRunner):
     def setup(self):
-        """Load the model into memory to make running multiple predictions efficient"""
+        """Load the model into memory to make running multiple inferences efficient"""
         self.model = torch.load("./weights.pth")
 
     # The arguments and types the model takes as input
     def run(self,
           image: Path = Input(description="Grayscale input image")
     ) -> Path:
-        """Run a single prediction on the model"""
+        """Run the model"""
         processed_image = preprocess(image)
         output = self.model(processed_image)
         return postprocess(output)
@@ -54,12 +54,12 @@ class Runner(BaseRunner):
 
 In the above we accept a path to the image as an input, and return a path to our transformed image after running it through our model.
 
-Now, you can run predictions on this model:
+Now, you can run the model:
 
 ```console
 $ cog run -i image=@input.jpg
 --> Building Docker image...
---> Running Prediction...
+--> Running...
 --> Output written to output.jpg
 ```
 
diff --git a/docs/cli.md b/docs/cli.md
index b4b1fe7fcb..9f472fae32 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -215,13 +215,13 @@ cog push [IMAGE] [flags]
 
 ## `cog run`
 
-Run a prediction.
+Run the model.
 
-If 'image' is passed, it will run the prediction on that Docker image.
+If 'image' is passed, it will run the model on that Docker image.
 It must be an image that has been built by Cog.
 
 Otherwise, it will build the model in the current directory and run
-the prediction on that.
+it.
 
 ```
 cog run [image] [flags]
@@ -230,7 +230,7 @@ cog run [image] [flags]
 **Examples**
 
 ```
-  # Run a prediction with named inputs
+  # Run the model with named inputs
   cog run -i prompt="a photo of a cat"
 
   # Pass a file as input
@@ -268,7 +268,7 @@ cog run [image] [flags]
 
 ## `cog serve`
 
-Run a prediction HTTP server.
+Run an HTTP server.
 
 Builds the model and starts an HTTP server that exposes the model's inputs
 and outputs as a REST API. Compatible with the Cog HTTP protocol.
diff --git a/docs/deploy.md b/docs/deploy.md
index ef3dd1990d..6abaaa12d5 100644
--- a/docs/deploy.md
+++ b/docs/deploy.md
@@ -1,11 +1,11 @@
 # Deploy models with Cog
 
 Cog containers are Docker containers that serve an HTTP server
-for running predictions on your model.
+for running your model.
 You can deploy them anywhere that Docker containers run.
 
-The server inside Cog containers is **coglet**, a Rust-based prediction server
-that handles HTTP requests, worker process management, and prediction execution.
+The server inside Cog containers is **coglet**, a Rust-based inference server
+that handles HTTP requests, worker process management, and run execution.
 
 This guide assumes you have a model packaged with Cog.
 If you don't, [follow our getting started guide](getting-started-own-model.md),
@@ -19,7 +19,7 @@ First, build your model:
 cog build -t my-model
 ```
 
-You can serve predictions locally with `cog serve`:
+You can serve your model locally with `cog serve`:
 
 ```console
 cog serve
@@ -54,7 +54,7 @@ To stop the server, run:
 docker kill my-model
 ```
 
-To run a prediction on the model,
+To run the model,
 call the `/predictions` endpoint,
 passing input in the format expected by your model:
 
@@ -79,7 +79,7 @@ The response includes a `status` field with values like `STARTING`, `READY`, `BU
 
 ## Concurrency
 
-By default, the server processes one prediction at a time. To enable concurrent predictions, set the `concurrency.max` option in `cog.yaml`:
+By default, the server processes one run at a time. To enable concurrent runs, set the `concurrency.max` option in `cog.yaml`:
 
 ```yaml
 concurrency:
diff --git a/docs/environment.md b/docs/environment.md
index 072e013a36..011a03e240 100644
--- a/docs/environment.md
+++ b/docs/environment.md
@@ -44,7 +44,7 @@ The `dist` option searches for wheels in:
 
 ### `COGLET_WHEEL`
 
-Controls which coglet wheel is installed in the Docker image. Coglet is the Rust-based prediction server.
+Controls which coglet wheel is installed in the Docker image. Coglet is the Rust-based inference server.
 
 **Supported values:** Same as `COG_SDK_WHEEL`
 
diff --git a/docs/getting-started-own-model.md b/docs/getting-started-own-model.md
index 220509c4c3..1e139b3676 100644
--- a/docs/getting-started-own-model.md
+++ b/docs/getting-started-own-model.md
@@ -27,7 +27,7 @@ sudo chmod +x /usr/local/bin/cog
 To configure your project for use with Cog, you'll need to add two files:
 
 - [`cog.yaml`](yaml.md) defines system requirements, Python package dependencies, etc
-- [`run.py`](python.md) describes the prediction interface for your model
+- [`run.py`](python.md) describes the run interface for your model
 
 Use the `cog init` command to generate these files in your project:
 
@@ -74,9 +74,9 @@ This is handy for ensuring a consistent environment for development or training.
 
 With `cog.yaml`, you can also install system packages and other things. [Take a look at the full reference to see what else you can do.](yaml.md)
 
-## Define how to run predictions
+## Define how to run your model
 
-The next step is to update `run.py` to define the interface for running predictions on your model. The `run.py` generated by `cog init` looks something like this:
+The next step is to update `run.py` to define the interface for running your model. The `run.py` generated by `cog init` looks something like this:
 
 ```python
 from cog import BaseRunner, Path, Input
@@ -84,21 +84,21 @@ import torch
 
 class Runner(BaseRunner):
     def setup(self):
-        """Load the model into memory to make running multiple predictions efficient"""
+        """Load the model into memory to make running multiple inferences efficient"""
         self.net = torch.load("weights.pth")
 
     def run(self,
             image: Path = Input(description="Image to enlarge"),
             scale: float = Input(description="Factor to scale image by", default=1.5)
     ) -> Path:
-        """Run a single prediction on the model"""
+        """Run the model"""
         # ... pre-processing ...
         output = self.net(input)
         # ... post-processing ...
         return output
 ```
 
-Edit your `run.py` file and fill in the functions with your own model's setup and prediction code. You might need to import parts of your model from another file.
+Edit your `run.py` file and fill in the functions with your own model's setup and run code. You might need to import parts of your model from another file.
 
 You also need to define the inputs to your model as arguments to the `run()` function, as demonstrated above. For each argument, you need to annotate with a type. The supported types are:
 
@@ -121,7 +121,7 @@ You can provide more information about the input with the `Input()` function, as
 - `choices`: For `str` or `int` types, a list of possible values for this input.
 - `deprecated`: Mark this input as deprecated with a message explaining what to use instead.
 
-There are some more advanced options you can pass, too. For more details, [take a look at the prediction interface documentation](python.md).
+There are some more advanced options you can pass, too. For more details, [take a look at the run interface documentation](python.md).
 
 Next, add the line `run: "run.py:Runner"` to your `cog.yaml`, so it looks something like this:
 
@@ -132,7 +132,7 @@ build:
 run: "run.py:Runner"
 ```
 
-That's it! To test this works, try running a prediction on the model:
+That's it! To test this works, try running the model:
 
 ```
 $ cog run -i image=@input.jpg
diff --git a/docs/getting-started.md b/docs/getting-started.md
index 04fd72acd3..db9477da10 100644
--- a/docs/getting-started.md
+++ b/docs/getting-started.md
@@ -85,11 +85,11 @@ Type "help", "copyright", "credits" or "license" for more information.
 
 Inside this Docker environment you can do anything – run a Jupyter notebook, your training script, your evaluation script, and so on.
 
-## Run predictions on a model
+## Run a model
 
-Let's pretend we've trained a model. With Cog, we can define how to run predictions on it in a standard way, so other people can easily run predictions on it without having to hunt around for a prediction script.
+Let's pretend we've trained a model. With Cog, we can define how to run it in a standard way, so other people can easily run it without having to hunt around for a run script.
 
-We need to write some code to describe how predictions are run on the model.
+We need to write some code to describe how the model runs.
 
 Save this to `run.py`:
 
@@ -107,13 +107,13 @@ WEIGHTS = models.ResNet50_Weights.IMAGENET1K_V1
 
 class Runner(BaseRunner):
     def setup(self):
-        """Load the model into memory to make running multiple predictions efficient"""
+        """Load the model into memory to make running multiple inferences efficient"""
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model = models.resnet50(weights=WEIGHTS).to(self.device)
         self.model.eval()
 
     def run(self, image: Path = Input(description="Image to classify")) -> dict:
-        """Run a single prediction on the model"""
+        """Run the model"""
         img = Image.open(image).convert("RGB")
         preds = self.model(WEIGHTS.transforms()(img).unsqueeze(0).to(self.device))
         top3 = preds[0].softmax(0).topk(3)
@@ -174,7 +174,7 @@ Note: The first time you run `cog run`, the build process will be triggered to g
 
 ## Build an image
 
-We can bake your model's code, the trained weights, and the Docker environment into a Docker image. This image serves predictions with an HTTP server, and can be deployed to anywhere that Docker runs to serve real-time predictions.
+We can bake your model's code, the trained weights, and the Docker environment into a Docker image. This image serves an HTTP server, and can be deployed to anywhere that Docker runs to serve real-time inference.
 
 ```bash
 cog build -t resnet
diff --git a/docs/llms.txt b/docs/llms.txt
index c7b328d8c5..9c23e24c65 100644
--- a/docs/llms.txt
+++ b/docs/llms.txt
@@ -12,7 +12,7 @@ You can deploy your packaged model to your own infrastructure, or to [Replicate]
 
 - ✅ **Define the inputs and outputs for your model with standard Python.** Then, Cog generates an OpenAPI schema and validates the inputs and outputs.
 
-- 🎁 **Automatic HTTP prediction server**: Your model's types are used to dynamically generate a RESTful HTTP API using a high-performance Rust/Axum server.
+- 🎁 **Automatic HTTP inference server**: Your model's types are used to dynamically generate a RESTful HTTP API using a high-performance Rust/Axum server.
 
 - 🚀 **Ready for production.** Deploy your model anywhere that Docker images run. Your own infrastructure, or [Replicate](https://replicate.com).
 
@@ -31,7 +31,7 @@ build:
 run: "run.py:Runner"
 ```
 
-Define how predictions are run on your model with `run.py`:
+Define how your model runs with `run.py`:
 
 ```python
 from cog import BaseRunner, Input, Path
@@ -39,14 +39,14 @@ import torch
 
 class Runner(BaseRunner):
     def setup(self):
-        """Load the model into memory to make running multiple predictions efficient"""
+        """Load the model into memory to make running multiple inferences efficient"""
         self.model = torch.load("./weights.pth")
 
     # The arguments and types the model takes as input
     def run(self,
           image: Path = Input(description="Grayscale input image")
     ) -> Path:
-        """Run a single prediction on the model"""
+        """Run the model"""
         processed_image = preprocess(image)
         output = self.model(processed_image)
         return postprocess(output)
@@ -54,12 +54,12 @@ class Runner(BaseRunner):
 
 In the above we accept a path to the image as an input, and return a path to our transformed image after running it through our model.
 
-Now, you can run predictions on this model:
+Now, you can run the model:
 
 ```console
 $ cog run -i image=@input.jpg
 --> Building Docker image...
---> Running Prediction...
+--> Running...
 --> Output written to output.jpg
 ```
 
@@ -411,13 +411,13 @@ cog push [IMAGE] [flags]
 
 ## `cog run`
 
-Run a prediction.
+Run the model.
 
-If 'image' is passed, it will run the prediction on that Docker image.
+If 'image' is passed, it will run the model on that Docker image.
 It must be an image that has been built by Cog.
 
 Otherwise, it will build the model in the current directory and run
-the prediction on that.
+it.
 
 ```
 cog run [image] [flags]
@@ -426,7 +426,7 @@ cog run [image] [flags]
 **Examples**
 
 ```
-  # Run a prediction with named inputs
+  # Run the model with named inputs
   cog run -i prompt="a photo of a cat"
 
   # Pass a file as input
@@ -464,7 +464,7 @@ cog run [image] [flags]
 
 ## `cog serve`
 
-Run a prediction HTTP server.
+Run an HTTP server.
 
 Builds the model and starts an HTTP server that exposes the model's inputs
 and outputs as a REST API. Compatible with the Cog HTTP protocol.
@@ -508,11 +508,11 @@ cog serve [flags]
 # Deploy models with Cog
 
 Cog containers are Docker containers that serve an HTTP server
-for running predictions on your model.
+for running your model.
 You can deploy them anywhere that Docker containers run.
 
-The server inside Cog containers is **coglet**, a Rust-based prediction server
-that handles HTTP requests, worker process management, and prediction execution.
+The server inside Cog containers is **coglet**, a Rust-based inference server
+that handles HTTP requests, worker process management, and run execution.
 
 This guide assumes you have a model packaged with Cog.
 If you don't, [follow our getting started guide](getting-started-own-model.md),
@@ -526,7 +526,7 @@ First, build your model:
 cog build -t my-model
 ```
 
-You can serve predictions locally with `cog serve`:
+You can serve your model locally with `cog serve`:
 
 ```console
 cog serve
@@ -561,7 +561,7 @@ To stop the server, run:
 docker kill my-model
 ```
 
-To run a prediction on the model,
+To run the model,
 call the `/predictions` endpoint,
 passing input in the format expected by your model:
 
@@ -586,7 +586,7 @@ The response includes a `status` field with values like `STARTING`, `READY`, `BU
 
 ## Concurrency
 
-By default, the server processes one prediction at a time. To enable concurrent predictions, set the `concurrency.max` option in `cog.yaml`:
+By default, the server processes one run at a time. To enable concurrent runs, set the `concurrency.max` option in `cog.yaml`:
 
 ```yaml
 concurrency:
@@ -652,7 +652,7 @@ The `dist` option searches for wheels in:
 
 ### `COGLET_WHEEL`
 
-Controls which coglet wheel is installed in the Docker image. Coglet is the Rust-based prediction server.
+Controls which coglet wheel is installed in the Docker image. Coglet is the Rust-based inference server.
 
 **Supported values:** Same as `COG_SDK_WHEEL`
 
@@ -754,7 +754,7 @@ sudo chmod +x /usr/local/bin/cog
 To configure your project for use with Cog, you'll need to add two files:
 
 - [`cog.yaml`](yaml.md) defines system requirements, Python package dependencies, etc
-- [`run.py`](python.md) describes the prediction interface for your model
+- [`run.py`](python.md) describes the run interface for your model
 
 Use the `cog init` command to generate these files in your project:
 
@@ -801,9 +801,9 @@ This is handy for ensuring a consistent environment for development or training.
 
 With `cog.yaml`, you can also install system packages and other things. [Take a look at the full reference to see what else you can do.](yaml.md)
 
-## Define how to run predictions
+## Define how to run your model
 
-The next step is to update `run.py` to define the interface for running predictions on your model. The `run.py` generated by `cog init` looks something like this:
+The next step is to update `run.py` to define the interface for running your model. The `run.py` generated by `cog init` looks something like this:
 
 ```python
 from cog import BaseRunner, Path, Input
@@ -811,21 +811,21 @@ import torch
 
 class Runner(BaseRunner):
     def setup(self):
-        """Load the model into memory to make running multiple predictions efficient"""
+        """Load the model into memory to make running multiple inferences efficient"""
         self.net = torch.load("weights.pth")
 
     def run(self,
             image: Path = Input(description="Image to enlarge"),
             scale: float = Input(description="Factor to scale image by", default=1.5)
     ) -> Path:
-        """Run a single prediction on the model"""
+        """Run the model"""
         # ... pre-processing ...
         output = self.net(input)
         # ... post-processing ...
         return output
 ```
 
-Edit your `run.py` file and fill in the functions with your own model's setup and prediction code. You might need to import parts of your model from another file.
+Edit your `run.py` file and fill in the functions with your own model's setup and run code. You might need to import parts of your model from another file.
 
 You also need to define the inputs to your model as arguments to the `run()` function, as demonstrated above. For each argument, you need to annotate with a type. The supported types are:
 
@@ -848,7 +848,7 @@ You can provide more information about the input with the `Input()` function, as
 - `choices`: For `str` or `int` types, a list of possible values for this input.
 - `deprecated`: Mark this input as deprecated with a message explaining what to use instead.
 
-There are some more advanced options you can pass, too. For more details, [take a look at the prediction interface documentation](python.md).
+There are some more advanced options you can pass, too. For more details, [take a look at the run interface documentation](python.md).
 
 Next, add the line `run: "run.py:Runner"` to your `cog.yaml`, so it looks something like this:
 
@@ -859,7 +859,7 @@ build:
 run: "run.py:Runner"
 ```
 
-That's it! To test this works, try running a prediction on the model:
+That's it! To test this works, try running the model:
 
 ```
 $ cog run -i image=@input.jpg
@@ -989,11 +989,11 @@ Type "help", "copyright", "credits" or "license" for more information.
 
 Inside this Docker environment you can do anything – run a Jupyter notebook, your training script, your evaluation script, and so on.
 
-## Run predictions on a model
+## Run a model
 
-Let's pretend we've trained a model. With Cog, we can define how to run predictions on it in a standard way, so other people can easily run predictions on it without having to hunt around for a prediction script.
+Let's pretend we've trained a model. With Cog, we can define how to run it in a standard way, so other people can easily run it without having to hunt around for a run script.
 
-We need to write some code to describe how predictions are run on the model.
+We need to write some code to describe how the model runs.
 
 Save this to `run.py`:
 
@@ -1011,13 +1011,13 @@ WEIGHTS = models.ResNet50_Weights.IMAGENET1K_V1
 
 class Runner(BaseRunner):
     def setup(self):
-        """Load the model into memory to make running multiple predictions efficient"""
+        """Load the model into memory to make running multiple inferences efficient"""
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model = models.resnet50(weights=WEIGHTS).to(self.device)
         self.model.eval()
 
     def run(self, image: Path = Input(description="Image to classify")) -> dict:
-        """Run a single prediction on the model"""
+        """Run the model"""
         img = Image.open(image).convert("RGB")
         preds = self.model(WEIGHTS.transforms()(img).unsqueeze(0).to(self.device))
         top3 = preds[0].softmax(0).topk(3)
@@ -1078,7 +1078,7 @@ Note: The first time you run `cog run`, the build process will be triggered to g
 
 ## Build an image
 
-We can bake your model's code, the trained weights, and the Docker environment into a Docker image. This image serves predictions with an HTTP server, and can be deployed to anywhere that Docker runs to serve real-time predictions.
+We can bake your model's code, the trained weights, and the Docker environment into a Docker image. This image serves an HTTP server, and can be deployed to anywhere that Docker runs to serve real-time inference.
 
 ```bash
 cog build -t resnet
@@ -1716,7 +1716,7 @@ Using a secret mount allows the private registry credentials to be securely pass
 
 # Run interface reference
 
-This document defines the API of the `cog` Python module, which is used to define the interface for running predictions on your model.
+This document defines the API of the `cog` Python module, which is used to define the interface for running your model.
 
 > [!TIP]
 > Run [`cog init`](getting-started-own-model.md#initialization) to generate an annotated `run.py` file that can be used as a starting point for setting up your model.
@@ -1765,7 +1765,7 @@ This document defines the API of the `cog` Python module, which is used to defin
 
 ## `BaseRunner`
 
-You define how Cog runs predictions on your model by defining a class that inherits from `BaseRunner`. It looks something like this:
+You define how Cog runs your model by defining a class that inherits from `BaseRunner`. It looks something like this:
 
 ```python
 from cog import BaseRunner, Path, Input
@@ -1773,14 +1773,14 @@ import torch
 
 class Runner(BaseRunner):
     def setup(self):
-        """Load the model into memory to make running multiple predictions efficient"""
+        """Load the model into memory to make running multiple inferences efficient"""
         self.model = torch.load("weights.pth")
 
     def run(self,
             image: Path = Input(description="Image to enlarge"),
             scale: float = Input(description="Factor to scale image by", default=1.5)
     ) -> Path:
-        """Run a single prediction on the model"""
+        """Run the model"""
         # ... pre-processing ...
         output = self.model(image)
         # ... post-processing ...
@@ -1793,7 +1793,7 @@ Your Runner class should define two methods: `setup()` and `run()`.
 
 ### `Runner.setup()`
 
-Prepare the model so multiple predictions run efficiently.
+Prepare the model so multiple runs are efficient.
 
 Use this _optional_ method to include expensive one-off operations like loading trained models, instantiating data transformations, etc.
 
@@ -1817,7 +1817,7 @@ While this will increase your image size and build time, it offers other advanta
 
 ### `Runner.run(**kwargs)`
 
-Run a single prediction.
+Run the model.
 
 This _required_ method is where you call the model that was loaded during `setup()`, but you may also want to add pre- and post-processing code here.
 
@@ -1843,7 +1843,7 @@ class Runner(BaseRunner):
         return "hello world";
 ```
 
-Models that have an async `run()` function can run predictions concurrently, up to the limit specified by [`concurrency.max`](yaml.md#max) in cog.yaml. Attempting to exceed this limit will return a 409 Conflict response.
+Models that have an async `run()` function can run concurrently, up to the limit specified by [`concurrency.max`](yaml.md#max) in cog.yaml. Attempting to exceed this limit will return a 409 Conflict response.
 
 ## `Input(**kwargs)`
 
@@ -1941,12 +1941,12 @@ from cog import BaseRunner, Path
 
 class Runner(BaseRunner):
     def run(self) -> list[Path]:
-        predictions = ["foo", "bar", "baz"]
+        items = ["foo", "bar", "baz"]
         output = []
-        for i, prediction in enumerate(predictions):
+        for i, item in enumerate(items):
             out_path = Path(f"/tmp/out-{i}.txt")
             with out_path.open("w") as f:
-                f.write(prediction)
+                f.write(item)
             output.append(out_path)
         return output
 ```
@@ -2031,7 +2031,7 @@ class Runner(BaseRunner):
 
 ## Metrics
 
-You can record custom metrics from your `run()` function to track model-specific data like token counts, timing breakdowns, or confidence scores. Metrics are included in the prediction response alongside the output.
+You can record custom metrics from your `run()` function to track model-specific data like token counts, timing breakdowns, or confidence scores. Metrics are included in the response alongside the output.
 
 ### Recording metrics
 
@@ -2056,7 +2056,7 @@ self.scope.metrics["token_count"] = 42
 del self.scope.metrics["token_count"]
 ```
 
-Metrics appear in the prediction response `metrics` field:
+Metrics appear in the response `metrics` field:
 
 ```json
 {
@@ -2153,11 +2153,11 @@ del self.scope.metrics["count"]
 self.record_metric("count", "now a string")
 ```
 
-Outside an active prediction, `self.record_metric()` and `self.scope` are silent no-ops — no need for `None` checks.
+Outside an active run, `self.record_metric()` and `self.scope` are silent no-ops — no need for `None` checks.
 
 ## Cancellation
 
-When a prediction is canceled (via the [cancel HTTP endpoint](http.md#post-predictionsprediction_idcancel) or a dropped connection), the Cog runtime interrupts the running `run()` function. The exception raised depends on whether the runner is sync or async:
+When a run is canceled (via the [cancel HTTP endpoint](http.md#post-predictionsprediction_idcancel) or a dropped connection), the Cog runtime interrupts the running `run()` function. The exception raised depends on whether the runner is sync or async:
 
 | Runner type             | Exception raised         |
 | ----------------------- | ------------------------ |
@@ -2170,9 +2170,9 @@ When a prediction is canceled (via the [cancel HTTP endpoint](http.md#post-predi
 from cog import CancelationException
 ```
 
-`CancelationException` is raised in **sync** runners when a prediction is cancelled. It is a `BaseException` subclass — **not** an `Exception` subclass. This means bare `except Exception` blocks in your run code will not accidentally catch it, matching the behavior of `KeyboardInterrupt` and `asyncio.CancelledError`.
+`CancelationException` is raised in **sync** runners when a run is cancelled. It is a `BaseException` subclass — **not** an `Exception` subclass. This means bare `except Exception` blocks in your run code will not accidentally catch it, matching the behavior of `KeyboardInterrupt` and `asyncio.CancelledError`.
 
-You do **not** need to handle this exception in normal runner code — the runtime manages cancellation automatically. However, if you need to run cleanup logic when a prediction is cancelled, you can catch it explicitly:
+You do **not** need to handle this exception in normal runner code — the runtime manages cancellation automatically. However, if you need to run cleanup logic when a run is cancelled, you can catch it explicitly:
 
 ```python
 from cog import BaseRunner, CancelationException, Path
@@ -2187,7 +2187,7 @@ class Runner(BaseRunner):
 ```
 
 > [!WARNING]
-> You **must** re-raise `CancelationException` after cleanup. Swallowing it will prevent the runtime from marking the prediction as canceled, and may result in the termination of the container.
+> You **must** re-raise `CancelationException` after cleanup. Swallowing it will prevent the runtime from marking the run as canceled, and may result in the termination of the container.
 
 `CancelationException` is available as:
 
@@ -2220,7 +2220,7 @@ These types can be used directly as input parameter types and output return type
 
 `cog.Path` is a subclass of Python's [`pathlib.Path`](https://docs.python.org/3/library/pathlib.html#basic-use) and can be used as a drop-in replacement. Any `os.PathLike` subclass is also accepted as an input type and treated as `cog.Path`.
 
-For models that return a `cog.Path` object, the prediction output returned by Cog's built-in HTTP server will be a URL.
+For models that return a `cog.Path` object, the output returned by Cog's built-in HTTP server will be a URL.
 
 This example takes an input file, resizes it, and returns the resized image:
 
@@ -2244,7 +2244,7 @@ class Runner(BaseRunner):
 > [!WARNING]  
 > `cog.File` is deprecated and will be removed in a future version of Cog. Use [`cog.Path`](#cogpath) instead.
 
-`cog.File` represents a _file handle_. For models that return a `cog.File` object, the prediction output returned by Cog's built-in HTTP server will be a URL.
+`cog.File` represents a _file handle_. For models that return a `cog.File` object, the output returned by Cog's built-in HTTP server will be a URL.
 
 ```python
 from cog import BaseRunner, File, Input
@@ -2285,7 +2285,7 @@ A runner's `Secret` inputs are represented in OpenAPI with the following schema:
 }
 ```
 
-Models uploaded to Replicate treat secret inputs differently throughout its system. When you create a prediction on Replicate, any value passed to a `Secret` input is redacted after being sent to the model.
+Models uploaded to Replicate treat secret inputs differently throughout its system. When you create a run on Replicate, any value passed to a `Secret` input is redacted after being sent to the model.
 
 > [!WARNING]  
 > Passing secret values to untrusted models can result in
@@ -2363,12 +2363,12 @@ from cog import BaseRunner, Path
 
 class Runner(BaseRunner):
     def run(self) -> list[Path]:
-        predictions = ["foo", "bar", "baz"]
+        items = ["foo", "bar", "baz"]
         output = []
-        for i, prediction in enumerate(predictions):
+        for i, item in enumerate(items):
             out_path = Path(f"/tmp/out-{i}.txt")
             with out_path.open("w") as f:
-                f.write(prediction)
+                f.write(item)
             output.append(out_path)
         return output
 ```
@@ -2508,7 +2508,7 @@ Cog's training API allows you to define a fine-tuning interface for an existing
 
 ## How it works
 
-If you've used Cog before, you've probably seen the [Runner](./python.md) class, which defines the interface for creating predictions against your model. Cog's training API works similarly: You define a Python function that describes the inputs and outputs of the training process. The inputs are things like training data, epochs, batch size, seed, etc. The output is typically a file with the fine-tuned weights.
+If you've used Cog before, you've probably seen the [Runner](./python.md) class, which defines the interface for running your model. Cog's training API works similarly: You define a Python function that describes the inputs and outputs of the training process. The inputs are things like training data, epochs, batch size, seed, etc. The output is typically a file with the fine-tuned weights.
 
 `cog.yaml`:
 
@@ -2807,12 +2807,12 @@ cog --version # should output the cog version number.
 Finally, make sure it works. Let's try running `afiaka87/glid-3-xl` locally:
 
 ```bash
-cog run 'r8.im/afiaka87/glid-3-xl' -i prompt="a fresh avocado floating in the water" -o prediction.json
+cog run 'r8.im/afiaka87/glid-3-xl' -i prompt="a fresh avocado floating in the water" -o output.json
 ```
 
-![Output from a running cog prediction in Windows Terminal](images/cog_model_output.png)
+![Output from a running cog run in Windows Terminal](images/cog_model_output.png)
 
-While your prediction is running, you can use `Task Manager` to keep an eye on GPU memory consumption:
+While your run is executing, you can use `Task Manager` to keep an eye on GPU memory consumption:
 
 ![Windows task manager will show the shared host/guest GPU memory](images/memory-usage.png)
 
@@ -2826,16 +2826,16 @@ Notice that output is returned as JSON for this model as it has a complex return
 sudo apt install jq
 ```
 
-The following bash uses `jq` to grab the first element in our prediction array and converts it from a base64 string to a `png` file.
+The following bash uses `jq` to grab the first element in our output array and converts it from a base64 string to a `png` file.
 
 ```bash
-jq -cs '.[0][0][0]' prediction.json | cut --delimiter "," --field 2 | base64 --ignore-garbage --decode > prediction.png
+jq -cs '.[0][0][0]' output.json | cut --delimiter "," --field 2 | base64 --ignore-garbage --decode > output.png
 ```
 
 When using WSL 2, you can access Windows binaries with the `.exe` extension. This lets you open photos easily within linux.
 
 ```bash
-explorer.exe prediction.png
+explorer.exe output.png
 ```
 
 ![a square image of an avocado, generated by the model](images/glide_out.png)
@@ -2853,7 +2853,7 @@ explorer.exe prediction.png
 
 # `cog.yaml` reference
 
-`cog.yaml` defines how to build a Docker image and how to run predictions on your model inside that image.
+`cog.yaml` defines how to build a Docker image and how to run your model inside that image.
 
 It has three keys: [`build`](#build), [`image`](#image), and [`run`](#run). It looks a bit like this:
 
@@ -3047,7 +3047,7 @@ This stanza describes the concurrency capabilities of the model. It has one opti
 
 ### `max`
 
-The maximum number of concurrent predictions the model can process. If this is set, the model must specify an [async `run()` method](python.md#async-runners-and-concurrency).
+The maximum number of concurrent runs the model can process. If this is set, the model must specify an [async `run()` method](python.md#async-runners-and-concurrency).
 
 For example:
 
diff --git a/docs/python.md b/docs/python.md
index f5f3bf1b8a..c86ac4b9b4 100644
--- a/docs/python.md
+++ b/docs/python.md
@@ -1,6 +1,6 @@
 # Run interface reference
 
-This document defines the API of the `cog` Python module, which is used to define the interface for running predictions on your model.
+This document defines the API of the `cog` Python module, which is used to define the interface for running your model.
 
 > [!TIP]
 > Run [`cog init`](getting-started-own-model.md#initialization) to generate an annotated `run.py` file that can be used as a starting point for setting up your model.
@@ -49,7 +49,7 @@ This document defines the API of the `cog` Python module, which is used to defin
 
 ## `BaseRunner`
 
-You define how Cog runs predictions on your model by defining a class that inherits from `BaseRunner`. It looks something like this:
+You define how Cog runs your model by defining a class that inherits from `BaseRunner`. It looks something like this:
 
 ```python
 from cog import BaseRunner, Path, Input
@@ -57,14 +57,14 @@ import torch
 
 class Runner(BaseRunner):
     def setup(self):
-        """Load the model into memory to make running multiple predictions efficient"""
+        """Load the model into memory to make running multiple inferences efficient"""
         self.model = torch.load("weights.pth")
 
     def run(self,
             image: Path = Input(description="Image to enlarge"),
             scale: float = Input(description="Factor to scale image by", default=1.5)
     ) -> Path:
-        """Run a single prediction on the model"""
+        """Run the model"""
         # ... pre-processing ...
         output = self.model(image)
         # ... post-processing ...
@@ -77,7 +77,7 @@ Your Runner class should define two methods: `setup()` and `run()`.
 
 ### `Runner.setup()`
 
-Prepare the model so multiple predictions run efficiently.
+Prepare the model so multiple runs are efficient.
 
 Use this _optional_ method to include expensive one-off operations like loading trained models, instantiating data transformations, etc.
 
@@ -101,7 +101,7 @@ While this will increase your image size and build time, it offers other advanta
 
 ### `Runner.run(**kwargs)`
 
-Run a single prediction.
+Run the model.
 
 This _required_ method is where you call the model that was loaded during `setup()`, but you may also want to add pre- and post-processing code here.
 
@@ -127,7 +127,7 @@ class Runner(BaseRunner):
         return "hello world";
 ```
 
-Models that have an async `run()` function can run predictions concurrently, up to the limit specified by [`concurrency.max`](yaml.md#max) in cog.yaml. Attempting to exceed this limit will return a 409 Conflict response.
+Models that have an async `run()` function can run concurrently, up to the limit specified by [`concurrency.max`](yaml.md#max) in cog.yaml. Attempting to exceed this limit will return a 409 Conflict response.
 
 ## `Input(**kwargs)`
 
@@ -225,12 +225,12 @@ from cog import BaseRunner, Path
 
 class Runner(BaseRunner):
     def run(self) -> list[Path]:
-        predictions = ["foo", "bar", "baz"]
+        items = ["foo", "bar", "baz"]
         output = []
-        for i, prediction in enumerate(predictions):
+        for i, item in enumerate(items):
             out_path = Path(f"/tmp/out-{i}.txt")
             with out_path.open("w") as f:
-                f.write(prediction)
+                f.write(item)
             output.append(out_path)
         return output
 ```
@@ -315,7 +315,7 @@ class Runner(BaseRunner):
 
 ## Metrics
 
-You can record custom metrics from your `run()` function to track model-specific data like token counts, timing breakdowns, or confidence scores. Metrics are included in the prediction response alongside the output.
+You can record custom metrics from your `run()` function to track model-specific data like token counts, timing breakdowns, or confidence scores. Metrics are included in the response alongside the output.
 
 ### Recording metrics
 
@@ -340,7 +340,7 @@ self.scope.metrics["token_count"] = 42
 del self.scope.metrics["token_count"]
 ```
 
-Metrics appear in the prediction response `metrics` field:
+Metrics appear in the response `metrics` field:
 
 ```json
 {
@@ -437,11 +437,11 @@ del self.scope.metrics["count"]
 self.record_metric("count", "now a string")
 ```
 
-Outside an active prediction, `self.record_metric()` and `self.scope` are silent no-ops — no need for `None` checks.
+Outside an active run, `self.record_metric()` and `self.scope` are silent no-ops — no need for `None` checks.
 
 ## Cancellation
 
-When a prediction is canceled (via the [cancel HTTP endpoint](http.md#post-predictionsprediction_idcancel) or a dropped connection), the Cog runtime interrupts the running `run()` function. The exception raised depends on whether the runner is sync or async:
+When a run is canceled (via the [cancel HTTP endpoint](http.md#post-predictionsprediction_idcancel) or a dropped connection), the Cog runtime interrupts the running `run()` function. The exception raised depends on whether the runner is sync or async:
 
 | Runner type             | Exception raised         |
 | ----------------------- | ------------------------ |
@@ -454,9 +454,9 @@ When a prediction is canceled (via the [cancel HTTP endpoint](http.md#post-predi
 from cog import CancelationException
 ```
 
-`CancelationException` is raised in **sync** runners when a prediction is cancelled. It is a `BaseException` subclass — **not** an `Exception` subclass. This means bare `except Exception` blocks in your run code will not accidentally catch it, matching the behavior of `KeyboardInterrupt` and `asyncio.CancelledError`.
+`CancelationException` is raised in **sync** runners when a run is cancelled. It is a `BaseException` subclass — **not** an `Exception` subclass. This means bare `except Exception` blocks in your run code will not accidentally catch it, matching the behavior of `KeyboardInterrupt` and `asyncio.CancelledError`.
 
-You do **not** need to handle this exception in normal runner code — the runtime manages cancellation automatically. However, if you need to run cleanup logic when a prediction is cancelled, you can catch it explicitly:
+You do **not** need to handle this exception in normal runner code — the runtime manages cancellation automatically. However, if you need to run cleanup logic when a run is cancelled, you can catch it explicitly:
 
 ```python
 from cog import BaseRunner, CancelationException, Path
@@ -471,7 +471,7 @@ class Runner(BaseRunner):
 ```
 
 > [!WARNING]
-> You **must** re-raise `CancelationException` after cleanup. Swallowing it will prevent the runtime from marking the prediction as canceled, and may result in the termination of the container.
+> You **must** re-raise `CancelationException` after cleanup. Swallowing it will prevent the runtime from marking the run as canceled, and may result in the termination of the container.
 
 `CancelationException` is available as:
 
@@ -504,7 +504,7 @@ These types can be used directly as input parameter types and output return type
 
 `cog.Path` is a subclass of Python's [`pathlib.Path`](https://docs.python.org/3/library/pathlib.html#basic-use) and can be used as a drop-in replacement. Any `os.PathLike` subclass is also accepted as an input type and treated as `cog.Path`.
 
-For models that return a `cog.Path` object, the prediction output returned by Cog's built-in HTTP server will be a URL.
+For models that return a `cog.Path` object, the output returned by Cog's built-in HTTP server will be a URL.
 
 This example takes an input file, resizes it, and returns the resized image:
 
@@ -528,7 +528,7 @@ class Runner(BaseRunner):
 > [!WARNING]  
 > `cog.File` is deprecated and will be removed in a future version of Cog. Use [`cog.Path`](#cogpath) instead.
 
-`cog.File` represents a _file handle_. For models that return a `cog.File` object, the prediction output returned by Cog's built-in HTTP server will be a URL.
+`cog.File` represents a _file handle_. For models that return a `cog.File` object, the output returned by Cog's built-in HTTP server will be a URL.
 
 ```python
 from cog import BaseRunner, File, Input
@@ -569,7 +569,7 @@ A runner's `Secret` inputs are represented in OpenAPI with the following schema:
 }
 ```
 
-Models uploaded to Replicate treat secret inputs differently throughout its system. When you create a prediction on Replicate, any value passed to a `Secret` input is redacted after being sent to the model.
+Models uploaded to Replicate treat secret inputs differently throughout its system. When you create a run on Replicate, any value passed to a `Secret` input is redacted after being sent to the model.
 
 > [!WARNING]  
 > Passing secret values to untrusted models can result in
@@ -647,12 +647,12 @@ from cog import BaseRunner, Path
 
 class Runner(BaseRunner):
     def run(self) -> list[Path]:
-        predictions = ["foo", "bar", "baz"]
+        items = ["foo", "bar", "baz"]
         output = []
-        for i, prediction in enumerate(predictions):
+        for i, item in enumerate(items):
             out_path = Path(f"/tmp/out-{i}.txt")
             with out_path.open("w") as f:
-                f.write(prediction)
+                f.write(item)
             output.append(out_path)
         return output
 ```
diff --git a/docs/training.md b/docs/training.md
index 026f6dc52a..715485dc11 100644
--- a/docs/training.md
+++ b/docs/training.md
@@ -7,7 +7,7 @@ Cog's training API allows you to define a fine-tuning interface for an existing
 
 ## How it works
 
-If you've used Cog before, you've probably seen the [Runner](./python.md) class, which defines the interface for creating predictions against your model. Cog's training API works similarly: You define a Python function that describes the inputs and outputs of the training process. The inputs are things like training data, epochs, batch size, seed, etc. The output is typically a file with the fine-tuned weights.
+If you've used Cog before, you've probably seen the [Runner](./python.md) class, which defines the interface for running your model. Cog's training API works similarly: You define a Python function that describes the inputs and outputs of the training process. The inputs are things like training data, epochs, batch size, seed, etc. The output is typically a file with the fine-tuned weights.
 
 `cog.yaml`:
 
diff --git a/docs/wsl2/wsl2.md b/docs/wsl2/wsl2.md
index bc3316d58a..a04223896f 100644
--- a/docs/wsl2/wsl2.md
+++ b/docs/wsl2/wsl2.md
@@ -175,12 +175,12 @@ cog --version # should output the cog version number.
 Finally, make sure it works. Let's try running `afiaka87/glid-3-xl` locally:
 
 ```bash
-cog run 'r8.im/afiaka87/glid-3-xl' -i prompt="a fresh avocado floating in the water" -o prediction.json
+cog run 'r8.im/afiaka87/glid-3-xl' -i prompt="a fresh avocado floating in the water" -o output.json
 ```
 
-![Output from a running cog prediction in Windows Terminal](images/cog_model_output.png)
+![Output from a running cog run in Windows Terminal](images/cog_model_output.png)
 
-While your prediction is running, you can use `Task Manager` to keep an eye on GPU memory consumption:
+While your run is executing, you can use `Task Manager` to keep an eye on GPU memory consumption:
 
 ![Windows task manager will show the shared host/guest GPU memory](images/memory-usage.png)
 
@@ -194,16 +194,16 @@ Notice that output is returned as JSON for this model as it has a complex return
 sudo apt install jq
 ```
 
-The following bash uses `jq` to grab the first element in our prediction array and converts it from a base64 string to a `png` file.
+The following bash uses `jq` to grab the first element in our output array and converts it from a base64 string to a `png` file.
 
 ```bash
-jq -cs '.[0][0][0]' prediction.json | cut --delimiter "," --field 2 | base64 --ignore-garbage --decode > prediction.png
+jq -cs '.[0][0][0]' output.json | cut --delimiter "," --field 2 | base64 --ignore-garbage --decode > output.png
 ```
 
 When using WSL 2, you can access Windows binaries with the `.exe` extension. This lets you open photos easily within linux.
 
 ```bash
-explorer.exe prediction.png
+explorer.exe output.png
 ```
 
 ![a square image of an avocado, generated by the model](images/glide_out.png)
diff --git a/docs/yaml.md b/docs/yaml.md
index 56fecdb923..535b9aafa4 100644
--- a/docs/yaml.md
+++ b/docs/yaml.md
@@ -1,6 +1,6 @@
 # `cog.yaml` reference
 
-`cog.yaml` defines how to build a Docker image and how to run predictions on your model inside that image.
+`cog.yaml` defines how to build a Docker image and how to run your model inside that image.
 
 It has three keys: [`build`](#build), [`image`](#image), and [`run`](#run). It looks a bit like this:
 
@@ -194,7 +194,7 @@ This stanza describes the concurrency capabilities of the model. It has one opti
 
 ### `max`
 
-The maximum number of concurrent predictions the model can process. If this is set, the model must specify an [async `run()` method](python.md#async-runners-and-concurrency).
+The maximum number of concurrent runs the model can process. If this is set, the model must specify an [async `run()` method](python.md#async-runners-and-concurrency).
 
 For example:
 
diff --git a/pkg/cli/predict.go b/pkg/cli/predict.go
index 51853136a1..a8c322215c 100644
--- a/pkg/cli/predict.go
+++ b/pkg/cli/predict.go
@@ -44,7 +44,7 @@ var (
 	inputJSON            string
 )
 
-const existingPredictExamples = `  # Run a prediction with named inputs
+const existingPredictExamples = `  # Run the model with named inputs
   cog predict -i prompt="a photo of a cat"
 
   # Pass a file as input
@@ -69,14 +69,15 @@ func newPredictCommand() *cobra.Command {
 func newPredictionCommand(use string, hidden bool) *cobra.Command {
 	cmd := &cobra.Command{
 		Use:   use + " [image]",
-		Short: "Run a prediction",
-		Long: `Run a prediction.
+		Short: "Run the model",
+		Long: `Run the model.
 
-If 'image' is passed, it will run the prediction on that Docker image.
+If 'image' is passed, it will run the model on that Docker image.
 It must be an image that has been built by Cog.
 
 Otherwise, it will build the model in the current directory and run
-the prediction on that.`,
+it.`,
+
 		Example:    strings.ReplaceAll(existingPredictExamples, "cog predict", "cog "+use),
 		RunE:       cmdPredict,
 		Args:       cobra.MaximumNArgs(1),
@@ -84,7 +85,7 @@ the prediction on that.`,
 		SuggestFor: []string{"infer"},
 	}
 	if hidden {
-		cmd.Short = "Run a prediction (deprecated, use cog run)"
+		cmd.Short = "Run the model (deprecated, use cog run)"
 	}
 
 	addUseCudaBaseImageFlag(cmd)
diff --git a/pkg/cli/serve.go b/pkg/cli/serve.go
index 11916ec0ff..b99562ce0c 100644
--- a/pkg/cli/serve.go
+++ b/pkg/cli/serve.go
@@ -23,8 +23,8 @@ var (
 func newServeCommand() *cobra.Command {
 	cmd := &cobra.Command{
 		Use:   "serve",
-		Short: "Run a prediction HTTP server",
-		Long: `Run a prediction HTTP server.
+		Short: "Run an HTTP server",
+		Long: `Run an HTTP server.
 
 Builds the model and starts an HTTP server that exposes the model's inputs
 and outputs as a REST API. Compatible with the Cog HTTP protocol.`,