Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 42 additions & 11 deletions packages/evals/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,18 @@
* Evals CLI entry point.
*
* Modes:
* - `evals` (no args) → interactive REPL
* - `evals run <target> …` → single-shot run with rich progress
* - `evals list [tier]` → list discovered tasks
* - `evals config [sub]` → print / get / set defaults
* - `evals experiments [sub]` → inspect / compare Braintrust runs
* - `evals new <tier> <cat> <name>` → scaffold a task file
* - `evals help` / `-h` → help
* - `evals` (no args) → interactive REPL
* - `evals --quiet` / `evals -q` → REPL with no banner / welcome / inline warnings
* - `evals run <target> …` → single-shot run with rich progress
* - `evals list [tier]` → list discovered tasks
* - `evals config [sub]` → print / get / set defaults
* - `evals experiments [sub]` → inspect / compare Braintrust runs
* - `evals doctor` / `health` → env-key + config + discovery health report
* - `evals new <tier> <cat> <name>`→ scaffold a task file
* - `evals help` / `-h` → help
*
* Env vars:
* - EVALS_NO_WELCOME=1 → suppress first-run welcome panel (REPL only)
*
* No child processes. All runs flow through framework/runEvals in-process.
*
Expand Down Expand Up @@ -89,14 +94,19 @@ const args = process.argv.slice(2);
process.on("SIGINT", () => void handleSignal("SIGINT"));
process.on("SIGTERM", () => void handleSignal("SIGTERM"));

// REPL launch: zero args, or only `--quiet`/`-q` flags. Quiet flags are
// REPL-only (they suppress chrome); other args route to the argv switch.
const isQuietFlag = (a: string): boolean => a === "--quiet" || a === "-q";
const replLaunch = args.length === 0 || args.every(isQuietFlag);

// Argv mode: Esc behaves like Ctrl+C. The REPL has its own keypress
// handler that does cooperative-then-aggressive abort instead — this
// path is only active when no arg-less REPL is running.
//
// Note: raw mode disables the OS-level Ctrl+C → SIGINT translation,
// so we forward it ourselves.
let cleanupArgvInput = (): void => {};
if (args.length > 0 && process.stdin.isTTY) {
if (!replLaunch && args.length > 0 && process.stdin.isTTY) {
const readline = await import("node:readline");
const wasRaw = process.stdin.isRaw;
readline.emitKeypressEvents(process.stdin);
Expand All @@ -117,10 +127,16 @@ const args = process.argv.slice(2);
};
}

// Whether to write the first-run marker in `finally`. Help-only paths and
// the doctor command don't count as "first uses" — they're discovery
// actions. The REPL marks itself. Set by the dispatch outcome below.
let shouldMarkFirstRun = false;

try {
if (args.length === 0) {
if (replLaunch) {
const { startRepl } = await import("./tui/repl.js");
await startRepl(ENTRY_DIR);
const quiet = args.some(isQuietFlag);
await startRepl(ENTRY_DIR, { quiet });
return;
}

Expand All @@ -140,7 +156,7 @@ const args = process.argv.slice(2);
const tree = buildCommandTree();

const tokens = tokenizeArgv(args);
await dispatch(tree, tokens, {
const outcome = await dispatch(tree, tokens, {
entryDir: ENTRY_DIR,
getRegistry,
setRegistry: (r) => {
Expand All @@ -149,10 +165,25 @@ const args = process.argv.slice(2);
abortRef: null,
contextPath: null,
});

// Only count real handler invocations as "first use". Doctor is a
// diagnostic, not a first use; help/meta paths are discovery.
if (outcome.kind === "ran") {
const top = outcome.absolutePath[0];
shouldMarkFirstRun = top !== "doctor";
}
} catch (err) {
console.error(red(`Error: ${(err as Error).message}`));
process.exitCode = 1;
} finally {
if (shouldMarkFirstRun) {
try {
const { markFirstRunComplete } = await import("./tui/welcomeState.js");
markFirstRunComplete(ENTRY_DIR);
} catch {
// best-effort
}
}
cleanupArgvInput();
}
})();
8 changes: 8 additions & 0 deletions packages/evals/scripts/build-cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,14 @@ if (fs.existsSync(distConfigPath)) {
...existing.defaults,
};
}
// Preserve the first-run welcome marker across rebuilds so a contributor
// who's already seen the welcome on the dist path doesn't see it again
// after every `pnpm run build:cli`. If the source has _meta and dist
// doesn't (fresh dist install), the source value is inherited via the
// sourceConfig literal — already correct.
if (existing._meta) {
sourceConfig._meta = { ...sourceConfig._meta, ...existing._meta };
}
} catch {
// invalid existing config – overwrite entirely
}
Expand Down
70 changes: 70 additions & 0 deletions packages/evals/tests/cli.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,18 @@ const SOURCE_CONFIG = path.join(
"evals.config.json",
);

// File-level snapshot/restore: any `evals run …` invocation through the
// real CLI writes `_meta.firstRunCompletedAt` into the source config
// (because the test runs in source mode). Restore at the end so the
// repo file stays pristine.
let __fileLevelConfigSnapshot: string;
beforeAll(() => {
__fileLevelConfigSnapshot = fs.readFileSync(SOURCE_CONFIG, "utf-8");
});
afterAll(() => {
fs.writeFileSync(SOURCE_CONFIG, __fileLevelConfigSnapshot);
});

async function runCli(
args: string[],
): Promise<{ stdout: string; stderr: string; code: number }> {
Expand All @@ -38,6 +50,17 @@ async function runCli(
}
}

function resetSourceWelcomeMeta(): void {
const config = JSON.parse(fs.readFileSync(SOURCE_CONFIG, "utf-8"));
delete config._meta;
fs.writeFileSync(SOURCE_CONFIG, JSON.stringify(config, null, 2) + "\n");
}

function readSourceWelcomeCompletedAt(): string | undefined {
const config = JSON.parse(fs.readFileSync(SOURCE_CONFIG, "utf-8"));
return config._meta?.firstRunCompletedAt;
}

describe("CLI entrypoint", () => {
it("shows help", async () => {
const { stdout, code } = await runCli(["-h"]);
Expand All @@ -59,6 +82,39 @@ describe("CLI entrypoint", () => {
expect(stdout).toContain("compare");
});

it("includes doctor in top-level help", async () => {
const { stdout, code } = await runCli(["-h"]);
expect(code).toBe(0);
expect(stdout).toContain("doctor");
});

it("shows doctor help via --help", async () => {
const { stdout, code } = await runCli(["doctor", "--help"]);
expect(code).toBe(0);
expect(stdout).toContain("evals doctor");
expect(stdout).toContain("--json");
// Hidden --probe flag must not appear
expect(stdout).not.toContain("--probe");
});

it("doctor --json emits a parseable report", async () => {
const { stdout, code } = await runCli(["doctor", "--json"]);
// --json always exits 0 regardless of verdict
expect(code).toBe(0);
const payload = JSON.parse(stdout);
expect(payload).toHaveProperty("verdict");
expect(payload).toHaveProperty("runtime.node");
expect(payload).toHaveProperty("keys.openai");
expect(Array.isArray(payload.reasons)).toBe(true);
});

it("health is an alias for doctor", async () => {
const { stdout, code } = await runCli(["health", "--json"]);
expect(code).toBe(0);
const payload = JSON.parse(stdout);
expect(payload).toHaveProperty("verdict");
});

it("shows experiments compare help", async () => {
const { stdout, code } = await runCli(["experiments", "compare", "-h"]);
expect(code).toBe(0);
Expand Down Expand Up @@ -141,6 +197,20 @@ describe("CLI entrypoint", () => {
},
);

it("does not mark first-run complete for nested help invocations", async () => {
resetSourceWelcomeMeta();

for (const args of [
["config", "set", "--help"],
["experiments", "compare", "--help"],
]) {
const { stdout, code } = await runCli(args);
expect(code).toBe(0);
expect(stdout).toContain("evals");
expect(readSourceWelcomeCompletedAt()).toBeUndefined();
}
});

// Regression: help interception must not reach into value positions.
// `config set <key> <value>` must surface a parse/value error, not silently
// print help — otherwise `--help` would be a magical sentinel anywhere.
Expand Down
Loading
Loading