browserbase · miguelg719 · May 13, 2026 · May 11, 2026 · May 11, 2026 · May 11, 2026
diff --git a/packages/evals/cli.ts b/packages/evals/cli.ts
@@ -2,13 +2,18 @@
  * Evals CLI entry point.
  *
  * Modes:
- *   - `evals` (no args)          → interactive REPL
- *   - `evals run <target> …`     → single-shot run with rich progress
- *   - `evals list [tier]`        → list discovered tasks
- *   - `evals config [sub]`       → print / get / set defaults
- *   - `evals experiments [sub]`  → inspect / compare Braintrust runs
- *   - `evals new <tier> <cat> <name>` → scaffold a task file
- *   - `evals help` / `-h`        → help
+ *   - `evals` (no args)              → interactive REPL
+ *   - `evals --quiet` / `evals -q`   → REPL with no banner / welcome / inline warnings
+ *   - `evals run <target> …`         → single-shot run with rich progress
+ *   - `evals list [tier]`            → list discovered tasks
+ *   - `evals config [sub]`           → print / get / set defaults
+ *   - `evals experiments [sub]`      → inspect / compare Braintrust runs
+ *   - `evals doctor` / `health`      → env-key + config + discovery health report
+ *   - `evals new <tier> <cat> <name>`→ scaffold a task file
+ *   - `evals help` / `-h`            → help
+ *
+ * Env vars:
+ *   - EVALS_NO_WELCOME=1             → suppress first-run welcome panel (REPL only)
  *
  * No child processes. All runs flow through framework/runEvals in-process.
  *
@@ -89,14 +94,19 @@ const args = process.argv.slice(2);
   process.on("SIGINT", () => void handleSignal("SIGINT"));
   process.on("SIGTERM", () => void handleSignal("SIGTERM"));
 
+  // REPL launch: zero args, or only `--quiet`/`-q` flags. Quiet flags are
+  // REPL-only (they suppress chrome); other args route to the argv switch.
+  const isQuietFlag = (a: string): boolean => a === "--quiet" || a === "-q";
+  const replLaunch = args.length === 0 || args.every(isQuietFlag);
+
   // Argv mode: Esc behaves like Ctrl+C. The REPL has its own keypress
   // handler that does cooperative-then-aggressive abort instead — this
   // path is only active when no arg-less REPL is running.
   //
   // Note: raw mode disables the OS-level Ctrl+C → SIGINT translation,
   // so we forward it ourselves.
   let cleanupArgvInput = (): void => {};
-  if (args.length > 0 && process.stdin.isTTY) {
+  if (!replLaunch && args.length > 0 && process.stdin.isTTY) {
     const readline = await import("node:readline");
     const wasRaw = process.stdin.isRaw;
     readline.emitKeypressEvents(process.stdin);
@@ -117,10 +127,16 @@ const args = process.argv.slice(2);
     };
   }
 
+  // Whether to write the first-run marker in `finally`. Help-only paths and
+  // the doctor command don't count as "first uses" — they're discovery
+  // actions. The REPL marks itself. Set by the dispatch outcome below.
+  let shouldMarkFirstRun = false;
+
   try {
-    if (args.length === 0) {
+    if (replLaunch) {
       const { startRepl } = await import("./tui/repl.js");
-      await startRepl(ENTRY_DIR);
+      const quiet = args.some(isQuietFlag);
+      await startRepl(ENTRY_DIR, { quiet });
       return;
     }
 
@@ -140,7 +156,7 @@ const args = process.argv.slice(2);
     const tree = buildCommandTree();
 
     const tokens = tokenizeArgv(args);
-    await dispatch(tree, tokens, {
+    const outcome = await dispatch(tree, tokens, {
       entryDir: ENTRY_DIR,
       getRegistry,
       setRegistry: (r) => {
@@ -149,10 +165,25 @@ const args = process.argv.slice(2);
       abortRef: null,
       contextPath: null,
     });
+
+    // Only count real handler invocations as "first use". Doctor is a
+    // diagnostic, not a first use; help/meta paths are discovery.
+    if (outcome.kind === "ran") {
+      const top = outcome.absolutePath[0];
+      shouldMarkFirstRun = top !== "doctor";
+    }
   } catch (err) {
     console.error(red(`Error: ${(err as Error).message}`));
     process.exitCode = 1;
   } finally {
+    if (shouldMarkFirstRun) {
+      try {
+        const { markFirstRunComplete } = await import("./tui/welcomeState.js");
+        markFirstRunComplete(ENTRY_DIR);
+      } catch {
+        // best-effort
+      }
+    }
     cleanupArgvInput();
   }
 })();
diff --git a/packages/evals/scripts/build-cli.ts b/packages/evals/scripts/build-cli.ts
@@ -50,6 +50,14 @@ if (fs.existsSync(distConfigPath)) {
         ...existing.defaults,
       };
     }
+    // Preserve the first-run welcome marker across rebuilds so a contributor
+    // who's already seen the welcome on the dist path doesn't see it again
+    // after every `pnpm run build:cli`. If the source has _meta and dist
+    // doesn't (fresh dist install), the source value is inherited via the
+    // sourceConfig literal — already correct.
+    if (existing._meta) {
+      sourceConfig._meta = { ...sourceConfig._meta, ...existing._meta };
+    }
   } catch {
     // invalid existing config – overwrite entirely
   }

diff --git a/packages/evals/tests/cli.test.ts b/packages/evals/tests/cli.test.ts
@@ -15,6 +15,18 @@ const SOURCE_CONFIG = path.join(
   "evals.config.json",
 );
 
+// File-level snapshot/restore: any `evals run …` invocation through the
+// real CLI writes `_meta.firstRunCompletedAt` into the source config
+// (because the test runs in source mode). Restore at the end so the
+// repo file stays pristine.
+let __fileLevelConfigSnapshot: string;
+beforeAll(() => {
+  __fileLevelConfigSnapshot = fs.readFileSync(SOURCE_CONFIG, "utf-8");
+});
+afterAll(() => {
+  fs.writeFileSync(SOURCE_CONFIG, __fileLevelConfigSnapshot);
+});
+
 async function runCli(
   args: string[],
 ): Promise<{ stdout: string; stderr: string; code: number }> {
@@ -38,6 +50,17 @@ async function runCli(
   }
 }
 
+function resetSourceWelcomeMeta(): void {
+  const config = JSON.parse(fs.readFileSync(SOURCE_CONFIG, "utf-8"));
+  delete config._meta;
+  fs.writeFileSync(SOURCE_CONFIG, JSON.stringify(config, null, 2) + "\n");
+}
+
+function readSourceWelcomeCompletedAt(): string | undefined {
+  const config = JSON.parse(fs.readFileSync(SOURCE_CONFIG, "utf-8"));
+  return config._meta?.firstRunCompletedAt;
+}
+
 describe("CLI entrypoint", () => {
   it("shows help", async () => {
     const { stdout, code } = await runCli(["-h"]);
@@ -59,6 +82,39 @@ describe("CLI entrypoint", () => {
     expect(stdout).toContain("compare");
   });
 
+  it("includes doctor in top-level help", async () => {
+    const { stdout, code } = await runCli(["-h"]);
+    expect(code).toBe(0);
+    expect(stdout).toContain("doctor");
+  });
+
+  it("shows doctor help via --help", async () => {
+    const { stdout, code } = await runCli(["doctor", "--help"]);
+    expect(code).toBe(0);
+    expect(stdout).toContain("evals doctor");
+    expect(stdout).toContain("--json");
+    // Hidden --probe flag must not appear
+    expect(stdout).not.toContain("--probe");
+  });
+
+  it("doctor --json emits a parseable report", async () => {
+    const { stdout, code } = await runCli(["doctor", "--json"]);
+    // --json always exits 0 regardless of verdict
+    expect(code).toBe(0);
+    const payload = JSON.parse(stdout);
+    expect(payload).toHaveProperty("verdict");
+    expect(payload).toHaveProperty("runtime.node");
+    expect(payload).toHaveProperty("keys.openai");
+    expect(Array.isArray(payload.reasons)).toBe(true);
+  });
+
+  it("health is an alias for doctor", async () => {
+    const { stdout, code } = await runCli(["health", "--json"]);
+    expect(code).toBe(0);
+    const payload = JSON.parse(stdout);
+    expect(payload).toHaveProperty("verdict");
+  });
+
   it("shows experiments compare help", async () => {
     const { stdout, code } = await runCli(["experiments", "compare", "-h"]);
     expect(code).toBe(0);
@@ -141,6 +197,20 @@ describe("CLI entrypoint", () => {
     },
   );
 
+  it("does not mark first-run complete for nested help invocations", async () => {
+    resetSourceWelcomeMeta();
+
+    for (const args of [
+      ["config", "set", "--help"],
+      ["experiments", "compare", "--help"],
+    ]) {
+      const { stdout, code } = await runCli(args);
+      expect(code).toBe(0);
+      expect(stdout).toContain("evals");
+      expect(readSourceWelcomeCompletedAt()).toBeUndefined();
+    }
+  });
+
   // Regression: help interception must not reach into value positions.
   // `config set <key> <value>` must surface a parse/value error, not silently
   // print help — otherwise `--help` would be a magical sentinel anywhere.