remove some braintrust headers

2026-03-27 10:09:26 +08:00 · 2026-02-25 19:11:56 +00:00
parent 9b08864e94
commit e65642b752
10 changed files with 555 additions and 117 deletions
--- a/packages/evals/AGENTS.md
+++ b/packages/evals/AGENTS.md
@@ -76,16 +76,6 @@ mise run eval:upload
 mise run --force eval:upload
 ```

-Or directly (no caching, always runs):
-
-```bash
-cd packages/evals
-npx tsx src/runner.ts
-
-# Single scenario, baseline mode
-EVAL_BASELINE=true EVAL_SCENARIO=auth-rls-new-project npx tsx src/runner.ts
-```
-
 ## Baseline Mode

 Set `EVAL_BASELINE=true` to run scenarios **without** skills. By default,
--- a/packages/evals/Dockerfile
+++ b/packages/evals/Dockerfile
@@ -29,12 +29,13 @@ RUN npm --prefix packages/skills-build run build
 # ---------- Stage 2: runtime ----------
 FROM node:22-slim

-# Install Docker CLI and curl (needed for supabase CLI install)
+# Install Docker CLI, psql client, and curl (needed for supabase CLI install)
 RUN apt-get update && apt-get install -y --no-install-recommends \
    git \
    curl \
    ca-certificates \
    docker.io \
+    postgresql-client \
    && rm -rf /var/lib/apt/lists/*

 # Install supabase CLI binary (pinned version)
--- a/packages/evals/src/runner.ts
+++ b/packages/evals/src/runner.ts
@@ -1,11 +1,21 @@
 import { existsSync, readdirSync, readFileSync } from "node:fs";
 import { join, resolve } from "node:path";
 import { runAgent } from "./runner/agent.js";
-import { uploadToBraintrust } from "./runner/braintrust.js";
+import {
+	initBraintrustLogger,
+	logScenarioToLogger,
+	uploadToBraintrust,
+} from "./runner/braintrust.js";
 import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
 import { preflight } from "./runner/preflight.js";
 import { listModifiedFiles, printSummary } from "./runner/results.js";
 import { createWorkspace } from "./runner/scaffold.js";
+import {
+	assertionsPassedScorer,
+	finalResultScorer,
+	referenceFilesUsageScorer,
+	skillUsageScorer,
+} from "./runner/scorers.js";
 import {
 	getKeys,
 	resetDB,
@@ -24,9 +34,11 @@ import type { EvalRunResult, EvalScenario } from "./types.js";
 // ---------------------------------------------------------------------------

 const DEFAULT_MODEL = "claude-sonnet-4-5-20250929";
+const DEFAULT_SKILL = "supabase";
 const AGENT_TIMEOUT = 30 * 60 * 1000; // 30 minutes

 const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL;
+const skillName = process.env.EVAL_SKILL ?? DEFAULT_SKILL;
 const scenarioFilter = process.env.EVAL_SCENARIO;
 const isBaseline = process.env.EVAL_BASELINE === "true";
 const skillEnabled = !isBaseline;
@@ -107,12 +119,14 @@ async function runEval(

 		// 3. Run the agent
 		console.log(`  Running agent (${model})...`);
+		const startedAt = Date.now();
 		const agentResult = await runAgent({
 			cwd: workspacePath,
 			prompt,
 			model,
 			timeout: AGENT_TIMEOUT,
 			skillEnabled,
+			skillName: skillEnabled ? skillName : undefined,
 		});
 		console.log(
 			`  Agent finished in ${(agentResult.duration / 1000).toFixed(1)}s`,
@@ -149,6 +163,26 @@ async function runEval(
 		// 6. Build transcript summary
 		const summary = buildTranscriptSummary(agentResult.events);

+		// 7. Load expectedReferenceFiles from EVAL.ts (if declared)
+		const { expectedReferenceFiles = [] } = await import(evalFilePath).catch(
+			() => ({ expectedReferenceFiles: [] as string[] }),
+		);
+
+		// 8. Run scorers
+		const skillScore = skillUsageScorer(summary, skillName);
+		const refScore = referenceFilesUsageScorer(summary, expectedReferenceFiles);
+		const assertScore = assertionsPassedScorer({
+			testsPassed: testResult.passedCount,
+			testsTotal: testResult.totalCount,
+			status: testResult.passed ? "passed" : "failed",
+		} as EvalRunResult);
+		const finalScore = finalResultScorer({
+			status: testResult.passed ? "passed" : "failed",
+			testsPassed: testResult.passedCount,
+			testsTotal: testResult.totalCount,
+			passThreshold: passThreshold ?? undefined,
+		} as EvalRunResult);
+
 		const result: EvalRunResult = {
 			scenario: scenario.id,
 			agent: "claude-code",
@@ -166,6 +200,23 @@ async function runEval(
 			costUsd: summary.totalCostUsd ?? undefined,
 			prompt,
 			individualTests: testResult.individualTests,
+			startedAt,
+			durationApiMs: summary.totalDurationApiMs,
+			totalInputTokens: summary.totalInputTokens,
+			totalOutputTokens: summary.totalOutputTokens,
+			totalCacheReadTokens: summary.totalCacheReadTokens,
+			totalCacheCreationTokens: summary.totalCacheCreationTokens,
+			modelUsage: summary.modelUsage,
+			toolErrorCount: summary.toolErrorCount,
+			permissionDenialCount: summary.permissionDenialCount,
+			loadedSkills: summary.skills,
+			referenceFilesRead: summary.referenceFilesRead,
+			scores: {
+				skillUsage: skillScore.score,
+				referenceFilesUsage: refScore.score,
+				assertionsPassed: assertScore.score,
+				finalResult: finalScore.score,
+			},
 		};

 		// 7. Persist results
@@ -239,6 +290,9 @@ async function main() {
 	const results: EvalRunResult[] = [];
 	const transcripts = new Map<string, TranscriptSummary>();

+	const braintrustUpload = process.env.BRAINTRUST_UPLOAD === "true";
+	const logger = braintrustUpload ? initBraintrustLogger() : undefined;
+
 	try {
 		for (const scenario of scenarios) {
 			// Reset the database before each scenario for a clean slate.
@@ -250,16 +304,22 @@ async function main() {
 			if (transcript) {
 				transcripts.set(result.scenario, transcript);
 			}
+
+			// Log immediately after each scenario for real-time visibility.
+			if (logger) {
+				logScenarioToLogger(logger, result, transcript);
+			}
 		}
 	} finally {
 		stopSupabase();
+		await logger?.flush();
 	}

 	// Use the results dir from the first result (all share the same timestamp)
 	const resultsDir = results.find((r) => r.resultsDir)?.resultsDir;
 	printSummary(results, resultsDir);

-	if (process.env.BRAINTRUST_UPLOAD === "true") {
+	if (braintrustUpload) {
 		console.log("\nUploading to Braintrust...");
 		await uploadToBraintrust(results, {
 			model,
--- a/packages/evals/src/runner/agent.ts
+++ b/packages/evals/src/runner/agent.ts
@@ -26,6 +26,12 @@ export interface AgentRunResult {
 * and has access to the local Supabase MCP server so it can apply migrations
 * and query the real database. --strict-mcp-config ensures only the local
 * Supabase instance is reachable — no host MCP servers leak in.
+ *
+ * --setting-sources project,local prevents skills from the user's global
+ * ~/.agents/skills/ from leaking into the eval environment.
+ *
+ * When skillEnabled, --agents injects the target skill directly into the
+ * agent's context, guaranteeing it is present (not just discoverable).
 */
 export async function runAgent(opts: {
 	cwd: string;
@@ -33,6 +39,8 @@ export async function runAgent(opts: {
 	model: string;
 	timeout: number;
 	skillEnabled: boolean;
+	/** Skill name to inject via --agents (e.g. "supabase"). Used when skillEnabled. */
+	skillName?: string;
 }): Promise<AgentRunResult> {
 	const start = Date.now();

@@ -62,10 +70,26 @@ export async function runAgent(opts: {
 		"--mcp-config",
 		mcpConfig,
 		"--strict-mcp-config",
+		// Prevent skills from the user's global ~/.agents/skills/ from leaking
+		// into the eval environment. Only workspace (project) and local sources
+		// are loaded, so the eval sees only what was explicitly installed.
+		"--setting-sources",
+		"project,local",
 	];

-	// Disable skills for baseline runs so the agent relies on innate knowledge
-	if (!opts.skillEnabled) {
+	if (opts.skillEnabled && opts.skillName) {
+		// Inject the target skill directly into the agent context via --agents.
+		// This guarantees the skill is embedded in the subagent's context at
+		// startup (not just available as a slash command).
+		const agentsDef = JSON.stringify({
+			main: {
+				description: `Supabase developer agent with ${opts.skillName} skill`,
+				skills: [opts.skillName],
+			},
+		});
+		args.push("--agents", agentsDef);
+	} else if (!opts.skillEnabled) {
+		// Baseline runs: disable all skills so the agent relies on innate knowledge
 		args.push("--disable-slash-commands");
 	}

--- a/packages/evals/src/runner/braintrust.ts
+++ b/packages/evals/src/runner/braintrust.ts
@@ -1,14 +1,119 @@
 import assert from "node:assert";
-import { init } from "braintrust";
+import { init, initLogger, type Logger } from "braintrust";
 import type { EvalRunResult } from "../types.js";
 import type { TranscriptSummary } from "./transcript.js";

-/** Convert a test name to a snake_case score key. */
-function toScoreKey(name: string): string {
-	return `test_${name
-		.toLowerCase()
-		.replace(/[^a-z0-9]+/g, "_")
-		.replace(/^_|_$/g, "")}`;
+/**
+ * Initialize a Braintrust project logger for real-time per-scenario logging.
+ * Call this once at startup and pass the logger to logScenarioToLogger().
+ */
+export function initBraintrustLogger(): Logger<true> {
+	assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
+	assert(process.env.BRAINTRUST_PROJECT_ID, "BRAINTRUST_PROJECT_ID is not set");
+	return initLogger({
+		projectId: process.env.BRAINTRUST_PROJECT_ID,
+		asyncFlush: true,
+	});
+}
+
+/**
+ * Log a single scenario result to the Braintrust project logger in real-time.
+ * This runs alongside the experiment upload, giving immediate visibility in
+ * the project log as each scenario completes.
+ */
+export function logScenarioToLogger(
+	logger: Logger<true>,
+	r: EvalRunResult,
+	transcript?: TranscriptSummary,
+): void {
+	const scores: Record<string, number> = {
+		skill_usage: r.scores?.skillUsage ?? 0,
+		reference_files_usage: r.scores?.referenceFilesUsage ?? 0,
+		assertions_passed: r.scores?.assertionsPassed ?? 0,
+		final_result: r.scores?.finalResult ?? 0,
+	};
+
+	const metadata: Record<string, unknown> = {
+		agent: r.agent,
+		model: r.model,
+		skillEnabled: r.skillEnabled,
+		testsPassed: r.testsPassed,
+		testsTotal: r.testsTotal,
+		toolCallCount: r.toolCallCount ?? 0,
+		contextWindowUsed:
+			(r.totalInputTokens ?? 0) +
+			(r.totalCacheReadTokens ?? 0) +
+			(r.totalCacheCreationTokens ?? 0),
+		totalOutputTokens: r.totalOutputTokens,
+		modelUsage: r.modelUsage,
+		toolErrorCount: r.toolErrorCount,
+		permissionDenialCount: r.permissionDenialCount,
+		loadedSkills: r.loadedSkills,
+		referenceFilesRead: r.referenceFilesRead,
+		...(r.costUsd != null ? { costUsd: r.costUsd } : {}),
+		...(r.error ? { error: r.error } : {}),
+	};
+
+	const spanOptions = r.startedAt
+		? { name: r.scenario, startTime: r.startedAt / 1000 }
+		: { name: r.scenario };
+
+	if (transcript && transcript.toolCalls.length > 0) {
+		logger.traced((span) => {
+			span.log({
+				input: {
+					scenario: r.scenario,
+					prompt: r.prompt ?? "",
+					skillEnabled: r.skillEnabled,
+				},
+				output: {
+					status: r.status,
+					agentOutput: r.agentOutput,
+					filesModified: r.filesModified,
+					testOutput: r.testOutput,
+				},
+				expected: { testsTotal: r.testsTotal },
+				scores,
+				metadata,
+			});
+
+			for (const tc of transcript.toolCalls) {
+				span.traced(
+					(childSpan) => {
+						childSpan.log({
+							input: { tool: tc.tool, args: tc.input },
+							output: {
+								preview: tc.outputPreview,
+								isError: tc.isError,
+								...(tc.stderr ? { stderr: tc.stderr } : {}),
+							},
+							metadata: { toolUseId: tc.toolUseId },
+						});
+					},
+					{ name: `tool:${tc.tool}` },
+				);
+			}
+		}, spanOptions);
+	} else {
+		logger.traced((span) => {
+			span.log({
+				input: {
+					scenario: r.scenario,
+					prompt: r.prompt ?? "",
+					skillEnabled: r.skillEnabled,
+				},
+				output: {
+					status: r.status,
+					agentOutput: r.agentOutput,
+					filesModified: r.filesModified,
+					testOutput: r.testOutput,
+				},
+				expected: { testsTotal: r.testsTotal },
+				scores,
+				metadata,
+			});
+		}, spanOptions);
+	}
 }

 /**
@@ -18,8 +123,8 @@ function toScoreKey(name: string): string {
 * - input: scenario ID, prompt content, skillEnabled flag
 * - output: status, agent output, files modified, test output
 * - expected: total tests, pass threshold
- * - scores: pass (0|1), test_pass_rate (0-1), per-test scores
- * - metadata: model, duration, cost, tool call count, files modified
+ * - scores: skill_usage, reference_files_usage, assertions_passed, final_result
+ * - metadata: agent, model, skillEnabled, test counts, tool calls, context window, output tokens, model usage, errors, cost
 * - spans: one child span per agent tool call (when transcript available)
 */
 export async function uploadToBraintrust(
@@ -50,20 +155,11 @@ export async function uploadToBraintrust(
 	for (const r of results) {
 		const transcript = opts.transcripts.get(r.scenario);

-		// Build per-test scores
-		const perTestScores: Record<string, number> = {};
-		if (r.individualTests) {
-			for (const [testName, didPass] of Object.entries(r.individualTests)) {
-				perTestScores[toScoreKey(testName)] = didPass ? 1 : 0;
-			}
-		}
-
-		const testPassRate = r.testsTotal > 0 ? r.testsPassed / r.testsTotal : 0;
-
 		const scores: Record<string, number> = {
-			pass: r.status === "passed" ? 1 : 0,
-			test_pass_rate: testPassRate,
-			...perTestScores,
+			skill_usage: r.scores?.skillUsage ?? 0,
+			reference_files_usage: r.scores?.referenceFilesUsage ?? 0,
+			assertions_passed: r.scores?.assertionsPassed ?? 0,
+			final_result: r.scores?.finalResult ?? 0,
 		};

 		const input = {
@@ -88,38 +184,52 @@ export async function uploadToBraintrust(
 			agent: r.agent,
 			model: r.model,
 			skillEnabled: r.skillEnabled,
-			duration: r.duration,
 			testsPassed: r.testsPassed,
 			testsTotal: r.testsTotal,
 			toolCallCount: r.toolCallCount ?? 0,
-			filesModified: r.filesModified,
+			contextWindowUsed:
+				(r.totalInputTokens ?? 0) +
+				(r.totalCacheReadTokens ?? 0) +
+				(r.totalCacheCreationTokens ?? 0),
+			totalOutputTokens: r.totalOutputTokens,
+			modelUsage: r.modelUsage,
+			toolErrorCount: r.toolErrorCount,
+			permissionDenialCount: r.permissionDenialCount,
+			loadedSkills: r.loadedSkills,
+			referenceFilesRead: r.referenceFilesRead,
 			...(r.costUsd != null ? { costUsd: r.costUsd } : {}),
 			...(r.error ? { error: r.error } : {}),
 		};

-		if (transcript && transcript.toolCalls.length > 0) {
-			// Use traced() to create a root span with child spans for tool calls
-			experiment.traced(
-				(span) => {
-					span.log({ input, output, expected, scores, metadata });
+		const spanOptions = r.startedAt
+			? { name: r.scenario, startTime: r.startedAt / 1000 }
+			: { name: r.scenario };

-					for (const tc of transcript.toolCalls) {
-						span.traced(
-							(childSpan) => {
-								childSpan.log({
-									input: { tool: tc.tool, args: tc.input },
-									output: { preview: tc.outputPreview },
-									metadata: { toolUseId: tc.toolUseId },
-								});
-							},
-							{ name: `tool:${tc.tool}` },
-						);
-					}
-				},
-				{ name: r.scenario },
-			);
+		if (transcript && transcript.toolCalls.length > 0) {
+			experiment.traced((span) => {
+				span.log({ input, output, expected, scores, metadata });
+
+				for (const tc of transcript.toolCalls) {
+					span.traced(
+						(childSpan) => {
+							childSpan.log({
+								input: { tool: tc.tool, args: tc.input },
+								output: {
+									preview: tc.outputPreview,
+									isError: tc.isError,
+									...(tc.stderr ? { stderr: tc.stderr } : {}),
+								},
+								metadata: { toolUseId: tc.toolUseId },
+							});
+						},
+						{ name: `tool:${tc.tool}` },
+					);
+				}
+			}, spanOptions);
 		} else {
-			experiment.log({ input, output, expected, scores, metadata });
+			experiment.traced((span) => {
+				span.log({ input, output, expected, scores, metadata });
+			}, spanOptions);
 		}
 	}

--- a/packages/evals/src/runner/scaffold.ts
+++ b/packages/evals/src/runner/scaffold.ts
@@ -1,4 +1,3 @@
-import { execFileSync } from "node:child_process";
 import {
 	cpSync,
 	existsSync,
@@ -6,43 +5,21 @@ import {
 	mkdtempSync,
 	readdirSync,
 	rmSync,
+	writeFileSync,
 } from "node:fs";
 import { tmpdir } from "node:os";
-import { dirname, join, resolve } from "node:path";
-import { fileURLToPath } from "node:url";
+import { join } from "node:path";
 import { EVAL_PROJECT_DIR } from "./supabase-setup.js";

-const __filename = fileURLToPath(import.meta.url);
-const __dirname = dirname(__filename);
-
-/** Resolve the `skills` binary from the evals package node_modules. */
-function resolveSkillsBin(): string {
-	// __dirname is packages/evals/src/runner/ (or compiled equivalent)
-	// Walk up to packages/evals/ and into node_modules/.bin/skills
-	const bin = resolve(__dirname, "..", "..", "node_modules", ".bin", "skills");
-	if (existsSync(bin)) return bin;
-	throw new Error(`skills binary not found at ${bin}. Run npm install.`);
-}
-
-/** Walk up from cwd to find the repository root (contains skills/ and packages/). */
-function findRepoRoot(): string {
-	let dir = process.cwd();
-	for (let i = 0; i < 10; i++) {
-		if (existsSync(join(dir, "skills")) && existsSync(join(dir, "packages"))) {
-			return dir;
-		}
-		const parent = resolve(dir, "..");
-		if (parent === dir) break;
-		dir = parent;
-	}
-	throw new Error("Could not find repository root (skills/ + packages/)");
-}
-
 /**
 * Create an isolated workspace for an eval run.
 *
- * 1. Copy the eval directory to a temp folder (excluding EVAL.ts)
- * 2. Optionally install skills via the `skills` CLI so Claude Code can discover them
+ * 1. Copy the eval directory to a temp folder (excluding EVAL.ts/EVAL.tsx)
+ * 2. Seed with the eval project's supabase/config.toml
+ *
+ * Skills are injected via the --agents flag in agent.ts (not installed into
+ * the workspace here). Combined with --setting-sources project,local, this
+ * prevents host ~/.agents/skills/ from leaking into the eval environment.
 *
 * Returns the path to the workspace and a cleanup function.
 */
@@ -50,10 +27,9 @@ export function createWorkspace(opts: {
 	evalDir: string;
 	skillEnabled: boolean;
 }): { workspacePath: string; cleanup: () => void } {
-	const repoRoot = findRepoRoot();
 	const workspacePath = mkdtempSync(join(tmpdir(), "supabase-eval-"));

-	// Copy eval directory, excluding EVAL.ts (hidden from agent)
+	// Copy eval directory, excluding EVAL.ts/EVAL.tsx (hidden from agent)
 	const entries = readdirSync(opts.evalDir, { withFileTypes: true });
 	for (const entry of entries) {
 		if (entry.name === "EVAL.ts" || entry.name === "EVAL.tsx") continue;
@@ -62,6 +38,23 @@ export function createWorkspace(opts: {
 		cpSync(src, dest, { recursive: true });
 	}

+	// Add .mcp.json so the agent connects to the local Supabase MCP server
+	writeFileSync(
+		join(workspacePath, ".mcp.json"),
+		JSON.stringify(
+			{
+				mcpServers: {
+					"local-supabase": {
+						type: "http",
+						url: "http://localhost:54321/mcp",
+					},
+				},
+			},
+			null,
+			"\t",
+		),
+	);
+
 	// Seed the workspace with the eval project's supabase/config.toml so the
 	// agent can run `supabase db push` against the shared local instance without
 	// needing to run `supabase init` or `supabase start` first.
@@ -72,26 +65,6 @@ export function createWorkspace(opts: {
 		cpSync(projectConfigSrc, join(destSupabaseDir, "config.toml"));
 	}

-	// Install skills into the workspace via the `skills` CLI
-	if (opts.skillEnabled) {
-		const skillsDir = join(repoRoot, "skills");
-		if (existsSync(skillsDir)) {
-			const skillsBin = resolveSkillsBin();
-			const args = ["add", skillsDir, "-a", "claude-code", "-y"];
-
-			const skillFilter = process.env.EVAL_SKILL;
-			if (skillFilter) {
-				args.push("--skill", skillFilter);
-			}
-
-			execFileSync(skillsBin, args, {
-				cwd: workspacePath,
-				stdio: "pipe",
-				timeout: 60_000,
-			});
-		}
-	}
-
 	return {
 		workspacePath,
 		cleanup: () => {
--- a/packages/evals/src/runner/scorers.ts
+++ b/packages/evals/src/runner/scorers.ts
@@ -0,0 +1,94 @@
+import type { EvalRunResult } from "../types.js";
+import type { TranscriptSummary } from "./transcript.js";
+
+export interface ScoreResult {
+	name: string;
+	/** 0.0 – 1.0 */
+	score: number;
+	metadata?: Record<string, unknown>;
+}
+
+/**
+ * skillUsageScorer — 1 if the target skill was in the agent's context, 0 otherwise.
+ *
+ * Detected via the `skills` array in the system init event of the NDJSON transcript.
+ * Combined with `--setting-sources project,local` in agent.ts, this array is clean
+ * (no host skill leakage), so its presence is a reliable signal.
+ */
+export function skillUsageScorer(
+	transcript: TranscriptSummary,
+	skillName: string,
+): ScoreResult {
+	const loaded = transcript.skills.includes(skillName);
+	return {
+		name: "skill_usage",
+		score: loaded ? 1 : 0,
+		metadata: {
+			loadedSkills: transcript.skills,
+			targetSkill: skillName,
+		},
+	};
+}
+
+/**
+ * referenceFilesUsageScorer — fraction of expected reference files actually read.
+ *
+ * Detected via Read tool calls whose file_path matches "/.agents/skills/*\/references/".
+ * The expectedReferenceFiles list is declared in each EVAL.ts and should match the
+ * "Skill References Exercised" table in the corresponding scenarios/*.md file.
+ */
+export function referenceFilesUsageScorer(
+	transcript: TranscriptSummary,
+	expectedReferenceFiles: string[],
+): ScoreResult {
+	if (expectedReferenceFiles.length === 0) {
+		return {
+			name: "reference_files_usage",
+			score: 1,
+			metadata: { skipped: true },
+		};
+	}
+	const read = transcript.referenceFilesRead;
+	const hits = expectedReferenceFiles.filter((f) => read.includes(f)).length;
+	return {
+		name: "reference_files_usage",
+		score: hits / expectedReferenceFiles.length,
+		metadata: {
+			expected: expectedReferenceFiles,
+			read,
+			hits,
+			total: expectedReferenceFiles.length,
+		},
+	};
+}
+
+/**
+ * assertionsPassedScorer — ratio of vitest assertions passed vs total.
+ */
+export function assertionsPassedScorer(result: EvalRunResult): ScoreResult {
+	const score =
+		result.testsTotal > 0 ? result.testsPassed / result.testsTotal : 0;
+	return {
+		name: "assertions_passed",
+		score,
+		metadata: { passed: result.testsPassed, total: result.testsTotal },
+	};
+}
+
+/**
+ * finalResultScorer — 1 if the agent met the pass threshold, 0 otherwise.
+ *
+ * A result is "passed" when assertionsPassed >= passThreshold (set per scenario
+ * in scenarios/*.md). This is the binary outcome used for Braintrust comparisons.
+ */
+export function finalResultScorer(result: EvalRunResult): ScoreResult {
+	return {
+		name: "final_result",
+		score: result.status === "passed" ? 1 : 0,
+		metadata: {
+			testsPassed: result.testsPassed,
+			testsTotal: result.testsTotal,
+			passThreshold: result.passThreshold,
+		},
+	};
+}
--- a/packages/evals/src/runner/transcript.ts
+++ b/packages/evals/src/runner/transcript.ts
@@ -1,3 +1,5 @@
+import { basename } from "node:path";
+
 export interface TranscriptEvent {
 	type: string;
 	[key: string]: unknown;
@@ -9,15 +11,45 @@ export interface ToolCallSummary {
 	input: Record<string, unknown>;
 	/** First ~200 chars of output for quick scanning */
 	outputPreview: string;
+	/** Whether the tool call returned an error */
+	isError: boolean;
+	/** stderr output for Bash tool calls */
+	stderr: string;
+}
+
+export interface ModelUsage {
+	inputTokens: number;
+	outputTokens: number;
+	cacheReadInputTokens: number;
+	cacheCreationInputTokens: number;
+	costUSD: number;
 }

 export interface TranscriptSummary {
 	totalTurns: number;
 	totalDurationMs: number;
+	/** API-only latency (excludes local processing overhead) */
+	totalDurationApiMs: number;
 	totalCostUsd: number | null;
 	model: string | null;
 	toolCalls: ToolCallSummary[];
 	finalOutput: string;
+	/** Skills listed in the system init event (loaded into agent context) */
+	skills: string[];
+	/** Basenames of reference files the agent read via the Read tool */
+	referenceFilesRead: string[];
+	/** Per-model token usage and cost breakdown */
+	modelUsage: Record<string, ModelUsage>;
+	totalInputTokens: number;
+	totalOutputTokens: number;
+	totalCacheReadTokens: number;
+	totalCacheCreationTokens: number;
+	/** Count of tool calls that returned is_error === true */
+	toolErrorCount: number;
+	/** Whether the overall session ended in an error */
+	isError: boolean;
+	/** Count of permission_denials in the result event */
+	permissionDenialCount: number;
 }

 /** Parse a single NDJSON line. Returns null on empty or invalid input. */
@@ -74,6 +106,13 @@ export function extractFinalOutput(events: TranscriptEvent[]): string {
 	return "";
 }

+/** Return true if a file path points to a skill reference file. */
+function isReferenceFilePath(filePath: string): boolean {
+	return (
+		filePath.includes("/.agents/skills/") && filePath.includes("/references/")
+	);
+}
+
 /** Walk parsed events to build a transcript summary. */
 export function buildTranscriptSummary(
 	events: TranscriptEvent[],
@@ -81,16 +120,30 @@ export function buildTranscriptSummary(
 	const toolCalls: ToolCallSummary[] = [];
 	let finalOutput = "";
 	let totalDurationMs = 0;
+	let totalDurationApiMs = 0;
 	let totalCostUsd: number | null = null;
 	let model: string | null = null;
 	let totalTurns = 0;
+	let skills: string[] = [];
+	const referenceFilesRead: string[] = [];
+	let modelUsage: Record<string, ModelUsage> = {};
+	let totalInputTokens = 0;
+	let totalOutputTokens = 0;
+	let totalCacheReadTokens = 0;
+	let totalCacheCreationTokens = 0;
+	let toolErrorCount = 0;
+	let isError = false;
+	let permissionDenialCount = 0;

 	for (const event of events) {
 		const e = event as Record<string, unknown>;

-		// System init: extract model
+		// System init: extract model and loaded skills
 		if (e.type === "system" && e.subtype === "init") {
 			model = typeof e.model === "string" ? e.model : null;
+			if (Array.isArray(e.skills)) {
+				skills = e.skills.filter((s): s is string => typeof s === "string");
+			}
 		}

 		// Assistant messages: extract tool_use blocks
@@ -100,12 +153,27 @@ export function buildTranscriptSummary(
 			if (Array.isArray(content)) {
 				for (const block of content) {
 					if (block.type === "tool_use") {
-						toolCalls.push({
+						const toolCall: ToolCallSummary = {
 							tool: block.name ?? "unknown",
 							toolUseId: block.id ?? "",
 							input: block.input ?? {},
 							outputPreview: "",
-						});
+							isError: false,
+							stderr: "",
+						};
+						toolCalls.push(toolCall);
+
+						// Track reference file reads
+						if (
+							block.name === "Read" &&
+							typeof block.input?.file_path === "string" &&
+							isReferenceFilePath(block.input.file_path)
+						) {
+							const base = basename(block.input.file_path);
+							if (!referenceFilesRead.includes(base)) {
+								referenceFilesRead.push(base);
+							}
+						}
 					}
 				}
 			}
@@ -127,28 +195,107 @@ export function buildTranscriptSummary(
 									? block.content
 									: JSON.stringify(block.content);
 							matching.outputPreview = text.slice(0, 200);
+
+							// Capture error state from tool result
+							if (block.is_error === true) {
+								matching.isError = true;
+								toolErrorCount++;
+							}
 						}
 					}
 				}
 			}
+
+			// Capture stderr from tool_use_result (Bash tool emits this at the user event level)
+			const toolUseResult = e.tool_use_result as
+				| Record<string, unknown>
+				| undefined;
+			if (toolUseResult && typeof toolUseResult.stderr === "string") {
+				// Match to the most recent Bash tool call without stderr set
+				const lastBash = [...toolCalls]
+					.reverse()
+					.find((tc) => tc.tool === "Bash" && !tc.stderr);
+				if (lastBash) {
+					lastBash.stderr = toolUseResult.stderr;
+				}
+			}
 		}

-		// Result event: final output, cost, duration, turns
+		// Result event: final output, cost, duration, turns, token usage
 		if (e.type === "result") {
 			finalOutput = typeof e.result === "string" ? e.result : "";
 			totalDurationMs = typeof e.duration_ms === "number" ? e.duration_ms : 0;
+			totalDurationApiMs =
+				typeof e.duration_api_ms === "number" ? e.duration_api_ms : 0;
 			totalCostUsd =
 				typeof e.total_cost_usd === "number" ? e.total_cost_usd : null;
 			totalTurns = typeof e.num_turns === "number" ? e.num_turns : 0;
+			isError = e.is_error === true;
+			permissionDenialCount = Array.isArray(e.permission_denials)
+				? e.permission_denials.length
+				: 0;
+
+			// Aggregate token usage from the result event's usage field
+			const usage = e.usage as Record<string, unknown> | undefined;
+			if (usage) {
+				totalInputTokens =
+					typeof usage.input_tokens === "number" ? usage.input_tokens : 0;
+				totalOutputTokens =
+					typeof usage.output_tokens === "number" ? usage.output_tokens : 0;
+				totalCacheReadTokens =
+					typeof usage.cache_read_input_tokens === "number"
+						? usage.cache_read_input_tokens
+						: 0;
+				totalCacheCreationTokens =
+					typeof usage.cache_creation_input_tokens === "number"
+						? usage.cache_creation_input_tokens
+						: 0;
+			}
+
+			// Per-model usage breakdown (modelUsage keyed by model name)
+			const rawModelUsage = e.modelUsage as
+				| Record<string, Record<string, unknown>>
+				| undefined;
+			if (rawModelUsage) {
+				modelUsage = {};
+				for (const [modelName, mu] of Object.entries(rawModelUsage)) {
+					modelUsage[modelName] = {
+						inputTokens:
+							typeof mu.inputTokens === "number" ? mu.inputTokens : 0,
+						outputTokens:
+							typeof mu.outputTokens === "number" ? mu.outputTokens : 0,
+						cacheReadInputTokens:
+							typeof mu.cacheReadInputTokens === "number"
+								? mu.cacheReadInputTokens
+								: 0,
+						cacheCreationInputTokens:
+							typeof mu.cacheCreationInputTokens === "number"
+								? mu.cacheCreationInputTokens
+								: 0,
+						costUSD: typeof mu.costUSD === "number" ? mu.costUSD : 0,
+					};
+				}
+			}
 		}
 	}

 	return {
 		totalTurns,
 		totalDurationMs,
+		totalDurationApiMs,
 		totalCostUsd,
 		model,
 		toolCalls,
 		finalOutput,
+		skills,
+		referenceFilesRead,
+		modelUsage,
+		totalInputTokens,
+		totalOutputTokens,
+		totalCacheReadTokens,
+		totalCacheCreationTokens,
+		toolErrorCount,
+		isError,
+		permissionDenialCount,
 	};
 }
--- a/packages/evals/src/types.ts
+++ b/packages/evals/src/types.ts
@@ -44,4 +44,39 @@ export interface EvalRunResult {
 	prompt?: string;
 	/** Per-test pass/fail results from vitest */
 	individualTests?: Record<string, boolean>;
+	/** Epoch ms when the agent run started (for Braintrust span timing) */
+	startedAt?: number;
+	/** API-only latency in ms (excludes local processing overhead) */
+	durationApiMs?: number;
+	/** Aggregate token counts from the result event */
+	totalInputTokens?: number;
+	totalOutputTokens?: number;
+	totalCacheReadTokens?: number;
+	totalCacheCreationTokens?: number;
+	/** Per-model token usage and cost breakdown */
+	modelUsage?: Record<
+		string,
+		{
+			inputTokens: number;
+			outputTokens: number;
+			cacheReadInputTokens: number;
+			cacheCreationInputTokens: number;
+			costUSD: number;
+		}
+	>;
+	/** Count of tool calls that returned is_error === true */
+	toolErrorCount?: number;
+	/** Count of permission_denials in the result event */
+	permissionDenialCount?: number;
+	/** Skills that were in the agent's context (from system init event) */
+	loadedSkills?: string[];
+	/** Basenames of skill reference files the agent read */
+	referenceFilesRead?: string[];
+	/** Computed scorer results */
+	scores?: {
+		skillUsage: number;
+		referenceFilesUsage: number;
+		assertionsPassed: number;
+		finalResult: number;
+	};
 }