remove some braintrust headers

2026-03-27 10:09:26 +08:00 · 2026-02-25 19:11:56 +00:00
parent 9b08864e94
commit e65642b752
10 changed files with 555 additions and 117 deletions
--- a/mise.toml
+++ b/mise.toml
@@ -76,6 +76,10 @@ docker run --rm \
  -e BRAINTRUST_PROJECT_ID \
  -e EVAL_RESULTS_DIR=/app/results \
  -v "$(pwd)/packages/evals/results:/app/results" \
  -v "$(pwd)/packages/evals/project:/app/packages/evals/project" \
  -v /var/run/docker.sock:/var/run/docker.sock \
  --group-add 0 \
  --network host \
  supabase-evals:local
 """
--- a/packages/evals/AGENTS.md
+++ b/packages/evals/AGENTS.md
@@ -76,16 +76,6 @@ mise run eval:upload
 mise run --force eval:upload
 ```
 Or directly (no caching, always runs):
 ```bash
 cd packages/evals
 npx tsx src/runner.ts
 # Single scenario, baseline mode
 EVAL_BASELINE=true EVAL_SCENARIO=auth-rls-new-project npx tsx src/runner.ts
 ```
 ## Baseline Mode
 Set `EVAL_BASELINE=true` to run scenarios **without** skills. By default,
--- a/packages/evals/Dockerfile
+++ b/packages/evals/Dockerfile
@@ -29,12 +29,13 @@ RUN npm --prefix packages/skills-build run build
 # ---------- Stage 2: runtime ----------
 FROM node:22-slim
-# Install Docker CLI and curl (needed for supabase CLI install)
+# Install Docker CLI, psql client, and curl (needed for supabase CLI install)
 RUN apt-get update && apt-get install -y --no-install-recommends \
    git \
    curl \
    ca-certificates \
    docker.io \
    postgresql-client \
    && rm -rf /var/lib/apt/lists/*
 # Install supabase CLI binary (pinned version)
--- a/packages/evals/src/runner.ts
+++ b/packages/evals/src/runner.ts
@@ -1,11 +1,21 @@
 import { existsSync, readdirSync, readFileSync } from "node:fs";
 import { join, resolve } from "node:path";
 import { runAgent } from "./runner/agent.js";
-import { uploadToBraintrust } from "./runner/braintrust.js";
+import {
 	initBraintrustLogger,
 	logScenarioToLogger,
 	uploadToBraintrust,
 } from "./runner/braintrust.js";
 import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
 import { preflight } from "./runner/preflight.js";
 import { listModifiedFiles, printSummary } from "./runner/results.js";
 import { createWorkspace } from "./runner/scaffold.js";
 import {
 	assertionsPassedScorer,
 	finalResultScorer,
 	referenceFilesUsageScorer,
 	skillUsageScorer,
 } from "./runner/scorers.js";
 import {
 	getKeys,
 	resetDB,
@@ -24,9 +34,11 @@ import type { EvalRunResult, EvalScenario } from "./types.js";
 // ---------------------------------------------------------------------------
 const DEFAULT_MODEL = "claude-sonnet-4-5-20250929";
 const DEFAULT_SKILL = "supabase";
 const AGENT_TIMEOUT = 30 * 60 * 1000; // 30 minutes
 const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL;
 const skillName = process.env.EVAL_SKILL ?? DEFAULT_SKILL;
 const scenarioFilter = process.env.EVAL_SCENARIO;
 const isBaseline = process.env.EVAL_BASELINE === "true";
 const skillEnabled = !isBaseline;
@@ -107,12 +119,14 @@ async function runEval(
 		// 3. Run the agent
 		console.log(`  Running agent (${model})...`);
 		const startedAt = Date.now();
 		const agentResult = await runAgent({
 			cwd: workspacePath,
 			prompt,
 			model,
 			timeout: AGENT_TIMEOUT,
 			skillEnabled,
 			skillName: skillEnabled ? skillName : undefined,
 		});
 		console.log(
 			`  Agent finished in ${(agentResult.duration / 1000).toFixed(1)}s`,
@@ -149,6 +163,26 @@ async function runEval(
 		// 6. Build transcript summary
 		const summary = buildTranscriptSummary(agentResult.events);
 		// 7. Load expectedReferenceFiles from EVAL.ts (if declared)
 		const { expectedReferenceFiles = [] } = await import(evalFilePath).catch(
 			() => ({ expectedReferenceFiles: [] as string[] }),
 		);
 		// 8. Run scorers
 		const skillScore = skillUsageScorer(summary, skillName);
 		const refScore = referenceFilesUsageScorer(summary, expectedReferenceFiles);
 		const assertScore = assertionsPassedScorer({
 			testsPassed: testResult.passedCount,
 			testsTotal: testResult.totalCount,
 			status: testResult.passed ? "passed" : "failed",
 		} as EvalRunResult);
 		const finalScore = finalResultScorer({
 			status: testResult.passed ? "passed" : "failed",
 			testsPassed: testResult.passedCount,
 			testsTotal: testResult.totalCount,
 			passThreshold: passThreshold ?? undefined,
 		} as EvalRunResult);
 		const result: EvalRunResult = {
 			scenario: scenario.id,
 			agent: "claude-code",
@@ -166,6 +200,23 @@ async function runEval(
 			costUsd: summary.totalCostUsd ?? undefined,
 			prompt,
 			individualTests: testResult.individualTests,
 			startedAt,
 			durationApiMs: summary.totalDurationApiMs,
 			totalInputTokens: summary.totalInputTokens,
 			totalOutputTokens: summary.totalOutputTokens,
 			totalCacheReadTokens: summary.totalCacheReadTokens,
 			totalCacheCreationTokens: summary.totalCacheCreationTokens,
 			modelUsage: summary.modelUsage,
 			toolErrorCount: summary.toolErrorCount,
 			permissionDenialCount: summary.permissionDenialCount,
 			loadedSkills: summary.skills,
 			referenceFilesRead: summary.referenceFilesRead,
 			scores: {
 				skillUsage: skillScore.score,
 				referenceFilesUsage: refScore.score,
 				assertionsPassed: assertScore.score,
 				finalResult: finalScore.score,
 			},
 		};
 		// 7. Persist results
@@ -239,6 +290,9 @@ async function main() {
 	const results: EvalRunResult[] = [];
 	const transcripts = new Map<string, TranscriptSummary>();
 	const braintrustUpload = process.env.BRAINTRUST_UPLOAD === "true";
 	const logger = braintrustUpload ? initBraintrustLogger() : undefined;
 	try {
 		for (const scenario of scenarios) {
 			// Reset the database before each scenario for a clean slate.
@@ -250,16 +304,22 @@ async function main() {
 			if (transcript) {
 				transcripts.set(result.scenario, transcript);
 			}
 			// Log immediately after each scenario for real-time visibility.
 			if (logger) {
 				logScenarioToLogger(logger, result, transcript);
 			}
 		}
 	} finally {
 		stopSupabase();
 		await logger?.flush();
 	}
 	// Use the results dir from the first result (all share the same timestamp)
 	const resultsDir = results.find((r) => r.resultsDir)?.resultsDir;
 	printSummary(results, resultsDir);
-	if (process.env.BRAINTRUST_UPLOAD === "true") {
+	if (braintrustUpload) {
 		console.log("\nUploading to Braintrust...");
 		await uploadToBraintrust(results, {
 			model,
--- a/packages/evals/src/runner/agent.ts
+++ b/packages/evals/src/runner/agent.ts
@@ -26,6 +26,12 @@ export interface AgentRunResult {
 * and has access to the local Supabase MCP server so it can apply migrations
 * and query the real database. --strict-mcp-config ensures only the local
 * Supabase instance is reachable — no host MCP servers leak in.
 *
 * --setting-sources project,local prevents skills from the user's global
 * ~/.agents/skills/ from leaking into the eval environment.
 *
 * When skillEnabled, --agents injects the target skill directly into the
 * agent's context, guaranteeing it is present (not just discoverable).
 */
 export async function runAgent(opts: {
 	cwd: string;
@@ -33,6 +39,8 @@ export async function runAgent(opts: {
 	model: string;
 	timeout: number;
 	skillEnabled: boolean;
 	/** Skill name to inject via --agents (e.g. "supabase"). Used when skillEnabled. */
 	skillName?: string;
 }): Promise<AgentRunResult> {
 	const start = Date.now();
@@ -62,10 +70,26 @@ export async function runAgent(opts: {
 		"--mcp-config",
 		mcpConfig,
 		"--strict-mcp-config",
 		// Prevent skills from the user's global ~/.agents/skills/ from leaking
 		// into the eval environment. Only workspace (project) and local sources
 		// are loaded, so the eval sees only what was explicitly installed.
 		"--setting-sources",
 		"project,local",
 	];
-	// Disable skills for baseline runs so the agent relies on innate knowledge
+	if (opts.skillEnabled && opts.skillName) {
-	if (!opts.skillEnabled) {
+		// Inject the target skill directly into the agent context via --agents.
 		// This guarantees the skill is embedded in the subagent's context at
 		// startup (not just available as a slash command).
 		const agentsDef = JSON.stringify({
 			main: {
 				description: `Supabase developer agent with ${opts.skillName} skill`,
 				skills: [opts.skillName],
 			},
 		});
 		args.push("--agents", agentsDef);
 	} else if (!opts.skillEnabled) {
 		// Baseline runs: disable all skills so the agent relies on innate knowledge
 		args.push("--disable-slash-commands");
 	}
--- a/packages/evals/src/runner/braintrust.ts
+++ b/packages/evals/src/runner/braintrust.ts
@@ -1,14 +1,119 @@
 import assert from "node:assert";
-import { init } from "braintrust";
+import { init, initLogger, type Logger } from "braintrust";
 import type { EvalRunResult } from "../types.js";
 import type { TranscriptSummary } from "./transcript.js";
-/** Convert a test name to a snake_case score key. */
+/**
-function toScoreKey(name: string): string {
+ * Initialize a Braintrust project logger for real-time per-scenario logging.
-	return `test_${name
+ * Call this once at startup and pass the logger to logScenarioToLogger().
-		.toLowerCase()
+ */
-		.replace(/[^a-z0-9]+/g, "_")
+export function initBraintrustLogger(): Logger<true> {
-		.replace(/^_|_$/g, "")}`;
+	assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
 	assert(process.env.BRAINTRUST_PROJECT_ID, "BRAINTRUST_PROJECT_ID is not set");
 	return initLogger({
 		projectId: process.env.BRAINTRUST_PROJECT_ID,
 		asyncFlush: true,
 	});
 }
 /**
 * Log a single scenario result to the Braintrust project logger in real-time.
 * This runs alongside the experiment upload, giving immediate visibility in
 * the project log as each scenario completes.
 */
 export function logScenarioToLogger(
 	logger: Logger<true>,
 	r: EvalRunResult,
 	transcript?: TranscriptSummary,
 ): void {
 	const scores: Record<string, number> = {
 		skill_usage: r.scores?.skillUsage ?? 0,
 		reference_files_usage: r.scores?.referenceFilesUsage ?? 0,
 		assertions_passed: r.scores?.assertionsPassed ?? 0,
 		final_result: r.scores?.finalResult ?? 0,
 	};
 	const metadata: Record<string, unknown> = {
 		agent: r.agent,
 		model: r.model,
 		skillEnabled: r.skillEnabled,
 		testsPassed: r.testsPassed,
 		testsTotal: r.testsTotal,
 		toolCallCount: r.toolCallCount ?? 0,
 		contextWindowUsed:
 			(r.totalInputTokens ?? 0) +
 			(r.totalCacheReadTokens ?? 0) +
 			(r.totalCacheCreationTokens ?? 0),
 		totalOutputTokens: r.totalOutputTokens,
 		modelUsage: r.modelUsage,
 		toolErrorCount: r.toolErrorCount,
 		permissionDenialCount: r.permissionDenialCount,
 		loadedSkills: r.loadedSkills,
 		referenceFilesRead: r.referenceFilesRead,
 		...(r.costUsd != null ? { costUsd: r.costUsd } : {}),
 		...(r.error ? { error: r.error } : {}),
 	};
 	const spanOptions = r.startedAt
 		? { name: r.scenario, startTime: r.startedAt / 1000 }
 		: { name: r.scenario };
 	if (transcript && transcript.toolCalls.length > 0) {
 		logger.traced((span) => {
 			span.log({
 				input: {
 					scenario: r.scenario,
 					prompt: r.prompt ?? "",
 					skillEnabled: r.skillEnabled,
 				},
 				output: {
 					status: r.status,
 					agentOutput: r.agentOutput,
 					filesModified: r.filesModified,
 					testOutput: r.testOutput,
 				},
 				expected: { testsTotal: r.testsTotal },
 				scores,
 				metadata,
 			});
 			for (const tc of transcript.toolCalls) {
 				span.traced(
 					(childSpan) => {
 						childSpan.log({
 							input: { tool: tc.tool, args: tc.input },
 							output: {
 								preview: tc.outputPreview,
 								isError: tc.isError,
 								...(tc.stderr ? { stderr: tc.stderr } : {}),
 							},
 							metadata: { toolUseId: tc.toolUseId },
 						});
 					},
 					{ name: `tool:${tc.tool}` },
 				);
 			}
 		}, spanOptions);
 	} else {
 		logger.traced((span) => {
 			span.log({
 				input: {
 					scenario: r.scenario,
 					prompt: r.prompt ?? "",
 					skillEnabled: r.skillEnabled,
 				},
 				output: {
 					status: r.status,
 					agentOutput: r.agentOutput,
 					filesModified: r.filesModified,
 					testOutput: r.testOutput,
 				},
 				expected: { testsTotal: r.testsTotal },
 				scores,
 				metadata,
 			});
 		}, spanOptions);
 	}
 }
 /**
@@ -18,8 +123,8 @@ function toScoreKey(name: string): string {
 * - input: scenario ID, prompt content, skillEnabled flag
 * - output: status, agent output, files modified, test output
 * - expected: total tests, pass threshold
- * - scores: pass (0|1), test_pass_rate (0-1), per-test scores
+ * - scores: skill_usage, reference_files_usage, assertions_passed, final_result
- * - metadata: model, duration, cost, tool call count, files modified
+ * - metadata: agent, model, skillEnabled, test counts, tool calls, context window, output tokens, model usage, errors, cost
 * - spans: one child span per agent tool call (when transcript available)
 */
 export async function uploadToBraintrust(
@@ -50,20 +155,11 @@ export async function uploadToBraintrust(
 	for (const r of results) {
 		const transcript = opts.transcripts.get(r.scenario);
 		// Build per-test scores
 		const perTestScores: Record<string, number> = {};
 		if (r.individualTests) {
 			for (const [testName, didPass] of Object.entries(r.individualTests)) {
 				perTestScores[toScoreKey(testName)] = didPass ? 1 : 0;
 			}
 		}
 		const testPassRate = r.testsTotal > 0 ? r.testsPassed / r.testsTotal : 0;
 		const scores: Record<string, number> = {
-			pass: r.status === "passed" ? 1 : 0,
+			skill_usage: r.scores?.skillUsage ?? 0,
-			test_pass_rate: testPassRate,
+			reference_files_usage: r.scores?.referenceFilesUsage ?? 0,
-			...perTestScores,
+			assertions_passed: r.scores?.assertionsPassed ?? 0,
 			final_result: r.scores?.finalResult ?? 0,
 		};
 		const input = {
@@ -88,38 +184,52 @@ export async function uploadToBraintrust(
 			agent: r.agent,
 			model: r.model,
 			skillEnabled: r.skillEnabled,
 			duration: r.duration,
 			testsPassed: r.testsPassed,
 			testsTotal: r.testsTotal,
 			toolCallCount: r.toolCallCount ?? 0,
-			filesModified: r.filesModified,
+			contextWindowUsed:
 				(r.totalInputTokens ?? 0) +
 				(r.totalCacheReadTokens ?? 0) +
 				(r.totalCacheCreationTokens ?? 0),
 			totalOutputTokens: r.totalOutputTokens,
 			modelUsage: r.modelUsage,
 			toolErrorCount: r.toolErrorCount,
 			permissionDenialCount: r.permissionDenialCount,
 			loadedSkills: r.loadedSkills,
 			referenceFilesRead: r.referenceFilesRead,
 			...(r.costUsd != null ? { costUsd: r.costUsd } : {}),
 			...(r.error ? { error: r.error } : {}),
 		};
-		if (transcript && transcript.toolCalls.length > 0) {
+		const spanOptions = r.startedAt
-			// Use traced() to create a root span with child spans for tool calls
+			? { name: r.scenario, startTime: r.startedAt / 1000 }
-			experiment.traced(
+			: { name: r.scenario };
 				(span) => {
 					span.log({ input, output, expected, scores, metadata });
-					for (const tc of transcript.toolCalls) {
+		if (transcript && transcript.toolCalls.length > 0) {
-						span.traced(
+			experiment.traced((span) => {
-							(childSpan) => {
+				span.log({ input, output, expected, scores, metadata });
-								childSpan.log({
+
-									input: { tool: tc.tool, args: tc.input },
+				for (const tc of transcript.toolCalls) {
-									output: { preview: tc.outputPreview },
+					span.traced(
-									metadata: { toolUseId: tc.toolUseId },
+						(childSpan) => {
-								});
+							childSpan.log({
-							},
+								input: { tool: tc.tool, args: tc.input },
-							{ name: `tool:${tc.tool}` },
+								output: {
-						);
+									preview: tc.outputPreview,
-					}
+									isError: tc.isError,
-				},
+									...(tc.stderr ? { stderr: tc.stderr } : {}),
-				{ name: r.scenario },
+								},
-			);
+								metadata: { toolUseId: tc.toolUseId },
 							});
 						},
 						{ name: `tool:${tc.tool}` },
 					);
 				}
 			}, spanOptions);
 		} else {
-			experiment.log({ input, output, expected, scores, metadata });
+			experiment.traced((span) => {
 				span.log({ input, output, expected, scores, metadata });
 			}, spanOptions);
 		}
 	}
--- a/packages/evals/src/runner/scaffold.ts
+++ b/packages/evals/src/runner/scaffold.ts
@@ -1,4 +1,3 @@
 import { execFileSync } from "node:child_process";
 import {
 	cpSync,
 	existsSync,
@@ -6,43 +5,21 @@ import {
 	mkdtempSync,
 	readdirSync,
 	rmSync,
 	writeFileSync,
 } from "node:fs";
 import { tmpdir } from "node:os";
-import { dirname, join, resolve } from "node:path";
+import { join } from "node:path";
 import { fileURLToPath } from "node:url";
 import { EVAL_PROJECT_DIR } from "./supabase-setup.js";
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = dirname(__filename);
 /** Resolve the `skills` binary from the evals package node_modules. */
 function resolveSkillsBin(): string {
 	// __dirname is packages/evals/src/runner/ (or compiled equivalent)
 	// Walk up to packages/evals/ and into node_modules/.bin/skills
 	const bin = resolve(__dirname, "..", "..", "node_modules", ".bin", "skills");
 	if (existsSync(bin)) return bin;
 	throw new Error(`skills binary not found at ${bin}. Run npm install.`);
 }
 /** Walk up from cwd to find the repository root (contains skills/ and packages/). */
 function findRepoRoot(): string {
 	let dir = process.cwd();
 	for (let i = 0; i < 10; i++) {
 		if (existsSync(join(dir, "skills")) && existsSync(join(dir, "packages"))) {
 			return dir;
 		}
 		const parent = resolve(dir, "..");
 		if (parent === dir) break;
 		dir = parent;
 	}
 	throw new Error("Could not find repository root (skills/ + packages/)");
 }
 /**
 * Create an isolated workspace for an eval run.
 *
- * 1. Copy the eval directory to a temp folder (excluding EVAL.ts)
+ * 1. Copy the eval directory to a temp folder (excluding EVAL.ts/EVAL.tsx)
- * 2. Optionally install skills via the `skills` CLI so Claude Code can discover them
+ * 2. Seed with the eval project's supabase/config.toml
 *
 * Skills are injected via the --agents flag in agent.ts (not installed into
 * the workspace here). Combined with --setting-sources project,local, this
 * prevents host ~/.agents/skills/ from leaking into the eval environment.
 *
 * Returns the path to the workspace and a cleanup function.
 */
@@ -50,10 +27,9 @@ export function createWorkspace(opts: {
 	evalDir: string;
 	skillEnabled: boolean;
 }): { workspacePath: string; cleanup: () => void } {
 	const repoRoot = findRepoRoot();
 	const workspacePath = mkdtempSync(join(tmpdir(), "supabase-eval-"));
-	// Copy eval directory, excluding EVAL.ts (hidden from agent)
+	// Copy eval directory, excluding EVAL.ts/EVAL.tsx (hidden from agent)
 	const entries = readdirSync(opts.evalDir, { withFileTypes: true });
 	for (const entry of entries) {
 		if (entry.name === "EVAL.ts" || entry.name === "EVAL.tsx") continue;
@@ -62,6 +38,23 @@ export function createWorkspace(opts: {
 		cpSync(src, dest, { recursive: true });
 	}
 	// Add .mcp.json so the agent connects to the local Supabase MCP server
 	writeFileSync(
 		join(workspacePath, ".mcp.json"),
 		JSON.stringify(
 			{
 				mcpServers: {
 					"local-supabase": {
 						type: "http",
 						url: "http://localhost:54321/mcp",
 					},
 				},
 			},
 			null,
 			"\t",
 		),
 	);
 	// Seed the workspace with the eval project's supabase/config.toml so the
 	// agent can run `supabase db push` against the shared local instance without
 	// needing to run `supabase init` or `supabase start` first.
@@ -72,26 +65,6 @@ export function createWorkspace(opts: {
 		cpSync(projectConfigSrc, join(destSupabaseDir, "config.toml"));
 	}
 	// Install skills into the workspace via the `skills` CLI
 	if (opts.skillEnabled) {
 		const skillsDir = join(repoRoot, "skills");
 		if (existsSync(skillsDir)) {
 			const skillsBin = resolveSkillsBin();
 			const args = ["add", skillsDir, "-a", "claude-code", "-y"];
 			const skillFilter = process.env.EVAL_SKILL;
 			if (skillFilter) {
 				args.push("--skill", skillFilter);
 			}
 			execFileSync(skillsBin, args, {
 				cwd: workspacePath,
 				stdio: "pipe",
 				timeout: 60_000,
 			});
 		}
 	}
 	return {
 		workspacePath,
 		cleanup: () => {
--- a/packages/evals/src/runner/scorers.ts
+++ b/packages/evals/src/runner/scorers.ts
@@ -0,0 +1,94 @@
 import type { EvalRunResult } from "../types.js";
 import type { TranscriptSummary } from "./transcript.js";
 export interface ScoreResult {
 	name: string;
 	/** 0.0 – 1.0 */
 	score: number;
 	metadata?: Record<string, unknown>;
 }
 /**
 * skillUsageScorer — 1 if the target skill was in the agent's context, 0 otherwise.
 *
 * Detected via the `skills` array in the system init event of the NDJSON transcript.
 * Combined with `--setting-sources project,local` in agent.ts, this array is clean
 * (no host skill leakage), so its presence is a reliable signal.
 */
 export function skillUsageScorer(
 	transcript: TranscriptSummary,
 	skillName: string,
 ): ScoreResult {
 	const loaded = transcript.skills.includes(skillName);
 	return {
 		name: "skill_usage",
 		score: loaded ? 1 : 0,
 		metadata: {
 			loadedSkills: transcript.skills,
 			targetSkill: skillName,
 		},
 	};
 }
 /**
 * referenceFilesUsageScorer — fraction of expected reference files actually read.
 *
 * Detected via Read tool calls whose file_path matches "/.agents/skills/*\/references/".
 * The expectedReferenceFiles list is declared in each EVAL.ts and should match the
 * "Skill References Exercised" table in the corresponding scenarios/*.md file.
 */
 export function referenceFilesUsageScorer(
 	transcript: TranscriptSummary,
 	expectedReferenceFiles: string[],
 ): ScoreResult {
 	if (expectedReferenceFiles.length === 0) {
 		return {
 			name: "reference_files_usage",
 			score: 1,
 			metadata: { skipped: true },
 		};
 	}
 	const read = transcript.referenceFilesRead;
 	const hits = expectedReferenceFiles.filter((f) => read.includes(f)).length;
 	return {
 		name: "reference_files_usage",
 		score: hits / expectedReferenceFiles.length,
 		metadata: {
 			expected: expectedReferenceFiles,
 			read,
 			hits,
 			total: expectedReferenceFiles.length,
 		},
 	};
 }
 /**
 * assertionsPassedScorer — ratio of vitest assertions passed vs total.
 */
 export function assertionsPassedScorer(result: EvalRunResult): ScoreResult {
 	const score =
 		result.testsTotal > 0 ? result.testsPassed / result.testsTotal : 0;
 	return {
 		name: "assertions_passed",
 		score,
 		metadata: { passed: result.testsPassed, total: result.testsTotal },
 	};
 }
 /**
 * finalResultScorer — 1 if the agent met the pass threshold, 0 otherwise.
 *
 * A result is "passed" when assertionsPassed >= passThreshold (set per scenario
 * in scenarios/*.md). This is the binary outcome used for Braintrust comparisons.
 */
 export function finalResultScorer(result: EvalRunResult): ScoreResult {
 	return {
 		name: "final_result",
 		score: result.status === "passed" ? 1 : 0,
 		metadata: {
 			testsPassed: result.testsPassed,
 			testsTotal: result.testsTotal,
 			passThreshold: result.passThreshold,
 		},
 	};
 }
--- a/packages/evals/src/runner/transcript.ts
+++ b/packages/evals/src/runner/transcript.ts
@@ -1,3 +1,5 @@
 import { basename } from "node:path";
 export interface TranscriptEvent {
 	type: string;
 	[key: string]: unknown;
@@ -9,15 +11,45 @@ export interface ToolCallSummary {
 	input: Record<string, unknown>;
 	/** First ~200 chars of output for quick scanning */
 	outputPreview: string;
 	/** Whether the tool call returned an error */
 	isError: boolean;
 	/** stderr output for Bash tool calls */
 	stderr: string;
 }
 export interface ModelUsage {
 	inputTokens: number;
 	outputTokens: number;
 	cacheReadInputTokens: number;
 	cacheCreationInputTokens: number;
 	costUSD: number;
 }
 export interface TranscriptSummary {
 	totalTurns: number;
 	totalDurationMs: number;
 	/** API-only latency (excludes local processing overhead) */
 	totalDurationApiMs: number;
 	totalCostUsd: number | null;
 	model: string | null;
 	toolCalls: ToolCallSummary[];
 	finalOutput: string;
 	/** Skills listed in the system init event (loaded into agent context) */
 	skills: string[];
 	/** Basenames of reference files the agent read via the Read tool */
 	referenceFilesRead: string[];
 	/** Per-model token usage and cost breakdown */
 	modelUsage: Record<string, ModelUsage>;
 	totalInputTokens: number;
 	totalOutputTokens: number;
 	totalCacheReadTokens: number;
 	totalCacheCreationTokens: number;
 	/** Count of tool calls that returned is_error === true */
 	toolErrorCount: number;
 	/** Whether the overall session ended in an error */
 	isError: boolean;
 	/** Count of permission_denials in the result event */
 	permissionDenialCount: number;
 }
 /** Parse a single NDJSON line. Returns null on empty or invalid input. */
@@ -74,6 +106,13 @@ export function extractFinalOutput(events: TranscriptEvent[]): string {
 	return "";
 }
 /** Return true if a file path points to a skill reference file. */
 function isReferenceFilePath(filePath: string): boolean {
 	return (
 		filePath.includes("/.agents/skills/") && filePath.includes("/references/")
 	);
 }
 /** Walk parsed events to build a transcript summary. */
 export function buildTranscriptSummary(
 	events: TranscriptEvent[],
@@ -81,16 +120,30 @@ export function buildTranscriptSummary(
 	const toolCalls: ToolCallSummary[] = [];
 	let finalOutput = "";
 	let totalDurationMs = 0;
 	let totalDurationApiMs = 0;
 	let totalCostUsd: number | null = null;
 	let model: string | null = null;
 	let totalTurns = 0;
 	let skills: string[] = [];
 	const referenceFilesRead: string[] = [];
 	let modelUsage: Record<string, ModelUsage> = {};
 	let totalInputTokens = 0;
 	let totalOutputTokens = 0;
 	let totalCacheReadTokens = 0;
 	let totalCacheCreationTokens = 0;
 	let toolErrorCount = 0;
 	let isError = false;
 	let permissionDenialCount = 0;
 	for (const event of events) {
 		const e = event as Record<string, unknown>;
-		// System init: extract model
+		// System init: extract model and loaded skills
 		if (e.type === "system" && e.subtype === "init") {
 			model = typeof e.model === "string" ? e.model : null;
 			if (Array.isArray(e.skills)) {
 				skills = e.skills.filter((s): s is string => typeof s === "string");
 			}
 		}
 		// Assistant messages: extract tool_use blocks
@@ -100,12 +153,27 @@ export function buildTranscriptSummary(
 			if (Array.isArray(content)) {
 				for (const block of content) {
 					if (block.type === "tool_use") {
-						toolCalls.push({
+						const toolCall: ToolCallSummary = {
 							tool: block.name ?? "unknown",
 							toolUseId: block.id ?? "",
 							input: block.input ?? {},
 							outputPreview: "",
-						});
+							isError: false,
 							stderr: "",
 						};
 						toolCalls.push(toolCall);
 						// Track reference file reads
 						if (
 							block.name === "Read" &&
 							typeof block.input?.file_path === "string" &&
 							isReferenceFilePath(block.input.file_path)
 						) {
 							const base = basename(block.input.file_path);
 							if (!referenceFilesRead.includes(base)) {
 								referenceFilesRead.push(base);
 							}
 						}
 					}
 				}
 			}
@@ -127,28 +195,107 @@ export function buildTranscriptSummary(
 									? block.content
 									: JSON.stringify(block.content);
 							matching.outputPreview = text.slice(0, 200);
 							// Capture error state from tool result
 							if (block.is_error === true) {
 								matching.isError = true;
 								toolErrorCount++;
 							}
 						}
 					}
 				}
 			}
 			// Capture stderr from tool_use_result (Bash tool emits this at the user event level)
 			const toolUseResult = e.tool_use_result as
 				| Record<string, unknown>
 				| undefined;
 			if (toolUseResult && typeof toolUseResult.stderr === "string") {
 				// Match to the most recent Bash tool call without stderr set
 				const lastBash = [...toolCalls]
 					.reverse()
 					.find((tc) => tc.tool === "Bash" && !tc.stderr);
 				if (lastBash) {
 					lastBash.stderr = toolUseResult.stderr;
 				}
 			}
 		}
-		// Result event: final output, cost, duration, turns
+		// Result event: final output, cost, duration, turns, token usage
 		if (e.type === "result") {
 			finalOutput = typeof e.result === "string" ? e.result : "";
 			totalDurationMs = typeof e.duration_ms === "number" ? e.duration_ms : 0;
 			totalDurationApiMs =
 				typeof e.duration_api_ms === "number" ? e.duration_api_ms : 0;
 			totalCostUsd =
 				typeof e.total_cost_usd === "number" ? e.total_cost_usd : null;
 			totalTurns = typeof e.num_turns === "number" ? e.num_turns : 0;
 			isError = e.is_error === true;
 			permissionDenialCount = Array.isArray(e.permission_denials)
 				? e.permission_denials.length
 				: 0;
 			// Aggregate token usage from the result event's usage field
 			const usage = e.usage as Record<string, unknown> | undefined;
 			if (usage) {
 				totalInputTokens =
 					typeof usage.input_tokens === "number" ? usage.input_tokens : 0;
 				totalOutputTokens =
 					typeof usage.output_tokens === "number" ? usage.output_tokens : 0;
 				totalCacheReadTokens =
 					typeof usage.cache_read_input_tokens === "number"
 						? usage.cache_read_input_tokens
 						: 0;
 				totalCacheCreationTokens =
 					typeof usage.cache_creation_input_tokens === "number"
 						? usage.cache_creation_input_tokens
 						: 0;
 			}
 			// Per-model usage breakdown (modelUsage keyed by model name)
 			const rawModelUsage = e.modelUsage as
 				| Record<string, Record<string, unknown>>
 				| undefined;
 			if (rawModelUsage) {
 				modelUsage = {};
 				for (const [modelName, mu] of Object.entries(rawModelUsage)) {
 					modelUsage[modelName] = {
 						inputTokens:
 							typeof mu.inputTokens === "number" ? mu.inputTokens : 0,
 						outputTokens:
 							typeof mu.outputTokens === "number" ? mu.outputTokens : 0,
 						cacheReadInputTokens:
 							typeof mu.cacheReadInputTokens === "number"
 								? mu.cacheReadInputTokens
 								: 0,
 						cacheCreationInputTokens:
 							typeof mu.cacheCreationInputTokens === "number"
 								? mu.cacheCreationInputTokens
 								: 0,
 						costUSD: typeof mu.costUSD === "number" ? mu.costUSD : 0,
 					};
 				}
 			}
 		}
 	}
 	return {
 		totalTurns,
 		totalDurationMs,
 		totalDurationApiMs,
 		totalCostUsd,
 		model,
 		toolCalls,
 		finalOutput,
 		skills,
 		referenceFilesRead,
 		modelUsage,
 		totalInputTokens,
 		totalOutputTokens,
 		totalCacheReadTokens,
 		totalCacheCreationTokens,
 		toolErrorCount,
 		isError,
 		permissionDenialCount,
 	};
 }
--- a/packages/evals/src/types.ts
+++ b/packages/evals/src/types.ts
@@ -44,4 +44,39 @@ export interface EvalRunResult {
 	prompt?: string;
 	/** Per-test pass/fail results from vitest */
 	individualTests?: Record<string, boolean>;
 	/** Epoch ms when the agent run started (for Braintrust span timing) */
 	startedAt?: number;
 	/** API-only latency in ms (excludes local processing overhead) */
 	durationApiMs?: number;
 	/** Aggregate token counts from the result event */
 	totalInputTokens?: number;
 	totalOutputTokens?: number;
 	totalCacheReadTokens?: number;
 	totalCacheCreationTokens?: number;
 	/** Per-model token usage and cost breakdown */
 	modelUsage?: Record<
 		string,
 		{
 			inputTokens: number;
 			outputTokens: number;
 			cacheReadInputTokens: number;
 			cacheCreationInputTokens: number;
 			costUSD: number;
 		}
 	>;
 	/** Count of tool calls that returned is_error === true */
 	toolErrorCount?: number;
 	/** Count of permission_denials in the result event */
 	permissionDenialCount?: number;
 	/** Skills that were in the agent's context (from system init event) */
 	loadedSkills?: string[];
 	/** Basenames of skill reference files the agent read */
 	referenceFilesRead?: string[];
 	/** Computed scorer results */
 	scores?: {
 		skillUsage: number;
 		referenceFilesUsage: number;
 		assertionsPassed: number;
 		finalResult: number;
 	};
 }