remove some braintrust headers

This commit is contained in:
Pedro Rodrigues
2026-02-25 19:11:56 +00:00
parent 9b08864e94
commit e65642b752
10 changed files with 555 additions and 117 deletions

View File

@@ -76,16 +76,6 @@ mise run eval:upload
mise run --force eval:upload
```
Or directly (no caching, always runs):
```bash
cd packages/evals
npx tsx src/runner.ts
# Single scenario, baseline mode
EVAL_BASELINE=true EVAL_SCENARIO=auth-rls-new-project npx tsx src/runner.ts
```
## Baseline Mode
Set `EVAL_BASELINE=true` to run scenarios **without** skills. By default,

View File

@@ -29,12 +29,13 @@ RUN npm --prefix packages/skills-build run build
# ---------- Stage 2: runtime ----------
FROM node:22-slim
# Install Docker CLI and curl (needed for supabase CLI install)
# Install Docker CLI, psql client, and curl (needed for supabase CLI install)
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
curl \
ca-certificates \
docker.io \
postgresql-client \
&& rm -rf /var/lib/apt/lists/*
# Install supabase CLI binary (pinned version)

View File

@@ -1,11 +1,21 @@
import { existsSync, readdirSync, readFileSync } from "node:fs";
import { join, resolve } from "node:path";
import { runAgent } from "./runner/agent.js";
import { uploadToBraintrust } from "./runner/braintrust.js";
import {
initBraintrustLogger,
logScenarioToLogger,
uploadToBraintrust,
} from "./runner/braintrust.js";
import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
import { preflight } from "./runner/preflight.js";
import { listModifiedFiles, printSummary } from "./runner/results.js";
import { createWorkspace } from "./runner/scaffold.js";
import {
assertionsPassedScorer,
finalResultScorer,
referenceFilesUsageScorer,
skillUsageScorer,
} from "./runner/scorers.js";
import {
getKeys,
resetDB,
@@ -24,9 +34,11 @@ import type { EvalRunResult, EvalScenario } from "./types.js";
// ---------------------------------------------------------------------------
const DEFAULT_MODEL = "claude-sonnet-4-5-20250929";
const DEFAULT_SKILL = "supabase";
const AGENT_TIMEOUT = 30 * 60 * 1000; // 30 minutes
const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL;
const skillName = process.env.EVAL_SKILL ?? DEFAULT_SKILL;
const scenarioFilter = process.env.EVAL_SCENARIO;
const isBaseline = process.env.EVAL_BASELINE === "true";
const skillEnabled = !isBaseline;
@@ -107,12 +119,14 @@ async function runEval(
// 3. Run the agent
console.log(` Running agent (${model})...`);
const startedAt = Date.now();
const agentResult = await runAgent({
cwd: workspacePath,
prompt,
model,
timeout: AGENT_TIMEOUT,
skillEnabled,
skillName: skillEnabled ? skillName : undefined,
});
console.log(
` Agent finished in ${(agentResult.duration / 1000).toFixed(1)}s`,
@@ -149,6 +163,26 @@ async function runEval(
// 6. Build transcript summary
const summary = buildTranscriptSummary(agentResult.events);
// 7. Load expectedReferenceFiles from EVAL.ts (if declared)
const { expectedReferenceFiles = [] } = await import(evalFilePath).catch(
() => ({ expectedReferenceFiles: [] as string[] }),
);
// 8. Run scorers
const skillScore = skillUsageScorer(summary, skillName);
const refScore = referenceFilesUsageScorer(summary, expectedReferenceFiles);
const assertScore = assertionsPassedScorer({
testsPassed: testResult.passedCount,
testsTotal: testResult.totalCount,
status: testResult.passed ? "passed" : "failed",
} as EvalRunResult);
const finalScore = finalResultScorer({
status: testResult.passed ? "passed" : "failed",
testsPassed: testResult.passedCount,
testsTotal: testResult.totalCount,
passThreshold: passThreshold ?? undefined,
} as EvalRunResult);
const result: EvalRunResult = {
scenario: scenario.id,
agent: "claude-code",
@@ -166,6 +200,23 @@ async function runEval(
costUsd: summary.totalCostUsd ?? undefined,
prompt,
individualTests: testResult.individualTests,
startedAt,
durationApiMs: summary.totalDurationApiMs,
totalInputTokens: summary.totalInputTokens,
totalOutputTokens: summary.totalOutputTokens,
totalCacheReadTokens: summary.totalCacheReadTokens,
totalCacheCreationTokens: summary.totalCacheCreationTokens,
modelUsage: summary.modelUsage,
toolErrorCount: summary.toolErrorCount,
permissionDenialCount: summary.permissionDenialCount,
loadedSkills: summary.skills,
referenceFilesRead: summary.referenceFilesRead,
scores: {
skillUsage: skillScore.score,
referenceFilesUsage: refScore.score,
assertionsPassed: assertScore.score,
finalResult: finalScore.score,
},
};
// 7. Persist results
@@ -239,6 +290,9 @@ async function main() {
const results: EvalRunResult[] = [];
const transcripts = new Map<string, TranscriptSummary>();
const braintrustUpload = process.env.BRAINTRUST_UPLOAD === "true";
const logger = braintrustUpload ? initBraintrustLogger() : undefined;
try {
for (const scenario of scenarios) {
// Reset the database before each scenario for a clean slate.
@@ -250,16 +304,22 @@ async function main() {
if (transcript) {
transcripts.set(result.scenario, transcript);
}
// Log immediately after each scenario for real-time visibility.
if (logger) {
logScenarioToLogger(logger, result, transcript);
}
}
} finally {
stopSupabase();
await logger?.flush();
}
// Use the results dir from the first result (all share the same timestamp)
const resultsDir = results.find((r) => r.resultsDir)?.resultsDir;
printSummary(results, resultsDir);
if (process.env.BRAINTRUST_UPLOAD === "true") {
if (braintrustUpload) {
console.log("\nUploading to Braintrust...");
await uploadToBraintrust(results, {
model,

View File

@@ -26,6 +26,12 @@ export interface AgentRunResult {
* and has access to the local Supabase MCP server so it can apply migrations
* and query the real database. --strict-mcp-config ensures only the local
* Supabase instance is reachable — no host MCP servers leak in.
*
* --setting-sources project,local prevents skills from the user's global
* ~/.agents/skills/ from leaking into the eval environment.
*
* When skillEnabled, --agents injects the target skill directly into the
* agent's context, guaranteeing it is present (not just discoverable).
*/
export async function runAgent(opts: {
cwd: string;
@@ -33,6 +39,8 @@ export async function runAgent(opts: {
model: string;
timeout: number;
skillEnabled: boolean;
/** Skill name to inject via --agents (e.g. "supabase"). Used when skillEnabled. */
skillName?: string;
}): Promise<AgentRunResult> {
const start = Date.now();
@@ -62,10 +70,26 @@ export async function runAgent(opts: {
"--mcp-config",
mcpConfig,
"--strict-mcp-config",
// Prevent skills from the user's global ~/.agents/skills/ from leaking
// into the eval environment. Only workspace (project) and local sources
// are loaded, so the eval sees only what was explicitly installed.
"--setting-sources",
"project,local",
];
// Disable skills for baseline runs so the agent relies on innate knowledge
if (!opts.skillEnabled) {
if (opts.skillEnabled && opts.skillName) {
// Inject the target skill directly into the agent context via --agents.
// This guarantees the skill is embedded in the subagent's context at
// startup (not just available as a slash command).
const agentsDef = JSON.stringify({
main: {
description: `Supabase developer agent with ${opts.skillName} skill`,
skills: [opts.skillName],
},
});
args.push("--agents", agentsDef);
} else if (!opts.skillEnabled) {
// Baseline runs: disable all skills so the agent relies on innate knowledge
args.push("--disable-slash-commands");
}

View File

@@ -1,14 +1,119 @@
import assert from "node:assert";
import { init } from "braintrust";
import { init, initLogger, type Logger } from "braintrust";
import type { EvalRunResult } from "../types.js";
import type { TranscriptSummary } from "./transcript.js";
/** Convert a test name to a snake_case score key. */
function toScoreKey(name: string): string {
return `test_${name
.toLowerCase()
.replace(/[^a-z0-9]+/g, "_")
.replace(/^_|_$/g, "")}`;
/**
* Initialize a Braintrust project logger for real-time per-scenario logging.
* Call this once at startup and pass the logger to logScenarioToLogger().
*/
export function initBraintrustLogger(): Logger<true> {
assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
assert(process.env.BRAINTRUST_PROJECT_ID, "BRAINTRUST_PROJECT_ID is not set");
return initLogger({
projectId: process.env.BRAINTRUST_PROJECT_ID,
asyncFlush: true,
});
}
/**
* Log a single scenario result to the Braintrust project logger in real-time.
* This runs alongside the experiment upload, giving immediate visibility in
* the project log as each scenario completes.
*/
export function logScenarioToLogger(
logger: Logger<true>,
r: EvalRunResult,
transcript?: TranscriptSummary,
): void {
const scores: Record<string, number> = {
skill_usage: r.scores?.skillUsage ?? 0,
reference_files_usage: r.scores?.referenceFilesUsage ?? 0,
assertions_passed: r.scores?.assertionsPassed ?? 0,
final_result: r.scores?.finalResult ?? 0,
};
const metadata: Record<string, unknown> = {
agent: r.agent,
model: r.model,
skillEnabled: r.skillEnabled,
testsPassed: r.testsPassed,
testsTotal: r.testsTotal,
toolCallCount: r.toolCallCount ?? 0,
contextWindowUsed:
(r.totalInputTokens ?? 0) +
(r.totalCacheReadTokens ?? 0) +
(r.totalCacheCreationTokens ?? 0),
totalOutputTokens: r.totalOutputTokens,
modelUsage: r.modelUsage,
toolErrorCount: r.toolErrorCount,
permissionDenialCount: r.permissionDenialCount,
loadedSkills: r.loadedSkills,
referenceFilesRead: r.referenceFilesRead,
...(r.costUsd != null ? { costUsd: r.costUsd } : {}),
...(r.error ? { error: r.error } : {}),
};
const spanOptions = r.startedAt
? { name: r.scenario, startTime: r.startedAt / 1000 }
: { name: r.scenario };
if (transcript && transcript.toolCalls.length > 0) {
logger.traced((span) => {
span.log({
input: {
scenario: r.scenario,
prompt: r.prompt ?? "",
skillEnabled: r.skillEnabled,
},
output: {
status: r.status,
agentOutput: r.agentOutput,
filesModified: r.filesModified,
testOutput: r.testOutput,
},
expected: { testsTotal: r.testsTotal },
scores,
metadata,
});
for (const tc of transcript.toolCalls) {
span.traced(
(childSpan) => {
childSpan.log({
input: { tool: tc.tool, args: tc.input },
output: {
preview: tc.outputPreview,
isError: tc.isError,
...(tc.stderr ? { stderr: tc.stderr } : {}),
},
metadata: { toolUseId: tc.toolUseId },
});
},
{ name: `tool:${tc.tool}` },
);
}
}, spanOptions);
} else {
logger.traced((span) => {
span.log({
input: {
scenario: r.scenario,
prompt: r.prompt ?? "",
skillEnabled: r.skillEnabled,
},
output: {
status: r.status,
agentOutput: r.agentOutput,
filesModified: r.filesModified,
testOutput: r.testOutput,
},
expected: { testsTotal: r.testsTotal },
scores,
metadata,
});
}, spanOptions);
}
}
/**
@@ -18,8 +123,8 @@ function toScoreKey(name: string): string {
* - input: scenario ID, prompt content, skillEnabled flag
* - output: status, agent output, files modified, test output
* - expected: total tests, pass threshold
* - scores: pass (0|1), test_pass_rate (0-1), per-test scores
* - metadata: model, duration, cost, tool call count, files modified
* - scores: skill_usage, reference_files_usage, assertions_passed, final_result
* - metadata: agent, model, skillEnabled, test counts, tool calls, context window, output tokens, model usage, errors, cost
* - spans: one child span per agent tool call (when transcript available)
*/
export async function uploadToBraintrust(
@@ -50,20 +155,11 @@ export async function uploadToBraintrust(
for (const r of results) {
const transcript = opts.transcripts.get(r.scenario);
// Build per-test scores
const perTestScores: Record<string, number> = {};
if (r.individualTests) {
for (const [testName, didPass] of Object.entries(r.individualTests)) {
perTestScores[toScoreKey(testName)] = didPass ? 1 : 0;
}
}
const testPassRate = r.testsTotal > 0 ? r.testsPassed / r.testsTotal : 0;
const scores: Record<string, number> = {
pass: r.status === "passed" ? 1 : 0,
test_pass_rate: testPassRate,
...perTestScores,
skill_usage: r.scores?.skillUsage ?? 0,
reference_files_usage: r.scores?.referenceFilesUsage ?? 0,
assertions_passed: r.scores?.assertionsPassed ?? 0,
final_result: r.scores?.finalResult ?? 0,
};
const input = {
@@ -88,38 +184,52 @@ export async function uploadToBraintrust(
agent: r.agent,
model: r.model,
skillEnabled: r.skillEnabled,
duration: r.duration,
testsPassed: r.testsPassed,
testsTotal: r.testsTotal,
toolCallCount: r.toolCallCount ?? 0,
filesModified: r.filesModified,
contextWindowUsed:
(r.totalInputTokens ?? 0) +
(r.totalCacheReadTokens ?? 0) +
(r.totalCacheCreationTokens ?? 0),
totalOutputTokens: r.totalOutputTokens,
modelUsage: r.modelUsage,
toolErrorCount: r.toolErrorCount,
permissionDenialCount: r.permissionDenialCount,
loadedSkills: r.loadedSkills,
referenceFilesRead: r.referenceFilesRead,
...(r.costUsd != null ? { costUsd: r.costUsd } : {}),
...(r.error ? { error: r.error } : {}),
};
if (transcript && transcript.toolCalls.length > 0) {
// Use traced() to create a root span with child spans for tool calls
experiment.traced(
(span) => {
span.log({ input, output, expected, scores, metadata });
const spanOptions = r.startedAt
? { name: r.scenario, startTime: r.startedAt / 1000 }
: { name: r.scenario };
for (const tc of transcript.toolCalls) {
span.traced(
(childSpan) => {
childSpan.log({
input: { tool: tc.tool, args: tc.input },
output: { preview: tc.outputPreview },
metadata: { toolUseId: tc.toolUseId },
});
},
{ name: `tool:${tc.tool}` },
);
}
},
{ name: r.scenario },
);
if (transcript && transcript.toolCalls.length > 0) {
experiment.traced((span) => {
span.log({ input, output, expected, scores, metadata });
for (const tc of transcript.toolCalls) {
span.traced(
(childSpan) => {
childSpan.log({
input: { tool: tc.tool, args: tc.input },
output: {
preview: tc.outputPreview,
isError: tc.isError,
...(tc.stderr ? { stderr: tc.stderr } : {}),
},
metadata: { toolUseId: tc.toolUseId },
});
},
{ name: `tool:${tc.tool}` },
);
}
}, spanOptions);
} else {
experiment.log({ input, output, expected, scores, metadata });
experiment.traced((span) => {
span.log({ input, output, expected, scores, metadata });
}, spanOptions);
}
}

View File

@@ -1,4 +1,3 @@
import { execFileSync } from "node:child_process";
import {
cpSync,
existsSync,
@@ -6,43 +5,21 @@ import {
mkdtempSync,
readdirSync,
rmSync,
writeFileSync,
} from "node:fs";
import { tmpdir } from "node:os";
import { dirname, join, resolve } from "node:path";
import { fileURLToPath } from "node:url";
import { join } from "node:path";
import { EVAL_PROJECT_DIR } from "./supabase-setup.js";
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
/** Resolve the `skills` binary from the evals package node_modules. */
function resolveSkillsBin(): string {
// __dirname is packages/evals/src/runner/ (or compiled equivalent)
// Walk up to packages/evals/ and into node_modules/.bin/skills
const bin = resolve(__dirname, "..", "..", "node_modules", ".bin", "skills");
if (existsSync(bin)) return bin;
throw new Error(`skills binary not found at ${bin}. Run npm install.`);
}
/** Walk up from cwd to find the repository root (contains skills/ and packages/). */
function findRepoRoot(): string {
let dir = process.cwd();
for (let i = 0; i < 10; i++) {
if (existsSync(join(dir, "skills")) && existsSync(join(dir, "packages"))) {
return dir;
}
const parent = resolve(dir, "..");
if (parent === dir) break;
dir = parent;
}
throw new Error("Could not find repository root (skills/ + packages/)");
}
/**
* Create an isolated workspace for an eval run.
*
* 1. Copy the eval directory to a temp folder (excluding EVAL.ts)
* 2. Optionally install skills via the `skills` CLI so Claude Code can discover them
* 1. Copy the eval directory to a temp folder (excluding EVAL.ts/EVAL.tsx)
* 2. Seed with the eval project's supabase/config.toml
*
* Skills are injected via the --agents flag in agent.ts (not installed into
* the workspace here). Combined with --setting-sources project,local, this
* prevents host ~/.agents/skills/ from leaking into the eval environment.
*
* Returns the path to the workspace and a cleanup function.
*/
@@ -50,10 +27,9 @@ export function createWorkspace(opts: {
evalDir: string;
skillEnabled: boolean;
}): { workspacePath: string; cleanup: () => void } {
const repoRoot = findRepoRoot();
const workspacePath = mkdtempSync(join(tmpdir(), "supabase-eval-"));
// Copy eval directory, excluding EVAL.ts (hidden from agent)
// Copy eval directory, excluding EVAL.ts/EVAL.tsx (hidden from agent)
const entries = readdirSync(opts.evalDir, { withFileTypes: true });
for (const entry of entries) {
if (entry.name === "EVAL.ts" || entry.name === "EVAL.tsx") continue;
@@ -62,6 +38,23 @@ export function createWorkspace(opts: {
cpSync(src, dest, { recursive: true });
}
// Add .mcp.json so the agent connects to the local Supabase MCP server
writeFileSync(
join(workspacePath, ".mcp.json"),
JSON.stringify(
{
mcpServers: {
"local-supabase": {
type: "http",
url: "http://localhost:54321/mcp",
},
},
},
null,
"\t",
),
);
// Seed the workspace with the eval project's supabase/config.toml so the
// agent can run `supabase db push` against the shared local instance without
// needing to run `supabase init` or `supabase start` first.
@@ -72,26 +65,6 @@ export function createWorkspace(opts: {
cpSync(projectConfigSrc, join(destSupabaseDir, "config.toml"));
}
// Install skills into the workspace via the `skills` CLI
if (opts.skillEnabled) {
const skillsDir = join(repoRoot, "skills");
if (existsSync(skillsDir)) {
const skillsBin = resolveSkillsBin();
const args = ["add", skillsDir, "-a", "claude-code", "-y"];
const skillFilter = process.env.EVAL_SKILL;
if (skillFilter) {
args.push("--skill", skillFilter);
}
execFileSync(skillsBin, args, {
cwd: workspacePath,
stdio: "pipe",
timeout: 60_000,
});
}
}
return {
workspacePath,
cleanup: () => {

View File

@@ -0,0 +1,94 @@
import type { EvalRunResult } from "../types.js";
import type { TranscriptSummary } from "./transcript.js";
export interface ScoreResult {
name: string;
/** 0.0 1.0 */
score: number;
metadata?: Record<string, unknown>;
}
/**
* skillUsageScorer — 1 if the target skill was in the agent's context, 0 otherwise.
*
* Detected via the `skills` array in the system init event of the NDJSON transcript.
* Combined with `--setting-sources project,local` in agent.ts, this array is clean
* (no host skill leakage), so its presence is a reliable signal.
*/
export function skillUsageScorer(
transcript: TranscriptSummary,
skillName: string,
): ScoreResult {
const loaded = transcript.skills.includes(skillName);
return {
name: "skill_usage",
score: loaded ? 1 : 0,
metadata: {
loadedSkills: transcript.skills,
targetSkill: skillName,
},
};
}
/**
* referenceFilesUsageScorer — fraction of expected reference files actually read.
*
* Detected via Read tool calls whose file_path matches "/.agents/skills/*\/references/".
* The expectedReferenceFiles list is declared in each EVAL.ts and should match the
* "Skill References Exercised" table in the corresponding scenarios/*.md file.
*/
export function referenceFilesUsageScorer(
transcript: TranscriptSummary,
expectedReferenceFiles: string[],
): ScoreResult {
if (expectedReferenceFiles.length === 0) {
return {
name: "reference_files_usage",
score: 1,
metadata: { skipped: true },
};
}
const read = transcript.referenceFilesRead;
const hits = expectedReferenceFiles.filter((f) => read.includes(f)).length;
return {
name: "reference_files_usage",
score: hits / expectedReferenceFiles.length,
metadata: {
expected: expectedReferenceFiles,
read,
hits,
total: expectedReferenceFiles.length,
},
};
}
/**
* assertionsPassedScorer — ratio of vitest assertions passed vs total.
*/
export function assertionsPassedScorer(result: EvalRunResult): ScoreResult {
const score =
result.testsTotal > 0 ? result.testsPassed / result.testsTotal : 0;
return {
name: "assertions_passed",
score,
metadata: { passed: result.testsPassed, total: result.testsTotal },
};
}
/**
* finalResultScorer — 1 if the agent met the pass threshold, 0 otherwise.
*
* A result is "passed" when assertionsPassed >= passThreshold (set per scenario
* in scenarios/*.md). This is the binary outcome used for Braintrust comparisons.
*/
export function finalResultScorer(result: EvalRunResult): ScoreResult {
return {
name: "final_result",
score: result.status === "passed" ? 1 : 0,
metadata: {
testsPassed: result.testsPassed,
testsTotal: result.testsTotal,
passThreshold: result.passThreshold,
},
};
}

View File

@@ -1,3 +1,5 @@
import { basename } from "node:path";
export interface TranscriptEvent {
type: string;
[key: string]: unknown;
@@ -9,15 +11,45 @@ export interface ToolCallSummary {
input: Record<string, unknown>;
/** First ~200 chars of output for quick scanning */
outputPreview: string;
/** Whether the tool call returned an error */
isError: boolean;
/** stderr output for Bash tool calls */
stderr: string;
}
export interface ModelUsage {
inputTokens: number;
outputTokens: number;
cacheReadInputTokens: number;
cacheCreationInputTokens: number;
costUSD: number;
}
export interface TranscriptSummary {
totalTurns: number;
totalDurationMs: number;
/** API-only latency (excludes local processing overhead) */
totalDurationApiMs: number;
totalCostUsd: number | null;
model: string | null;
toolCalls: ToolCallSummary[];
finalOutput: string;
/** Skills listed in the system init event (loaded into agent context) */
skills: string[];
/** Basenames of reference files the agent read via the Read tool */
referenceFilesRead: string[];
/** Per-model token usage and cost breakdown */
modelUsage: Record<string, ModelUsage>;
totalInputTokens: number;
totalOutputTokens: number;
totalCacheReadTokens: number;
totalCacheCreationTokens: number;
/** Count of tool calls that returned is_error === true */
toolErrorCount: number;
/** Whether the overall session ended in an error */
isError: boolean;
/** Count of permission_denials in the result event */
permissionDenialCount: number;
}
/** Parse a single NDJSON line. Returns null on empty or invalid input. */
@@ -74,6 +106,13 @@ export function extractFinalOutput(events: TranscriptEvent[]): string {
return "";
}
/** Return true if a file path points to a skill reference file. */
function isReferenceFilePath(filePath: string): boolean {
return (
filePath.includes("/.agents/skills/") && filePath.includes("/references/")
);
}
/** Walk parsed events to build a transcript summary. */
export function buildTranscriptSummary(
events: TranscriptEvent[],
@@ -81,16 +120,30 @@ export function buildTranscriptSummary(
const toolCalls: ToolCallSummary[] = [];
let finalOutput = "";
let totalDurationMs = 0;
let totalDurationApiMs = 0;
let totalCostUsd: number | null = null;
let model: string | null = null;
let totalTurns = 0;
let skills: string[] = [];
const referenceFilesRead: string[] = [];
let modelUsage: Record<string, ModelUsage> = {};
let totalInputTokens = 0;
let totalOutputTokens = 0;
let totalCacheReadTokens = 0;
let totalCacheCreationTokens = 0;
let toolErrorCount = 0;
let isError = false;
let permissionDenialCount = 0;
for (const event of events) {
const e = event as Record<string, unknown>;
// System init: extract model
// System init: extract model and loaded skills
if (e.type === "system" && e.subtype === "init") {
model = typeof e.model === "string" ? e.model : null;
if (Array.isArray(e.skills)) {
skills = e.skills.filter((s): s is string => typeof s === "string");
}
}
// Assistant messages: extract tool_use blocks
@@ -100,12 +153,27 @@ export function buildTranscriptSummary(
if (Array.isArray(content)) {
for (const block of content) {
if (block.type === "tool_use") {
toolCalls.push({
const toolCall: ToolCallSummary = {
tool: block.name ?? "unknown",
toolUseId: block.id ?? "",
input: block.input ?? {},
outputPreview: "",
});
isError: false,
stderr: "",
};
toolCalls.push(toolCall);
// Track reference file reads
if (
block.name === "Read" &&
typeof block.input?.file_path === "string" &&
isReferenceFilePath(block.input.file_path)
) {
const base = basename(block.input.file_path);
if (!referenceFilesRead.includes(base)) {
referenceFilesRead.push(base);
}
}
}
}
}
@@ -127,28 +195,107 @@ export function buildTranscriptSummary(
? block.content
: JSON.stringify(block.content);
matching.outputPreview = text.slice(0, 200);
// Capture error state from tool result
if (block.is_error === true) {
matching.isError = true;
toolErrorCount++;
}
}
}
}
}
// Capture stderr from tool_use_result (Bash tool emits this at the user event level)
const toolUseResult = e.tool_use_result as
| Record<string, unknown>
| undefined;
if (toolUseResult && typeof toolUseResult.stderr === "string") {
// Match to the most recent Bash tool call without stderr set
const lastBash = [...toolCalls]
.reverse()
.find((tc) => tc.tool === "Bash" && !tc.stderr);
if (lastBash) {
lastBash.stderr = toolUseResult.stderr;
}
}
}
// Result event: final output, cost, duration, turns
// Result event: final output, cost, duration, turns, token usage
if (e.type === "result") {
finalOutput = typeof e.result === "string" ? e.result : "";
totalDurationMs = typeof e.duration_ms === "number" ? e.duration_ms : 0;
totalDurationApiMs =
typeof e.duration_api_ms === "number" ? e.duration_api_ms : 0;
totalCostUsd =
typeof e.total_cost_usd === "number" ? e.total_cost_usd : null;
totalTurns = typeof e.num_turns === "number" ? e.num_turns : 0;
isError = e.is_error === true;
permissionDenialCount = Array.isArray(e.permission_denials)
? e.permission_denials.length
: 0;
// Aggregate token usage from the result event's usage field
const usage = e.usage as Record<string, unknown> | undefined;
if (usage) {
totalInputTokens =
typeof usage.input_tokens === "number" ? usage.input_tokens : 0;
totalOutputTokens =
typeof usage.output_tokens === "number" ? usage.output_tokens : 0;
totalCacheReadTokens =
typeof usage.cache_read_input_tokens === "number"
? usage.cache_read_input_tokens
: 0;
totalCacheCreationTokens =
typeof usage.cache_creation_input_tokens === "number"
? usage.cache_creation_input_tokens
: 0;
}
// Per-model usage breakdown (modelUsage keyed by model name)
const rawModelUsage = e.modelUsage as
| Record<string, Record<string, unknown>>
| undefined;
if (rawModelUsage) {
modelUsage = {};
for (const [modelName, mu] of Object.entries(rawModelUsage)) {
modelUsage[modelName] = {
inputTokens:
typeof mu.inputTokens === "number" ? mu.inputTokens : 0,
outputTokens:
typeof mu.outputTokens === "number" ? mu.outputTokens : 0,
cacheReadInputTokens:
typeof mu.cacheReadInputTokens === "number"
? mu.cacheReadInputTokens
: 0,
cacheCreationInputTokens:
typeof mu.cacheCreationInputTokens === "number"
? mu.cacheCreationInputTokens
: 0,
costUSD: typeof mu.costUSD === "number" ? mu.costUSD : 0,
};
}
}
}
}
return {
totalTurns,
totalDurationMs,
totalDurationApiMs,
totalCostUsd,
model,
toolCalls,
finalOutput,
skills,
referenceFilesRead,
modelUsage,
totalInputTokens,
totalOutputTokens,
totalCacheReadTokens,
totalCacheCreationTokens,
toolErrorCount,
isError,
permissionDenialCount,
};
}

View File

@@ -44,4 +44,39 @@ export interface EvalRunResult {
prompt?: string;
/** Per-test pass/fail results from vitest */
individualTests?: Record<string, boolean>;
/** Epoch ms when the agent run started (for Braintrust span timing) */
startedAt?: number;
/** API-only latency in ms (excludes local processing overhead) */
durationApiMs?: number;
/** Aggregate token counts from the result event */
totalInputTokens?: number;
totalOutputTokens?: number;
totalCacheReadTokens?: number;
totalCacheCreationTokens?: number;
/** Per-model token usage and cost breakdown */
modelUsage?: Record<
string,
{
inputTokens: number;
outputTokens: number;
cacheReadInputTokens: number;
cacheCreationInputTokens: number;
costUSD: number;
}
>;
/** Count of tool calls that returned is_error === true */
toolErrorCount?: number;
/** Count of permission_denials in the result event */
permissionDenialCount?: number;
/** Skills that were in the agent's context (from system init event) */
loadedSkills?: string[];
/** Basenames of skill reference files the agent read */
referenceFilesRead?: string[];
/** Computed scorer results */
scores?: {
skillUsage: number;
referenceFilesUsage: number;
assertionsPassed: number;
finalResult: number;
};
}