mirror of
https://github.com/supabase/agent-skills.git
synced 2026-03-27 10:09:26 +08:00
remove some braintrust headers
This commit is contained in:
@@ -76,6 +76,10 @@ docker run --rm \
|
|||||||
-e BRAINTRUST_PROJECT_ID \
|
-e BRAINTRUST_PROJECT_ID \
|
||||||
-e EVAL_RESULTS_DIR=/app/results \
|
-e EVAL_RESULTS_DIR=/app/results \
|
||||||
-v "$(pwd)/packages/evals/results:/app/results" \
|
-v "$(pwd)/packages/evals/results:/app/results" \
|
||||||
|
-v "$(pwd)/packages/evals/project:/app/packages/evals/project" \
|
||||||
|
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||||
|
--group-add 0 \
|
||||||
|
--network host \
|
||||||
supabase-evals:local
|
supabase-evals:local
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
@@ -76,16 +76,6 @@ mise run eval:upload
|
|||||||
mise run --force eval:upload
|
mise run --force eval:upload
|
||||||
```
|
```
|
||||||
|
|
||||||
Or directly (no caching, always runs):
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd packages/evals
|
|
||||||
npx tsx src/runner.ts
|
|
||||||
|
|
||||||
# Single scenario, baseline mode
|
|
||||||
EVAL_BASELINE=true EVAL_SCENARIO=auth-rls-new-project npx tsx src/runner.ts
|
|
||||||
```
|
|
||||||
|
|
||||||
## Baseline Mode
|
## Baseline Mode
|
||||||
|
|
||||||
Set `EVAL_BASELINE=true` to run scenarios **without** skills. By default,
|
Set `EVAL_BASELINE=true` to run scenarios **without** skills. By default,
|
||||||
|
|||||||
@@ -29,12 +29,13 @@ RUN npm --prefix packages/skills-build run build
|
|||||||
# ---------- Stage 2: runtime ----------
|
# ---------- Stage 2: runtime ----------
|
||||||
FROM node:22-slim
|
FROM node:22-slim
|
||||||
|
|
||||||
# Install Docker CLI and curl (needed for supabase CLI install)
|
# Install Docker CLI, psql client, and curl (needed for supabase CLI install)
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
git \
|
git \
|
||||||
curl \
|
curl \
|
||||||
ca-certificates \
|
ca-certificates \
|
||||||
docker.io \
|
docker.io \
|
||||||
|
postgresql-client \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Install supabase CLI binary (pinned version)
|
# Install supabase CLI binary (pinned version)
|
||||||
|
|||||||
@@ -1,11 +1,21 @@
|
|||||||
import { existsSync, readdirSync, readFileSync } from "node:fs";
|
import { existsSync, readdirSync, readFileSync } from "node:fs";
|
||||||
import { join, resolve } from "node:path";
|
import { join, resolve } from "node:path";
|
||||||
import { runAgent } from "./runner/agent.js";
|
import { runAgent } from "./runner/agent.js";
|
||||||
import { uploadToBraintrust } from "./runner/braintrust.js";
|
import {
|
||||||
|
initBraintrustLogger,
|
||||||
|
logScenarioToLogger,
|
||||||
|
uploadToBraintrust,
|
||||||
|
} from "./runner/braintrust.js";
|
||||||
import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
|
import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
|
||||||
import { preflight } from "./runner/preflight.js";
|
import { preflight } from "./runner/preflight.js";
|
||||||
import { listModifiedFiles, printSummary } from "./runner/results.js";
|
import { listModifiedFiles, printSummary } from "./runner/results.js";
|
||||||
import { createWorkspace } from "./runner/scaffold.js";
|
import { createWorkspace } from "./runner/scaffold.js";
|
||||||
|
import {
|
||||||
|
assertionsPassedScorer,
|
||||||
|
finalResultScorer,
|
||||||
|
referenceFilesUsageScorer,
|
||||||
|
skillUsageScorer,
|
||||||
|
} from "./runner/scorers.js";
|
||||||
import {
|
import {
|
||||||
getKeys,
|
getKeys,
|
||||||
resetDB,
|
resetDB,
|
||||||
@@ -24,9 +34,11 @@ import type { EvalRunResult, EvalScenario } from "./types.js";
|
|||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
const DEFAULT_MODEL = "claude-sonnet-4-5-20250929";
|
const DEFAULT_MODEL = "claude-sonnet-4-5-20250929";
|
||||||
|
const DEFAULT_SKILL = "supabase";
|
||||||
const AGENT_TIMEOUT = 30 * 60 * 1000; // 30 minutes
|
const AGENT_TIMEOUT = 30 * 60 * 1000; // 30 minutes
|
||||||
|
|
||||||
const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL;
|
const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL;
|
||||||
|
const skillName = process.env.EVAL_SKILL ?? DEFAULT_SKILL;
|
||||||
const scenarioFilter = process.env.EVAL_SCENARIO;
|
const scenarioFilter = process.env.EVAL_SCENARIO;
|
||||||
const isBaseline = process.env.EVAL_BASELINE === "true";
|
const isBaseline = process.env.EVAL_BASELINE === "true";
|
||||||
const skillEnabled = !isBaseline;
|
const skillEnabled = !isBaseline;
|
||||||
@@ -107,12 +119,14 @@ async function runEval(
|
|||||||
|
|
||||||
// 3. Run the agent
|
// 3. Run the agent
|
||||||
console.log(` Running agent (${model})...`);
|
console.log(` Running agent (${model})...`);
|
||||||
|
const startedAt = Date.now();
|
||||||
const agentResult = await runAgent({
|
const agentResult = await runAgent({
|
||||||
cwd: workspacePath,
|
cwd: workspacePath,
|
||||||
prompt,
|
prompt,
|
||||||
model,
|
model,
|
||||||
timeout: AGENT_TIMEOUT,
|
timeout: AGENT_TIMEOUT,
|
||||||
skillEnabled,
|
skillEnabled,
|
||||||
|
skillName: skillEnabled ? skillName : undefined,
|
||||||
});
|
});
|
||||||
console.log(
|
console.log(
|
||||||
` Agent finished in ${(agentResult.duration / 1000).toFixed(1)}s`,
|
` Agent finished in ${(agentResult.duration / 1000).toFixed(1)}s`,
|
||||||
@@ -149,6 +163,26 @@ async function runEval(
|
|||||||
// 6. Build transcript summary
|
// 6. Build transcript summary
|
||||||
const summary = buildTranscriptSummary(agentResult.events);
|
const summary = buildTranscriptSummary(agentResult.events);
|
||||||
|
|
||||||
|
// 7. Load expectedReferenceFiles from EVAL.ts (if declared)
|
||||||
|
const { expectedReferenceFiles = [] } = await import(evalFilePath).catch(
|
||||||
|
() => ({ expectedReferenceFiles: [] as string[] }),
|
||||||
|
);
|
||||||
|
|
||||||
|
// 8. Run scorers
|
||||||
|
const skillScore = skillUsageScorer(summary, skillName);
|
||||||
|
const refScore = referenceFilesUsageScorer(summary, expectedReferenceFiles);
|
||||||
|
const assertScore = assertionsPassedScorer({
|
||||||
|
testsPassed: testResult.passedCount,
|
||||||
|
testsTotal: testResult.totalCount,
|
||||||
|
status: testResult.passed ? "passed" : "failed",
|
||||||
|
} as EvalRunResult);
|
||||||
|
const finalScore = finalResultScorer({
|
||||||
|
status: testResult.passed ? "passed" : "failed",
|
||||||
|
testsPassed: testResult.passedCount,
|
||||||
|
testsTotal: testResult.totalCount,
|
||||||
|
passThreshold: passThreshold ?? undefined,
|
||||||
|
} as EvalRunResult);
|
||||||
|
|
||||||
const result: EvalRunResult = {
|
const result: EvalRunResult = {
|
||||||
scenario: scenario.id,
|
scenario: scenario.id,
|
||||||
agent: "claude-code",
|
agent: "claude-code",
|
||||||
@@ -166,6 +200,23 @@ async function runEval(
|
|||||||
costUsd: summary.totalCostUsd ?? undefined,
|
costUsd: summary.totalCostUsd ?? undefined,
|
||||||
prompt,
|
prompt,
|
||||||
individualTests: testResult.individualTests,
|
individualTests: testResult.individualTests,
|
||||||
|
startedAt,
|
||||||
|
durationApiMs: summary.totalDurationApiMs,
|
||||||
|
totalInputTokens: summary.totalInputTokens,
|
||||||
|
totalOutputTokens: summary.totalOutputTokens,
|
||||||
|
totalCacheReadTokens: summary.totalCacheReadTokens,
|
||||||
|
totalCacheCreationTokens: summary.totalCacheCreationTokens,
|
||||||
|
modelUsage: summary.modelUsage,
|
||||||
|
toolErrorCount: summary.toolErrorCount,
|
||||||
|
permissionDenialCount: summary.permissionDenialCount,
|
||||||
|
loadedSkills: summary.skills,
|
||||||
|
referenceFilesRead: summary.referenceFilesRead,
|
||||||
|
scores: {
|
||||||
|
skillUsage: skillScore.score,
|
||||||
|
referenceFilesUsage: refScore.score,
|
||||||
|
assertionsPassed: assertScore.score,
|
||||||
|
finalResult: finalScore.score,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
// 7. Persist results
|
// 7. Persist results
|
||||||
@@ -239,6 +290,9 @@ async function main() {
|
|||||||
const results: EvalRunResult[] = [];
|
const results: EvalRunResult[] = [];
|
||||||
const transcripts = new Map<string, TranscriptSummary>();
|
const transcripts = new Map<string, TranscriptSummary>();
|
||||||
|
|
||||||
|
const braintrustUpload = process.env.BRAINTRUST_UPLOAD === "true";
|
||||||
|
const logger = braintrustUpload ? initBraintrustLogger() : undefined;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for (const scenario of scenarios) {
|
for (const scenario of scenarios) {
|
||||||
// Reset the database before each scenario for a clean slate.
|
// Reset the database before each scenario for a clean slate.
|
||||||
@@ -250,16 +304,22 @@ async function main() {
|
|||||||
if (transcript) {
|
if (transcript) {
|
||||||
transcripts.set(result.scenario, transcript);
|
transcripts.set(result.scenario, transcript);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Log immediately after each scenario for real-time visibility.
|
||||||
|
if (logger) {
|
||||||
|
logScenarioToLogger(logger, result, transcript);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
stopSupabase();
|
stopSupabase();
|
||||||
|
await logger?.flush();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use the results dir from the first result (all share the same timestamp)
|
// Use the results dir from the first result (all share the same timestamp)
|
||||||
const resultsDir = results.find((r) => r.resultsDir)?.resultsDir;
|
const resultsDir = results.find((r) => r.resultsDir)?.resultsDir;
|
||||||
printSummary(results, resultsDir);
|
printSummary(results, resultsDir);
|
||||||
|
|
||||||
if (process.env.BRAINTRUST_UPLOAD === "true") {
|
if (braintrustUpload) {
|
||||||
console.log("\nUploading to Braintrust...");
|
console.log("\nUploading to Braintrust...");
|
||||||
await uploadToBraintrust(results, {
|
await uploadToBraintrust(results, {
|
||||||
model,
|
model,
|
||||||
|
|||||||
@@ -26,6 +26,12 @@ export interface AgentRunResult {
|
|||||||
* and has access to the local Supabase MCP server so it can apply migrations
|
* and has access to the local Supabase MCP server so it can apply migrations
|
||||||
* and query the real database. --strict-mcp-config ensures only the local
|
* and query the real database. --strict-mcp-config ensures only the local
|
||||||
* Supabase instance is reachable — no host MCP servers leak in.
|
* Supabase instance is reachable — no host MCP servers leak in.
|
||||||
|
*
|
||||||
|
* --setting-sources project,local prevents skills from the user's global
|
||||||
|
* ~/.agents/skills/ from leaking into the eval environment.
|
||||||
|
*
|
||||||
|
* When skillEnabled, --agents injects the target skill directly into the
|
||||||
|
* agent's context, guaranteeing it is present (not just discoverable).
|
||||||
*/
|
*/
|
||||||
export async function runAgent(opts: {
|
export async function runAgent(opts: {
|
||||||
cwd: string;
|
cwd: string;
|
||||||
@@ -33,6 +39,8 @@ export async function runAgent(opts: {
|
|||||||
model: string;
|
model: string;
|
||||||
timeout: number;
|
timeout: number;
|
||||||
skillEnabled: boolean;
|
skillEnabled: boolean;
|
||||||
|
/** Skill name to inject via --agents (e.g. "supabase"). Used when skillEnabled. */
|
||||||
|
skillName?: string;
|
||||||
}): Promise<AgentRunResult> {
|
}): Promise<AgentRunResult> {
|
||||||
const start = Date.now();
|
const start = Date.now();
|
||||||
|
|
||||||
@@ -62,10 +70,26 @@ export async function runAgent(opts: {
|
|||||||
"--mcp-config",
|
"--mcp-config",
|
||||||
mcpConfig,
|
mcpConfig,
|
||||||
"--strict-mcp-config",
|
"--strict-mcp-config",
|
||||||
|
// Prevent skills from the user's global ~/.agents/skills/ from leaking
|
||||||
|
// into the eval environment. Only workspace (project) and local sources
|
||||||
|
// are loaded, so the eval sees only what was explicitly installed.
|
||||||
|
"--setting-sources",
|
||||||
|
"project,local",
|
||||||
];
|
];
|
||||||
|
|
||||||
// Disable skills for baseline runs so the agent relies on innate knowledge
|
if (opts.skillEnabled && opts.skillName) {
|
||||||
if (!opts.skillEnabled) {
|
// Inject the target skill directly into the agent context via --agents.
|
||||||
|
// This guarantees the skill is embedded in the subagent's context at
|
||||||
|
// startup (not just available as a slash command).
|
||||||
|
const agentsDef = JSON.stringify({
|
||||||
|
main: {
|
||||||
|
description: `Supabase developer agent with ${opts.skillName} skill`,
|
||||||
|
skills: [opts.skillName],
|
||||||
|
},
|
||||||
|
});
|
||||||
|
args.push("--agents", agentsDef);
|
||||||
|
} else if (!opts.skillEnabled) {
|
||||||
|
// Baseline runs: disable all skills so the agent relies on innate knowledge
|
||||||
args.push("--disable-slash-commands");
|
args.push("--disable-slash-commands");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,14 +1,119 @@
|
|||||||
import assert from "node:assert";
|
import assert from "node:assert";
|
||||||
import { init } from "braintrust";
|
import { init, initLogger, type Logger } from "braintrust";
|
||||||
import type { EvalRunResult } from "../types.js";
|
import type { EvalRunResult } from "../types.js";
|
||||||
import type { TranscriptSummary } from "./transcript.js";
|
import type { TranscriptSummary } from "./transcript.js";
|
||||||
|
|
||||||
/** Convert a test name to a snake_case score key. */
|
/**
|
||||||
function toScoreKey(name: string): string {
|
* Initialize a Braintrust project logger for real-time per-scenario logging.
|
||||||
return `test_${name
|
* Call this once at startup and pass the logger to logScenarioToLogger().
|
||||||
.toLowerCase()
|
*/
|
||||||
.replace(/[^a-z0-9]+/g, "_")
|
export function initBraintrustLogger(): Logger<true> {
|
||||||
.replace(/^_|_$/g, "")}`;
|
assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
|
||||||
|
assert(process.env.BRAINTRUST_PROJECT_ID, "BRAINTRUST_PROJECT_ID is not set");
|
||||||
|
return initLogger({
|
||||||
|
projectId: process.env.BRAINTRUST_PROJECT_ID,
|
||||||
|
asyncFlush: true,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log a single scenario result to the Braintrust project logger in real-time.
|
||||||
|
* This runs alongside the experiment upload, giving immediate visibility in
|
||||||
|
* the project log as each scenario completes.
|
||||||
|
*/
|
||||||
|
export function logScenarioToLogger(
|
||||||
|
logger: Logger<true>,
|
||||||
|
r: EvalRunResult,
|
||||||
|
transcript?: TranscriptSummary,
|
||||||
|
): void {
|
||||||
|
const scores: Record<string, number> = {
|
||||||
|
skill_usage: r.scores?.skillUsage ?? 0,
|
||||||
|
reference_files_usage: r.scores?.referenceFilesUsage ?? 0,
|
||||||
|
assertions_passed: r.scores?.assertionsPassed ?? 0,
|
||||||
|
final_result: r.scores?.finalResult ?? 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
const metadata: Record<string, unknown> = {
|
||||||
|
agent: r.agent,
|
||||||
|
model: r.model,
|
||||||
|
skillEnabled: r.skillEnabled,
|
||||||
|
testsPassed: r.testsPassed,
|
||||||
|
testsTotal: r.testsTotal,
|
||||||
|
toolCallCount: r.toolCallCount ?? 0,
|
||||||
|
contextWindowUsed:
|
||||||
|
(r.totalInputTokens ?? 0) +
|
||||||
|
(r.totalCacheReadTokens ?? 0) +
|
||||||
|
(r.totalCacheCreationTokens ?? 0),
|
||||||
|
totalOutputTokens: r.totalOutputTokens,
|
||||||
|
modelUsage: r.modelUsage,
|
||||||
|
toolErrorCount: r.toolErrorCount,
|
||||||
|
permissionDenialCount: r.permissionDenialCount,
|
||||||
|
loadedSkills: r.loadedSkills,
|
||||||
|
referenceFilesRead: r.referenceFilesRead,
|
||||||
|
...(r.costUsd != null ? { costUsd: r.costUsd } : {}),
|
||||||
|
...(r.error ? { error: r.error } : {}),
|
||||||
|
};
|
||||||
|
|
||||||
|
const spanOptions = r.startedAt
|
||||||
|
? { name: r.scenario, startTime: r.startedAt / 1000 }
|
||||||
|
: { name: r.scenario };
|
||||||
|
|
||||||
|
if (transcript && transcript.toolCalls.length > 0) {
|
||||||
|
logger.traced((span) => {
|
||||||
|
span.log({
|
||||||
|
input: {
|
||||||
|
scenario: r.scenario,
|
||||||
|
prompt: r.prompt ?? "",
|
||||||
|
skillEnabled: r.skillEnabled,
|
||||||
|
},
|
||||||
|
output: {
|
||||||
|
status: r.status,
|
||||||
|
agentOutput: r.agentOutput,
|
||||||
|
filesModified: r.filesModified,
|
||||||
|
testOutput: r.testOutput,
|
||||||
|
},
|
||||||
|
expected: { testsTotal: r.testsTotal },
|
||||||
|
scores,
|
||||||
|
metadata,
|
||||||
|
});
|
||||||
|
|
||||||
|
for (const tc of transcript.toolCalls) {
|
||||||
|
span.traced(
|
||||||
|
(childSpan) => {
|
||||||
|
childSpan.log({
|
||||||
|
input: { tool: tc.tool, args: tc.input },
|
||||||
|
output: {
|
||||||
|
preview: tc.outputPreview,
|
||||||
|
isError: tc.isError,
|
||||||
|
...(tc.stderr ? { stderr: tc.stderr } : {}),
|
||||||
|
},
|
||||||
|
metadata: { toolUseId: tc.toolUseId },
|
||||||
|
});
|
||||||
|
},
|
||||||
|
{ name: `tool:${tc.tool}` },
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}, spanOptions);
|
||||||
|
} else {
|
||||||
|
logger.traced((span) => {
|
||||||
|
span.log({
|
||||||
|
input: {
|
||||||
|
scenario: r.scenario,
|
||||||
|
prompt: r.prompt ?? "",
|
||||||
|
skillEnabled: r.skillEnabled,
|
||||||
|
},
|
||||||
|
output: {
|
||||||
|
status: r.status,
|
||||||
|
agentOutput: r.agentOutput,
|
||||||
|
filesModified: r.filesModified,
|
||||||
|
testOutput: r.testOutput,
|
||||||
|
},
|
||||||
|
expected: { testsTotal: r.testsTotal },
|
||||||
|
scores,
|
||||||
|
metadata,
|
||||||
|
});
|
||||||
|
}, spanOptions);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -18,8 +123,8 @@ function toScoreKey(name: string): string {
|
|||||||
* - input: scenario ID, prompt content, skillEnabled flag
|
* - input: scenario ID, prompt content, skillEnabled flag
|
||||||
* - output: status, agent output, files modified, test output
|
* - output: status, agent output, files modified, test output
|
||||||
* - expected: total tests, pass threshold
|
* - expected: total tests, pass threshold
|
||||||
* - scores: pass (0|1), test_pass_rate (0-1), per-test scores
|
* - scores: skill_usage, reference_files_usage, assertions_passed, final_result
|
||||||
* - metadata: model, duration, cost, tool call count, files modified
|
* - metadata: agent, model, skillEnabled, test counts, tool calls, context window, output tokens, model usage, errors, cost
|
||||||
* - spans: one child span per agent tool call (when transcript available)
|
* - spans: one child span per agent tool call (when transcript available)
|
||||||
*/
|
*/
|
||||||
export async function uploadToBraintrust(
|
export async function uploadToBraintrust(
|
||||||
@@ -50,20 +155,11 @@ export async function uploadToBraintrust(
|
|||||||
for (const r of results) {
|
for (const r of results) {
|
||||||
const transcript = opts.transcripts.get(r.scenario);
|
const transcript = opts.transcripts.get(r.scenario);
|
||||||
|
|
||||||
// Build per-test scores
|
|
||||||
const perTestScores: Record<string, number> = {};
|
|
||||||
if (r.individualTests) {
|
|
||||||
for (const [testName, didPass] of Object.entries(r.individualTests)) {
|
|
||||||
perTestScores[toScoreKey(testName)] = didPass ? 1 : 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const testPassRate = r.testsTotal > 0 ? r.testsPassed / r.testsTotal : 0;
|
|
||||||
|
|
||||||
const scores: Record<string, number> = {
|
const scores: Record<string, number> = {
|
||||||
pass: r.status === "passed" ? 1 : 0,
|
skill_usage: r.scores?.skillUsage ?? 0,
|
||||||
test_pass_rate: testPassRate,
|
reference_files_usage: r.scores?.referenceFilesUsage ?? 0,
|
||||||
...perTestScores,
|
assertions_passed: r.scores?.assertionsPassed ?? 0,
|
||||||
|
final_result: r.scores?.finalResult ?? 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
const input = {
|
const input = {
|
||||||
@@ -88,38 +184,52 @@ export async function uploadToBraintrust(
|
|||||||
agent: r.agent,
|
agent: r.agent,
|
||||||
model: r.model,
|
model: r.model,
|
||||||
skillEnabled: r.skillEnabled,
|
skillEnabled: r.skillEnabled,
|
||||||
duration: r.duration,
|
|
||||||
testsPassed: r.testsPassed,
|
testsPassed: r.testsPassed,
|
||||||
testsTotal: r.testsTotal,
|
testsTotal: r.testsTotal,
|
||||||
toolCallCount: r.toolCallCount ?? 0,
|
toolCallCount: r.toolCallCount ?? 0,
|
||||||
filesModified: r.filesModified,
|
contextWindowUsed:
|
||||||
|
(r.totalInputTokens ?? 0) +
|
||||||
|
(r.totalCacheReadTokens ?? 0) +
|
||||||
|
(r.totalCacheCreationTokens ?? 0),
|
||||||
|
totalOutputTokens: r.totalOutputTokens,
|
||||||
|
modelUsage: r.modelUsage,
|
||||||
|
toolErrorCount: r.toolErrorCount,
|
||||||
|
permissionDenialCount: r.permissionDenialCount,
|
||||||
|
loadedSkills: r.loadedSkills,
|
||||||
|
referenceFilesRead: r.referenceFilesRead,
|
||||||
...(r.costUsd != null ? { costUsd: r.costUsd } : {}),
|
...(r.costUsd != null ? { costUsd: r.costUsd } : {}),
|
||||||
...(r.error ? { error: r.error } : {}),
|
...(r.error ? { error: r.error } : {}),
|
||||||
};
|
};
|
||||||
|
|
||||||
if (transcript && transcript.toolCalls.length > 0) {
|
const spanOptions = r.startedAt
|
||||||
// Use traced() to create a root span with child spans for tool calls
|
? { name: r.scenario, startTime: r.startedAt / 1000 }
|
||||||
experiment.traced(
|
: { name: r.scenario };
|
||||||
(span) => {
|
|
||||||
span.log({ input, output, expected, scores, metadata });
|
|
||||||
|
|
||||||
for (const tc of transcript.toolCalls) {
|
if (transcript && transcript.toolCalls.length > 0) {
|
||||||
span.traced(
|
experiment.traced((span) => {
|
||||||
(childSpan) => {
|
span.log({ input, output, expected, scores, metadata });
|
||||||
childSpan.log({
|
|
||||||
input: { tool: tc.tool, args: tc.input },
|
for (const tc of transcript.toolCalls) {
|
||||||
output: { preview: tc.outputPreview },
|
span.traced(
|
||||||
metadata: { toolUseId: tc.toolUseId },
|
(childSpan) => {
|
||||||
});
|
childSpan.log({
|
||||||
},
|
input: { tool: tc.tool, args: tc.input },
|
||||||
{ name: `tool:${tc.tool}` },
|
output: {
|
||||||
);
|
preview: tc.outputPreview,
|
||||||
}
|
isError: tc.isError,
|
||||||
},
|
...(tc.stderr ? { stderr: tc.stderr } : {}),
|
||||||
{ name: r.scenario },
|
},
|
||||||
);
|
metadata: { toolUseId: tc.toolUseId },
|
||||||
|
});
|
||||||
|
},
|
||||||
|
{ name: `tool:${tc.tool}` },
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}, spanOptions);
|
||||||
} else {
|
} else {
|
||||||
experiment.log({ input, output, expected, scores, metadata });
|
experiment.traced((span) => {
|
||||||
|
span.log({ input, output, expected, scores, metadata });
|
||||||
|
}, spanOptions);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
import { execFileSync } from "node:child_process";
|
|
||||||
import {
|
import {
|
||||||
cpSync,
|
cpSync,
|
||||||
existsSync,
|
existsSync,
|
||||||
@@ -6,43 +5,21 @@ import {
|
|||||||
mkdtempSync,
|
mkdtempSync,
|
||||||
readdirSync,
|
readdirSync,
|
||||||
rmSync,
|
rmSync,
|
||||||
|
writeFileSync,
|
||||||
} from "node:fs";
|
} from "node:fs";
|
||||||
import { tmpdir } from "node:os";
|
import { tmpdir } from "node:os";
|
||||||
import { dirname, join, resolve } from "node:path";
|
import { join } from "node:path";
|
||||||
import { fileURLToPath } from "node:url";
|
|
||||||
import { EVAL_PROJECT_DIR } from "./supabase-setup.js";
|
import { EVAL_PROJECT_DIR } from "./supabase-setup.js";
|
||||||
|
|
||||||
const __filename = fileURLToPath(import.meta.url);
|
|
||||||
const __dirname = dirname(__filename);
|
|
||||||
|
|
||||||
/** Resolve the `skills` binary from the evals package node_modules. */
|
|
||||||
function resolveSkillsBin(): string {
|
|
||||||
// __dirname is packages/evals/src/runner/ (or compiled equivalent)
|
|
||||||
// Walk up to packages/evals/ and into node_modules/.bin/skills
|
|
||||||
const bin = resolve(__dirname, "..", "..", "node_modules", ".bin", "skills");
|
|
||||||
if (existsSync(bin)) return bin;
|
|
||||||
throw new Error(`skills binary not found at ${bin}. Run npm install.`);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Walk up from cwd to find the repository root (contains skills/ and packages/). */
|
|
||||||
function findRepoRoot(): string {
|
|
||||||
let dir = process.cwd();
|
|
||||||
for (let i = 0; i < 10; i++) {
|
|
||||||
if (existsSync(join(dir, "skills")) && existsSync(join(dir, "packages"))) {
|
|
||||||
return dir;
|
|
||||||
}
|
|
||||||
const parent = resolve(dir, "..");
|
|
||||||
if (parent === dir) break;
|
|
||||||
dir = parent;
|
|
||||||
}
|
|
||||||
throw new Error("Could not find repository root (skills/ + packages/)");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create an isolated workspace for an eval run.
|
* Create an isolated workspace for an eval run.
|
||||||
*
|
*
|
||||||
* 1. Copy the eval directory to a temp folder (excluding EVAL.ts)
|
* 1. Copy the eval directory to a temp folder (excluding EVAL.ts/EVAL.tsx)
|
||||||
* 2. Optionally install skills via the `skills` CLI so Claude Code can discover them
|
* 2. Seed with the eval project's supabase/config.toml
|
||||||
|
*
|
||||||
|
* Skills are injected via the --agents flag in agent.ts (not installed into
|
||||||
|
* the workspace here). Combined with --setting-sources project,local, this
|
||||||
|
* prevents host ~/.agents/skills/ from leaking into the eval environment.
|
||||||
*
|
*
|
||||||
* Returns the path to the workspace and a cleanup function.
|
* Returns the path to the workspace and a cleanup function.
|
||||||
*/
|
*/
|
||||||
@@ -50,10 +27,9 @@ export function createWorkspace(opts: {
|
|||||||
evalDir: string;
|
evalDir: string;
|
||||||
skillEnabled: boolean;
|
skillEnabled: boolean;
|
||||||
}): { workspacePath: string; cleanup: () => void } {
|
}): { workspacePath: string; cleanup: () => void } {
|
||||||
const repoRoot = findRepoRoot();
|
|
||||||
const workspacePath = mkdtempSync(join(tmpdir(), "supabase-eval-"));
|
const workspacePath = mkdtempSync(join(tmpdir(), "supabase-eval-"));
|
||||||
|
|
||||||
// Copy eval directory, excluding EVAL.ts (hidden from agent)
|
// Copy eval directory, excluding EVAL.ts/EVAL.tsx (hidden from agent)
|
||||||
const entries = readdirSync(opts.evalDir, { withFileTypes: true });
|
const entries = readdirSync(opts.evalDir, { withFileTypes: true });
|
||||||
for (const entry of entries) {
|
for (const entry of entries) {
|
||||||
if (entry.name === "EVAL.ts" || entry.name === "EVAL.tsx") continue;
|
if (entry.name === "EVAL.ts" || entry.name === "EVAL.tsx") continue;
|
||||||
@@ -62,6 +38,23 @@ export function createWorkspace(opts: {
|
|||||||
cpSync(src, dest, { recursive: true });
|
cpSync(src, dest, { recursive: true });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add .mcp.json so the agent connects to the local Supabase MCP server
|
||||||
|
writeFileSync(
|
||||||
|
join(workspacePath, ".mcp.json"),
|
||||||
|
JSON.stringify(
|
||||||
|
{
|
||||||
|
mcpServers: {
|
||||||
|
"local-supabase": {
|
||||||
|
type: "http",
|
||||||
|
url: "http://localhost:54321/mcp",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
null,
|
||||||
|
"\t",
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
// Seed the workspace with the eval project's supabase/config.toml so the
|
// Seed the workspace with the eval project's supabase/config.toml so the
|
||||||
// agent can run `supabase db push` against the shared local instance without
|
// agent can run `supabase db push` against the shared local instance without
|
||||||
// needing to run `supabase init` or `supabase start` first.
|
// needing to run `supabase init` or `supabase start` first.
|
||||||
@@ -72,26 +65,6 @@ export function createWorkspace(opts: {
|
|||||||
cpSync(projectConfigSrc, join(destSupabaseDir, "config.toml"));
|
cpSync(projectConfigSrc, join(destSupabaseDir, "config.toml"));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Install skills into the workspace via the `skills` CLI
|
|
||||||
if (opts.skillEnabled) {
|
|
||||||
const skillsDir = join(repoRoot, "skills");
|
|
||||||
if (existsSync(skillsDir)) {
|
|
||||||
const skillsBin = resolveSkillsBin();
|
|
||||||
const args = ["add", skillsDir, "-a", "claude-code", "-y"];
|
|
||||||
|
|
||||||
const skillFilter = process.env.EVAL_SKILL;
|
|
||||||
if (skillFilter) {
|
|
||||||
args.push("--skill", skillFilter);
|
|
||||||
}
|
|
||||||
|
|
||||||
execFileSync(skillsBin, args, {
|
|
||||||
cwd: workspacePath,
|
|
||||||
stdio: "pipe",
|
|
||||||
timeout: 60_000,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
workspacePath,
|
workspacePath,
|
||||||
cleanup: () => {
|
cleanup: () => {
|
||||||
|
|||||||
94
packages/evals/src/runner/scorers.ts
Normal file
94
packages/evals/src/runner/scorers.ts
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
import type { EvalRunResult } from "../types.js";
|
||||||
|
import type { TranscriptSummary } from "./transcript.js";
|
||||||
|
|
||||||
|
export interface ScoreResult {
|
||||||
|
name: string;
|
||||||
|
/** 0.0 – 1.0 */
|
||||||
|
score: number;
|
||||||
|
metadata?: Record<string, unknown>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* skillUsageScorer — 1 if the target skill was in the agent's context, 0 otherwise.
|
||||||
|
*
|
||||||
|
* Detected via the `skills` array in the system init event of the NDJSON transcript.
|
||||||
|
* Combined with `--setting-sources project,local` in agent.ts, this array is clean
|
||||||
|
* (no host skill leakage), so its presence is a reliable signal.
|
||||||
|
*/
|
||||||
|
export function skillUsageScorer(
|
||||||
|
transcript: TranscriptSummary,
|
||||||
|
skillName: string,
|
||||||
|
): ScoreResult {
|
||||||
|
const loaded = transcript.skills.includes(skillName);
|
||||||
|
return {
|
||||||
|
name: "skill_usage",
|
||||||
|
score: loaded ? 1 : 0,
|
||||||
|
metadata: {
|
||||||
|
loadedSkills: transcript.skills,
|
||||||
|
targetSkill: skillName,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* referenceFilesUsageScorer — fraction of expected reference files actually read.
|
||||||
|
*
|
||||||
|
* Detected via Read tool calls whose file_path matches "/.agents/skills/*\/references/".
|
||||||
|
* The expectedReferenceFiles list is declared in each EVAL.ts and should match the
|
||||||
|
* "Skill References Exercised" table in the corresponding scenarios/*.md file.
|
||||||
|
*/
|
||||||
|
export function referenceFilesUsageScorer(
|
||||||
|
transcript: TranscriptSummary,
|
||||||
|
expectedReferenceFiles: string[],
|
||||||
|
): ScoreResult {
|
||||||
|
if (expectedReferenceFiles.length === 0) {
|
||||||
|
return {
|
||||||
|
name: "reference_files_usage",
|
||||||
|
score: 1,
|
||||||
|
metadata: { skipped: true },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
const read = transcript.referenceFilesRead;
|
||||||
|
const hits = expectedReferenceFiles.filter((f) => read.includes(f)).length;
|
||||||
|
return {
|
||||||
|
name: "reference_files_usage",
|
||||||
|
score: hits / expectedReferenceFiles.length,
|
||||||
|
metadata: {
|
||||||
|
expected: expectedReferenceFiles,
|
||||||
|
read,
|
||||||
|
hits,
|
||||||
|
total: expectedReferenceFiles.length,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* assertionsPassedScorer — ratio of vitest assertions passed vs total.
|
||||||
|
*/
|
||||||
|
export function assertionsPassedScorer(result: EvalRunResult): ScoreResult {
|
||||||
|
const score =
|
||||||
|
result.testsTotal > 0 ? result.testsPassed / result.testsTotal : 0;
|
||||||
|
return {
|
||||||
|
name: "assertions_passed",
|
||||||
|
score,
|
||||||
|
metadata: { passed: result.testsPassed, total: result.testsTotal },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* finalResultScorer — 1 if the agent met the pass threshold, 0 otherwise.
|
||||||
|
*
|
||||||
|
* A result is "passed" when assertionsPassed >= passThreshold (set per scenario
|
||||||
|
* in scenarios/*.md). This is the binary outcome used for Braintrust comparisons.
|
||||||
|
*/
|
||||||
|
export function finalResultScorer(result: EvalRunResult): ScoreResult {
|
||||||
|
return {
|
||||||
|
name: "final_result",
|
||||||
|
score: result.status === "passed" ? 1 : 0,
|
||||||
|
metadata: {
|
||||||
|
testsPassed: result.testsPassed,
|
||||||
|
testsTotal: result.testsTotal,
|
||||||
|
passThreshold: result.passThreshold,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -1,3 +1,5 @@
|
|||||||
|
import { basename } from "node:path";
|
||||||
|
|
||||||
export interface TranscriptEvent {
|
export interface TranscriptEvent {
|
||||||
type: string;
|
type: string;
|
||||||
[key: string]: unknown;
|
[key: string]: unknown;
|
||||||
@@ -9,15 +11,45 @@ export interface ToolCallSummary {
|
|||||||
input: Record<string, unknown>;
|
input: Record<string, unknown>;
|
||||||
/** First ~200 chars of output for quick scanning */
|
/** First ~200 chars of output for quick scanning */
|
||||||
outputPreview: string;
|
outputPreview: string;
|
||||||
|
/** Whether the tool call returned an error */
|
||||||
|
isError: boolean;
|
||||||
|
/** stderr output for Bash tool calls */
|
||||||
|
stderr: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ModelUsage {
|
||||||
|
inputTokens: number;
|
||||||
|
outputTokens: number;
|
||||||
|
cacheReadInputTokens: number;
|
||||||
|
cacheCreationInputTokens: number;
|
||||||
|
costUSD: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface TranscriptSummary {
|
export interface TranscriptSummary {
|
||||||
totalTurns: number;
|
totalTurns: number;
|
||||||
totalDurationMs: number;
|
totalDurationMs: number;
|
||||||
|
/** API-only latency (excludes local processing overhead) */
|
||||||
|
totalDurationApiMs: number;
|
||||||
totalCostUsd: number | null;
|
totalCostUsd: number | null;
|
||||||
model: string | null;
|
model: string | null;
|
||||||
toolCalls: ToolCallSummary[];
|
toolCalls: ToolCallSummary[];
|
||||||
finalOutput: string;
|
finalOutput: string;
|
||||||
|
/** Skills listed in the system init event (loaded into agent context) */
|
||||||
|
skills: string[];
|
||||||
|
/** Basenames of reference files the agent read via the Read tool */
|
||||||
|
referenceFilesRead: string[];
|
||||||
|
/** Per-model token usage and cost breakdown */
|
||||||
|
modelUsage: Record<string, ModelUsage>;
|
||||||
|
totalInputTokens: number;
|
||||||
|
totalOutputTokens: number;
|
||||||
|
totalCacheReadTokens: number;
|
||||||
|
totalCacheCreationTokens: number;
|
||||||
|
/** Count of tool calls that returned is_error === true */
|
||||||
|
toolErrorCount: number;
|
||||||
|
/** Whether the overall session ended in an error */
|
||||||
|
isError: boolean;
|
||||||
|
/** Count of permission_denials in the result event */
|
||||||
|
permissionDenialCount: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Parse a single NDJSON line. Returns null on empty or invalid input. */
|
/** Parse a single NDJSON line. Returns null on empty or invalid input. */
|
||||||
@@ -74,6 +106,13 @@ export function extractFinalOutput(events: TranscriptEvent[]): string {
|
|||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Return true if a file path points to a skill reference file. */
|
||||||
|
function isReferenceFilePath(filePath: string): boolean {
|
||||||
|
return (
|
||||||
|
filePath.includes("/.agents/skills/") && filePath.includes("/references/")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
/** Walk parsed events to build a transcript summary. */
|
/** Walk parsed events to build a transcript summary. */
|
||||||
export function buildTranscriptSummary(
|
export function buildTranscriptSummary(
|
||||||
events: TranscriptEvent[],
|
events: TranscriptEvent[],
|
||||||
@@ -81,16 +120,30 @@ export function buildTranscriptSummary(
|
|||||||
const toolCalls: ToolCallSummary[] = [];
|
const toolCalls: ToolCallSummary[] = [];
|
||||||
let finalOutput = "";
|
let finalOutput = "";
|
||||||
let totalDurationMs = 0;
|
let totalDurationMs = 0;
|
||||||
|
let totalDurationApiMs = 0;
|
||||||
let totalCostUsd: number | null = null;
|
let totalCostUsd: number | null = null;
|
||||||
let model: string | null = null;
|
let model: string | null = null;
|
||||||
let totalTurns = 0;
|
let totalTurns = 0;
|
||||||
|
let skills: string[] = [];
|
||||||
|
const referenceFilesRead: string[] = [];
|
||||||
|
let modelUsage: Record<string, ModelUsage> = {};
|
||||||
|
let totalInputTokens = 0;
|
||||||
|
let totalOutputTokens = 0;
|
||||||
|
let totalCacheReadTokens = 0;
|
||||||
|
let totalCacheCreationTokens = 0;
|
||||||
|
let toolErrorCount = 0;
|
||||||
|
let isError = false;
|
||||||
|
let permissionDenialCount = 0;
|
||||||
|
|
||||||
for (const event of events) {
|
for (const event of events) {
|
||||||
const e = event as Record<string, unknown>;
|
const e = event as Record<string, unknown>;
|
||||||
|
|
||||||
// System init: extract model
|
// System init: extract model and loaded skills
|
||||||
if (e.type === "system" && e.subtype === "init") {
|
if (e.type === "system" && e.subtype === "init") {
|
||||||
model = typeof e.model === "string" ? e.model : null;
|
model = typeof e.model === "string" ? e.model : null;
|
||||||
|
if (Array.isArray(e.skills)) {
|
||||||
|
skills = e.skills.filter((s): s is string => typeof s === "string");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Assistant messages: extract tool_use blocks
|
// Assistant messages: extract tool_use blocks
|
||||||
@@ -100,12 +153,27 @@ export function buildTranscriptSummary(
|
|||||||
if (Array.isArray(content)) {
|
if (Array.isArray(content)) {
|
||||||
for (const block of content) {
|
for (const block of content) {
|
||||||
if (block.type === "tool_use") {
|
if (block.type === "tool_use") {
|
||||||
toolCalls.push({
|
const toolCall: ToolCallSummary = {
|
||||||
tool: block.name ?? "unknown",
|
tool: block.name ?? "unknown",
|
||||||
toolUseId: block.id ?? "",
|
toolUseId: block.id ?? "",
|
||||||
input: block.input ?? {},
|
input: block.input ?? {},
|
||||||
outputPreview: "",
|
outputPreview: "",
|
||||||
});
|
isError: false,
|
||||||
|
stderr: "",
|
||||||
|
};
|
||||||
|
toolCalls.push(toolCall);
|
||||||
|
|
||||||
|
// Track reference file reads
|
||||||
|
if (
|
||||||
|
block.name === "Read" &&
|
||||||
|
typeof block.input?.file_path === "string" &&
|
||||||
|
isReferenceFilePath(block.input.file_path)
|
||||||
|
) {
|
||||||
|
const base = basename(block.input.file_path);
|
||||||
|
if (!referenceFilesRead.includes(base)) {
|
||||||
|
referenceFilesRead.push(base);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -127,28 +195,107 @@ export function buildTranscriptSummary(
|
|||||||
? block.content
|
? block.content
|
||||||
: JSON.stringify(block.content);
|
: JSON.stringify(block.content);
|
||||||
matching.outputPreview = text.slice(0, 200);
|
matching.outputPreview = text.slice(0, 200);
|
||||||
|
|
||||||
|
// Capture error state from tool result
|
||||||
|
if (block.is_error === true) {
|
||||||
|
matching.isError = true;
|
||||||
|
toolErrorCount++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Capture stderr from tool_use_result (Bash tool emits this at the user event level)
|
||||||
|
const toolUseResult = e.tool_use_result as
|
||||||
|
| Record<string, unknown>
|
||||||
|
| undefined;
|
||||||
|
if (toolUseResult && typeof toolUseResult.stderr === "string") {
|
||||||
|
// Match to the most recent Bash tool call without stderr set
|
||||||
|
const lastBash = [...toolCalls]
|
||||||
|
.reverse()
|
||||||
|
.find((tc) => tc.tool === "Bash" && !tc.stderr);
|
||||||
|
if (lastBash) {
|
||||||
|
lastBash.stderr = toolUseResult.stderr;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Result event: final output, cost, duration, turns
|
// Result event: final output, cost, duration, turns, token usage
|
||||||
if (e.type === "result") {
|
if (e.type === "result") {
|
||||||
finalOutput = typeof e.result === "string" ? e.result : "";
|
finalOutput = typeof e.result === "string" ? e.result : "";
|
||||||
totalDurationMs = typeof e.duration_ms === "number" ? e.duration_ms : 0;
|
totalDurationMs = typeof e.duration_ms === "number" ? e.duration_ms : 0;
|
||||||
|
totalDurationApiMs =
|
||||||
|
typeof e.duration_api_ms === "number" ? e.duration_api_ms : 0;
|
||||||
totalCostUsd =
|
totalCostUsd =
|
||||||
typeof e.total_cost_usd === "number" ? e.total_cost_usd : null;
|
typeof e.total_cost_usd === "number" ? e.total_cost_usd : null;
|
||||||
totalTurns = typeof e.num_turns === "number" ? e.num_turns : 0;
|
totalTurns = typeof e.num_turns === "number" ? e.num_turns : 0;
|
||||||
|
isError = e.is_error === true;
|
||||||
|
permissionDenialCount = Array.isArray(e.permission_denials)
|
||||||
|
? e.permission_denials.length
|
||||||
|
: 0;
|
||||||
|
|
||||||
|
// Aggregate token usage from the result event's usage field
|
||||||
|
const usage = e.usage as Record<string, unknown> | undefined;
|
||||||
|
if (usage) {
|
||||||
|
totalInputTokens =
|
||||||
|
typeof usage.input_tokens === "number" ? usage.input_tokens : 0;
|
||||||
|
totalOutputTokens =
|
||||||
|
typeof usage.output_tokens === "number" ? usage.output_tokens : 0;
|
||||||
|
totalCacheReadTokens =
|
||||||
|
typeof usage.cache_read_input_tokens === "number"
|
||||||
|
? usage.cache_read_input_tokens
|
||||||
|
: 0;
|
||||||
|
totalCacheCreationTokens =
|
||||||
|
typeof usage.cache_creation_input_tokens === "number"
|
||||||
|
? usage.cache_creation_input_tokens
|
||||||
|
: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Per-model usage breakdown (modelUsage keyed by model name)
|
||||||
|
const rawModelUsage = e.modelUsage as
|
||||||
|
| Record<string, Record<string, unknown>>
|
||||||
|
| undefined;
|
||||||
|
if (rawModelUsage) {
|
||||||
|
modelUsage = {};
|
||||||
|
for (const [modelName, mu] of Object.entries(rawModelUsage)) {
|
||||||
|
modelUsage[modelName] = {
|
||||||
|
inputTokens:
|
||||||
|
typeof mu.inputTokens === "number" ? mu.inputTokens : 0,
|
||||||
|
outputTokens:
|
||||||
|
typeof mu.outputTokens === "number" ? mu.outputTokens : 0,
|
||||||
|
cacheReadInputTokens:
|
||||||
|
typeof mu.cacheReadInputTokens === "number"
|
||||||
|
? mu.cacheReadInputTokens
|
||||||
|
: 0,
|
||||||
|
cacheCreationInputTokens:
|
||||||
|
typeof mu.cacheCreationInputTokens === "number"
|
||||||
|
? mu.cacheCreationInputTokens
|
||||||
|
: 0,
|
||||||
|
costUSD: typeof mu.costUSD === "number" ? mu.costUSD : 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
totalTurns,
|
totalTurns,
|
||||||
totalDurationMs,
|
totalDurationMs,
|
||||||
|
totalDurationApiMs,
|
||||||
totalCostUsd,
|
totalCostUsd,
|
||||||
model,
|
model,
|
||||||
toolCalls,
|
toolCalls,
|
||||||
finalOutput,
|
finalOutput,
|
||||||
|
skills,
|
||||||
|
referenceFilesRead,
|
||||||
|
modelUsage,
|
||||||
|
totalInputTokens,
|
||||||
|
totalOutputTokens,
|
||||||
|
totalCacheReadTokens,
|
||||||
|
totalCacheCreationTokens,
|
||||||
|
toolErrorCount,
|
||||||
|
isError,
|
||||||
|
permissionDenialCount,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -44,4 +44,39 @@ export interface EvalRunResult {
|
|||||||
prompt?: string;
|
prompt?: string;
|
||||||
/** Per-test pass/fail results from vitest */
|
/** Per-test pass/fail results from vitest */
|
||||||
individualTests?: Record<string, boolean>;
|
individualTests?: Record<string, boolean>;
|
||||||
|
/** Epoch ms when the agent run started (for Braintrust span timing) */
|
||||||
|
startedAt?: number;
|
||||||
|
/** API-only latency in ms (excludes local processing overhead) */
|
||||||
|
durationApiMs?: number;
|
||||||
|
/** Aggregate token counts from the result event */
|
||||||
|
totalInputTokens?: number;
|
||||||
|
totalOutputTokens?: number;
|
||||||
|
totalCacheReadTokens?: number;
|
||||||
|
totalCacheCreationTokens?: number;
|
||||||
|
/** Per-model token usage and cost breakdown */
|
||||||
|
modelUsage?: Record<
|
||||||
|
string,
|
||||||
|
{
|
||||||
|
inputTokens: number;
|
||||||
|
outputTokens: number;
|
||||||
|
cacheReadInputTokens: number;
|
||||||
|
cacheCreationInputTokens: number;
|
||||||
|
costUSD: number;
|
||||||
|
}
|
||||||
|
>;
|
||||||
|
/** Count of tool calls that returned is_error === true */
|
||||||
|
toolErrorCount?: number;
|
||||||
|
/** Count of permission_denials in the result event */
|
||||||
|
permissionDenialCount?: number;
|
||||||
|
/** Skills that were in the agent's context (from system init event) */
|
||||||
|
loadedSkills?: string[];
|
||||||
|
/** Basenames of skill reference files the agent read */
|
||||||
|
referenceFilesRead?: string[];
|
||||||
|
/** Computed scorer results */
|
||||||
|
scores?: {
|
||||||
|
skillUsage: number;
|
||||||
|
referenceFilesUsage: number;
|
||||||
|
assertionsPassed: number;
|
||||||
|
finalResult: number;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user