mirror of
https://github.com/supabase/agent-skills.git
synced 2026-03-27 10:09:26 +08:00
feat(evals): enrich Braintrust upload with granular scores and tracing
Add per-test pass/fail parsing from vitest verbose output, thread prompt content and individual test results through the runner, and rewrite uploadToBraintrust with experiment naming (model-variant-timestamp), granular scores (pass, test_pass_rate, per-test), rich metadata, and tool-call tracing via experiment.traced(). Also document --force flag for cached mise tasks and add Braintrust env vars to AGENTS.md. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -45,10 +45,18 @@ This prevents the agent from "teaching to the test."
|
|||||||
|
|
||||||
## Running Evals
|
## Running Evals
|
||||||
|
|
||||||
|
Eval tasks in `mise.toml` have `sources` defined, so mise skips them when
|
||||||
|
source files haven't changed. Use `--force` to bypass caching when you need
|
||||||
|
to re-run evals regardless (e.g., after changing environment variables or
|
||||||
|
re-running the same scenario):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Run all scenarios with skills (default)
|
# Run all scenarios with skills (default)
|
||||||
mise run eval
|
mise run eval
|
||||||
|
|
||||||
|
# Force re-run (bypass source caching)
|
||||||
|
mise run --force eval
|
||||||
|
|
||||||
# Run a specific scenario
|
# Run a specific scenario
|
||||||
EVAL_SCENARIO=auth-rls-new-project mise run eval
|
EVAL_SCENARIO=auth-rls-new-project mise run eval
|
||||||
|
|
||||||
@@ -63,9 +71,12 @@ EVAL_SKILL=supabase mise run eval
|
|||||||
|
|
||||||
# Upload results to Braintrust
|
# Upload results to Braintrust
|
||||||
mise run eval:upload
|
mise run eval:upload
|
||||||
|
|
||||||
|
# Force upload (bypass cache)
|
||||||
|
mise run --force eval:upload
|
||||||
```
|
```
|
||||||
|
|
||||||
Or directly:
|
Or directly (no caching, always runs):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd packages/evals
|
cd packages/evals
|
||||||
@@ -99,12 +110,15 @@ Compare the results to measure how much skills improve agent output.
|
|||||||
## Environment
|
## Environment
|
||||||
|
|
||||||
```
|
```
|
||||||
ANTHROPIC_API_KEY=sk-ant-... # Required: Claude Code authentication
|
ANTHROPIC_API_KEY=sk-ant-... # Required: Claude Code authentication
|
||||||
EVAL_MODEL=... # Optional: override model (default: claude-sonnet-4-5-20250929)
|
EVAL_MODEL=... # Optional: override model (default: claude-sonnet-4-5-20250929)
|
||||||
EVAL_SCENARIO=... # Optional: run single scenario
|
EVAL_SCENARIO=... # Optional: run single scenario
|
||||||
EVAL_SKILL=... # Optional: install only this skill (e.g., "supabase")
|
EVAL_SKILL=... # Optional: install only this skill (e.g., "supabase")
|
||||||
EVAL_BASELINE=true # Optional: run without skills (baseline mode)
|
EVAL_BASELINE=true # Optional: run without skills (baseline mode)
|
||||||
BRAINTRUST_UPLOAD=true # Optional: upload results to Braintrust
|
BRAINTRUST_UPLOAD=true # Optional: upload results to Braintrust
|
||||||
|
BRAINTRUST_API_KEY=... # Required when BRAINTRUST_UPLOAD=true
|
||||||
|
BRAINTRUST_PROJECT_ID=... # Required when BRAINTRUST_UPLOAD=true
|
||||||
|
BRAINTRUST_BASE_EXPERIMENT=... # Optional: compare against a named experiment
|
||||||
```
|
```
|
||||||
|
|
||||||
## Key Files
|
## Key Files
|
||||||
|
|||||||
@@ -7,7 +7,10 @@ import { preflight } from "./runner/preflight.js";
|
|||||||
import { listModifiedFiles, printSummary } from "./runner/results.js";
|
import { listModifiedFiles, printSummary } from "./runner/results.js";
|
||||||
import { createWorkspace } from "./runner/scaffold.js";
|
import { createWorkspace } from "./runner/scaffold.js";
|
||||||
import { runTests } from "./runner/test.js";
|
import { runTests } from "./runner/test.js";
|
||||||
import { buildTranscriptSummary } from "./runner/transcript.js";
|
import {
|
||||||
|
buildTranscriptSummary,
|
||||||
|
type TranscriptSummary,
|
||||||
|
} from "./runner/transcript.js";
|
||||||
import type { EvalRunResult, EvalScenario } from "./types.js";
|
import type { EvalRunResult, EvalScenario } from "./types.js";
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
@@ -64,7 +67,7 @@ function discoverScenarios(): EvalScenario[] {
|
|||||||
async function runEval(
|
async function runEval(
|
||||||
scenario: EvalScenario,
|
scenario: EvalScenario,
|
||||||
skillEnabled: boolean,
|
skillEnabled: boolean,
|
||||||
): Promise<EvalRunResult> {
|
): Promise<{ result: EvalRunResult; transcript?: TranscriptSummary }> {
|
||||||
const evalsDir = findEvalsDir();
|
const evalsDir = findEvalsDir();
|
||||||
const evalDir = join(evalsDir, scenario.id);
|
const evalDir = join(evalsDir, scenario.id);
|
||||||
const variant = skillEnabled ? "with-skill" : "baseline";
|
const variant = skillEnabled ? "with-skill" : "baseline";
|
||||||
@@ -129,6 +132,8 @@ async function runEval(
|
|||||||
filesModified,
|
filesModified,
|
||||||
toolCallCount: summary.toolCalls.length,
|
toolCallCount: summary.toolCalls.length,
|
||||||
costUsd: summary.totalCostUsd ?? undefined,
|
costUsd: summary.totalCostUsd ?? undefined,
|
||||||
|
prompt,
|
||||||
|
individualTests: testResult.individualTests,
|
||||||
};
|
};
|
||||||
|
|
||||||
// 7. Persist results
|
// 7. Persist results
|
||||||
@@ -142,22 +147,24 @@ async function runEval(
|
|||||||
transcriptSummary: summary,
|
transcriptSummary: summary,
|
||||||
});
|
});
|
||||||
|
|
||||||
return result;
|
return { result, transcript: summary };
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
const err = error as Error;
|
const err = error as Error;
|
||||||
return {
|
return {
|
||||||
scenario: scenario.id,
|
result: {
|
||||||
agent: "claude-code",
|
scenario: scenario.id,
|
||||||
model,
|
agent: "claude-code",
|
||||||
skillEnabled,
|
model,
|
||||||
status: "error",
|
skillEnabled,
|
||||||
duration: 0,
|
status: "error",
|
||||||
testOutput: "",
|
duration: 0,
|
||||||
agentOutput: "",
|
testOutput: "",
|
||||||
testsPassed: 0,
|
agentOutput: "",
|
||||||
testsTotal: 0,
|
testsPassed: 0,
|
||||||
filesModified: [],
|
testsTotal: 0,
|
||||||
error: err.message,
|
filesModified: [],
|
||||||
|
error: err.message,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
} finally {
|
} finally {
|
||||||
cleanup();
|
cleanup();
|
||||||
@@ -188,10 +195,14 @@ async function main() {
|
|||||||
console.log(`Scenarios: ${scenarios.map((s) => s.id).join(", ")}`);
|
console.log(`Scenarios: ${scenarios.map((s) => s.id).join(", ")}`);
|
||||||
|
|
||||||
const results: EvalRunResult[] = [];
|
const results: EvalRunResult[] = [];
|
||||||
|
const transcripts = new Map<string, TranscriptSummary>();
|
||||||
|
|
||||||
for (const scenario of scenarios) {
|
for (const scenario of scenarios) {
|
||||||
const result = await runEval(scenario, skillEnabled);
|
const { result, transcript } = await runEval(scenario, skillEnabled);
|
||||||
results.push(result);
|
results.push(result);
|
||||||
|
if (transcript) {
|
||||||
|
transcripts.set(result.scenario, transcript);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use the results dir from the first result (all share the same timestamp)
|
// Use the results dir from the first result (all share the same timestamp)
|
||||||
@@ -200,7 +211,12 @@ async function main() {
|
|||||||
|
|
||||||
if (process.env.BRAINTRUST_UPLOAD === "true") {
|
if (process.env.BRAINTRUST_UPLOAD === "true") {
|
||||||
console.log("\nUploading to Braintrust...");
|
console.log("\nUploading to Braintrust...");
|
||||||
await uploadToBraintrust(results);
|
await uploadToBraintrust(results, {
|
||||||
|
model,
|
||||||
|
skillEnabled,
|
||||||
|
runTimestamp,
|
||||||
|
transcripts,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,50 +1,126 @@
|
|||||||
import assert from "node:assert";
|
import assert from "node:assert";
|
||||||
import { init } from "braintrust";
|
import { init } from "braintrust";
|
||||||
import type { EvalRunResult } from "../types.js";
|
import type { EvalRunResult } from "../types.js";
|
||||||
|
import type { TranscriptSummary } from "./transcript.js";
|
||||||
|
|
||||||
|
/** Convert a test name to a snake_case score key. */
|
||||||
|
function toScoreKey(name: string): string {
|
||||||
|
return `test_${name
|
||||||
|
.toLowerCase()
|
||||||
|
.replace(/[^a-z0-9]+/g, "_")
|
||||||
|
.replace(/^_|_$/g, "")}`;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Upload eval results to Braintrust as an experiment.
|
* Upload eval results to Braintrust as an experiment.
|
||||||
*
|
*
|
||||||
* Each EvalRunResult becomes a row in the experiment with:
|
* Each EvalRunResult becomes a row in the experiment with:
|
||||||
* - input: scenario name + config
|
* - input: scenario ID, prompt content, skillEnabled flag
|
||||||
* - output: agent output summary
|
* - output: status, agent output, files modified, test output
|
||||||
* - scores: pass (0 or 1)
|
* - expected: total tests, pass threshold
|
||||||
* - metadata: model, skill toggle, duration, files modified
|
* - scores: pass (0|1), test_pass_rate (0-1), per-test scores
|
||||||
|
* - metadata: model, duration, cost, tool call count, files modified
|
||||||
|
* - spans: one child span per agent tool call (when transcript available)
|
||||||
*/
|
*/
|
||||||
export async function uploadToBraintrust(
|
export async function uploadToBraintrust(
|
||||||
results: EvalRunResult[],
|
results: EvalRunResult[],
|
||||||
|
opts: {
|
||||||
|
model: string;
|
||||||
|
skillEnabled: boolean;
|
||||||
|
runTimestamp: string;
|
||||||
|
transcripts: Map<string, TranscriptSummary>;
|
||||||
|
},
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
|
assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
|
||||||
assert(process.env.BRAINTRUST_PROJECT_ID, "BRAINTRUST_PROJECT_ID is not set");
|
assert(process.env.BRAINTRUST_PROJECT_ID, "BRAINTRUST_PROJECT_ID is not set");
|
||||||
|
|
||||||
|
const variant = opts.skillEnabled ? "skill" : "baseline";
|
||||||
const experiment = await init({
|
const experiment = await init({
|
||||||
projectId: process.env.BRAINTRUST_PROJECT_ID,
|
projectId: process.env.BRAINTRUST_PROJECT_ID,
|
||||||
|
experiment: `${opts.model}-${variant}-${opts.runTimestamp}`,
|
||||||
|
baseExperiment: process.env.BRAINTRUST_BASE_EXPERIMENT ?? undefined,
|
||||||
|
metadata: {
|
||||||
|
model: opts.model,
|
||||||
|
skillEnabled: opts.skillEnabled,
|
||||||
|
runTimestamp: opts.runTimestamp,
|
||||||
|
scenarioCount: results.length,
|
||||||
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
for (const r of results) {
|
for (const r of results) {
|
||||||
experiment.log({
|
const transcript = opts.transcripts.get(r.scenario);
|
||||||
input: {
|
|
||||||
scenario: r.scenario,
|
// Build per-test scores
|
||||||
skillEnabled: r.skillEnabled,
|
const perTestScores: Record<string, number> = {};
|
||||||
},
|
if (r.individualTests) {
|
||||||
output: {
|
for (const [testName, didPass] of Object.entries(r.individualTests)) {
|
||||||
status: r.status,
|
perTestScores[toScoreKey(testName)] = didPass ? 1 : 0;
|
||||||
filesModified: r.filesModified,
|
}
|
||||||
testOutput: r.testOutput,
|
}
|
||||||
},
|
|
||||||
scores: {
|
const testPassRate = r.testsTotal > 0 ? r.testsPassed / r.testsTotal : 0;
|
||||||
pass: r.status === "passed" ? 1 : 0,
|
|
||||||
},
|
const scores: Record<string, number> = {
|
||||||
metadata: {
|
pass: r.status === "passed" ? 1 : 0,
|
||||||
agent: r.agent,
|
test_pass_rate: testPassRate,
|
||||||
model: r.model,
|
...perTestScores,
|
||||||
skillEnabled: r.skillEnabled,
|
};
|
||||||
duration: r.duration,
|
|
||||||
testsPassed: r.testsPassed,
|
const input = {
|
||||||
testsTotal: r.testsTotal,
|
scenario: r.scenario,
|
||||||
...(r.error ? { error: r.error } : {}),
|
prompt: r.prompt ?? "",
|
||||||
},
|
skillEnabled: r.skillEnabled,
|
||||||
});
|
};
|
||||||
|
|
||||||
|
const output = {
|
||||||
|
status: r.status,
|
||||||
|
agentOutput: r.agentOutput,
|
||||||
|
filesModified: r.filesModified,
|
||||||
|
testOutput: r.testOutput,
|
||||||
|
};
|
||||||
|
|
||||||
|
const expected = {
|
||||||
|
testsTotal: r.testsTotal,
|
||||||
|
passThreshold: 1.0,
|
||||||
|
};
|
||||||
|
|
||||||
|
const metadata: Record<string, unknown> = {
|
||||||
|
agent: r.agent,
|
||||||
|
model: r.model,
|
||||||
|
skillEnabled: r.skillEnabled,
|
||||||
|
duration: r.duration,
|
||||||
|
testsPassed: r.testsPassed,
|
||||||
|
testsTotal: r.testsTotal,
|
||||||
|
toolCallCount: r.toolCallCount ?? 0,
|
||||||
|
filesModified: r.filesModified,
|
||||||
|
...(r.costUsd != null ? { costUsd: r.costUsd } : {}),
|
||||||
|
...(r.error ? { error: r.error } : {}),
|
||||||
|
};
|
||||||
|
|
||||||
|
if (transcript && transcript.toolCalls.length > 0) {
|
||||||
|
// Use traced() to create a root span with child spans for tool calls
|
||||||
|
experiment.traced(
|
||||||
|
(span) => {
|
||||||
|
span.log({ input, output, expected, scores, metadata });
|
||||||
|
|
||||||
|
for (const tc of transcript.toolCalls) {
|
||||||
|
span.traced(
|
||||||
|
(childSpan) => {
|
||||||
|
childSpan.log({
|
||||||
|
input: { tool: tc.tool, args: tc.input },
|
||||||
|
output: { preview: tc.outputPreview },
|
||||||
|
metadata: { toolUseId: tc.toolUseId },
|
||||||
|
});
|
||||||
|
},
|
||||||
|
{ name: `tool:${tc.tool}` },
|
||||||
|
);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ name: r.scenario },
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
experiment.log({ input, output, expected, scores, metadata });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const summary = await experiment.summarize();
|
const summary = await experiment.summarize();
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import { mkdirSync, readdirSync, statSync, writeFileSync } from "node:fs";
|
import { readdirSync, statSync } from "node:fs";
|
||||||
import { join, resolve } from "node:path";
|
import { join } from "node:path";
|
||||||
import type { EvalRunResult } from "../types.js";
|
import type { EvalRunResult } from "../types.js";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -16,6 +16,8 @@ export interface TestResult {
|
|||||||
passedCount: number;
|
passedCount: number;
|
||||||
/** Total number of tests */
|
/** Total number of tests */
|
||||||
totalCount: number;
|
totalCount: number;
|
||||||
|
/** Per-test pass/fail extracted from vitest verbose output */
|
||||||
|
individualTests: Record<string, boolean>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -91,6 +93,24 @@ export async function runTests(opts: {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract per-test pass/fail from vitest verbose output.
|
||||||
|
*
|
||||||
|
* Vitest verbose format:
|
||||||
|
* ✓ EVAL.ts > test name here 0ms → passed
|
||||||
|
* × EVAL.ts > test name here 2ms → failed
|
||||||
|
*/
|
||||||
|
function parseIndividualTests(output: string): Record<string, boolean> {
|
||||||
|
const results: Record<string, boolean> = {};
|
||||||
|
const re = /[✓×]\s+EVAL\.tsx?\s+>\s+(.+?)\s+\d+ms/g;
|
||||||
|
for (const match of output.matchAll(re)) {
|
||||||
|
const testName = match[1].trim();
|
||||||
|
const didPass = output[match.index] === "✓";
|
||||||
|
results[testName] = didPass;
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
function parseTestOutput(output: string): TestResult {
|
function parseTestOutput(output: string): TestResult {
|
||||||
// Parse vitest output for pass/fail counts
|
// Parse vitest output for pass/fail counts
|
||||||
// Vitest formats:
|
// Vitest formats:
|
||||||
@@ -114,6 +134,7 @@ function parseTestOutput(output: string): TestResult {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const passed = totalCount > 0 && passedCount === totalCount;
|
const passed = totalCount > 0 && passedCount === totalCount;
|
||||||
|
const individualTests = parseIndividualTests(output);
|
||||||
|
|
||||||
return { passed, output, passedCount, totalCount };
|
return { passed, output, passedCount, totalCount, individualTests };
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -38,4 +38,8 @@ export interface EvalRunResult {
|
|||||||
toolCallCount?: number;
|
toolCallCount?: number;
|
||||||
/** Total cost in USD (from stream-json result event) */
|
/** Total cost in USD (from stream-json result event) */
|
||||||
costUsd?: number;
|
costUsd?: number;
|
||||||
|
/** The PROMPT.md content sent to the agent */
|
||||||
|
prompt?: string;
|
||||||
|
/** Per-test pass/fail results from vitest */
|
||||||
|
individualTests?: Record<string, boolean>;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user