diff --git a/packages/evals/AGENTS.md b/packages/evals/AGENTS.md index 622828c..bc8c4f9 100644 --- a/packages/evals/AGENTS.md +++ b/packages/evals/AGENTS.md @@ -45,10 +45,18 @@ This prevents the agent from "teaching to the test." ## Running Evals +Eval tasks in `mise.toml` have `sources` defined, so mise skips them when +source files haven't changed. Use `--force` to bypass caching when you need +to re-run evals regardless (e.g., after changing environment variables or +re-running the same scenario): + ```bash # Run all scenarios with skills (default) mise run eval +# Force re-run (bypass source caching) +mise run --force eval + # Run a specific scenario EVAL_SCENARIO=auth-rls-new-project mise run eval @@ -63,9 +71,12 @@ EVAL_SKILL=supabase mise run eval # Upload results to Braintrust mise run eval:upload + +# Force upload (bypass cache) +mise run --force eval:upload ``` -Or directly: +Or directly (no caching, always runs): ```bash cd packages/evals @@ -99,12 +110,15 @@ Compare the results to measure how much skills improve agent output. ## Environment ``` -ANTHROPIC_API_KEY=sk-ant-... # Required: Claude Code authentication -EVAL_MODEL=... # Optional: override model (default: claude-sonnet-4-5-20250929) -EVAL_SCENARIO=... # Optional: run single scenario -EVAL_SKILL=... # Optional: install only this skill (e.g., "supabase") -EVAL_BASELINE=true # Optional: run without skills (baseline mode) -BRAINTRUST_UPLOAD=true # Optional: upload results to Braintrust +ANTHROPIC_API_KEY=sk-ant-... # Required: Claude Code authentication +EVAL_MODEL=... # Optional: override model (default: claude-sonnet-4-5-20250929) +EVAL_SCENARIO=... # Optional: run single scenario +EVAL_SKILL=... # Optional: install only this skill (e.g., "supabase") +EVAL_BASELINE=true # Optional: run without skills (baseline mode) +BRAINTRUST_UPLOAD=true # Optional: upload results to Braintrust +BRAINTRUST_API_KEY=... # Required when BRAINTRUST_UPLOAD=true +BRAINTRUST_PROJECT_ID=... # Required when BRAINTRUST_UPLOAD=true +BRAINTRUST_BASE_EXPERIMENT=... # Optional: compare against a named experiment ``` ## Key Files diff --git a/packages/evals/src/runner.ts b/packages/evals/src/runner.ts index 9bfd53c..4229cb3 100644 --- a/packages/evals/src/runner.ts +++ b/packages/evals/src/runner.ts @@ -7,7 +7,10 @@ import { preflight } from "./runner/preflight.js"; import { listModifiedFiles, printSummary } from "./runner/results.js"; import { createWorkspace } from "./runner/scaffold.js"; import { runTests } from "./runner/test.js"; -import { buildTranscriptSummary } from "./runner/transcript.js"; +import { + buildTranscriptSummary, + type TranscriptSummary, +} from "./runner/transcript.js"; import type { EvalRunResult, EvalScenario } from "./types.js"; // --------------------------------------------------------------------------- @@ -64,7 +67,7 @@ function discoverScenarios(): EvalScenario[] { async function runEval( scenario: EvalScenario, skillEnabled: boolean, -): Promise { +): Promise<{ result: EvalRunResult; transcript?: TranscriptSummary }> { const evalsDir = findEvalsDir(); const evalDir = join(evalsDir, scenario.id); const variant = skillEnabled ? "with-skill" : "baseline"; @@ -129,6 +132,8 @@ async function runEval( filesModified, toolCallCount: summary.toolCalls.length, costUsd: summary.totalCostUsd ?? undefined, + prompt, + individualTests: testResult.individualTests, }; // 7. Persist results @@ -142,22 +147,24 @@ async function runEval( transcriptSummary: summary, }); - return result; + return { result, transcript: summary }; } catch (error) { const err = error as Error; return { - scenario: scenario.id, - agent: "claude-code", - model, - skillEnabled, - status: "error", - duration: 0, - testOutput: "", - agentOutput: "", - testsPassed: 0, - testsTotal: 0, - filesModified: [], - error: err.message, + result: { + scenario: scenario.id, + agent: "claude-code", + model, + skillEnabled, + status: "error", + duration: 0, + testOutput: "", + agentOutput: "", + testsPassed: 0, + testsTotal: 0, + filesModified: [], + error: err.message, + }, }; } finally { cleanup(); @@ -188,10 +195,14 @@ async function main() { console.log(`Scenarios: ${scenarios.map((s) => s.id).join(", ")}`); const results: EvalRunResult[] = []; + const transcripts = new Map(); for (const scenario of scenarios) { - const result = await runEval(scenario, skillEnabled); + const { result, transcript } = await runEval(scenario, skillEnabled); results.push(result); + if (transcript) { + transcripts.set(result.scenario, transcript); + } } // Use the results dir from the first result (all share the same timestamp) @@ -200,7 +211,12 @@ async function main() { if (process.env.BRAINTRUST_UPLOAD === "true") { console.log("\nUploading to Braintrust..."); - await uploadToBraintrust(results); + await uploadToBraintrust(results, { + model, + skillEnabled, + runTimestamp, + transcripts, + }); } } diff --git a/packages/evals/src/runner/braintrust.ts b/packages/evals/src/runner/braintrust.ts index 72dddb2..c1ab27a 100644 --- a/packages/evals/src/runner/braintrust.ts +++ b/packages/evals/src/runner/braintrust.ts @@ -1,50 +1,126 @@ import assert from "node:assert"; import { init } from "braintrust"; import type { EvalRunResult } from "../types.js"; +import type { TranscriptSummary } from "./transcript.js"; + +/** Convert a test name to a snake_case score key. */ +function toScoreKey(name: string): string { + return `test_${name + .toLowerCase() + .replace(/[^a-z0-9]+/g, "_") + .replace(/^_|_$/g, "")}`; +} /** * Upload eval results to Braintrust as an experiment. * * Each EvalRunResult becomes a row in the experiment with: - * - input: scenario name + config - * - output: agent output summary - * - scores: pass (0 or 1) - * - metadata: model, skill toggle, duration, files modified + * - input: scenario ID, prompt content, skillEnabled flag + * - output: status, agent output, files modified, test output + * - expected: total tests, pass threshold + * - scores: pass (0|1), test_pass_rate (0-1), per-test scores + * - metadata: model, duration, cost, tool call count, files modified + * - spans: one child span per agent tool call (when transcript available) */ export async function uploadToBraintrust( results: EvalRunResult[], + opts: { + model: string; + skillEnabled: boolean; + runTimestamp: string; + transcripts: Map; + }, ): Promise { assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set"); assert(process.env.BRAINTRUST_PROJECT_ID, "BRAINTRUST_PROJECT_ID is not set"); + const variant = opts.skillEnabled ? "skill" : "baseline"; const experiment = await init({ projectId: process.env.BRAINTRUST_PROJECT_ID, + experiment: `${opts.model}-${variant}-${opts.runTimestamp}`, + baseExperiment: process.env.BRAINTRUST_BASE_EXPERIMENT ?? undefined, + metadata: { + model: opts.model, + skillEnabled: opts.skillEnabled, + runTimestamp: opts.runTimestamp, + scenarioCount: results.length, + }, }); for (const r of results) { - experiment.log({ - input: { - scenario: r.scenario, - skillEnabled: r.skillEnabled, - }, - output: { - status: r.status, - filesModified: r.filesModified, - testOutput: r.testOutput, - }, - scores: { - pass: r.status === "passed" ? 1 : 0, - }, - metadata: { - agent: r.agent, - model: r.model, - skillEnabled: r.skillEnabled, - duration: r.duration, - testsPassed: r.testsPassed, - testsTotal: r.testsTotal, - ...(r.error ? { error: r.error } : {}), - }, - }); + const transcript = opts.transcripts.get(r.scenario); + + // Build per-test scores + const perTestScores: Record = {}; + if (r.individualTests) { + for (const [testName, didPass] of Object.entries(r.individualTests)) { + perTestScores[toScoreKey(testName)] = didPass ? 1 : 0; + } + } + + const testPassRate = r.testsTotal > 0 ? r.testsPassed / r.testsTotal : 0; + + const scores: Record = { + pass: r.status === "passed" ? 1 : 0, + test_pass_rate: testPassRate, + ...perTestScores, + }; + + const input = { + scenario: r.scenario, + prompt: r.prompt ?? "", + skillEnabled: r.skillEnabled, + }; + + const output = { + status: r.status, + agentOutput: r.agentOutput, + filesModified: r.filesModified, + testOutput: r.testOutput, + }; + + const expected = { + testsTotal: r.testsTotal, + passThreshold: 1.0, + }; + + const metadata: Record = { + agent: r.agent, + model: r.model, + skillEnabled: r.skillEnabled, + duration: r.duration, + testsPassed: r.testsPassed, + testsTotal: r.testsTotal, + toolCallCount: r.toolCallCount ?? 0, + filesModified: r.filesModified, + ...(r.costUsd != null ? { costUsd: r.costUsd } : {}), + ...(r.error ? { error: r.error } : {}), + }; + + if (transcript && transcript.toolCalls.length > 0) { + // Use traced() to create a root span with child spans for tool calls + experiment.traced( + (span) => { + span.log({ input, output, expected, scores, metadata }); + + for (const tc of transcript.toolCalls) { + span.traced( + (childSpan) => { + childSpan.log({ + input: { tool: tc.tool, args: tc.input }, + output: { preview: tc.outputPreview }, + metadata: { toolUseId: tc.toolUseId }, + }); + }, + { name: `tool:${tc.tool}` }, + ); + } + }, + { name: r.scenario }, + ); + } else { + experiment.log({ input, output, expected, scores, metadata }); + } } const summary = await experiment.summarize(); diff --git a/packages/evals/src/runner/results.ts b/packages/evals/src/runner/results.ts index 901ed3a..2f6df7b 100644 --- a/packages/evals/src/runner/results.ts +++ b/packages/evals/src/runner/results.ts @@ -1,5 +1,5 @@ -import { mkdirSync, readdirSync, statSync, writeFileSync } from "node:fs"; -import { join, resolve } from "node:path"; +import { readdirSync, statSync } from "node:fs"; +import { join } from "node:path"; import type { EvalRunResult } from "../types.js"; /** diff --git a/packages/evals/src/runner/test.ts b/packages/evals/src/runner/test.ts index a4e625c..2a9c4fa 100644 --- a/packages/evals/src/runner/test.ts +++ b/packages/evals/src/runner/test.ts @@ -16,6 +16,8 @@ export interface TestResult { passedCount: number; /** Total number of tests */ totalCount: number; + /** Per-test pass/fail extracted from vitest verbose output */ + individualTests: Record; } /** @@ -91,6 +93,24 @@ export async function runTests(opts: { } } +/** + * Extract per-test pass/fail from vitest verbose output. + * + * Vitest verbose format: + * ✓ EVAL.ts > test name here 0ms → passed + * × EVAL.ts > test name here 2ms → failed + */ +function parseIndividualTests(output: string): Record { + const results: Record = {}; + const re = /[✓×]\s+EVAL\.tsx?\s+>\s+(.+?)\s+\d+ms/g; + for (const match of output.matchAll(re)) { + const testName = match[1].trim(); + const didPass = output[match.index] === "✓"; + results[testName] = didPass; + } + return results; +} + function parseTestOutput(output: string): TestResult { // Parse vitest output for pass/fail counts // Vitest formats: @@ -114,6 +134,7 @@ function parseTestOutput(output: string): TestResult { } const passed = totalCount > 0 && passedCount === totalCount; + const individualTests = parseIndividualTests(output); - return { passed, output, passedCount, totalCount }; + return { passed, output, passedCount, totalCount, individualTests }; } diff --git a/packages/evals/src/types.ts b/packages/evals/src/types.ts index 4b97534..7d85e76 100644 --- a/packages/evals/src/types.ts +++ b/packages/evals/src/types.ts @@ -38,4 +38,8 @@ export interface EvalRunResult { toolCallCount?: number; /** Total cost in USD (from stream-json result event) */ costUsd?: number; + /** The PROMPT.md content sent to the agent */ + prompt?: string; + /** Per-test pass/fail results from vitest */ + individualTests?: Record; }