Files
supabase-postgres-best-prac…/packages/evals/src/runner.ts
Pedro Rodrigues 2da5cae2ac feat(evals): enrich Braintrust upload with granular scores and tracing
Add per-test pass/fail parsing from vitest verbose output, thread prompt
content and individual test results through the runner, and rewrite
uploadToBraintrust with experiment naming (model-variant-timestamp),
granular scores (pass, test_pass_rate, per-test), rich metadata, and
tool-call tracing via experiment.traced(). Also document --force flag
for cached mise tasks and add Braintrust env vars to AGENTS.md.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-24 13:26:48 +00:00

227 lines
6.3 KiB
TypeScript

import { existsSync, readdirSync, readFileSync } from "node:fs";
import { join, resolve } from "node:path";
import { runAgent } from "./runner/agent.js";
import { uploadToBraintrust } from "./runner/braintrust.js";
import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
import { preflight } from "./runner/preflight.js";
import { listModifiedFiles, printSummary } from "./runner/results.js";
import { createWorkspace } from "./runner/scaffold.js";
import { runTests } from "./runner/test.js";
import {
buildTranscriptSummary,
type TranscriptSummary,
} from "./runner/transcript.js";
import type { EvalRunResult, EvalScenario } from "./types.js";
// ---------------------------------------------------------------------------
// Configuration from environment
// ---------------------------------------------------------------------------
const DEFAULT_MODEL = "claude-sonnet-4-5-20250929";
const AGENT_TIMEOUT = 30 * 60 * 1000; // 30 minutes
const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL;
const scenarioFilter = process.env.EVAL_SCENARIO;
const isBaseline = process.env.EVAL_BASELINE === "true";
const skillEnabled = !isBaseline;
// Run-level timestamp shared across all scenarios in a single invocation
const runTimestamp = new Date()
.toISOString()
.replace(/[:.]/g, "-")
.replace("Z", "");
// ---------------------------------------------------------------------------
// Discover scenarios
// ---------------------------------------------------------------------------
function findEvalsDir(): string {
let dir = process.cwd();
for (let i = 0; i < 10; i++) {
const candidate = join(dir, "packages", "evals", "evals");
if (existsSync(candidate)) return candidate;
const parent = resolve(dir, "..");
if (parent === dir) break;
dir = parent;
}
throw new Error("Could not find packages/evals/evals/ directory");
}
function discoverScenarios(): EvalScenario[] {
const evalsDir = findEvalsDir();
const dirs = readdirSync(evalsDir, { withFileTypes: true }).filter(
(d) => d.isDirectory() && existsSync(join(evalsDir, d.name, "PROMPT.md")),
);
return dirs.map((d) => ({
id: d.name,
name: d.name,
tags: [],
}));
}
// ---------------------------------------------------------------------------
// Run a single eval
// ---------------------------------------------------------------------------
async function runEval(
scenario: EvalScenario,
skillEnabled: boolean,
): Promise<{ result: EvalRunResult; transcript?: TranscriptSummary }> {
const evalsDir = findEvalsDir();
const evalDir = join(evalsDir, scenario.id);
const variant = skillEnabled ? "with-skill" : "baseline";
console.log(`\n--- ${scenario.id} (${variant}) ---`);
// 1. Create isolated workspace
const { workspacePath, cleanup } = createWorkspace({
evalDir,
skillEnabled,
});
console.log(` Workspace: ${workspacePath}`);
try {
// 2. Read the prompt
const prompt = readFileSync(join(evalDir, "PROMPT.md"), "utf-8").trim();
// 3. Run the agent
console.log(` Running agent (${model})...`);
const agentResult = await runAgent({
cwd: workspacePath,
prompt,
model,
timeout: AGENT_TIMEOUT,
skillEnabled,
});
console.log(
` Agent finished in ${(agentResult.duration / 1000).toFixed(1)}s`,
);
// 4. Run the hidden tests
const evalFilePath = existsSync(join(evalDir, "EVAL.tsx"))
? join(evalDir, "EVAL.tsx")
: join(evalDir, "EVAL.ts");
console.log(" Running tests...");
const testResult = await runTests({
workspacePath,
evalFilePath,
});
console.log(
` Tests: ${testResult.passedCount}/${testResult.totalCount} passed`,
);
// 5. Collect modified files
const filesModified = listModifiedFiles(workspacePath, evalDir);
// 6. Build transcript summary
const summary = buildTranscriptSummary(agentResult.events);
const result: EvalRunResult = {
scenario: scenario.id,
agent: "claude-code",
model,
skillEnabled,
status: testResult.passed ? "passed" : "failed",
duration: agentResult.duration,
testOutput: testResult.output,
agentOutput: agentResult.output,
testsPassed: testResult.passedCount,
testsTotal: testResult.totalCount,
filesModified,
toolCallCount: summary.toolCalls.length,
costUsd: summary.totalCostUsd ?? undefined,
prompt,
individualTests: testResult.individualTests,
};
// 7. Persist results
const resultDir = createResultDir(runTimestamp, scenario.id, variant);
result.resultsDir = resultDir;
saveRunArtifacts({
resultDir,
rawTranscript: agentResult.rawTranscript,
testOutput: testResult.output,
result,
transcriptSummary: summary,
});
return { result, transcript: summary };
} catch (error) {
const err = error as Error;
return {
result: {
scenario: scenario.id,
agent: "claude-code",
model,
skillEnabled,
status: "error",
duration: 0,
testOutput: "",
agentOutput: "",
testsPassed: 0,
testsTotal: 0,
filesModified: [],
error: err.message,
},
};
} finally {
cleanup();
}
}
// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------
async function main() {
preflight();
console.log("Supabase Skills Evals");
console.log(`Model: ${model}`);
console.log(`Mode: ${isBaseline ? "baseline (no skills)" : "with skills"}`);
let scenarios = discoverScenarios();
if (scenarioFilter) {
scenarios = scenarios.filter((s) => s.id === scenarioFilter);
if (scenarios.length === 0) {
console.error(`Scenario not found: ${scenarioFilter}`);
process.exit(1);
}
}
console.log(`Scenarios: ${scenarios.map((s) => s.id).join(", ")}`);
const results: EvalRunResult[] = [];
const transcripts = new Map<string, TranscriptSummary>();
for (const scenario of scenarios) {
const { result, transcript } = await runEval(scenario, skillEnabled);
results.push(result);
if (transcript) {
transcripts.set(result.scenario, transcript);
}
}
// Use the results dir from the first result (all share the same timestamp)
const resultsDir = results.find((r) => r.resultsDir)?.resultsDir;
printSummary(results, resultsDir);
if (process.env.BRAINTRUST_UPLOAD === "true") {
console.log("\nUploading to Braintrust...");
await uploadToBraintrust(results, {
model,
skillEnabled,
runTimestamp,
transcripts,
});
}
}
main().catch((err) => {
console.error("Fatal error:", err);
process.exit(1);
});