Files
supabase-postgres-best-prac…/packages/evals/src/runner/results.ts
Pedro Rodrigues 2da5cae2ac feat(evals): enrich Braintrust upload with granular scores and tracing
Add per-test pass/fail parsing from vitest verbose output, thread prompt
content and individual test results through the runner, and rewrite
uploadToBraintrust with experiment naming (model-variant-timestamp),
granular scores (pass, test_pass_rate, per-test), rich metadata, and
tool-call tracing via experiment.traced(). Also document --force flag
for cached mise tasks and add Braintrust env vars to AGENTS.md.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-24 13:26:48 +00:00

77 lines
2.0 KiB
TypeScript

import { readdirSync, statSync } from "node:fs";
import { join } from "node:path";
import type { EvalRunResult } from "../types.js";
/**
* List files created or modified by the agent in the workspace.
* Compares against the original eval directory to find new files.
*/
export function listModifiedFiles(
workspacePath: string,
originalEvalDir: string,
): string[] {
const modified: string[] = [];
function walk(dir: string, prefix: string) {
const entries = readdirSync(dir, { withFileTypes: true });
for (const entry of entries) {
if (
entry.name === "node_modules" ||
entry.name === ".agents" ||
entry.name === ".claude" ||
entry.name === "EVAL.ts" ||
entry.name === "EVAL.tsx"
)
continue;
const relPath = prefix ? `${prefix}/${entry.name}` : entry.name;
const fullPath = join(dir, entry.name);
if (entry.isDirectory()) {
walk(fullPath, relPath);
} else {
// Check if file is new (not in original eval dir)
const originalPath = join(originalEvalDir, relPath);
try {
statSync(originalPath);
} catch {
// File doesn't exist in original — it was created by the agent
modified.push(relPath);
}
}
}
}
walk(workspacePath, "");
return modified;
}
/** Print a summary table of eval results. */
export function printSummary(
results: EvalRunResult[],
resultsDir?: string,
): void {
console.log("\n=== Eval Results ===\n");
for (const r of results) {
const icon = r.status === "passed" ? "PASS" : "FAIL";
const skill = r.skillEnabled ? "with-skill" : "baseline";
console.log(
`[${icon}] ${r.scenario} | ${r.model} | ${skill} | ${(r.duration / 1000).toFixed(1)}s`,
);
if (r.filesModified.length > 0) {
console.log(` Files: ${r.filesModified.join(", ")}`);
}
if (r.status === "error" && r.error) {
console.log(` Error: ${r.error}`);
}
}
const passed = results.filter((r) => r.status === "passed").length;
console.log(`\nTotal: ${passed}/${results.length} passed`);
if (resultsDir) {
console.log(`\nResults saved to: ${resultsDir}`);
}
}