mirror of
https://github.com/supabase/agent-skills.git
synced 2026-03-27 10:09:26 +08:00
Add per-test pass/fail parsing from vitest verbose output, thread prompt content and individual test results through the runner, and rewrite uploadToBraintrust with experiment naming (model-variant-timestamp), granular scores (pass, test_pass_rate, per-test), rich metadata, and tool-call tracing via experiment.traced(). Also document --force flag for cached mise tasks and add Braintrust env vars to AGENTS.md. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
227 lines
6.3 KiB
TypeScript
227 lines
6.3 KiB
TypeScript
import { existsSync, readdirSync, readFileSync } from "node:fs";
|
|
import { join, resolve } from "node:path";
|
|
import { runAgent } from "./runner/agent.js";
|
|
import { uploadToBraintrust } from "./runner/braintrust.js";
|
|
import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
|
|
import { preflight } from "./runner/preflight.js";
|
|
import { listModifiedFiles, printSummary } from "./runner/results.js";
|
|
import { createWorkspace } from "./runner/scaffold.js";
|
|
import { runTests } from "./runner/test.js";
|
|
import {
|
|
buildTranscriptSummary,
|
|
type TranscriptSummary,
|
|
} from "./runner/transcript.js";
|
|
import type { EvalRunResult, EvalScenario } from "./types.js";
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Configuration from environment
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const DEFAULT_MODEL = "claude-sonnet-4-5-20250929";
|
|
const AGENT_TIMEOUT = 30 * 60 * 1000; // 30 minutes
|
|
|
|
const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL;
|
|
const scenarioFilter = process.env.EVAL_SCENARIO;
|
|
const isBaseline = process.env.EVAL_BASELINE === "true";
|
|
const skillEnabled = !isBaseline;
|
|
|
|
// Run-level timestamp shared across all scenarios in a single invocation
|
|
const runTimestamp = new Date()
|
|
.toISOString()
|
|
.replace(/[:.]/g, "-")
|
|
.replace("Z", "");
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Discover scenarios
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function findEvalsDir(): string {
|
|
let dir = process.cwd();
|
|
for (let i = 0; i < 10; i++) {
|
|
const candidate = join(dir, "packages", "evals", "evals");
|
|
if (existsSync(candidate)) return candidate;
|
|
const parent = resolve(dir, "..");
|
|
if (parent === dir) break;
|
|
dir = parent;
|
|
}
|
|
throw new Error("Could not find packages/evals/evals/ directory");
|
|
}
|
|
|
|
function discoverScenarios(): EvalScenario[] {
|
|
const evalsDir = findEvalsDir();
|
|
const dirs = readdirSync(evalsDir, { withFileTypes: true }).filter(
|
|
(d) => d.isDirectory() && existsSync(join(evalsDir, d.name, "PROMPT.md")),
|
|
);
|
|
|
|
return dirs.map((d) => ({
|
|
id: d.name,
|
|
name: d.name,
|
|
tags: [],
|
|
}));
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Run a single eval
|
|
// ---------------------------------------------------------------------------
|
|
|
|
async function runEval(
|
|
scenario: EvalScenario,
|
|
skillEnabled: boolean,
|
|
): Promise<{ result: EvalRunResult; transcript?: TranscriptSummary }> {
|
|
const evalsDir = findEvalsDir();
|
|
const evalDir = join(evalsDir, scenario.id);
|
|
const variant = skillEnabled ? "with-skill" : "baseline";
|
|
|
|
console.log(`\n--- ${scenario.id} (${variant}) ---`);
|
|
|
|
// 1. Create isolated workspace
|
|
const { workspacePath, cleanup } = createWorkspace({
|
|
evalDir,
|
|
skillEnabled,
|
|
});
|
|
console.log(` Workspace: ${workspacePath}`);
|
|
|
|
try {
|
|
// 2. Read the prompt
|
|
const prompt = readFileSync(join(evalDir, "PROMPT.md"), "utf-8").trim();
|
|
|
|
// 3. Run the agent
|
|
console.log(` Running agent (${model})...`);
|
|
const agentResult = await runAgent({
|
|
cwd: workspacePath,
|
|
prompt,
|
|
model,
|
|
timeout: AGENT_TIMEOUT,
|
|
skillEnabled,
|
|
});
|
|
console.log(
|
|
` Agent finished in ${(agentResult.duration / 1000).toFixed(1)}s`,
|
|
);
|
|
|
|
// 4. Run the hidden tests
|
|
const evalFilePath = existsSync(join(evalDir, "EVAL.tsx"))
|
|
? join(evalDir, "EVAL.tsx")
|
|
: join(evalDir, "EVAL.ts");
|
|
|
|
console.log(" Running tests...");
|
|
const testResult = await runTests({
|
|
workspacePath,
|
|
evalFilePath,
|
|
});
|
|
console.log(
|
|
` Tests: ${testResult.passedCount}/${testResult.totalCount} passed`,
|
|
);
|
|
|
|
// 5. Collect modified files
|
|
const filesModified = listModifiedFiles(workspacePath, evalDir);
|
|
|
|
// 6. Build transcript summary
|
|
const summary = buildTranscriptSummary(agentResult.events);
|
|
|
|
const result: EvalRunResult = {
|
|
scenario: scenario.id,
|
|
agent: "claude-code",
|
|
model,
|
|
skillEnabled,
|
|
status: testResult.passed ? "passed" : "failed",
|
|
duration: agentResult.duration,
|
|
testOutput: testResult.output,
|
|
agentOutput: agentResult.output,
|
|
testsPassed: testResult.passedCount,
|
|
testsTotal: testResult.totalCount,
|
|
filesModified,
|
|
toolCallCount: summary.toolCalls.length,
|
|
costUsd: summary.totalCostUsd ?? undefined,
|
|
prompt,
|
|
individualTests: testResult.individualTests,
|
|
};
|
|
|
|
// 7. Persist results
|
|
const resultDir = createResultDir(runTimestamp, scenario.id, variant);
|
|
result.resultsDir = resultDir;
|
|
saveRunArtifacts({
|
|
resultDir,
|
|
rawTranscript: agentResult.rawTranscript,
|
|
testOutput: testResult.output,
|
|
result,
|
|
transcriptSummary: summary,
|
|
});
|
|
|
|
return { result, transcript: summary };
|
|
} catch (error) {
|
|
const err = error as Error;
|
|
return {
|
|
result: {
|
|
scenario: scenario.id,
|
|
agent: "claude-code",
|
|
model,
|
|
skillEnabled,
|
|
status: "error",
|
|
duration: 0,
|
|
testOutput: "",
|
|
agentOutput: "",
|
|
testsPassed: 0,
|
|
testsTotal: 0,
|
|
filesModified: [],
|
|
error: err.message,
|
|
},
|
|
};
|
|
} finally {
|
|
cleanup();
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Main
|
|
// ---------------------------------------------------------------------------
|
|
|
|
async function main() {
|
|
preflight();
|
|
|
|
console.log("Supabase Skills Evals");
|
|
console.log(`Model: ${model}`);
|
|
console.log(`Mode: ${isBaseline ? "baseline (no skills)" : "with skills"}`);
|
|
|
|
let scenarios = discoverScenarios();
|
|
|
|
if (scenarioFilter) {
|
|
scenarios = scenarios.filter((s) => s.id === scenarioFilter);
|
|
if (scenarios.length === 0) {
|
|
console.error(`Scenario not found: ${scenarioFilter}`);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
console.log(`Scenarios: ${scenarios.map((s) => s.id).join(", ")}`);
|
|
|
|
const results: EvalRunResult[] = [];
|
|
const transcripts = new Map<string, TranscriptSummary>();
|
|
|
|
for (const scenario of scenarios) {
|
|
const { result, transcript } = await runEval(scenario, skillEnabled);
|
|
results.push(result);
|
|
if (transcript) {
|
|
transcripts.set(result.scenario, transcript);
|
|
}
|
|
}
|
|
|
|
// Use the results dir from the first result (all share the same timestamp)
|
|
const resultsDir = results.find((r) => r.resultsDir)?.resultsDir;
|
|
printSummary(results, resultsDir);
|
|
|
|
if (process.env.BRAINTRUST_UPLOAD === "true") {
|
|
console.log("\nUploading to Braintrust...");
|
|
await uploadToBraintrust(results, {
|
|
model,
|
|
skillEnabled,
|
|
runTimestamp,
|
|
transcripts,
|
|
});
|
|
}
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error("Fatal error:", err);
|
|
process.exit(1);
|
|
});
|