clean supabase project and use braintrust datasets

This commit is contained in:
Pedro Rodrigues
2026-02-25 20:20:36 +00:00
parent 34e807a3f6
commit 0894f5683e
3 changed files with 88 additions and 19 deletions

View File

@@ -128,7 +128,9 @@ export const assertions: EvalAssertion[] = [
{ {
name: "function uses hyphenated name", name: "function uses hyphenated name",
check: () => { check: () => {
const dirs = existsSync(getFunctionsDir()) ? readdirSync(getFunctionsDir()) : []; const dirs = existsSync(getFunctionsDir())
? readdirSync(getFunctionsDir())
: [];
const helloDir = dirs.find( const helloDir = dirs.find(
(d) => d.includes("hello") && d.includes("world"), (d) => d.includes("hello") && d.includes("world"),
); );

View File

@@ -2,7 +2,10 @@ import { existsSync, readdirSync, readFileSync } from "node:fs";
import { join, resolve } from "node:path"; import { join, resolve } from "node:path";
import type { AssertionResult, EvalAssertion } from "./eval-types.js"; import type { AssertionResult, EvalAssertion } from "./eval-types.js";
import { runAgent } from "./runner/agent.js"; import { runAgent } from "./runner/agent.js";
import { uploadToBraintrust } from "./runner/braintrust.js"; import {
seedBraintrustDataset,
uploadToBraintrust,
} from "./runner/braintrust.js";
import { createResultDir, saveRunArtifacts } from "./runner/persist.js"; import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
import { preflight } from "./runner/preflight.js"; import { preflight } from "./runner/preflight.js";
import { listModifiedFiles, printSummary } from "./runner/results.js"; import { listModifiedFiles, printSummary } from "./runner/results.js";
@@ -129,7 +132,11 @@ async function runAssertions(
async function runEval( async function runEval(
scenario: EvalScenario, scenario: EvalScenario,
skillEnabled: boolean, skillEnabled: boolean,
): Promise<{ result: EvalRunResult; transcript?: TranscriptSummary }> { ): Promise<{
result: EvalRunResult;
transcript?: TranscriptSummary;
expectedReferenceFiles: string[];
}> {
const evalsDir = findEvalsDir(); const evalsDir = findEvalsDir();
const evalDir = join(evalsDir, scenario.id); const evalDir = join(evalsDir, scenario.id);
const variant = skillEnabled ? "with-skill" : "baseline"; const variant = skillEnabled ? "with-skill" : "baseline";
@@ -263,7 +270,7 @@ async function runEval(
transcriptSummary: summary, transcriptSummary: summary,
}); });
return { result, transcript: summary }; return { result, transcript: summary, expectedReferenceFiles };
} catch (error) { } catch (error) {
const err = error as Error; const err = error as Error;
return { return {
@@ -280,6 +287,7 @@ async function runEval(
filesModified: [], filesModified: [],
error: err.message, error: err.message,
}, },
expectedReferenceFiles: [],
}; };
} finally { } finally {
cleanup(); cleanup();
@@ -321,8 +329,7 @@ async function main() {
const results: EvalRunResult[] = []; const results: EvalRunResult[] = [];
const transcripts = new Map<string, TranscriptSummary>(); const transcripts = new Map<string, TranscriptSummary>();
const expectedRefFiles = new Map<string, string[]>();
const braintrustUpload = process.env.BRAINTRUST_UPLOAD === "true";
try { try {
for (const scenario of scenarios) { for (const scenario of scenarios) {
@@ -330,11 +337,15 @@ async function main() {
console.log(`\n Resetting DB for ${scenario.id}...`); console.log(`\n Resetting DB for ${scenario.id}...`);
resetDB(keys.dbUrl); resetDB(keys.dbUrl);
const { result, transcript } = await runEval(scenario, skillEnabled); const { result, transcript, expectedReferenceFiles } = await runEval(
scenario,
skillEnabled,
);
results.push(result); results.push(result);
if (transcript) { if (transcript) {
transcripts.set(result.scenario, transcript); transcripts.set(result.scenario, transcript);
} }
expectedRefFiles.set(result.scenario, expectedReferenceFiles);
} }
} finally { } finally {
stopSupabase(); stopSupabase();
@@ -344,15 +355,15 @@ async function main() {
const resultsDir = results.find((r) => r.resultsDir)?.resultsDir; const resultsDir = results.find((r) => r.resultsDir)?.resultsDir;
printSummary(results, resultsDir); printSummary(results, resultsDir);
if (braintrustUpload) {
console.log("\nUploading to Braintrust..."); console.log("\nUploading to Braintrust...");
await seedBraintrustDataset(results, expectedRefFiles);
await uploadToBraintrust(results, { await uploadToBraintrust(results, {
model, model,
skillEnabled, skillEnabled,
runTimestamp, runTimestamp,
transcripts, transcripts,
expectedRefFiles,
}); });
}
} }
main().catch((err) => { main().catch((err) => {

View File

@@ -1,5 +1,5 @@
import assert from "node:assert"; import assert from "node:assert";
import { init, initLogger, type Logger } from "braintrust"; import { init, initDataset, initLogger, type Logger } from "braintrust";
import type { EvalRunResult } from "../types.js"; import type { EvalRunResult } from "../types.js";
import type { TranscriptSummary } from "./transcript.js"; import type { TranscriptSummary } from "./transcript.js";
@@ -116,6 +116,46 @@ export function logScenarioToLogger(
} }
} }
/**
* Seed a Braintrust dataset with one row per scenario.
*
* Uses scenario.id as the stable row ID so re-seeding is idempotent.
* Each row stores the prompt and expected assertions/reference files,
* giving Braintrust a stable baseline to track per-scenario score trends
* across experiment runs.
*/
export async function seedBraintrustDataset(
results: EvalRunResult[],
expectedRefFiles: Map<string, string[]>,
): Promise<void> {
assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
assert(process.env.BRAINTRUST_PROJECT_ID, "BRAINTRUST_PROJECT_ID is not set");
const dataset = initDataset({
projectId: process.env.BRAINTRUST_PROJECT_ID,
dataset: "supabase-skill-scenarios",
});
for (const r of results) {
dataset.insert({
id: r.scenario,
input: {
scenario: r.scenario,
prompt: r.prompt ?? "",
},
expected: {
testsTotal: r.testsTotal,
passThreshold: r.passThreshold ?? 1.0,
expectedReferenceFiles: expectedRefFiles.get(r.scenario) ?? [],
},
metadata: { scenario: r.scenario },
});
}
await dataset.flush();
console.log("Braintrust dataset seeded: supabase-skill-scenarios");
}
/** /**
* Upload eval results to Braintrust as an experiment. * Upload eval results to Braintrust as an experiment.
* *
@@ -126,6 +166,7 @@ export function logScenarioToLogger(
* - scores: skill_usage, reference_files_usage, assertions_passed, final_result * - scores: skill_usage, reference_files_usage, assertions_passed, final_result
* - metadata: agent, model, skillEnabled, test counts, tool calls, context window, output tokens, model usage, errors, cost * - metadata: agent, model, skillEnabled, test counts, tool calls, context window, output tokens, model usage, errors, cost
* - spans: one child span per agent tool call (when transcript available) * - spans: one child span per agent tool call (when transcript available)
* - datasetRecordId: links this row to the dataset row for per-scenario tracking
*/ */
export async function uploadToBraintrust( export async function uploadToBraintrust(
results: EvalRunResult[], results: EvalRunResult[],
@@ -134,6 +175,7 @@ export async function uploadToBraintrust(
skillEnabled: boolean; skillEnabled: boolean;
runTimestamp: string; runTimestamp: string;
transcripts: Map<string, TranscriptSummary>; transcripts: Map<string, TranscriptSummary>;
expectedRefFiles: Map<string, string[]>;
}, },
): Promise<void> { ): Promise<void> {
assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set"); assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
@@ -207,7 +249,14 @@ export async function uploadToBraintrust(
if (transcript && transcript.toolCalls.length > 0) { if (transcript && transcript.toolCalls.length > 0) {
experiment.traced((span) => { experiment.traced((span) => {
span.log({ input, output, expected, scores, metadata }); span.log({
input,
output,
expected,
scores,
metadata,
datasetRecordId: r.scenario,
});
for (const tc of transcript.toolCalls) { for (const tc of transcript.toolCalls) {
span.traced( span.traced(
@@ -228,7 +277,14 @@ export async function uploadToBraintrust(
}, spanOptions); }, spanOptions);
} else { } else {
experiment.traced((span) => { experiment.traced((span) => {
span.log({ input, output, expected, scores, metadata }); span.log({
input,
output,
expected,
scores,
metadata,
datasetRecordId: r.scenario,
});
}, spanOptions); }, spanOptions);
} }
} }