From 0894f5683e304c8a039ab2eeb016ee17dc3d0476 Mon Sep 17 00:00:00 2001 From: Pedro Rodrigues Date: Wed, 25 Feb 2026 20:20:36 +0000 Subject: [PATCH] clean supabase project and use braintrust datasets --- .../evals/edge-function-hello-world/EVAL.ts | 4 +- packages/evals/src/runner.ts | 41 +++++++----- packages/evals/src/runner/braintrust.ts | 62 ++++++++++++++++++- 3 files changed, 88 insertions(+), 19 deletions(-) diff --git a/packages/evals/evals/edge-function-hello-world/EVAL.ts b/packages/evals/evals/edge-function-hello-world/EVAL.ts index 83f329a..9cab306 100644 --- a/packages/evals/evals/edge-function-hello-world/EVAL.ts +++ b/packages/evals/evals/edge-function-hello-world/EVAL.ts @@ -128,7 +128,9 @@ export const assertions: EvalAssertion[] = [ { name: "function uses hyphenated name", check: () => { - const dirs = existsSync(getFunctionsDir()) ? readdirSync(getFunctionsDir()) : []; + const dirs = existsSync(getFunctionsDir()) + ? readdirSync(getFunctionsDir()) + : []; const helloDir = dirs.find( (d) => d.includes("hello") && d.includes("world"), ); diff --git a/packages/evals/src/runner.ts b/packages/evals/src/runner.ts index 3396f02..f82f44b 100644 --- a/packages/evals/src/runner.ts +++ b/packages/evals/src/runner.ts @@ -2,7 +2,10 @@ import { existsSync, readdirSync, readFileSync } from "node:fs"; import { join, resolve } from "node:path"; import type { AssertionResult, EvalAssertion } from "./eval-types.js"; import { runAgent } from "./runner/agent.js"; -import { uploadToBraintrust } from "./runner/braintrust.js"; +import { + seedBraintrustDataset, + uploadToBraintrust, +} from "./runner/braintrust.js"; import { createResultDir, saveRunArtifacts } from "./runner/persist.js"; import { preflight } from "./runner/preflight.js"; import { listModifiedFiles, printSummary } from "./runner/results.js"; @@ -129,7 +132,11 @@ async function runAssertions( async function runEval( scenario: EvalScenario, skillEnabled: boolean, -): Promise<{ result: EvalRunResult; transcript?: TranscriptSummary }> { +): Promise<{ + result: EvalRunResult; + transcript?: TranscriptSummary; + expectedReferenceFiles: string[]; +}> { const evalsDir = findEvalsDir(); const evalDir = join(evalsDir, scenario.id); const variant = skillEnabled ? "with-skill" : "baseline"; @@ -263,7 +270,7 @@ async function runEval( transcriptSummary: summary, }); - return { result, transcript: summary }; + return { result, transcript: summary, expectedReferenceFiles }; } catch (error) { const err = error as Error; return { @@ -280,6 +287,7 @@ async function runEval( filesModified: [], error: err.message, }, + expectedReferenceFiles: [], }; } finally { cleanup(); @@ -321,8 +329,7 @@ async function main() { const results: EvalRunResult[] = []; const transcripts = new Map(); - - const braintrustUpload = process.env.BRAINTRUST_UPLOAD === "true"; + const expectedRefFiles = new Map(); try { for (const scenario of scenarios) { @@ -330,11 +337,15 @@ async function main() { console.log(`\n Resetting DB for ${scenario.id}...`); resetDB(keys.dbUrl); - const { result, transcript } = await runEval(scenario, skillEnabled); + const { result, transcript, expectedReferenceFiles } = await runEval( + scenario, + skillEnabled, + ); results.push(result); if (transcript) { transcripts.set(result.scenario, transcript); } + expectedRefFiles.set(result.scenario, expectedReferenceFiles); } } finally { stopSupabase(); @@ -344,15 +355,15 @@ async function main() { const resultsDir = results.find((r) => r.resultsDir)?.resultsDir; printSummary(results, resultsDir); - if (braintrustUpload) { - console.log("\nUploading to Braintrust..."); - await uploadToBraintrust(results, { - model, - skillEnabled, - runTimestamp, - transcripts, - }); - } + console.log("\nUploading to Braintrust..."); + await seedBraintrustDataset(results, expectedRefFiles); + await uploadToBraintrust(results, { + model, + skillEnabled, + runTimestamp, + transcripts, + expectedRefFiles, + }); } main().catch((err) => { diff --git a/packages/evals/src/runner/braintrust.ts b/packages/evals/src/runner/braintrust.ts index 0504175..03d6d77 100644 --- a/packages/evals/src/runner/braintrust.ts +++ b/packages/evals/src/runner/braintrust.ts @@ -1,5 +1,5 @@ import assert from "node:assert"; -import { init, initLogger, type Logger } from "braintrust"; +import { init, initDataset, initLogger, type Logger } from "braintrust"; import type { EvalRunResult } from "../types.js"; import type { TranscriptSummary } from "./transcript.js"; @@ -116,6 +116,46 @@ export function logScenarioToLogger( } } +/** + * Seed a Braintrust dataset with one row per scenario. + * + * Uses scenario.id as the stable row ID so re-seeding is idempotent. + * Each row stores the prompt and expected assertions/reference files, + * giving Braintrust a stable baseline to track per-scenario score trends + * across experiment runs. + */ +export async function seedBraintrustDataset( + results: EvalRunResult[], + expectedRefFiles: Map, +): Promise { + assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set"); + assert(process.env.BRAINTRUST_PROJECT_ID, "BRAINTRUST_PROJECT_ID is not set"); + + const dataset = initDataset({ + projectId: process.env.BRAINTRUST_PROJECT_ID, + dataset: "supabase-skill-scenarios", + }); + + for (const r of results) { + dataset.insert({ + id: r.scenario, + input: { + scenario: r.scenario, + prompt: r.prompt ?? "", + }, + expected: { + testsTotal: r.testsTotal, + passThreshold: r.passThreshold ?? 1.0, + expectedReferenceFiles: expectedRefFiles.get(r.scenario) ?? [], + }, + metadata: { scenario: r.scenario }, + }); + } + + await dataset.flush(); + console.log("Braintrust dataset seeded: supabase-skill-scenarios"); +} + /** * Upload eval results to Braintrust as an experiment. * @@ -126,6 +166,7 @@ export function logScenarioToLogger( * - scores: skill_usage, reference_files_usage, assertions_passed, final_result * - metadata: agent, model, skillEnabled, test counts, tool calls, context window, output tokens, model usage, errors, cost * - spans: one child span per agent tool call (when transcript available) + * - datasetRecordId: links this row to the dataset row for per-scenario tracking */ export async function uploadToBraintrust( results: EvalRunResult[], @@ -134,6 +175,7 @@ export async function uploadToBraintrust( skillEnabled: boolean; runTimestamp: string; transcripts: Map; + expectedRefFiles: Map; }, ): Promise { assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set"); @@ -207,7 +249,14 @@ export async function uploadToBraintrust( if (transcript && transcript.toolCalls.length > 0) { experiment.traced((span) => { - span.log({ input, output, expected, scores, metadata }); + span.log({ + input, + output, + expected, + scores, + metadata, + datasetRecordId: r.scenario, + }); for (const tc of transcript.toolCalls) { span.traced( @@ -228,7 +277,14 @@ export async function uploadToBraintrust( }, spanOptions); } else { experiment.traced((span) => { - span.log({ input, output, expected, scores, metadata }); + span.log({ + input, + output, + expected, + scores, + metadata, + datasetRecordId: r.scenario, + }); }, spanOptions); } }