mirror of
https://github.com/supabase/agent-skills.git
synced 2026-03-27 10:09:26 +08:00
clean supabase project and use braintrust datasets
This commit is contained in:
@@ -128,7 +128,9 @@ export const assertions: EvalAssertion[] = [
|
||||
{
|
||||
name: "function uses hyphenated name",
|
||||
check: () => {
|
||||
const dirs = existsSync(getFunctionsDir()) ? readdirSync(getFunctionsDir()) : [];
|
||||
const dirs = existsSync(getFunctionsDir())
|
||||
? readdirSync(getFunctionsDir())
|
||||
: [];
|
||||
const helloDir = dirs.find(
|
||||
(d) => d.includes("hello") && d.includes("world"),
|
||||
);
|
||||
|
||||
@@ -2,7 +2,10 @@ import { existsSync, readdirSync, readFileSync } from "node:fs";
|
||||
import { join, resolve } from "node:path";
|
||||
import type { AssertionResult, EvalAssertion } from "./eval-types.js";
|
||||
import { runAgent } from "./runner/agent.js";
|
||||
import { uploadToBraintrust } from "./runner/braintrust.js";
|
||||
import {
|
||||
seedBraintrustDataset,
|
||||
uploadToBraintrust,
|
||||
} from "./runner/braintrust.js";
|
||||
import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
|
||||
import { preflight } from "./runner/preflight.js";
|
||||
import { listModifiedFiles, printSummary } from "./runner/results.js";
|
||||
@@ -129,7 +132,11 @@ async function runAssertions(
|
||||
async function runEval(
|
||||
scenario: EvalScenario,
|
||||
skillEnabled: boolean,
|
||||
): Promise<{ result: EvalRunResult; transcript?: TranscriptSummary }> {
|
||||
): Promise<{
|
||||
result: EvalRunResult;
|
||||
transcript?: TranscriptSummary;
|
||||
expectedReferenceFiles: string[];
|
||||
}> {
|
||||
const evalsDir = findEvalsDir();
|
||||
const evalDir = join(evalsDir, scenario.id);
|
||||
const variant = skillEnabled ? "with-skill" : "baseline";
|
||||
@@ -263,7 +270,7 @@ async function runEval(
|
||||
transcriptSummary: summary,
|
||||
});
|
||||
|
||||
return { result, transcript: summary };
|
||||
return { result, transcript: summary, expectedReferenceFiles };
|
||||
} catch (error) {
|
||||
const err = error as Error;
|
||||
return {
|
||||
@@ -280,6 +287,7 @@ async function runEval(
|
||||
filesModified: [],
|
||||
error: err.message,
|
||||
},
|
||||
expectedReferenceFiles: [],
|
||||
};
|
||||
} finally {
|
||||
cleanup();
|
||||
@@ -321,8 +329,7 @@ async function main() {
|
||||
|
||||
const results: EvalRunResult[] = [];
|
||||
const transcripts = new Map<string, TranscriptSummary>();
|
||||
|
||||
const braintrustUpload = process.env.BRAINTRUST_UPLOAD === "true";
|
||||
const expectedRefFiles = new Map<string, string[]>();
|
||||
|
||||
try {
|
||||
for (const scenario of scenarios) {
|
||||
@@ -330,11 +337,15 @@ async function main() {
|
||||
console.log(`\n Resetting DB for ${scenario.id}...`);
|
||||
resetDB(keys.dbUrl);
|
||||
|
||||
const { result, transcript } = await runEval(scenario, skillEnabled);
|
||||
const { result, transcript, expectedReferenceFiles } = await runEval(
|
||||
scenario,
|
||||
skillEnabled,
|
||||
);
|
||||
results.push(result);
|
||||
if (transcript) {
|
||||
transcripts.set(result.scenario, transcript);
|
||||
}
|
||||
expectedRefFiles.set(result.scenario, expectedReferenceFiles);
|
||||
}
|
||||
} finally {
|
||||
stopSupabase();
|
||||
@@ -344,15 +355,15 @@ async function main() {
|
||||
const resultsDir = results.find((r) => r.resultsDir)?.resultsDir;
|
||||
printSummary(results, resultsDir);
|
||||
|
||||
if (braintrustUpload) {
|
||||
console.log("\nUploading to Braintrust...");
|
||||
await seedBraintrustDataset(results, expectedRefFiles);
|
||||
await uploadToBraintrust(results, {
|
||||
model,
|
||||
skillEnabled,
|
||||
runTimestamp,
|
||||
transcripts,
|
||||
expectedRefFiles,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import assert from "node:assert";
|
||||
import { init, initLogger, type Logger } from "braintrust";
|
||||
import { init, initDataset, initLogger, type Logger } from "braintrust";
|
||||
import type { EvalRunResult } from "../types.js";
|
||||
import type { TranscriptSummary } from "./transcript.js";
|
||||
|
||||
@@ -116,6 +116,46 @@ export function logScenarioToLogger(
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Seed a Braintrust dataset with one row per scenario.
|
||||
*
|
||||
* Uses scenario.id as the stable row ID so re-seeding is idempotent.
|
||||
* Each row stores the prompt and expected assertions/reference files,
|
||||
* giving Braintrust a stable baseline to track per-scenario score trends
|
||||
* across experiment runs.
|
||||
*/
|
||||
export async function seedBraintrustDataset(
|
||||
results: EvalRunResult[],
|
||||
expectedRefFiles: Map<string, string[]>,
|
||||
): Promise<void> {
|
||||
assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
|
||||
assert(process.env.BRAINTRUST_PROJECT_ID, "BRAINTRUST_PROJECT_ID is not set");
|
||||
|
||||
const dataset = initDataset({
|
||||
projectId: process.env.BRAINTRUST_PROJECT_ID,
|
||||
dataset: "supabase-skill-scenarios",
|
||||
});
|
||||
|
||||
for (const r of results) {
|
||||
dataset.insert({
|
||||
id: r.scenario,
|
||||
input: {
|
||||
scenario: r.scenario,
|
||||
prompt: r.prompt ?? "",
|
||||
},
|
||||
expected: {
|
||||
testsTotal: r.testsTotal,
|
||||
passThreshold: r.passThreshold ?? 1.0,
|
||||
expectedReferenceFiles: expectedRefFiles.get(r.scenario) ?? [],
|
||||
},
|
||||
metadata: { scenario: r.scenario },
|
||||
});
|
||||
}
|
||||
|
||||
await dataset.flush();
|
||||
console.log("Braintrust dataset seeded: supabase-skill-scenarios");
|
||||
}
|
||||
|
||||
/**
|
||||
* Upload eval results to Braintrust as an experiment.
|
||||
*
|
||||
@@ -126,6 +166,7 @@ export function logScenarioToLogger(
|
||||
* - scores: skill_usage, reference_files_usage, assertions_passed, final_result
|
||||
* - metadata: agent, model, skillEnabled, test counts, tool calls, context window, output tokens, model usage, errors, cost
|
||||
* - spans: one child span per agent tool call (when transcript available)
|
||||
* - datasetRecordId: links this row to the dataset row for per-scenario tracking
|
||||
*/
|
||||
export async function uploadToBraintrust(
|
||||
results: EvalRunResult[],
|
||||
@@ -134,6 +175,7 @@ export async function uploadToBraintrust(
|
||||
skillEnabled: boolean;
|
||||
runTimestamp: string;
|
||||
transcripts: Map<string, TranscriptSummary>;
|
||||
expectedRefFiles: Map<string, string[]>;
|
||||
},
|
||||
): Promise<void> {
|
||||
assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
|
||||
@@ -207,7 +249,14 @@ export async function uploadToBraintrust(
|
||||
|
||||
if (transcript && transcript.toolCalls.length > 0) {
|
||||
experiment.traced((span) => {
|
||||
span.log({ input, output, expected, scores, metadata });
|
||||
span.log({
|
||||
input,
|
||||
output,
|
||||
expected,
|
||||
scores,
|
||||
metadata,
|
||||
datasetRecordId: r.scenario,
|
||||
});
|
||||
|
||||
for (const tc of transcript.toolCalls) {
|
||||
span.traced(
|
||||
@@ -228,7 +277,14 @@ export async function uploadToBraintrust(
|
||||
}, spanOptions);
|
||||
} else {
|
||||
experiment.traced((span) => {
|
||||
span.log({ input, output, expected, scores, metadata });
|
||||
span.log({
|
||||
input,
|
||||
output,
|
||||
expected,
|
||||
scores,
|
||||
metadata,
|
||||
datasetRecordId: r.scenario,
|
||||
});
|
||||
}, spanOptions);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user