mirror of
https://github.com/supabase/agent-skills.git
synced 2026-03-27 10:09:26 +08:00
clean supabase project and use braintrust datasets
This commit is contained in:
@@ -128,7 +128,9 @@ export const assertions: EvalAssertion[] = [
|
|||||||
{
|
{
|
||||||
name: "function uses hyphenated name",
|
name: "function uses hyphenated name",
|
||||||
check: () => {
|
check: () => {
|
||||||
const dirs = existsSync(getFunctionsDir()) ? readdirSync(getFunctionsDir()) : [];
|
const dirs = existsSync(getFunctionsDir())
|
||||||
|
? readdirSync(getFunctionsDir())
|
||||||
|
: [];
|
||||||
const helloDir = dirs.find(
|
const helloDir = dirs.find(
|
||||||
(d) => d.includes("hello") && d.includes("world"),
|
(d) => d.includes("hello") && d.includes("world"),
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -2,7 +2,10 @@ import { existsSync, readdirSync, readFileSync } from "node:fs";
|
|||||||
import { join, resolve } from "node:path";
|
import { join, resolve } from "node:path";
|
||||||
import type { AssertionResult, EvalAssertion } from "./eval-types.js";
|
import type { AssertionResult, EvalAssertion } from "./eval-types.js";
|
||||||
import { runAgent } from "./runner/agent.js";
|
import { runAgent } from "./runner/agent.js";
|
||||||
import { uploadToBraintrust } from "./runner/braintrust.js";
|
import {
|
||||||
|
seedBraintrustDataset,
|
||||||
|
uploadToBraintrust,
|
||||||
|
} from "./runner/braintrust.js";
|
||||||
import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
|
import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
|
||||||
import { preflight } from "./runner/preflight.js";
|
import { preflight } from "./runner/preflight.js";
|
||||||
import { listModifiedFiles, printSummary } from "./runner/results.js";
|
import { listModifiedFiles, printSummary } from "./runner/results.js";
|
||||||
@@ -129,7 +132,11 @@ async function runAssertions(
|
|||||||
async function runEval(
|
async function runEval(
|
||||||
scenario: EvalScenario,
|
scenario: EvalScenario,
|
||||||
skillEnabled: boolean,
|
skillEnabled: boolean,
|
||||||
): Promise<{ result: EvalRunResult; transcript?: TranscriptSummary }> {
|
): Promise<{
|
||||||
|
result: EvalRunResult;
|
||||||
|
transcript?: TranscriptSummary;
|
||||||
|
expectedReferenceFiles: string[];
|
||||||
|
}> {
|
||||||
const evalsDir = findEvalsDir();
|
const evalsDir = findEvalsDir();
|
||||||
const evalDir = join(evalsDir, scenario.id);
|
const evalDir = join(evalsDir, scenario.id);
|
||||||
const variant = skillEnabled ? "with-skill" : "baseline";
|
const variant = skillEnabled ? "with-skill" : "baseline";
|
||||||
@@ -263,7 +270,7 @@ async function runEval(
|
|||||||
transcriptSummary: summary,
|
transcriptSummary: summary,
|
||||||
});
|
});
|
||||||
|
|
||||||
return { result, transcript: summary };
|
return { result, transcript: summary, expectedReferenceFiles };
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
const err = error as Error;
|
const err = error as Error;
|
||||||
return {
|
return {
|
||||||
@@ -280,6 +287,7 @@ async function runEval(
|
|||||||
filesModified: [],
|
filesModified: [],
|
||||||
error: err.message,
|
error: err.message,
|
||||||
},
|
},
|
||||||
|
expectedReferenceFiles: [],
|
||||||
};
|
};
|
||||||
} finally {
|
} finally {
|
||||||
cleanup();
|
cleanup();
|
||||||
@@ -321,8 +329,7 @@ async function main() {
|
|||||||
|
|
||||||
const results: EvalRunResult[] = [];
|
const results: EvalRunResult[] = [];
|
||||||
const transcripts = new Map<string, TranscriptSummary>();
|
const transcripts = new Map<string, TranscriptSummary>();
|
||||||
|
const expectedRefFiles = new Map<string, string[]>();
|
||||||
const braintrustUpload = process.env.BRAINTRUST_UPLOAD === "true";
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for (const scenario of scenarios) {
|
for (const scenario of scenarios) {
|
||||||
@@ -330,11 +337,15 @@ async function main() {
|
|||||||
console.log(`\n Resetting DB for ${scenario.id}...`);
|
console.log(`\n Resetting DB for ${scenario.id}...`);
|
||||||
resetDB(keys.dbUrl);
|
resetDB(keys.dbUrl);
|
||||||
|
|
||||||
const { result, transcript } = await runEval(scenario, skillEnabled);
|
const { result, transcript, expectedReferenceFiles } = await runEval(
|
||||||
|
scenario,
|
||||||
|
skillEnabled,
|
||||||
|
);
|
||||||
results.push(result);
|
results.push(result);
|
||||||
if (transcript) {
|
if (transcript) {
|
||||||
transcripts.set(result.scenario, transcript);
|
transcripts.set(result.scenario, transcript);
|
||||||
}
|
}
|
||||||
|
expectedRefFiles.set(result.scenario, expectedReferenceFiles);
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
stopSupabase();
|
stopSupabase();
|
||||||
@@ -344,15 +355,15 @@ async function main() {
|
|||||||
const resultsDir = results.find((r) => r.resultsDir)?.resultsDir;
|
const resultsDir = results.find((r) => r.resultsDir)?.resultsDir;
|
||||||
printSummary(results, resultsDir);
|
printSummary(results, resultsDir);
|
||||||
|
|
||||||
if (braintrustUpload) {
|
console.log("\nUploading to Braintrust...");
|
||||||
console.log("\nUploading to Braintrust...");
|
await seedBraintrustDataset(results, expectedRefFiles);
|
||||||
await uploadToBraintrust(results, {
|
await uploadToBraintrust(results, {
|
||||||
model,
|
model,
|
||||||
skillEnabled,
|
skillEnabled,
|
||||||
runTimestamp,
|
runTimestamp,
|
||||||
transcripts,
|
transcripts,
|
||||||
});
|
expectedRefFiles,
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
main().catch((err) => {
|
main().catch((err) => {
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import assert from "node:assert";
|
import assert from "node:assert";
|
||||||
import { init, initLogger, type Logger } from "braintrust";
|
import { init, initDataset, initLogger, type Logger } from "braintrust";
|
||||||
import type { EvalRunResult } from "../types.js";
|
import type { EvalRunResult } from "../types.js";
|
||||||
import type { TranscriptSummary } from "./transcript.js";
|
import type { TranscriptSummary } from "./transcript.js";
|
||||||
|
|
||||||
@@ -116,6 +116,46 @@ export function logScenarioToLogger(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Seed a Braintrust dataset with one row per scenario.
|
||||||
|
*
|
||||||
|
* Uses scenario.id as the stable row ID so re-seeding is idempotent.
|
||||||
|
* Each row stores the prompt and expected assertions/reference files,
|
||||||
|
* giving Braintrust a stable baseline to track per-scenario score trends
|
||||||
|
* across experiment runs.
|
||||||
|
*/
|
||||||
|
export async function seedBraintrustDataset(
|
||||||
|
results: EvalRunResult[],
|
||||||
|
expectedRefFiles: Map<string, string[]>,
|
||||||
|
): Promise<void> {
|
||||||
|
assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
|
||||||
|
assert(process.env.BRAINTRUST_PROJECT_ID, "BRAINTRUST_PROJECT_ID is not set");
|
||||||
|
|
||||||
|
const dataset = initDataset({
|
||||||
|
projectId: process.env.BRAINTRUST_PROJECT_ID,
|
||||||
|
dataset: "supabase-skill-scenarios",
|
||||||
|
});
|
||||||
|
|
||||||
|
for (const r of results) {
|
||||||
|
dataset.insert({
|
||||||
|
id: r.scenario,
|
||||||
|
input: {
|
||||||
|
scenario: r.scenario,
|
||||||
|
prompt: r.prompt ?? "",
|
||||||
|
},
|
||||||
|
expected: {
|
||||||
|
testsTotal: r.testsTotal,
|
||||||
|
passThreshold: r.passThreshold ?? 1.0,
|
||||||
|
expectedReferenceFiles: expectedRefFiles.get(r.scenario) ?? [],
|
||||||
|
},
|
||||||
|
metadata: { scenario: r.scenario },
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
await dataset.flush();
|
||||||
|
console.log("Braintrust dataset seeded: supabase-skill-scenarios");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Upload eval results to Braintrust as an experiment.
|
* Upload eval results to Braintrust as an experiment.
|
||||||
*
|
*
|
||||||
@@ -126,6 +166,7 @@ export function logScenarioToLogger(
|
|||||||
* - scores: skill_usage, reference_files_usage, assertions_passed, final_result
|
* - scores: skill_usage, reference_files_usage, assertions_passed, final_result
|
||||||
* - metadata: agent, model, skillEnabled, test counts, tool calls, context window, output tokens, model usage, errors, cost
|
* - metadata: agent, model, skillEnabled, test counts, tool calls, context window, output tokens, model usage, errors, cost
|
||||||
* - spans: one child span per agent tool call (when transcript available)
|
* - spans: one child span per agent tool call (when transcript available)
|
||||||
|
* - datasetRecordId: links this row to the dataset row for per-scenario tracking
|
||||||
*/
|
*/
|
||||||
export async function uploadToBraintrust(
|
export async function uploadToBraintrust(
|
||||||
results: EvalRunResult[],
|
results: EvalRunResult[],
|
||||||
@@ -134,6 +175,7 @@ export async function uploadToBraintrust(
|
|||||||
skillEnabled: boolean;
|
skillEnabled: boolean;
|
||||||
runTimestamp: string;
|
runTimestamp: string;
|
||||||
transcripts: Map<string, TranscriptSummary>;
|
transcripts: Map<string, TranscriptSummary>;
|
||||||
|
expectedRefFiles: Map<string, string[]>;
|
||||||
},
|
},
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
|
assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
|
||||||
@@ -207,7 +249,14 @@ export async function uploadToBraintrust(
|
|||||||
|
|
||||||
if (transcript && transcript.toolCalls.length > 0) {
|
if (transcript && transcript.toolCalls.length > 0) {
|
||||||
experiment.traced((span) => {
|
experiment.traced((span) => {
|
||||||
span.log({ input, output, expected, scores, metadata });
|
span.log({
|
||||||
|
input,
|
||||||
|
output,
|
||||||
|
expected,
|
||||||
|
scores,
|
||||||
|
metadata,
|
||||||
|
datasetRecordId: r.scenario,
|
||||||
|
});
|
||||||
|
|
||||||
for (const tc of transcript.toolCalls) {
|
for (const tc of transcript.toolCalls) {
|
||||||
span.traced(
|
span.traced(
|
||||||
@@ -228,7 +277,14 @@ export async function uploadToBraintrust(
|
|||||||
}, spanOptions);
|
}, spanOptions);
|
||||||
} else {
|
} else {
|
||||||
experiment.traced((span) => {
|
experiment.traced((span) => {
|
||||||
span.log({ input, output, expected, scores, metadata });
|
span.log({
|
||||||
|
input,
|
||||||
|
output,
|
||||||
|
expected,
|
||||||
|
scores,
|
||||||
|
metadata,
|
||||||
|
datasetRecordId: r.scenario,
|
||||||
|
});
|
||||||
}, spanOptions);
|
}, spanOptions);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user