mirror of
https://github.com/supabase/agent-skills.git
synced 2026-03-27 10:09:26 +08:00
multi model testing
This commit is contained in:
@@ -46,11 +46,15 @@ sources = ["test/**", "skills/**"]
|
||||
# ── Eval tasks ────────────────────────────────────────────────────────
|
||||
|
||||
[tasks.eval]
|
||||
description = "Run code-fix evals (local, no upload)"
|
||||
description = "Run code-fix evals for all configured models (local, no upload)"
|
||||
run = "npm --prefix packages/evals run eval"
|
||||
sources = ["packages/evals/src/**", "skills/**/references/**"]
|
||||
|
||||
[tasks."eval:model"]
|
||||
description = "Run code-fix eval for a single model (local, no upload)"
|
||||
run = "EVAL_MODEL={{arg(name='model')}} npm --prefix packages/evals run eval"
|
||||
|
||||
[tasks."eval:upload"]
|
||||
description = "Run code-fix evals and upload to Braintrust"
|
||||
description = "Run code-fix evals for all models and upload to Braintrust"
|
||||
run = "npm --prefix packages/evals run eval:upload"
|
||||
sources = ["packages/evals/src/**", "skills/**/references/**"]
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
ANTHROPIC_API_KEY=
|
||||
BRAINTRUST_API_KEY=
|
||||
BRAINTRUST_PROJECT_ID=
|
||||
ANTHROPIC_API_KEY=
|
||||
# Provider API keys for eval models are configured in the Braintrust dashboard
|
||||
# under Settings → AI providers (not needed locally).
|
||||
|
||||
@@ -9,19 +9,26 @@ using skill documentation as context. It uses
|
||||
|
||||
Two-step **LLM-as-judge** pattern powered by Braintrust's `Eval()`:
|
||||
|
||||
1. The **eval model** (default: `claude-sonnet-4-5-20250929`) receives a prompt
|
||||
with skill context and produces a code fix.
|
||||
2. Three independent **judge scorers** (default: `claude-opus-4-6`) evaluate the
|
||||
fix via structured output (Zod schemas via AI SDK's `Output.object()`).
|
||||
1. The **eval model** receives a prompt with skill context and produces a code
|
||||
fix. All eval model calls go through the **Braintrust AI proxy** — a single
|
||||
OpenAI-compatible endpoint that routes to any provider (Anthropic, OpenAI,
|
||||
Google, etc.).
|
||||
2. Five independent **judge scorers** (`claude-opus-4-6` via direct Anthropic
|
||||
API) evaluate the fix via structured output (Zod schemas via AI SDK's
|
||||
`Output.object()`).
|
||||
|
||||
The eval runs once per model in the model matrix, creating a separate Braintrust
|
||||
experiment per model for side-by-side comparison.
|
||||
|
||||
Key files:
|
||||
|
||||
```
|
||||
src/
|
||||
code-fix.eval.ts # Braintrust Eval() entry point
|
||||
code-fix.eval.ts # Braintrust Eval() entry point (loops over models)
|
||||
dataset.ts # Maps extracted test cases to EvalCase format
|
||||
scorer.ts # Three AI SDK-based scorers (Correctness, Completeness, Best Practice)
|
||||
models.ts # Model provider factory (Anthropic / OpenAI)
|
||||
scorer.ts # Five AI SDK-based scorers (quality, safety, minimality)
|
||||
models.ts # Braintrust proxy + direct Anthropic provider
|
||||
models.config.ts # Model matrix (add/remove models here)
|
||||
dataset/
|
||||
types.ts # CodeFixTestCase interface
|
||||
extract.ts # Auto-extracts test cases from skill references
|
||||
@@ -35,14 +42,18 @@ src/
|
||||
(`dataset/extract.ts`) finds consecutive `**Incorrect:**` / `**Correct:**` code
|
||||
block pairs under `##` sections. Each pair becomes one test case.
|
||||
|
||||
Three independent scorers evaluate each fix (0–1 scale):
|
||||
Five independent scorers evaluate each fix (0–1 scale):
|
||||
|
||||
- **Correctness** — does the fix address the core issue?
|
||||
- **Completeness** — does the fix include all necessary changes?
|
||||
- **Best Practice** — does the fix follow Supabase conventions?
|
||||
- **Regression Safety** — does the fix avoid introducing new problems (broken
|
||||
functionality, removed security measures, new vulnerabilities)?
|
||||
- **Minimality** — is the fix tightly scoped to the identified issue without
|
||||
unnecessary rewrites or over-engineering?
|
||||
|
||||
Braintrust aggregates the scores and provides a dashboard for tracking
|
||||
regressions over time.
|
||||
Each model in the matrix generates a separate Braintrust experiment. The
|
||||
dashboard supports side-by-side comparison of experiments.
|
||||
|
||||
## Adding Test Cases
|
||||
|
||||
@@ -81,6 +92,67 @@ Rules:
|
||||
- Files prefixed with `_` (like `_sections.md`, `_template.md`) are skipped
|
||||
- Each pair gets an ID like `supabase/db-rls-mandatory#0` (skill/filename#index)
|
||||
|
||||
## Adding/Removing Models
|
||||
|
||||
Edit the `EVAL_MODELS` array in `src/models.config.ts`:
|
||||
|
||||
```typescript
|
||||
export const EVAL_MODELS: EvalModelConfig[] = [
|
||||
{ id: "claude-sonnet-4-5-20250929", label: "Claude Sonnet 4.5", provider: "anthropic", ci: true },
|
||||
{ id: "gpt-5.3", label: "GPT 5.3", provider: "openai", ci: true },
|
||||
// Add new models here
|
||||
];
|
||||
```
|
||||
|
||||
Provider API keys must be configured in the Braintrust dashboard under
|
||||
Settings → AI providers.
|
||||
|
||||
## Running Evals
|
||||
|
||||
```bash
|
||||
# Run all models locally (no Braintrust upload)
|
||||
mise run eval
|
||||
|
||||
# Run a single model
|
||||
mise run eval:model model=claude-sonnet-4-5-20250929
|
||||
|
||||
# Run and upload to Braintrust dashboard
|
||||
mise run eval:upload
|
||||
```
|
||||
|
||||
Or directly:
|
||||
|
||||
```bash
|
||||
cd packages/evals
|
||||
|
||||
# Local run (all models)
|
||||
npx braintrust eval --no-send-logs src/code-fix.eval.ts
|
||||
|
||||
# Single model
|
||||
EVAL_MODEL=claude-sonnet-4-5-20250929 npx braintrust eval --no-send-logs src/code-fix.eval.ts
|
||||
|
||||
# Filter to one test case (across all models)
|
||||
npx braintrust eval --no-send-logs src/code-fix.eval.ts --filter 'input.testCase.id=db-migrations-idempotent'
|
||||
```
|
||||
|
||||
## Environment
|
||||
|
||||
API keys are loaded by mise from `packages/evals/.env` (configured in root
|
||||
`mise.toml`). Copy `.env.example` to `.env` and fill in the keys.
|
||||
|
||||
```
|
||||
BRAINTRUST_API_KEY=... # Required: proxy routing + dashboard upload
|
||||
BRAINTRUST_PROJECT_ID=... # Required: Braintrust project identifier
|
||||
ANTHROPIC_API_KEY=sk-ant-... # Required: judge model (Claude Opus 4.6)
|
||||
```
|
||||
|
||||
Optional overrides:
|
||||
|
||||
```
|
||||
EVAL_MODEL=claude-sonnet-4-5-20250929 # Run only this model (skips matrix)
|
||||
EVAL_JUDGE_MODEL=claude-opus-4-6 # Judge model for scorers
|
||||
```
|
||||
|
||||
## Modifying Prompts
|
||||
|
||||
- `src/prompts/code-fix.ts` — what the eval model sees
|
||||
@@ -97,46 +169,3 @@ Each scorer in `src/scorer.ts` is independent. To add a new dimension:
|
||||
|
||||
1. Create a new `EvalScorer` function in `scorer.ts`
|
||||
2. Add it to the `scores` array in `code-fix.eval.ts`
|
||||
|
||||
## Running Evals
|
||||
|
||||
```bash
|
||||
# Run locally (no Braintrust upload)
|
||||
mise run eval
|
||||
|
||||
# Run and upload to Braintrust dashboard
|
||||
mise run eval:upload
|
||||
```
|
||||
|
||||
Or directly:
|
||||
|
||||
```bash
|
||||
cd packages/evals
|
||||
|
||||
# Local run
|
||||
npx braintrust eval --no-send-logs src/code-fix.eval.ts
|
||||
|
||||
# Upload to Braintrust
|
||||
npx braintrust eval src/code-fix.eval.ts
|
||||
```
|
||||
|
||||
In CI, evals run via `braintrustdata/eval-action@v1` and are gated by the
|
||||
`run-evals` PR label.
|
||||
|
||||
## Environment
|
||||
|
||||
API keys are loaded by mise from `packages/evals/.env` (configured in root
|
||||
`mise.toml`). Copy `.env.example` to `.env` and fill in the keys.
|
||||
|
||||
```
|
||||
ANTHROPIC_API_KEY=sk-ant-... # Required: eval model + judge model
|
||||
BRAINTRUST_API_KEY=... # Required for upload to Braintrust dashboard
|
||||
BRAINTRUST_PROJECT_ID=... # Required for upload to Braintrust dashboard
|
||||
```
|
||||
|
||||
Optional overrides:
|
||||
|
||||
```
|
||||
EVAL_MODEL=claude-sonnet-4-5-20250929 # Model under test
|
||||
EVAL_JUDGE_MODEL=claude-opus-4-6 # Judge model for scorers
|
||||
```
|
||||
|
||||
@@ -2,7 +2,9 @@ import assert from "node:assert";
|
||||
import { generateText } from "ai";
|
||||
import { Eval } from "braintrust";
|
||||
import { dataset } from "./dataset.js";
|
||||
import { getModel } from "./models.js";
|
||||
import type { EvalModelConfig } from "./models.config.js";
|
||||
import { EVAL_MODELS } from "./models.config.js";
|
||||
import { getProxyModel } from "./models.js";
|
||||
import {
|
||||
buildCodeFixPrompt,
|
||||
buildCodeFixSystemPrompt,
|
||||
@@ -11,26 +13,70 @@ import {
|
||||
bestPracticeScorer,
|
||||
completenessScorer,
|
||||
correctnessScorer,
|
||||
minimalityScorer,
|
||||
regressionSafetyScorer,
|
||||
} from "./scorer.js";
|
||||
|
||||
assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
|
||||
assert(process.env.ANTHROPIC_API_KEY, "ANTHROPIC_API_KEY is not set");
|
||||
|
||||
const modelId = process.env.EVAL_MODEL || "claude-sonnet-4-5-20250929";
|
||||
/**
|
||||
* Resolve which models to run based on environment:
|
||||
* - EVAL_MODEL set → single model only (local dev / cost control)
|
||||
* - CI without EVAL_ALL_MODELS → ci:true models only
|
||||
* - Otherwise → all models
|
||||
*/
|
||||
function getModelsToRun(): EvalModelConfig[] {
|
||||
const singleModel = process.env.EVAL_MODEL;
|
||||
if (singleModel) {
|
||||
const found = EVAL_MODELS.find((m) => m.id === singleModel);
|
||||
return [
|
||||
found ?? {
|
||||
id: singleModel,
|
||||
label: singleModel,
|
||||
provider: "unknown",
|
||||
ci: false,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
Eval("CodeFix", {
|
||||
projectId: process.env.BRAINTRUST_PROJECT_ID,
|
||||
trialCount: process.env.CI ? 3 : 1,
|
||||
data: () => dataset(),
|
||||
task: async (input) => {
|
||||
const model = getModel(modelId);
|
||||
const response = await generateText({
|
||||
model,
|
||||
system: buildCodeFixSystemPrompt(),
|
||||
prompt: buildCodeFixPrompt(input.testCase),
|
||||
temperature: 0.2,
|
||||
maxRetries: 2,
|
||||
});
|
||||
return { llmOutput: response.text };
|
||||
},
|
||||
scores: [correctnessScorer, completenessScorer, bestPracticeScorer],
|
||||
});
|
||||
if (process.env.CI && !process.env.EVAL_ALL_MODELS) {
|
||||
return EVAL_MODELS.filter((m) => m.ci);
|
||||
}
|
||||
|
||||
return EVAL_MODELS;
|
||||
}
|
||||
|
||||
const models = getModelsToRun();
|
||||
|
||||
for (const modelConfig of models) {
|
||||
Eval("CodeFix", {
|
||||
experimentName: modelConfig.id,
|
||||
projectId: process.env.BRAINTRUST_PROJECT_ID,
|
||||
trialCount: process.env.CI ? 3 : 1,
|
||||
metadata: {
|
||||
model: modelConfig.id,
|
||||
label: modelConfig.label,
|
||||
provider: modelConfig.provider,
|
||||
},
|
||||
data: () => dataset(),
|
||||
task: async (input) => {
|
||||
const model = getProxyModel(modelConfig.id);
|
||||
const response = await generateText({
|
||||
model,
|
||||
system: buildCodeFixSystemPrompt(),
|
||||
prompt: buildCodeFixPrompt(input.testCase),
|
||||
temperature: 0.2,
|
||||
maxRetries: 2,
|
||||
});
|
||||
return { llmOutput: response.text };
|
||||
},
|
||||
scores: [
|
||||
correctnessScorer,
|
||||
completenessScorer,
|
||||
bestPracticeScorer,
|
||||
regressionSafetyScorer,
|
||||
minimalityScorer,
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
@@ -10,23 +10,41 @@ export type Expected = {
|
||||
};
|
||||
|
||||
export type Metadata = {
|
||||
name: string;
|
||||
skillName: string;
|
||||
section: string;
|
||||
referenceFile: string;
|
||||
tags: string[];
|
||||
};
|
||||
|
||||
export type Output = { llmOutput: string };
|
||||
|
||||
/**
|
||||
* Extract the feature category from a reference filename.
|
||||
* e.g. "db-migrations-idempotent.md" → "db"
|
||||
* "auth-core-sessions.md" → "auth"
|
||||
*/
|
||||
function featureCategory(filename: string): string {
|
||||
return filename.replace(/\.md$/, "").split("-")[0];
|
||||
}
|
||||
|
||||
export function dataset(): EvalCase<Input, Expected, Metadata>[] {
|
||||
return extractCodeFixDataset().map((tc) => ({
|
||||
id: tc.id,
|
||||
input: { testCase: tc },
|
||||
tags: [
|
||||
featureCategory(tc.referenceFilename),
|
||||
tc.referenceFilename.replace(/\.md$/, ""),
|
||||
],
|
||||
expected: {
|
||||
correctCode: tc.goodExample.code,
|
||||
correctLanguage: tc.goodExample.language,
|
||||
},
|
||||
metadata: {
|
||||
name: tc.title,
|
||||
skillName: tc.skillName,
|
||||
section: tc.section,
|
||||
referenceFile: tc.referenceFilename,
|
||||
tags: tc.tags,
|
||||
},
|
||||
}));
|
||||
|
||||
47
packages/evals/src/models.config.ts
Normal file
47
packages/evals/src/models.config.ts
Normal file
@@ -0,0 +1,47 @@
|
||||
export interface EvalModelConfig {
|
||||
/** Model ID passed to the Braintrust proxy */
|
||||
id: string;
|
||||
/** Human-readable label for dashboards */
|
||||
label: string;
|
||||
/** Provider name for display/grouping */
|
||||
provider: string;
|
||||
/** Whether to include in CI runs by default */
|
||||
ci: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Models to evaluate. Add/remove entries to change the eval matrix.
|
||||
* Set `ci: false` to exclude expensive models from automated CI runs.
|
||||
*/
|
||||
export const EVAL_MODELS: EvalModelConfig[] = [
|
||||
{
|
||||
id: "claude-sonnet-4-5-20250929",
|
||||
label: "Claude Sonnet 4.5",
|
||||
provider: "anthropic",
|
||||
ci: true,
|
||||
},
|
||||
{
|
||||
id: "gpt-5.3",
|
||||
label: "GPT 5.3",
|
||||
provider: "openai",
|
||||
ci: true,
|
||||
},
|
||||
{
|
||||
id: "gpt-5.2",
|
||||
label: "GPT 5.2",
|
||||
provider: "openai",
|
||||
ci: true,
|
||||
},
|
||||
{
|
||||
id: "gemini-3-pro",
|
||||
label: "Gemini 3.0 Pro",
|
||||
provider: "google",
|
||||
ci: true,
|
||||
},
|
||||
{
|
||||
id: "claude-opus-4-6",
|
||||
label: "Claude Opus 4.6",
|
||||
provider: "anthropic",
|
||||
ci: false,
|
||||
},
|
||||
];
|
||||
@@ -1,48 +1,49 @@
|
||||
import type { AnthropicProvider } from "@ai-sdk/anthropic";
|
||||
import { anthropic } from "@ai-sdk/anthropic";
|
||||
import type { OpenAIProvider } from "@ai-sdk/openai";
|
||||
import { openai } from "@ai-sdk/openai";
|
||||
import { createOpenAI } from "@ai-sdk/openai";
|
||||
import type { LanguageModel } from "ai";
|
||||
|
||||
/** Model ID accepted by the Anthropic provider (string literal union + string). */
|
||||
export type AnthropicModelId = Parameters<AnthropicProvider["chat"]>[0];
|
||||
|
||||
/** Model ID accepted by the OpenAI provider (string literal union + string). */
|
||||
export type OpenAIModelId = Parameters<OpenAIProvider["chat"]>[0];
|
||||
/**
|
||||
* Braintrust AI proxy — routes to any provider (Anthropic, OpenAI, Google)
|
||||
* via a single OpenAI-compatible endpoint.
|
||||
*
|
||||
* Provider API keys are configured in the Braintrust dashboard at
|
||||
* project or org level. The x-bt-parent header scopes the request to
|
||||
* the project so project-level keys are resolved.
|
||||
*/
|
||||
const braintrustProxy = createOpenAI({
|
||||
baseURL: "https://api.braintrust.dev/v1/proxy",
|
||||
apiKey: process.env.BRAINTRUST_API_KEY ?? "",
|
||||
headers: process.env.BRAINTRUST_PROJECT_ID
|
||||
? { "x-bt-parent": `project_id:${process.env.BRAINTRUST_PROJECT_ID}` }
|
||||
: undefined,
|
||||
});
|
||||
|
||||
/** Any model ID accepted by the eval harness. */
|
||||
export type SupportedModelId = AnthropicModelId | OpenAIModelId;
|
||||
|
||||
const MODEL_MAP: Record<string, () => LanguageModel> = {
|
||||
"claude-opus-4-6": () => anthropic("claude-opus-4-6"),
|
||||
"claude-sonnet-4-5-20250929": () => anthropic("claude-sonnet-4-5-20250929"),
|
||||
"claude-haiku-4-5-20251001": () => anthropic("claude-haiku-4-5-20251001"),
|
||||
"claude-opus-4-5-20251101": () => anthropic("claude-opus-4-5-20251101"),
|
||||
"claude-sonnet-4-20250514": () => anthropic("claude-sonnet-4-20250514"),
|
||||
"gpt-4o": () => openai("gpt-4o"),
|
||||
"gpt-4o-mini": () => openai("gpt-4o-mini"),
|
||||
"o3-mini": () => openai("o3-mini"),
|
||||
};
|
||||
|
||||
export function getModel(modelId: SupportedModelId): LanguageModel {
|
||||
const factory = MODEL_MAP[modelId];
|
||||
if (factory) return factory();
|
||||
|
||||
// Fall back to provider detection from model ID prefix
|
||||
/**
|
||||
* Get a model for the eval task. Claude models use the Anthropic SDK
|
||||
* directly (via ANTHROPIC_API_KEY). All other models route through the
|
||||
* Braintrust proxy (keys configured at the org level in Braintrust).
|
||||
*/
|
||||
export function getProxyModel(modelId: string): LanguageModel {
|
||||
if (modelId.startsWith("claude")) {
|
||||
return anthropic(modelId as AnthropicModelId);
|
||||
}
|
||||
if (
|
||||
modelId.startsWith("gpt") ||
|
||||
modelId.startsWith("o1") ||
|
||||
modelId.startsWith("o3")
|
||||
) {
|
||||
return openai(modelId as OpenAIModelId);
|
||||
return braintrustProxy(modelId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a model using direct provider SDKs. Used for the judge model which
|
||||
* is always Claude and uses ANTHROPIC_API_KEY directly (no proxy).
|
||||
*/
|
||||
export function getModel(modelId: string): LanguageModel {
|
||||
if (modelId.startsWith("claude")) {
|
||||
return anthropic(modelId as AnthropicModelId);
|
||||
}
|
||||
|
||||
throw new Error(
|
||||
`Unknown model: ${modelId}. Available: ${Object.keys(MODEL_MAP).join(", ")}`,
|
||||
);
|
||||
return getProxyModel(modelId);
|
||||
}
|
||||
|
||||
export function getJudgeModel(): LanguageModel {
|
||||
|
||||
@@ -124,3 +124,64 @@ Score 1 if the fix follows best practices, 0.5 if it mostly follows best practic
|
||||
metadata: { reasoning: result.reasoning },
|
||||
};
|
||||
};
|
||||
|
||||
export const regressionSafetyScorer: EvalScorer<
|
||||
Input,
|
||||
TaskOutput,
|
||||
Expected
|
||||
> = async ({ input, output }) => {
|
||||
const context = buildContext(input.testCase, output.llmOutput);
|
||||
const result = await judge(`${context}
|
||||
|
||||
## Task
|
||||
|
||||
Evaluate **regression safety**: Does the LLM's fix avoid introducing new problems?
|
||||
|
||||
Carefully check whether the fix:
|
||||
- Breaks existing functionality that was working in the original code
|
||||
- Removes security measures (RLS policies, auth checks, input validation) that were already present
|
||||
- Changes function signatures, return types, or column names in ways that would break callers
|
||||
- Introduces SQL injection, XSS, or other security vulnerabilities not present in the original
|
||||
- Drops data, removes columns, or alters schemas destructively without necessity
|
||||
- Changes behavior beyond the scope of the identified issue
|
||||
|
||||
The fix should repair the identified problem WITHOUT creating new ones. A fix that solves the original issue but breaks something else is dangerous in production.
|
||||
|
||||
Score 1 if the fix introduces no new problems. Score 0.5 if the fix introduces minor issues (e.g., slightly different naming that could confuse but not break). Score 0 if the fix introduces a new bug, security vulnerability, or breaking change.`);
|
||||
|
||||
return {
|
||||
name: "Regression Safety",
|
||||
score: result.score,
|
||||
metadata: { reasoning: result.reasoning },
|
||||
};
|
||||
};
|
||||
|
||||
export const minimalityScorer: EvalScorer<
|
||||
Input,
|
||||
TaskOutput,
|
||||
Expected
|
||||
> = async ({ input, output }) => {
|
||||
const context = buildContext(input.testCase, output.llmOutput);
|
||||
const result = await judge(`${context}
|
||||
|
||||
## Task
|
||||
|
||||
Evaluate **minimality**: Does the LLM's fix make only the changes necessary to address the identified issue?
|
||||
|
||||
Check whether the fix:
|
||||
- Rewrites or restructures code beyond what is needed to fix the problem
|
||||
- Adds features, abstractions, or utilities not present in the reference solution
|
||||
- Changes formatting, variable names, or style in unrelated parts of the code
|
||||
- Adds excessive comments, logging, or error handling not required by the fix
|
||||
- Over-engineers the solution (e.g., adding configuration options, generalization, or layers of abstraction when a simple targeted fix suffices)
|
||||
|
||||
Compare the scope of changes in the LLM's fix against the reference. The reference represents the ideal minimal fix. The LLM's fix should be similarly focused.
|
||||
|
||||
Score 1 if the fix is tightly scoped to the identified issue (similar scope to the reference). Score 0.5 if the fix includes some unnecessary changes but the core fix is present. Score 0 if the fix significantly over-reaches — rewriting large portions of code, adding unrelated features, or restructuring beyond what is needed.`);
|
||||
|
||||
return {
|
||||
name: "Minimality",
|
||||
score: result.score,
|
||||
metadata: { reasoning: result.reasoning },
|
||||
};
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user