mirror of
https://github.com/supabase/agent-skills.git
synced 2026-03-27 10:09:26 +08:00
multi model testing
This commit is contained in:
@@ -46,11 +46,15 @@ sources = ["test/**", "skills/**"]
|
|||||||
# ── Eval tasks ────────────────────────────────────────────────────────
|
# ── Eval tasks ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
[tasks.eval]
|
[tasks.eval]
|
||||||
description = "Run code-fix evals (local, no upload)"
|
description = "Run code-fix evals for all configured models (local, no upload)"
|
||||||
run = "npm --prefix packages/evals run eval"
|
run = "npm --prefix packages/evals run eval"
|
||||||
sources = ["packages/evals/src/**", "skills/**/references/**"]
|
sources = ["packages/evals/src/**", "skills/**/references/**"]
|
||||||
|
|
||||||
|
[tasks."eval:model"]
|
||||||
|
description = "Run code-fix eval for a single model (local, no upload)"
|
||||||
|
run = "EVAL_MODEL={{arg(name='model')}} npm --prefix packages/evals run eval"
|
||||||
|
|
||||||
[tasks."eval:upload"]
|
[tasks."eval:upload"]
|
||||||
description = "Run code-fix evals and upload to Braintrust"
|
description = "Run code-fix evals for all models and upload to Braintrust"
|
||||||
run = "npm --prefix packages/evals run eval:upload"
|
run = "npm --prefix packages/evals run eval:upload"
|
||||||
sources = ["packages/evals/src/**", "skills/**/references/**"]
|
sources = ["packages/evals/src/**", "skills/**/references/**"]
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
ANTHROPIC_API_KEY=
|
|
||||||
BRAINTRUST_API_KEY=
|
BRAINTRUST_API_KEY=
|
||||||
BRAINTRUST_PROJECT_ID=
|
BRAINTRUST_PROJECT_ID=
|
||||||
|
ANTHROPIC_API_KEY=
|
||||||
|
# Provider API keys for eval models are configured in the Braintrust dashboard
|
||||||
|
# under Settings → AI providers (not needed locally).
|
||||||
|
|||||||
@@ -9,19 +9,26 @@ using skill documentation as context. It uses
|
|||||||
|
|
||||||
Two-step **LLM-as-judge** pattern powered by Braintrust's `Eval()`:
|
Two-step **LLM-as-judge** pattern powered by Braintrust's `Eval()`:
|
||||||
|
|
||||||
1. The **eval model** (default: `claude-sonnet-4-5-20250929`) receives a prompt
|
1. The **eval model** receives a prompt with skill context and produces a code
|
||||||
with skill context and produces a code fix.
|
fix. All eval model calls go through the **Braintrust AI proxy** — a single
|
||||||
2. Three independent **judge scorers** (default: `claude-opus-4-6`) evaluate the
|
OpenAI-compatible endpoint that routes to any provider (Anthropic, OpenAI,
|
||||||
fix via structured output (Zod schemas via AI SDK's `Output.object()`).
|
Google, etc.).
|
||||||
|
2. Five independent **judge scorers** (`claude-opus-4-6` via direct Anthropic
|
||||||
|
API) evaluate the fix via structured output (Zod schemas via AI SDK's
|
||||||
|
`Output.object()`).
|
||||||
|
|
||||||
|
The eval runs once per model in the model matrix, creating a separate Braintrust
|
||||||
|
experiment per model for side-by-side comparison.
|
||||||
|
|
||||||
Key files:
|
Key files:
|
||||||
|
|
||||||
```
|
```
|
||||||
src/
|
src/
|
||||||
code-fix.eval.ts # Braintrust Eval() entry point
|
code-fix.eval.ts # Braintrust Eval() entry point (loops over models)
|
||||||
dataset.ts # Maps extracted test cases to EvalCase format
|
dataset.ts # Maps extracted test cases to EvalCase format
|
||||||
scorer.ts # Three AI SDK-based scorers (Correctness, Completeness, Best Practice)
|
scorer.ts # Five AI SDK-based scorers (quality, safety, minimality)
|
||||||
models.ts # Model provider factory (Anthropic / OpenAI)
|
models.ts # Braintrust proxy + direct Anthropic provider
|
||||||
|
models.config.ts # Model matrix (add/remove models here)
|
||||||
dataset/
|
dataset/
|
||||||
types.ts # CodeFixTestCase interface
|
types.ts # CodeFixTestCase interface
|
||||||
extract.ts # Auto-extracts test cases from skill references
|
extract.ts # Auto-extracts test cases from skill references
|
||||||
@@ -35,14 +42,18 @@ src/
|
|||||||
(`dataset/extract.ts`) finds consecutive `**Incorrect:**` / `**Correct:**` code
|
(`dataset/extract.ts`) finds consecutive `**Incorrect:**` / `**Correct:**` code
|
||||||
block pairs under `##` sections. Each pair becomes one test case.
|
block pairs under `##` sections. Each pair becomes one test case.
|
||||||
|
|
||||||
Three independent scorers evaluate each fix (0–1 scale):
|
Five independent scorers evaluate each fix (0–1 scale):
|
||||||
|
|
||||||
- **Correctness** — does the fix address the core issue?
|
- **Correctness** — does the fix address the core issue?
|
||||||
- **Completeness** — does the fix include all necessary changes?
|
- **Completeness** — does the fix include all necessary changes?
|
||||||
- **Best Practice** — does the fix follow Supabase conventions?
|
- **Best Practice** — does the fix follow Supabase conventions?
|
||||||
|
- **Regression Safety** — does the fix avoid introducing new problems (broken
|
||||||
|
functionality, removed security measures, new vulnerabilities)?
|
||||||
|
- **Minimality** — is the fix tightly scoped to the identified issue without
|
||||||
|
unnecessary rewrites or over-engineering?
|
||||||
|
|
||||||
Braintrust aggregates the scores and provides a dashboard for tracking
|
Each model in the matrix generates a separate Braintrust experiment. The
|
||||||
regressions over time.
|
dashboard supports side-by-side comparison of experiments.
|
||||||
|
|
||||||
## Adding Test Cases
|
## Adding Test Cases
|
||||||
|
|
||||||
@@ -81,6 +92,67 @@ Rules:
|
|||||||
- Files prefixed with `_` (like `_sections.md`, `_template.md`) are skipped
|
- Files prefixed with `_` (like `_sections.md`, `_template.md`) are skipped
|
||||||
- Each pair gets an ID like `supabase/db-rls-mandatory#0` (skill/filename#index)
|
- Each pair gets an ID like `supabase/db-rls-mandatory#0` (skill/filename#index)
|
||||||
|
|
||||||
|
## Adding/Removing Models
|
||||||
|
|
||||||
|
Edit the `EVAL_MODELS` array in `src/models.config.ts`:
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
export const EVAL_MODELS: EvalModelConfig[] = [
|
||||||
|
{ id: "claude-sonnet-4-5-20250929", label: "Claude Sonnet 4.5", provider: "anthropic", ci: true },
|
||||||
|
{ id: "gpt-5.3", label: "GPT 5.3", provider: "openai", ci: true },
|
||||||
|
// Add new models here
|
||||||
|
];
|
||||||
|
```
|
||||||
|
|
||||||
|
Provider API keys must be configured in the Braintrust dashboard under
|
||||||
|
Settings → AI providers.
|
||||||
|
|
||||||
|
## Running Evals
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run all models locally (no Braintrust upload)
|
||||||
|
mise run eval
|
||||||
|
|
||||||
|
# Run a single model
|
||||||
|
mise run eval:model model=claude-sonnet-4-5-20250929
|
||||||
|
|
||||||
|
# Run and upload to Braintrust dashboard
|
||||||
|
mise run eval:upload
|
||||||
|
```
|
||||||
|
|
||||||
|
Or directly:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd packages/evals
|
||||||
|
|
||||||
|
# Local run (all models)
|
||||||
|
npx braintrust eval --no-send-logs src/code-fix.eval.ts
|
||||||
|
|
||||||
|
# Single model
|
||||||
|
EVAL_MODEL=claude-sonnet-4-5-20250929 npx braintrust eval --no-send-logs src/code-fix.eval.ts
|
||||||
|
|
||||||
|
# Filter to one test case (across all models)
|
||||||
|
npx braintrust eval --no-send-logs src/code-fix.eval.ts --filter 'input.testCase.id=db-migrations-idempotent'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Environment
|
||||||
|
|
||||||
|
API keys are loaded by mise from `packages/evals/.env` (configured in root
|
||||||
|
`mise.toml`). Copy `.env.example` to `.env` and fill in the keys.
|
||||||
|
|
||||||
|
```
|
||||||
|
BRAINTRUST_API_KEY=... # Required: proxy routing + dashboard upload
|
||||||
|
BRAINTRUST_PROJECT_ID=... # Required: Braintrust project identifier
|
||||||
|
ANTHROPIC_API_KEY=sk-ant-... # Required: judge model (Claude Opus 4.6)
|
||||||
|
```
|
||||||
|
|
||||||
|
Optional overrides:
|
||||||
|
|
||||||
|
```
|
||||||
|
EVAL_MODEL=claude-sonnet-4-5-20250929 # Run only this model (skips matrix)
|
||||||
|
EVAL_JUDGE_MODEL=claude-opus-4-6 # Judge model for scorers
|
||||||
|
```
|
||||||
|
|
||||||
## Modifying Prompts
|
## Modifying Prompts
|
||||||
|
|
||||||
- `src/prompts/code-fix.ts` — what the eval model sees
|
- `src/prompts/code-fix.ts` — what the eval model sees
|
||||||
@@ -97,46 +169,3 @@ Each scorer in `src/scorer.ts` is independent. To add a new dimension:
|
|||||||
|
|
||||||
1. Create a new `EvalScorer` function in `scorer.ts`
|
1. Create a new `EvalScorer` function in `scorer.ts`
|
||||||
2. Add it to the `scores` array in `code-fix.eval.ts`
|
2. Add it to the `scores` array in `code-fix.eval.ts`
|
||||||
|
|
||||||
## Running Evals
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Run locally (no Braintrust upload)
|
|
||||||
mise run eval
|
|
||||||
|
|
||||||
# Run and upload to Braintrust dashboard
|
|
||||||
mise run eval:upload
|
|
||||||
```
|
|
||||||
|
|
||||||
Or directly:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd packages/evals
|
|
||||||
|
|
||||||
# Local run
|
|
||||||
npx braintrust eval --no-send-logs src/code-fix.eval.ts
|
|
||||||
|
|
||||||
# Upload to Braintrust
|
|
||||||
npx braintrust eval src/code-fix.eval.ts
|
|
||||||
```
|
|
||||||
|
|
||||||
In CI, evals run via `braintrustdata/eval-action@v1` and are gated by the
|
|
||||||
`run-evals` PR label.
|
|
||||||
|
|
||||||
## Environment
|
|
||||||
|
|
||||||
API keys are loaded by mise from `packages/evals/.env` (configured in root
|
|
||||||
`mise.toml`). Copy `.env.example` to `.env` and fill in the keys.
|
|
||||||
|
|
||||||
```
|
|
||||||
ANTHROPIC_API_KEY=sk-ant-... # Required: eval model + judge model
|
|
||||||
BRAINTRUST_API_KEY=... # Required for upload to Braintrust dashboard
|
|
||||||
BRAINTRUST_PROJECT_ID=... # Required for upload to Braintrust dashboard
|
|
||||||
```
|
|
||||||
|
|
||||||
Optional overrides:
|
|
||||||
|
|
||||||
```
|
|
||||||
EVAL_MODEL=claude-sonnet-4-5-20250929 # Model under test
|
|
||||||
EVAL_JUDGE_MODEL=claude-opus-4-6 # Judge model for scorers
|
|
||||||
```
|
|
||||||
|
|||||||
@@ -2,7 +2,9 @@ import assert from "node:assert";
|
|||||||
import { generateText } from "ai";
|
import { generateText } from "ai";
|
||||||
import { Eval } from "braintrust";
|
import { Eval } from "braintrust";
|
||||||
import { dataset } from "./dataset.js";
|
import { dataset } from "./dataset.js";
|
||||||
import { getModel } from "./models.js";
|
import type { EvalModelConfig } from "./models.config.js";
|
||||||
|
import { EVAL_MODELS } from "./models.config.js";
|
||||||
|
import { getProxyModel } from "./models.js";
|
||||||
import {
|
import {
|
||||||
buildCodeFixPrompt,
|
buildCodeFixPrompt,
|
||||||
buildCodeFixSystemPrompt,
|
buildCodeFixSystemPrompt,
|
||||||
@@ -11,26 +13,70 @@ import {
|
|||||||
bestPracticeScorer,
|
bestPracticeScorer,
|
||||||
completenessScorer,
|
completenessScorer,
|
||||||
correctnessScorer,
|
correctnessScorer,
|
||||||
|
minimalityScorer,
|
||||||
|
regressionSafetyScorer,
|
||||||
} from "./scorer.js";
|
} from "./scorer.js";
|
||||||
|
|
||||||
|
assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
|
||||||
assert(process.env.ANTHROPIC_API_KEY, "ANTHROPIC_API_KEY is not set");
|
assert(process.env.ANTHROPIC_API_KEY, "ANTHROPIC_API_KEY is not set");
|
||||||
|
|
||||||
const modelId = process.env.EVAL_MODEL || "claude-sonnet-4-5-20250929";
|
/**
|
||||||
|
* Resolve which models to run based on environment:
|
||||||
|
* - EVAL_MODEL set → single model only (local dev / cost control)
|
||||||
|
* - CI without EVAL_ALL_MODELS → ci:true models only
|
||||||
|
* - Otherwise → all models
|
||||||
|
*/
|
||||||
|
function getModelsToRun(): EvalModelConfig[] {
|
||||||
|
const singleModel = process.env.EVAL_MODEL;
|
||||||
|
if (singleModel) {
|
||||||
|
const found = EVAL_MODELS.find((m) => m.id === singleModel);
|
||||||
|
return [
|
||||||
|
found ?? {
|
||||||
|
id: singleModel,
|
||||||
|
label: singleModel,
|
||||||
|
provider: "unknown",
|
||||||
|
ci: false,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
Eval("CodeFix", {
|
if (process.env.CI && !process.env.EVAL_ALL_MODELS) {
|
||||||
projectId: process.env.BRAINTRUST_PROJECT_ID,
|
return EVAL_MODELS.filter((m) => m.ci);
|
||||||
trialCount: process.env.CI ? 3 : 1,
|
}
|
||||||
data: () => dataset(),
|
|
||||||
task: async (input) => {
|
return EVAL_MODELS;
|
||||||
const model = getModel(modelId);
|
}
|
||||||
const response = await generateText({
|
|
||||||
model,
|
const models = getModelsToRun();
|
||||||
system: buildCodeFixSystemPrompt(),
|
|
||||||
prompt: buildCodeFixPrompt(input.testCase),
|
for (const modelConfig of models) {
|
||||||
temperature: 0.2,
|
Eval("CodeFix", {
|
||||||
maxRetries: 2,
|
experimentName: modelConfig.id,
|
||||||
});
|
projectId: process.env.BRAINTRUST_PROJECT_ID,
|
||||||
return { llmOutput: response.text };
|
trialCount: process.env.CI ? 3 : 1,
|
||||||
},
|
metadata: {
|
||||||
scores: [correctnessScorer, completenessScorer, bestPracticeScorer],
|
model: modelConfig.id,
|
||||||
});
|
label: modelConfig.label,
|
||||||
|
provider: modelConfig.provider,
|
||||||
|
},
|
||||||
|
data: () => dataset(),
|
||||||
|
task: async (input) => {
|
||||||
|
const model = getProxyModel(modelConfig.id);
|
||||||
|
const response = await generateText({
|
||||||
|
model,
|
||||||
|
system: buildCodeFixSystemPrompt(),
|
||||||
|
prompt: buildCodeFixPrompt(input.testCase),
|
||||||
|
temperature: 0.2,
|
||||||
|
maxRetries: 2,
|
||||||
|
});
|
||||||
|
return { llmOutput: response.text };
|
||||||
|
},
|
||||||
|
scores: [
|
||||||
|
correctnessScorer,
|
||||||
|
completenessScorer,
|
||||||
|
bestPracticeScorer,
|
||||||
|
regressionSafetyScorer,
|
||||||
|
minimalityScorer,
|
||||||
|
],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|||||||
@@ -10,23 +10,41 @@ export type Expected = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export type Metadata = {
|
export type Metadata = {
|
||||||
|
name: string;
|
||||||
skillName: string;
|
skillName: string;
|
||||||
section: string;
|
section: string;
|
||||||
|
referenceFile: string;
|
||||||
tags: string[];
|
tags: string[];
|
||||||
};
|
};
|
||||||
|
|
||||||
export type Output = { llmOutput: string };
|
export type Output = { llmOutput: string };
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract the feature category from a reference filename.
|
||||||
|
* e.g. "db-migrations-idempotent.md" → "db"
|
||||||
|
* "auth-core-sessions.md" → "auth"
|
||||||
|
*/
|
||||||
|
function featureCategory(filename: string): string {
|
||||||
|
return filename.replace(/\.md$/, "").split("-")[0];
|
||||||
|
}
|
||||||
|
|
||||||
export function dataset(): EvalCase<Input, Expected, Metadata>[] {
|
export function dataset(): EvalCase<Input, Expected, Metadata>[] {
|
||||||
return extractCodeFixDataset().map((tc) => ({
|
return extractCodeFixDataset().map((tc) => ({
|
||||||
|
id: tc.id,
|
||||||
input: { testCase: tc },
|
input: { testCase: tc },
|
||||||
|
tags: [
|
||||||
|
featureCategory(tc.referenceFilename),
|
||||||
|
tc.referenceFilename.replace(/\.md$/, ""),
|
||||||
|
],
|
||||||
expected: {
|
expected: {
|
||||||
correctCode: tc.goodExample.code,
|
correctCode: tc.goodExample.code,
|
||||||
correctLanguage: tc.goodExample.language,
|
correctLanguage: tc.goodExample.language,
|
||||||
},
|
},
|
||||||
metadata: {
|
metadata: {
|
||||||
|
name: tc.title,
|
||||||
skillName: tc.skillName,
|
skillName: tc.skillName,
|
||||||
section: tc.section,
|
section: tc.section,
|
||||||
|
referenceFile: tc.referenceFilename,
|
||||||
tags: tc.tags,
|
tags: tc.tags,
|
||||||
},
|
},
|
||||||
}));
|
}));
|
||||||
|
|||||||
47
packages/evals/src/models.config.ts
Normal file
47
packages/evals/src/models.config.ts
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
export interface EvalModelConfig {
|
||||||
|
/** Model ID passed to the Braintrust proxy */
|
||||||
|
id: string;
|
||||||
|
/** Human-readable label for dashboards */
|
||||||
|
label: string;
|
||||||
|
/** Provider name for display/grouping */
|
||||||
|
provider: string;
|
||||||
|
/** Whether to include in CI runs by default */
|
||||||
|
ci: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Models to evaluate. Add/remove entries to change the eval matrix.
|
||||||
|
* Set `ci: false` to exclude expensive models from automated CI runs.
|
||||||
|
*/
|
||||||
|
export const EVAL_MODELS: EvalModelConfig[] = [
|
||||||
|
{
|
||||||
|
id: "claude-sonnet-4-5-20250929",
|
||||||
|
label: "Claude Sonnet 4.5",
|
||||||
|
provider: "anthropic",
|
||||||
|
ci: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: "gpt-5.3",
|
||||||
|
label: "GPT 5.3",
|
||||||
|
provider: "openai",
|
||||||
|
ci: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: "gpt-5.2",
|
||||||
|
label: "GPT 5.2",
|
||||||
|
provider: "openai",
|
||||||
|
ci: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: "gemini-3-pro",
|
||||||
|
label: "Gemini 3.0 Pro",
|
||||||
|
provider: "google",
|
||||||
|
ci: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: "claude-opus-4-6",
|
||||||
|
label: "Claude Opus 4.6",
|
||||||
|
provider: "anthropic",
|
||||||
|
ci: false,
|
||||||
|
},
|
||||||
|
];
|
||||||
@@ -1,48 +1,49 @@
|
|||||||
import type { AnthropicProvider } from "@ai-sdk/anthropic";
|
import type { AnthropicProvider } from "@ai-sdk/anthropic";
|
||||||
import { anthropic } from "@ai-sdk/anthropic";
|
import { anthropic } from "@ai-sdk/anthropic";
|
||||||
import type { OpenAIProvider } from "@ai-sdk/openai";
|
import { createOpenAI } from "@ai-sdk/openai";
|
||||||
import { openai } from "@ai-sdk/openai";
|
|
||||||
import type { LanguageModel } from "ai";
|
import type { LanguageModel } from "ai";
|
||||||
|
|
||||||
/** Model ID accepted by the Anthropic provider (string literal union + string). */
|
/** Model ID accepted by the Anthropic provider (string literal union + string). */
|
||||||
export type AnthropicModelId = Parameters<AnthropicProvider["chat"]>[0];
|
export type AnthropicModelId = Parameters<AnthropicProvider["chat"]>[0];
|
||||||
|
|
||||||
/** Model ID accepted by the OpenAI provider (string literal union + string). */
|
/**
|
||||||
export type OpenAIModelId = Parameters<OpenAIProvider["chat"]>[0];
|
* Braintrust AI proxy — routes to any provider (Anthropic, OpenAI, Google)
|
||||||
|
* via a single OpenAI-compatible endpoint.
|
||||||
|
*
|
||||||
|
* Provider API keys are configured in the Braintrust dashboard at
|
||||||
|
* project or org level. The x-bt-parent header scopes the request to
|
||||||
|
* the project so project-level keys are resolved.
|
||||||
|
*/
|
||||||
|
const braintrustProxy = createOpenAI({
|
||||||
|
baseURL: "https://api.braintrust.dev/v1/proxy",
|
||||||
|
apiKey: process.env.BRAINTRUST_API_KEY ?? "",
|
||||||
|
headers: process.env.BRAINTRUST_PROJECT_ID
|
||||||
|
? { "x-bt-parent": `project_id:${process.env.BRAINTRUST_PROJECT_ID}` }
|
||||||
|
: undefined,
|
||||||
|
});
|
||||||
|
|
||||||
/** Any model ID accepted by the eval harness. */
|
/**
|
||||||
export type SupportedModelId = AnthropicModelId | OpenAIModelId;
|
* Get a model for the eval task. Claude models use the Anthropic SDK
|
||||||
|
* directly (via ANTHROPIC_API_KEY). All other models route through the
|
||||||
const MODEL_MAP: Record<string, () => LanguageModel> = {
|
* Braintrust proxy (keys configured at the org level in Braintrust).
|
||||||
"claude-opus-4-6": () => anthropic("claude-opus-4-6"),
|
*/
|
||||||
"claude-sonnet-4-5-20250929": () => anthropic("claude-sonnet-4-5-20250929"),
|
export function getProxyModel(modelId: string): LanguageModel {
|
||||||
"claude-haiku-4-5-20251001": () => anthropic("claude-haiku-4-5-20251001"),
|
|
||||||
"claude-opus-4-5-20251101": () => anthropic("claude-opus-4-5-20251101"),
|
|
||||||
"claude-sonnet-4-20250514": () => anthropic("claude-sonnet-4-20250514"),
|
|
||||||
"gpt-4o": () => openai("gpt-4o"),
|
|
||||||
"gpt-4o-mini": () => openai("gpt-4o-mini"),
|
|
||||||
"o3-mini": () => openai("o3-mini"),
|
|
||||||
};
|
|
||||||
|
|
||||||
export function getModel(modelId: SupportedModelId): LanguageModel {
|
|
||||||
const factory = MODEL_MAP[modelId];
|
|
||||||
if (factory) return factory();
|
|
||||||
|
|
||||||
// Fall back to provider detection from model ID prefix
|
|
||||||
if (modelId.startsWith("claude")) {
|
if (modelId.startsWith("claude")) {
|
||||||
return anthropic(modelId as AnthropicModelId);
|
return anthropic(modelId as AnthropicModelId);
|
||||||
}
|
}
|
||||||
if (
|
return braintrustProxy(modelId);
|
||||||
modelId.startsWith("gpt") ||
|
}
|
||||||
modelId.startsWith("o1") ||
|
|
||||||
modelId.startsWith("o3")
|
/**
|
||||||
) {
|
* Get a model using direct provider SDKs. Used for the judge model which
|
||||||
return openai(modelId as OpenAIModelId);
|
* is always Claude and uses ANTHROPIC_API_KEY directly (no proxy).
|
||||||
|
*/
|
||||||
|
export function getModel(modelId: string): LanguageModel {
|
||||||
|
if (modelId.startsWith("claude")) {
|
||||||
|
return anthropic(modelId as AnthropicModelId);
|
||||||
}
|
}
|
||||||
|
|
||||||
throw new Error(
|
return getProxyModel(modelId);
|
||||||
`Unknown model: ${modelId}. Available: ${Object.keys(MODEL_MAP).join(", ")}`,
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export function getJudgeModel(): LanguageModel {
|
export function getJudgeModel(): LanguageModel {
|
||||||
|
|||||||
@@ -124,3 +124,64 @@ Score 1 if the fix follows best practices, 0.5 if it mostly follows best practic
|
|||||||
metadata: { reasoning: result.reasoning },
|
metadata: { reasoning: result.reasoning },
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export const regressionSafetyScorer: EvalScorer<
|
||||||
|
Input,
|
||||||
|
TaskOutput,
|
||||||
|
Expected
|
||||||
|
> = async ({ input, output }) => {
|
||||||
|
const context = buildContext(input.testCase, output.llmOutput);
|
||||||
|
const result = await judge(`${context}
|
||||||
|
|
||||||
|
## Task
|
||||||
|
|
||||||
|
Evaluate **regression safety**: Does the LLM's fix avoid introducing new problems?
|
||||||
|
|
||||||
|
Carefully check whether the fix:
|
||||||
|
- Breaks existing functionality that was working in the original code
|
||||||
|
- Removes security measures (RLS policies, auth checks, input validation) that were already present
|
||||||
|
- Changes function signatures, return types, or column names in ways that would break callers
|
||||||
|
- Introduces SQL injection, XSS, or other security vulnerabilities not present in the original
|
||||||
|
- Drops data, removes columns, or alters schemas destructively without necessity
|
||||||
|
- Changes behavior beyond the scope of the identified issue
|
||||||
|
|
||||||
|
The fix should repair the identified problem WITHOUT creating new ones. A fix that solves the original issue but breaks something else is dangerous in production.
|
||||||
|
|
||||||
|
Score 1 if the fix introduces no new problems. Score 0.5 if the fix introduces minor issues (e.g., slightly different naming that could confuse but not break). Score 0 if the fix introduces a new bug, security vulnerability, or breaking change.`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
name: "Regression Safety",
|
||||||
|
score: result.score,
|
||||||
|
metadata: { reasoning: result.reasoning },
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
export const minimalityScorer: EvalScorer<
|
||||||
|
Input,
|
||||||
|
TaskOutput,
|
||||||
|
Expected
|
||||||
|
> = async ({ input, output }) => {
|
||||||
|
const context = buildContext(input.testCase, output.llmOutput);
|
||||||
|
const result = await judge(`${context}
|
||||||
|
|
||||||
|
## Task
|
||||||
|
|
||||||
|
Evaluate **minimality**: Does the LLM's fix make only the changes necessary to address the identified issue?
|
||||||
|
|
||||||
|
Check whether the fix:
|
||||||
|
- Rewrites or restructures code beyond what is needed to fix the problem
|
||||||
|
- Adds features, abstractions, or utilities not present in the reference solution
|
||||||
|
- Changes formatting, variable names, or style in unrelated parts of the code
|
||||||
|
- Adds excessive comments, logging, or error handling not required by the fix
|
||||||
|
- Over-engineers the solution (e.g., adding configuration options, generalization, or layers of abstraction when a simple targeted fix suffices)
|
||||||
|
|
||||||
|
Compare the scope of changes in the LLM's fix against the reference. The reference represents the ideal minimal fix. The LLM's fix should be similarly focused.
|
||||||
|
|
||||||
|
Score 1 if the fix is tightly scoped to the identified issue (similar scope to the reference). Score 0.5 if the fix includes some unnecessary changes but the core fix is present. Score 0 if the fix significantly over-reaches — rewriting large portions of code, adding unrelated features, or restructuring beyond what is needed.`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
name: "Minimality",
|
||||||
|
score: result.score,
|
||||||
|
metadata: { reasoning: result.reasoning },
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user