mirror of
https://github.com/supabase/agent-skills.git
synced 2026-03-27 10:09:26 +08:00
initial skills evals
This commit is contained in:
49
.github/workflows/evals.yml
vendored
Normal file
49
.github/workflows/evals.yml
vendored
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
name: Skill Evals
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, labeled]
|
||||||
|
paths:
|
||||||
|
- "skills/**"
|
||||||
|
- "packages/evals/**"
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
pull-requests: write
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
eval:
|
||||||
|
name: Run evals
|
||||||
|
if: >
|
||||||
|
github.event_name == 'workflow_dispatch' ||
|
||||||
|
(github.event_name == 'pull_request' &&
|
||||||
|
contains(github.event.pull_request.labels.*.name, 'run-evals'))
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
environment: evals
|
||||||
|
timeout-minutes: 30
|
||||||
|
|
||||||
|
env:
|
||||||
|
BRAINTRUST_PROJECT_ID: ${{ secrets.BRAINTRUST_PROJECT_ID }}
|
||||||
|
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- uses: jdx/mise-action@v3
|
||||||
|
with:
|
||||||
|
install: true
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: npm install && npm --prefix packages/evals install
|
||||||
|
|
||||||
|
- name: Run Evals
|
||||||
|
uses: braintrustdata/eval-action@v1
|
||||||
|
with:
|
||||||
|
api_key: ${{ secrets.BRAINTRUST_API_KEY }}
|
||||||
|
runtime: node
|
||||||
|
root: packages/evals
|
||||||
10
AGENTS.md
10
AGENTS.md
@@ -25,6 +25,16 @@ skills/
|
|||||||
packages/
|
packages/
|
||||||
skills-build/ # Generic build system for all skills
|
skills-build/ # Generic build system for all skills
|
||||||
evals/ # LLM evaluation system for skills
|
evals/ # LLM evaluation system for skills
|
||||||
|
AGENTS.md # Agent guide for developing evals
|
||||||
|
CLAUDE.md # Symlink to AGENTS.md
|
||||||
|
scenarios/
|
||||||
|
workflow-scenarios.json # Handwritten workflow test scenarios
|
||||||
|
src/
|
||||||
|
cli.ts # Entry point
|
||||||
|
prompts/ # Eval and judge prompts
|
||||||
|
scorer/ # Zod schemas and judge execution
|
||||||
|
dataset/ # Test case extraction from skill references
|
||||||
|
runner/ # Eval orchestrator and runners
|
||||||
```
|
```
|
||||||
|
|
||||||
## Commands
|
## Commands
|
||||||
|
|||||||
15
mise.toml
15
mise.toml
@@ -46,16 +46,11 @@ sources = ["test/**", "skills/**"]
|
|||||||
# ── Eval tasks ────────────────────────────────────────────────────────
|
# ── Eval tasks ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
[tasks.eval]
|
[tasks.eval]
|
||||||
description = "Run all evals"
|
description = "Run code-fix evals (local, no upload)"
|
||||||
run = "tsx packages/evals/src/cli.ts"
|
run = "npm --prefix packages/evals run eval"
|
||||||
sources = ["packages/evals/src/**", "skills/**/references/**"]
|
sources = ["packages/evals/src/**", "skills/**/references/**"]
|
||||||
|
|
||||||
[tasks."eval:code-fix"]
|
[tasks."eval:upload"]
|
||||||
description = "Run code-fix evals"
|
description = "Run code-fix evals and upload to Braintrust"
|
||||||
run = "tsx packages/evals/src/cli.ts --type code-fix"
|
run = "npm --prefix packages/evals run eval:upload"
|
||||||
sources = ["packages/evals/src/**", "skills/**/references/**"]
|
|
||||||
|
|
||||||
[tasks."eval:workflow"]
|
|
||||||
description = "Run workflow evals"
|
|
||||||
run = "tsx packages/evals/src/cli.ts --type workflow"
|
|
||||||
sources = ["packages/evals/src/**", "skills/**/references/**"]
|
sources = ["packages/evals/src/**", "skills/**/references/**"]
|
||||||
|
|||||||
3
packages/evals/.env.example
Normal file
3
packages/evals/.env.example
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
ANTHROPIC_API_KEY=
|
||||||
|
BRAINTRUST_API_KEY=
|
||||||
|
BRAINTRUST_PROJECT_ID=
|
||||||
142
packages/evals/AGENTS.md
Normal file
142
packages/evals/AGENTS.md
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
# Evals — Agent Guide
|
||||||
|
|
||||||
|
This package evaluates whether LLMs correctly apply Supabase best practices
|
||||||
|
using skill documentation as context. It uses
|
||||||
|
[Braintrust](https://www.braintrust.dev/) for eval orchestration and the
|
||||||
|
[Vercel AI SDK](https://sdk.vercel.ai/) for LLM calls.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
Two-step **LLM-as-judge** pattern powered by Braintrust's `Eval()`:
|
||||||
|
|
||||||
|
1. The **eval model** (default: `claude-sonnet-4-5-20250929`) receives a prompt
|
||||||
|
with skill context and produces a code fix.
|
||||||
|
2. Three independent **judge scorers** (default: `claude-opus-4-6`) evaluate the
|
||||||
|
fix via structured output (Zod schemas via AI SDK's `Output.object()`).
|
||||||
|
|
||||||
|
Key files:
|
||||||
|
|
||||||
|
```
|
||||||
|
src/
|
||||||
|
code-fix.eval.ts # Braintrust Eval() entry point
|
||||||
|
dataset.ts # Maps extracted test cases to EvalCase format
|
||||||
|
scorer.ts # Three AI SDK-based scorers (Correctness, Completeness, Best Practice)
|
||||||
|
models.ts # Model provider factory (Anthropic / OpenAI)
|
||||||
|
dataset/
|
||||||
|
types.ts # CodeFixTestCase interface
|
||||||
|
extract.ts # Auto-extracts test cases from skill references
|
||||||
|
prompts/
|
||||||
|
code-fix.ts # System + user prompts for the eval model
|
||||||
|
```
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
**Test cases are auto-extracted** from `skills/*/references/*.md`. The extractor
|
||||||
|
(`dataset/extract.ts`) finds consecutive `**Incorrect:**` / `**Correct:**` code
|
||||||
|
block pairs under `##` sections. Each pair becomes one test case.
|
||||||
|
|
||||||
|
Three independent scorers evaluate each fix (0–1 scale):
|
||||||
|
|
||||||
|
- **Correctness** — does the fix address the core issue?
|
||||||
|
- **Completeness** — does the fix include all necessary changes?
|
||||||
|
- **Best Practice** — does the fix follow Supabase conventions?
|
||||||
|
|
||||||
|
Braintrust aggregates the scores and provides a dashboard for tracking
|
||||||
|
regressions over time.
|
||||||
|
|
||||||
|
## Adding Test Cases
|
||||||
|
|
||||||
|
No code changes needed. Add paired Incorrect/Correct blocks to any skill
|
||||||
|
reference file. The extractor picks them up automatically.
|
||||||
|
|
||||||
|
Required format in a reference `.md` file:
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
## Section Title
|
||||||
|
|
||||||
|
Explanation of the issue.
|
||||||
|
|
||||||
|
**Incorrect:**
|
||||||
|
|
||||||
|
\```sql
|
||||||
|
-- bad code
|
||||||
|
\```
|
||||||
|
|
||||||
|
**Correct:**
|
||||||
|
|
||||||
|
\```sql
|
||||||
|
-- good code
|
||||||
|
\```
|
||||||
|
```
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
|
||||||
|
- Pairs must be consecutive — an Incorrect block immediately followed by a
|
||||||
|
Correct block
|
||||||
|
- Labels are matched case-insensitively. Bad labels: `Incorrect`, `Wrong`, `Bad`.
|
||||||
|
Good labels: `Correct`, `Good`, `Usage`, `Implementation`, `Example`,
|
||||||
|
`Recommended`
|
||||||
|
- The optional parenthetical in the label becomes the `description` field:
|
||||||
|
`**Incorrect (missing RLS):**`
|
||||||
|
- Files prefixed with `_` (like `_sections.md`, `_template.md`) are skipped
|
||||||
|
- Each pair gets an ID like `supabase/db-rls-mandatory#0` (skill/filename#index)
|
||||||
|
|
||||||
|
## Modifying Prompts
|
||||||
|
|
||||||
|
- `src/prompts/code-fix.ts` — what the eval model sees
|
||||||
|
- `src/scorer.ts` — judge prompts for each scorer dimension
|
||||||
|
|
||||||
|
Temperature settings:
|
||||||
|
|
||||||
|
- Eval model: `0.2` (in `code-fix.eval.ts`)
|
||||||
|
- Judge model: `0.1` (in `scorer.ts`)
|
||||||
|
|
||||||
|
## Modifying Scoring
|
||||||
|
|
||||||
|
Each scorer in `src/scorer.ts` is independent. To add a new dimension:
|
||||||
|
|
||||||
|
1. Create a new `EvalScorer` function in `scorer.ts`
|
||||||
|
2. Add it to the `scores` array in `code-fix.eval.ts`
|
||||||
|
|
||||||
|
## Running Evals
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run locally (no Braintrust upload)
|
||||||
|
mise run eval
|
||||||
|
|
||||||
|
# Run and upload to Braintrust dashboard
|
||||||
|
mise run eval:upload
|
||||||
|
```
|
||||||
|
|
||||||
|
Or directly:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd packages/evals
|
||||||
|
|
||||||
|
# Local run
|
||||||
|
npx braintrust eval --no-send-logs src/code-fix.eval.ts
|
||||||
|
|
||||||
|
# Upload to Braintrust
|
||||||
|
npx braintrust eval src/code-fix.eval.ts
|
||||||
|
```
|
||||||
|
|
||||||
|
In CI, evals run via `braintrustdata/eval-action@v1` and are gated by the
|
||||||
|
`run-evals` PR label.
|
||||||
|
|
||||||
|
## Environment
|
||||||
|
|
||||||
|
API keys are loaded by mise from `packages/evals/.env` (configured in root
|
||||||
|
`mise.toml`). Copy `.env.example` to `.env` and fill in the keys.
|
||||||
|
|
||||||
|
```
|
||||||
|
ANTHROPIC_API_KEY=sk-ant-... # Required: eval model + judge model
|
||||||
|
BRAINTRUST_API_KEY=... # Required for upload to Braintrust dashboard
|
||||||
|
BRAINTRUST_PROJECT_ID=... # Required for upload to Braintrust dashboard
|
||||||
|
```
|
||||||
|
|
||||||
|
Optional overrides:
|
||||||
|
|
||||||
|
```
|
||||||
|
EVAL_MODEL=claude-sonnet-4-5-20250929 # Model under test
|
||||||
|
EVAL_JUDGE_MODEL=claude-opus-4-6 # Judge model for scorers
|
||||||
|
```
|
||||||
1
packages/evals/CLAUDE.md
Symbolic link
1
packages/evals/CLAUDE.md
Symbolic link
@@ -0,0 +1 @@
|
|||||||
|
AGENTS.md
|
||||||
46
packages/evals/README.md
Normal file
46
packages/evals/README.md
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
# Evals
|
||||||
|
|
||||||
|
LLM evaluation system for Supabase agent skills, powered by [Braintrust](https://www.braintrust.dev/). Tests whether models can correctly apply Supabase best practices using skill documentation as context.
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
Each eval follows a two-step **LLM-as-judge** pattern orchestrated by Braintrust's `Eval()`:
|
||||||
|
|
||||||
|
1. **Generate** — The eval model (e.g. Sonnet 4.5) receives a prompt with skill context and produces a code fix.
|
||||||
|
2. **Judge** — Three independent scorers using a stronger model (Opus 4.6 by default) evaluate the fix via the Vercel AI SDK with structured output.
|
||||||
|
|
||||||
|
Test cases are extracted automatically from skill reference files (`skills/*/references/*.md`). Each file contains paired **Incorrect** / **Correct** code blocks — the model receives the bad code and must produce the fix.
|
||||||
|
|
||||||
|
**Scoring dimensions (each 0–1):**
|
||||||
|
|
||||||
|
| Scorer | Description |
|
||||||
|
|--------|-------------|
|
||||||
|
| Correctness | Does the fix address the core issue? |
|
||||||
|
| Completeness | Does it include all necessary changes? |
|
||||||
|
| Best Practice | Does it follow Supabase best practices? |
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run locally (no Braintrust upload)
|
||||||
|
mise run eval
|
||||||
|
|
||||||
|
# Run and upload to Braintrust dashboard
|
||||||
|
mise run eval:upload
|
||||||
|
```
|
||||||
|
|
||||||
|
### Environment Variables
|
||||||
|
|
||||||
|
API keys are loaded via mise from `packages/evals/.env` (see root `mise.toml`).
|
||||||
|
|
||||||
|
```
|
||||||
|
ANTHROPIC_API_KEY Required: eval model + judge model
|
||||||
|
BRAINTRUST_API_KEY Required for Braintrust dashboard upload
|
||||||
|
BRAINTRUST_PROJECT_ID Required for Braintrust dashboard upload
|
||||||
|
EVAL_MODEL Override default eval model (claude-sonnet-4-5-20250929)
|
||||||
|
EVAL_JUDGE_MODEL Override default judge model (claude-opus-4-6)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Adding Test Cases
|
||||||
|
|
||||||
|
Add paired Incorrect/Correct code blocks to any skill reference file. The extractor picks them up automatically on the next run.
|
||||||
2301
packages/evals/package-lock.json
generated
Normal file
2301
packages/evals/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
24
packages/evals/package.json
Normal file
24
packages/evals/package.json
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
{
|
||||||
|
"name": "evals",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"type": "module",
|
||||||
|
"author": "Supabase",
|
||||||
|
"license": "MIT",
|
||||||
|
"description": "LLM evaluation system for Supabase agent skills",
|
||||||
|
"scripts": {
|
||||||
|
"eval": "braintrust eval --no-send-logs src/code-fix.eval.ts",
|
||||||
|
"eval:upload": "braintrust eval src/code-fix.eval.ts"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"@ai-sdk/anthropic": "^3.0.44",
|
||||||
|
"@ai-sdk/openai": "^3.0.29",
|
||||||
|
"ai": "^6.0.86",
|
||||||
|
"braintrust": "^1.0.2",
|
||||||
|
"zod": "^3.23.0"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/node": "^20.10.0",
|
||||||
|
"tsx": "^4.7.0",
|
||||||
|
"typescript": "^5.3.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
36
packages/evals/src/code-fix.eval.ts
Normal file
36
packages/evals/src/code-fix.eval.ts
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
import assert from "node:assert";
|
||||||
|
import { generateText } from "ai";
|
||||||
|
import { Eval } from "braintrust";
|
||||||
|
import { dataset } from "./dataset.js";
|
||||||
|
import { getModel } from "./models.js";
|
||||||
|
import {
|
||||||
|
buildCodeFixPrompt,
|
||||||
|
buildCodeFixSystemPrompt,
|
||||||
|
} from "./prompts/code-fix.js";
|
||||||
|
import {
|
||||||
|
bestPracticeScorer,
|
||||||
|
completenessScorer,
|
||||||
|
correctnessScorer,
|
||||||
|
} from "./scorer.js";
|
||||||
|
|
||||||
|
assert(process.env.ANTHROPIC_API_KEY, "ANTHROPIC_API_KEY is not set");
|
||||||
|
|
||||||
|
const modelId = process.env.EVAL_MODEL || "claude-sonnet-4-5-20250929";
|
||||||
|
|
||||||
|
Eval("CodeFix", {
|
||||||
|
projectId: process.env.BRAINTRUST_PROJECT_ID,
|
||||||
|
trialCount: process.env.CI ? 3 : 1,
|
||||||
|
data: () => dataset(),
|
||||||
|
task: async (input) => {
|
||||||
|
const model = getModel(modelId);
|
||||||
|
const response = await generateText({
|
||||||
|
model,
|
||||||
|
system: buildCodeFixSystemPrompt(),
|
||||||
|
prompt: buildCodeFixPrompt(input.testCase),
|
||||||
|
temperature: 0.2,
|
||||||
|
maxRetries: 2,
|
||||||
|
});
|
||||||
|
return { llmOutput: response.text };
|
||||||
|
},
|
||||||
|
scores: [correctnessScorer, completenessScorer, bestPracticeScorer],
|
||||||
|
});
|
||||||
33
packages/evals/src/dataset.ts
Normal file
33
packages/evals/src/dataset.ts
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
import type { EvalCase } from "braintrust";
|
||||||
|
import { extractCodeFixDataset } from "./dataset/extract.js";
|
||||||
|
import type { CodeFixTestCase } from "./dataset/types.js";
|
||||||
|
|
||||||
|
export type Input = { testCase: CodeFixTestCase };
|
||||||
|
|
||||||
|
export type Expected = {
|
||||||
|
correctCode: string;
|
||||||
|
correctLanguage?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type Metadata = {
|
||||||
|
skillName: string;
|
||||||
|
section: string;
|
||||||
|
tags: string[];
|
||||||
|
};
|
||||||
|
|
||||||
|
export type Output = { llmOutput: string };
|
||||||
|
|
||||||
|
export function dataset(): EvalCase<Input, Expected, Metadata>[] {
|
||||||
|
return extractCodeFixDataset().map((tc) => ({
|
||||||
|
input: { testCase: tc },
|
||||||
|
expected: {
|
||||||
|
correctCode: tc.goodExample.code,
|
||||||
|
correctLanguage: tc.goodExample.language,
|
||||||
|
},
|
||||||
|
metadata: {
|
||||||
|
skillName: tc.skillName,
|
||||||
|
section: tc.section,
|
||||||
|
tags: tc.tags,
|
||||||
|
},
|
||||||
|
}));
|
||||||
|
}
|
||||||
277
packages/evals/src/dataset/extract.ts
Normal file
277
packages/evals/src/dataset/extract.ts
Normal file
@@ -0,0 +1,277 @@
|
|||||||
|
import { existsSync, readdirSync, readFileSync } from "node:fs";
|
||||||
|
import { basename, join, resolve } from "node:path";
|
||||||
|
import type { CodeFixTestCase } from "./types.js";
|
||||||
|
|
||||||
|
function findSkillsRoot(): string {
|
||||||
|
let dir = process.cwd();
|
||||||
|
for (let i = 0; i < 10; i++) {
|
||||||
|
const candidate = join(dir, "skills");
|
||||||
|
if (existsSync(candidate)) return candidate;
|
||||||
|
const parent = resolve(dir, "..");
|
||||||
|
if (parent === dir) break;
|
||||||
|
dir = parent;
|
||||||
|
}
|
||||||
|
throw new Error(
|
||||||
|
"Could not find skills/ directory. Run from the repository root or a subdirectory.",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const SKILLS_ROOT = findSkillsRoot();
|
||||||
|
|
||||||
|
// --- Duplicated from skills-build/src/parser.ts for isolation ---
|
||||||
|
|
||||||
|
interface CodeExample {
|
||||||
|
label: string;
|
||||||
|
description?: string;
|
||||||
|
code: string;
|
||||||
|
language?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseFrontmatter(content: string): {
|
||||||
|
frontmatter: Record<string, string>;
|
||||||
|
body: string;
|
||||||
|
} {
|
||||||
|
const frontmatter: Record<string, string> = {};
|
||||||
|
|
||||||
|
if (!content.startsWith("---")) {
|
||||||
|
return { frontmatter, body: content };
|
||||||
|
}
|
||||||
|
|
||||||
|
const endIndex = content.indexOf("---", 3);
|
||||||
|
if (endIndex === -1) {
|
||||||
|
return { frontmatter, body: content };
|
||||||
|
}
|
||||||
|
|
||||||
|
const frontmatterContent = content.slice(3, endIndex).trim();
|
||||||
|
const body = content.slice(endIndex + 3).trim();
|
||||||
|
|
||||||
|
for (const line of frontmatterContent.split("\n")) {
|
||||||
|
const colonIndex = line.indexOf(":");
|
||||||
|
if (colonIndex === -1) continue;
|
||||||
|
|
||||||
|
const key = line.slice(0, colonIndex).trim();
|
||||||
|
let value = line.slice(colonIndex + 1).trim();
|
||||||
|
|
||||||
|
if (
|
||||||
|
(value.startsWith('"') && value.endsWith('"')) ||
|
||||||
|
(value.startsWith("'") && value.endsWith("'"))
|
||||||
|
) {
|
||||||
|
value = value.slice(1, -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
frontmatter[key] = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
return { frontmatter, body };
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractTitle(body: string): string | null {
|
||||||
|
const match = body.match(/^##\s+(.+)$/m);
|
||||||
|
return match ? match[1].trim() : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface Section {
|
||||||
|
title: string;
|
||||||
|
explanation: string;
|
||||||
|
examples: CodeExample[];
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractSections(body: string): Section[] {
|
||||||
|
const sections: Section[] = [];
|
||||||
|
const lines = body.split("\n");
|
||||||
|
|
||||||
|
let currentTitle = "";
|
||||||
|
let explanationLines: string[] = [];
|
||||||
|
let currentExamples: CodeExample[] = [];
|
||||||
|
let currentLabel = "";
|
||||||
|
let currentDescription = "";
|
||||||
|
let inCodeBlock = false;
|
||||||
|
let codeBlockLang = "";
|
||||||
|
let codeBlockContent: string[] = [];
|
||||||
|
let collectingExplanation = false;
|
||||||
|
|
||||||
|
function flushExample() {
|
||||||
|
if (currentLabel && codeBlockContent.length > 0) {
|
||||||
|
currentExamples.push({
|
||||||
|
label: currentLabel,
|
||||||
|
description: currentDescription || undefined,
|
||||||
|
code: codeBlockContent.join("\n"),
|
||||||
|
language: codeBlockLang || undefined,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
currentLabel = "";
|
||||||
|
currentDescription = "";
|
||||||
|
codeBlockContent = [];
|
||||||
|
codeBlockLang = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
function flushSection() {
|
||||||
|
if (currentTitle && currentExamples.length > 0) {
|
||||||
|
sections.push({
|
||||||
|
title: currentTitle,
|
||||||
|
explanation: explanationLines.join("\n").trim(),
|
||||||
|
examples: currentExamples,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
currentExamples = [];
|
||||||
|
explanationLines = [];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
if (line.startsWith("## ") && !inCodeBlock) {
|
||||||
|
flushExample();
|
||||||
|
flushSection();
|
||||||
|
currentTitle = line.replace(/^##\s+/, "").trim();
|
||||||
|
collectingExplanation = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const labelMatch = line.match(
|
||||||
|
/^\*\*([^*]+?)(?:\s*\(([^)]+)\))?\s*:\*\*\s*$/,
|
||||||
|
);
|
||||||
|
if (labelMatch && !inCodeBlock) {
|
||||||
|
collectingExplanation = false;
|
||||||
|
flushExample();
|
||||||
|
currentLabel = labelMatch[1].trim();
|
||||||
|
currentDescription = labelMatch[2]?.trim() || "";
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (line.startsWith("```") && !inCodeBlock) {
|
||||||
|
collectingExplanation = false;
|
||||||
|
inCodeBlock = true;
|
||||||
|
codeBlockLang = line.slice(3).trim();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (line.startsWith("```") && inCodeBlock) {
|
||||||
|
inCodeBlock = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inCodeBlock) {
|
||||||
|
codeBlockContent.push(line);
|
||||||
|
} else if (collectingExplanation) {
|
||||||
|
explanationLines.push(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
flushExample();
|
||||||
|
flushSection();
|
||||||
|
|
||||||
|
return sections;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Duplicated from skills-build/src/validate.ts ---
|
||||||
|
|
||||||
|
function isBadExample(label: string): boolean {
|
||||||
|
const lower = label.toLowerCase();
|
||||||
|
return (
|
||||||
|
lower.includes("incorrect") ||
|
||||||
|
lower.includes("wrong") ||
|
||||||
|
lower.includes("bad")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function isGoodExample(label: string): boolean {
|
||||||
|
const lower = label.toLowerCase();
|
||||||
|
return (
|
||||||
|
lower.includes("correct") ||
|
||||||
|
lower.includes("good") ||
|
||||||
|
lower.includes("usage") ||
|
||||||
|
lower.includes("implementation") ||
|
||||||
|
lower.includes("example") ||
|
||||||
|
lower.includes("recommended")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Extraction logic ---
|
||||||
|
|
||||||
|
function pairExamples(
|
||||||
|
examples: CodeExample[],
|
||||||
|
): Array<{ bad: CodeExample; good: CodeExample }> {
|
||||||
|
const pairs: Array<{ bad: CodeExample; good: CodeExample }> = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < examples.length - 1; i++) {
|
||||||
|
if (
|
||||||
|
isBadExample(examples[i].label) &&
|
||||||
|
isGoodExample(examples[i + 1].label)
|
||||||
|
) {
|
||||||
|
pairs.push({ bad: examples[i], good: examples[i + 1] });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return pairs;
|
||||||
|
}
|
||||||
|
|
||||||
|
function discoverSkillNames(): string[] {
|
||||||
|
if (!existsSync(SKILLS_ROOT)) return [];
|
||||||
|
|
||||||
|
return readdirSync(SKILLS_ROOT, { withFileTypes: true })
|
||||||
|
.filter((d) => d.isDirectory())
|
||||||
|
.filter((d) => existsSync(join(SKILLS_ROOT, d.name, "SKILL.md")))
|
||||||
|
.map((d) => d.name);
|
||||||
|
}
|
||||||
|
|
||||||
|
function getMarkdownFiles(dir: string): string[] {
|
||||||
|
if (!existsSync(dir)) return [];
|
||||||
|
|
||||||
|
return readdirSync(dir)
|
||||||
|
.filter((f) => f.endsWith(".md") && !f.startsWith("_"))
|
||||||
|
.map((f) => join(dir, f));
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractCodeFixDataset(skillName?: string): CodeFixTestCase[] {
|
||||||
|
const skills = skillName ? [skillName] : discoverSkillNames();
|
||||||
|
const testCases: CodeFixTestCase[] = [];
|
||||||
|
|
||||||
|
for (const skill of skills) {
|
||||||
|
const referencesDir = join(SKILLS_ROOT, skill, "references");
|
||||||
|
const files = getMarkdownFiles(referencesDir);
|
||||||
|
|
||||||
|
for (const filePath of files) {
|
||||||
|
const content = readFileSync(filePath, "utf-8");
|
||||||
|
const { frontmatter, body } = parseFrontmatter(content);
|
||||||
|
const fileTitle =
|
||||||
|
frontmatter.title || extractTitle(body) || basename(filePath, ".md");
|
||||||
|
const tags = frontmatter.tags?.split(",").map((t) => t.trim()) || [];
|
||||||
|
const section = basename(filePath, ".md").split("-")[0];
|
||||||
|
|
||||||
|
const sections = extractSections(body);
|
||||||
|
let pairIndex = 0;
|
||||||
|
|
||||||
|
for (const sec of sections) {
|
||||||
|
const pairs = pairExamples(sec.examples);
|
||||||
|
|
||||||
|
for (const { bad, good } of pairs) {
|
||||||
|
testCases.push({
|
||||||
|
id: `${skill}/${basename(filePath, ".md")}#${pairIndex}`,
|
||||||
|
skillName: skill,
|
||||||
|
referenceFile: filePath,
|
||||||
|
referenceFilename: basename(filePath),
|
||||||
|
title: sec.title || fileTitle,
|
||||||
|
explanation: sec.explanation,
|
||||||
|
section,
|
||||||
|
tags,
|
||||||
|
pairIndex,
|
||||||
|
badExample: {
|
||||||
|
label: bad.label,
|
||||||
|
description: bad.description,
|
||||||
|
code: bad.code,
|
||||||
|
language: bad.language,
|
||||||
|
},
|
||||||
|
goodExample: {
|
||||||
|
label: good.label,
|
||||||
|
description: good.description,
|
||||||
|
code: good.code,
|
||||||
|
language: good.language,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
pairIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return testCases;
|
||||||
|
}
|
||||||
24
packages/evals/src/dataset/types.ts
Normal file
24
packages/evals/src/dataset/types.ts
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
export interface CodeFixTestCase {
|
||||||
|
/** Unique ID, e.g. "supabase/db-rls-mandatory#0" */
|
||||||
|
id: string;
|
||||||
|
skillName: string;
|
||||||
|
referenceFile: string;
|
||||||
|
referenceFilename: string;
|
||||||
|
title: string;
|
||||||
|
explanation: string;
|
||||||
|
section: string;
|
||||||
|
tags: string[];
|
||||||
|
pairIndex: number;
|
||||||
|
badExample: {
|
||||||
|
label: string;
|
||||||
|
description?: string;
|
||||||
|
code: string;
|
||||||
|
language?: string;
|
||||||
|
};
|
||||||
|
goodExample: {
|
||||||
|
label: string;
|
||||||
|
description?: string;
|
||||||
|
code: string;
|
||||||
|
language?: string;
|
||||||
|
};
|
||||||
|
}
|
||||||
51
packages/evals/src/models.ts
Normal file
51
packages/evals/src/models.ts
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
import type { AnthropicProvider } from "@ai-sdk/anthropic";
|
||||||
|
import { anthropic } from "@ai-sdk/anthropic";
|
||||||
|
import type { OpenAIProvider } from "@ai-sdk/openai";
|
||||||
|
import { openai } from "@ai-sdk/openai";
|
||||||
|
import type { LanguageModel } from "ai";
|
||||||
|
|
||||||
|
/** Model ID accepted by the Anthropic provider (string literal union + string). */
|
||||||
|
export type AnthropicModelId = Parameters<AnthropicProvider["chat"]>[0];
|
||||||
|
|
||||||
|
/** Model ID accepted by the OpenAI provider (string literal union + string). */
|
||||||
|
export type OpenAIModelId = Parameters<OpenAIProvider["chat"]>[0];
|
||||||
|
|
||||||
|
/** Any model ID accepted by the eval harness. */
|
||||||
|
export type SupportedModelId = AnthropicModelId | OpenAIModelId;
|
||||||
|
|
||||||
|
const MODEL_MAP: Record<string, () => LanguageModel> = {
|
||||||
|
"claude-opus-4-6": () => anthropic("claude-opus-4-6"),
|
||||||
|
"claude-sonnet-4-5-20250929": () => anthropic("claude-sonnet-4-5-20250929"),
|
||||||
|
"claude-haiku-4-5-20251001": () => anthropic("claude-haiku-4-5-20251001"),
|
||||||
|
"claude-opus-4-5-20251101": () => anthropic("claude-opus-4-5-20251101"),
|
||||||
|
"claude-sonnet-4-20250514": () => anthropic("claude-sonnet-4-20250514"),
|
||||||
|
"gpt-4o": () => openai("gpt-4o"),
|
||||||
|
"gpt-4o-mini": () => openai("gpt-4o-mini"),
|
||||||
|
"o3-mini": () => openai("o3-mini"),
|
||||||
|
};
|
||||||
|
|
||||||
|
export function getModel(modelId: SupportedModelId): LanguageModel {
|
||||||
|
const factory = MODEL_MAP[modelId];
|
||||||
|
if (factory) return factory();
|
||||||
|
|
||||||
|
// Fall back to provider detection from model ID prefix
|
||||||
|
if (modelId.startsWith("claude")) {
|
||||||
|
return anthropic(modelId as AnthropicModelId);
|
||||||
|
}
|
||||||
|
if (
|
||||||
|
modelId.startsWith("gpt") ||
|
||||||
|
modelId.startsWith("o1") ||
|
||||||
|
modelId.startsWith("o3")
|
||||||
|
) {
|
||||||
|
return openai(modelId as OpenAIModelId);
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error(
|
||||||
|
`Unknown model: ${modelId}. Available: ${Object.keys(MODEL_MAP).join(", ")}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getJudgeModel(): LanguageModel {
|
||||||
|
const judgeModelId = process.env.EVAL_JUDGE_MODEL || "claude-opus-4-6";
|
||||||
|
return getModel(judgeModelId);
|
||||||
|
}
|
||||||
33
packages/evals/src/prompts/code-fix.ts
Normal file
33
packages/evals/src/prompts/code-fix.ts
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
import type { CodeFixTestCase } from "../dataset/types.js";
|
||||||
|
|
||||||
|
export function buildCodeFixSystemPrompt(): string {
|
||||||
|
return `You are a senior Supabase developer and database architect. You fix code to follow Supabase best practices including:
|
||||||
|
- Row Level Security (RLS) policies
|
||||||
|
- Proper authentication patterns
|
||||||
|
- Safe migration workflows
|
||||||
|
- Correct SDK usage patterns
|
||||||
|
- Edge Function best practices
|
||||||
|
- Connection pooling configuration
|
||||||
|
- Security-first defaults
|
||||||
|
|
||||||
|
When fixing code, ensure the fix is complete, production-ready, and follows the latest Supabase conventions. Return only the corrected code inside a single code block.`;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function buildCodeFixPrompt(testCase: CodeFixTestCase): string {
|
||||||
|
const langHint = testCase.badExample.language
|
||||||
|
? ` (${testCase.badExample.language})`
|
||||||
|
: "";
|
||||||
|
|
||||||
|
return `The following code has a problem related to: ${testCase.title}
|
||||||
|
|
||||||
|
Context: ${testCase.explanation}
|
||||||
|
|
||||||
|
Here is the problematic code${langHint}:
|
||||||
|
|
||||||
|
\`\`\`${testCase.badExample.language || ""}
|
||||||
|
${testCase.badExample.code}
|
||||||
|
\`\`\`
|
||||||
|
${testCase.badExample.description ? `\nIssue hint: ${testCase.badExample.description}` : ""}
|
||||||
|
|
||||||
|
Fix this code to follow Supabase best practices. Return ONLY the corrected code inside a single code block. Do not include any explanation outside the code block.`;
|
||||||
|
}
|
||||||
126
packages/evals/src/scorer.ts
Normal file
126
packages/evals/src/scorer.ts
Normal file
@@ -0,0 +1,126 @@
|
|||||||
|
import { generateText, Output } from "ai";
|
||||||
|
import type { EvalScorer } from "braintrust";
|
||||||
|
import { z } from "zod";
|
||||||
|
import type { CodeFixTestCase } from "./dataset/types.js";
|
||||||
|
import type { Expected, Input, Output as TaskOutput } from "./dataset.js";
|
||||||
|
import { getModel } from "./models.js";
|
||||||
|
|
||||||
|
const judgeModelId = process.env.EVAL_JUDGE_MODEL || "claude-opus-4-6";
|
||||||
|
|
||||||
|
const scoreSchema = z.object({
|
||||||
|
score: z
|
||||||
|
.number()
|
||||||
|
.describe("Score from 0 to 1 (0 = bad, 0.5 = partial, 1 = good)"),
|
||||||
|
reasoning: z.string().describe("Brief reasoning for the score"),
|
||||||
|
});
|
||||||
|
|
||||||
|
const SYSTEM_PROMPT =
|
||||||
|
"You are a precise, consistent evaluator of Supabase code fixes. You assess whether LLM-generated code correctly addresses Supabase anti-patterns by comparing against reference solutions. You are fair: functionally equivalent solutions that differ in style or approach from the reference still receive high scores. You are strict: partial fixes, missing security measures, or incorrect patterns receive low scores. Always provide specific evidence for your scoring.";
|
||||||
|
|
||||||
|
function buildContext(tc: CodeFixTestCase, llmOutput: string): string {
|
||||||
|
return `## Reference Information
|
||||||
|
|
||||||
|
**Topic:** ${tc.title}
|
||||||
|
**Explanation:** ${tc.explanation}
|
||||||
|
|
||||||
|
## Original Incorrect Code
|
||||||
|
|
||||||
|
\`\`\`${tc.badExample.language || ""}
|
||||||
|
${tc.badExample.code}
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
## Reference Correct Code (ground truth)
|
||||||
|
|
||||||
|
\`\`\`${tc.goodExample.language || ""}
|
||||||
|
${tc.goodExample.code}
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
## LLM's Attempted Fix
|
||||||
|
|
||||||
|
${llmOutput}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function judge(
|
||||||
|
prompt: string,
|
||||||
|
): Promise<{ score: number; reasoning: string }> {
|
||||||
|
const model = getModel(judgeModelId);
|
||||||
|
const { output } = await generateText({
|
||||||
|
model,
|
||||||
|
system: SYSTEM_PROMPT,
|
||||||
|
prompt,
|
||||||
|
output: Output.object({ schema: scoreSchema }),
|
||||||
|
temperature: 0.1,
|
||||||
|
maxRetries: 2,
|
||||||
|
});
|
||||||
|
if (!output) throw new Error("Judge returned no structured output");
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const correctnessScorer: EvalScorer<
|
||||||
|
Input,
|
||||||
|
TaskOutput,
|
||||||
|
Expected
|
||||||
|
> = async ({ input, output }) => {
|
||||||
|
const context = buildContext(input.testCase, output.llmOutput);
|
||||||
|
const result = await judge(`${context}
|
||||||
|
|
||||||
|
## Task
|
||||||
|
|
||||||
|
Evaluate **correctness**: Does the LLM's fix address the core issue identified in the incorrect code?
|
||||||
|
|
||||||
|
The fix does not need to be character-identical to the reference, but it must solve the same problem. Functionally equivalent or improved solutions should score well.
|
||||||
|
|
||||||
|
Score 1 if the fix fully addresses the core issue, 0.5 if it partially addresses it, 0 if it fails to address the core issue or introduces new problems.`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
name: "Correctness",
|
||||||
|
score: result.score,
|
||||||
|
metadata: { reasoning: result.reasoning },
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
export const completenessScorer: EvalScorer<
|
||||||
|
Input,
|
||||||
|
TaskOutput,
|
||||||
|
Expected
|
||||||
|
> = async ({ input, output }) => {
|
||||||
|
const context = buildContext(input.testCase, output.llmOutput);
|
||||||
|
const result = await judge(`${context}
|
||||||
|
|
||||||
|
## Task
|
||||||
|
|
||||||
|
Evaluate **completeness**: Does the LLM's fix include ALL necessary changes shown in the reference?
|
||||||
|
|
||||||
|
Check for missing RLS enablement, missing policy clauses, missing columns, incomplete migrations, or any partial fixes. The fix should be production-ready.
|
||||||
|
|
||||||
|
Score 1 if all necessary changes are present, 0.5 if most changes are present but some are missing, 0 if significant changes are missing.`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
name: "Completeness",
|
||||||
|
score: result.score,
|
||||||
|
metadata: { reasoning: result.reasoning },
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
export const bestPracticeScorer: EvalScorer<
|
||||||
|
Input,
|
||||||
|
TaskOutput,
|
||||||
|
Expected
|
||||||
|
> = async ({ input, output }) => {
|
||||||
|
const context = buildContext(input.testCase, output.llmOutput);
|
||||||
|
const result = await judge(`${context}
|
||||||
|
|
||||||
|
## Task
|
||||||
|
|
||||||
|
Evaluate **best practices**: Does the LLM's fix follow Supabase best practices as demonstrated in the reference?
|
||||||
|
|
||||||
|
Consider: RLS patterns, auth.users references, migration conventions, connection pooling, edge function patterns, SDK usage, and security-first defaults. Alternative correct approaches that achieve the same security/correctness goal are acceptable.
|
||||||
|
|
||||||
|
Score 1 if the fix follows best practices, 0.5 if it mostly follows best practices with minor deviations, 0 if it uses anti-patterns or ignores conventions.`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
name: "Best Practice",
|
||||||
|
score: result.score,
|
||||||
|
metadata: { reasoning: result.reasoning },
|
||||||
|
};
|
||||||
|
};
|
||||||
16
packages/evals/tsconfig.json
Normal file
16
packages/evals/tsconfig.json
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"target": "ES2022",
|
||||||
|
"module": "ESNext",
|
||||||
|
"moduleResolution": "bundler",
|
||||||
|
"esModuleInterop": true,
|
||||||
|
"strict": true,
|
||||||
|
"skipLibCheck": true,
|
||||||
|
"outDir": "dist",
|
||||||
|
"rootDir": "src",
|
||||||
|
"declaration": true,
|
||||||
|
"resolveJsonModule": true
|
||||||
|
},
|
||||||
|
"include": ["src/**/*"],
|
||||||
|
"exclude": ["node_modules", "dist"]
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user