mirror of
https://github.com/supabase/agent-skills.git
synced 2026-03-27 10:09:26 +08:00
workflow evals with one scenario
This commit is contained in:
12
mise.toml
12
mise.toml
@@ -46,15 +46,11 @@ sources = ["test/**", "skills/**"]
|
|||||||
# ── Eval tasks ────────────────────────────────────────────────────────
|
# ── Eval tasks ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
[tasks.eval]
|
[tasks.eval]
|
||||||
description = "Run code-fix evals for all configured models (local, no upload)"
|
description = "Run workflow evals"
|
||||||
run = "npm --prefix packages/evals run eval"
|
run = "npm --prefix packages/evals run eval"
|
||||||
sources = ["packages/evals/src/**", "skills/**/references/**"]
|
sources = ["packages/evals/src/**", "packages/evals/evals/**"]
|
||||||
|
|
||||||
[tasks."eval:model"]
|
|
||||||
description = "Run code-fix eval for a single model (local, no upload)"
|
|
||||||
run = "EVAL_MODEL={{arg(name='model')}} npm --prefix packages/evals run eval"
|
|
||||||
|
|
||||||
[tasks."eval:upload"]
|
[tasks."eval:upload"]
|
||||||
description = "Run code-fix evals for all models and upload to Braintrust"
|
description = "Run workflow evals and upload to Braintrust"
|
||||||
run = "npm --prefix packages/evals run eval:upload"
|
run = "npm --prefix packages/evals run eval:upload"
|
||||||
sources = ["packages/evals/src/**", "skills/**/references/**"]
|
sources = ["packages/evals/src/**", "packages/evals/evals/**"]
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
BRAINTRUST_API_KEY=
|
|
||||||
BRAINTRUST_PROJECT_ID=
|
|
||||||
ANTHROPIC_API_KEY=
|
ANTHROPIC_API_KEY=
|
||||||
# Provider API keys for eval models are configured in the Braintrust dashboard
|
# Required for Braintrust upload (BRAINTRUST_UPLOAD=true)
|
||||||
# under Settings → AI providers (not needed locally).
|
# BRAINTRUST_API_KEY=
|
||||||
|
# BRAINTRUST_PROJECT_ID=
|
||||||
|
|||||||
@@ -1,171 +1,108 @@
|
|||||||
# Evals — Agent Guide
|
# Evals — Agent Guide
|
||||||
|
|
||||||
This package evaluates whether LLMs correctly apply Supabase best practices
|
This package evaluates whether AI agents correctly implement Supabase tasks
|
||||||
using skill documentation as context. It uses
|
when using skill documentation. Modeled after
|
||||||
[Braintrust](https://www.braintrust.dev/) for eval orchestration and the
|
[Vercel's next-evals-oss](https://github.com/vercel-labs/next-evals-oss): each
|
||||||
[Vercel AI SDK](https://sdk.vercel.ai/) for LLM calls.
|
eval is a self-contained project with a task prompt, the agent works on it, and
|
||||||
|
hidden tests check the result. Binary pass/fail.
|
||||||
|
|
||||||
## Architecture
|
## Architecture
|
||||||
|
|
||||||
Two-step **LLM-as-judge** pattern powered by Braintrust's `Eval()`:
|
|
||||||
|
|
||||||
1. The **eval model** receives a prompt with skill context and produces a code
|
|
||||||
fix. All eval model calls go through the **Braintrust AI proxy** — a single
|
|
||||||
OpenAI-compatible endpoint that routes to any provider (Anthropic, OpenAI,
|
|
||||||
Google, etc.).
|
|
||||||
2. Five independent **judge scorers** (`claude-opus-4-6` via direct Anthropic
|
|
||||||
API) evaluate the fix via structured output (Zod schemas via AI SDK's
|
|
||||||
`Output.object()`).
|
|
||||||
|
|
||||||
The eval runs once per model in the model matrix, creating a separate Braintrust
|
|
||||||
experiment per model for side-by-side comparison.
|
|
||||||
|
|
||||||
Key files:
|
|
||||||
|
|
||||||
```
|
```
|
||||||
src/
|
1. Create temp dir with project skeleton (PROMPT.md, supabase/ dir)
|
||||||
code-fix.eval.ts # Braintrust Eval() entry point (loops over models)
|
2. Symlink supabase skill into workspace (or skip for baseline)
|
||||||
dataset.ts # Maps extracted test cases to EvalCase format
|
3. Run: claude -p "prompt" --cwd /tmp/eval-xxx
|
||||||
scorer.ts # Five AI SDK-based scorers (quality, safety, minimality)
|
4. Agent reads skill, creates migrations/code in the workspace
|
||||||
models.ts # Braintrust proxy + direct Anthropic provider
|
5. Copy hidden EVAL.ts into workspace, run vitest
|
||||||
models.config.ts # Model matrix (add/remove models here)
|
6. Capture pass/fail
|
||||||
dataset/
|
|
||||||
types.ts # CodeFixTestCase interface
|
|
||||||
extract.ts # Auto-extracts test cases from skill references
|
|
||||||
prompts/
|
|
||||||
code-fix.ts # System + user prompts for the eval model
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## How It Works
|
The agent is **Claude Code** invoked via `claude -p` (print mode). It operates
|
||||||
|
on a real filesystem in a temp directory and can read/write files freely.
|
||||||
|
|
||||||
**Test cases are auto-extracted** from `skills/*/references/*.md`. The extractor
|
## Eval Structure
|
||||||
(`dataset/extract.ts`) finds consecutive `**Incorrect:**` / `**Correct:**` code
|
|
||||||
block pairs under `##` sections. Each pair becomes one test case.
|
|
||||||
|
|
||||||
Five independent scorers evaluate each fix (0–1 scale):
|
Each eval lives in `evals/{scenario-name}/`:
|
||||||
|
|
||||||
- **Correctness** — does the fix address the core issue?
|
```
|
||||||
- **Completeness** — does the fix include all necessary changes?
|
evals/auth-rls-new-project/
|
||||||
- **Best Practice** — does the fix follow Supabase conventions?
|
PROMPT.md # Task description (visible to agent)
|
||||||
- **Regression Safety** — does the fix avoid introducing new problems (broken
|
EVAL.ts # Vitest assertions (hidden from agent during run)
|
||||||
functionality, removed security measures, new vulnerabilities)?
|
package.json # Minimal project manifest
|
||||||
- **Minimality** — is the fix tightly scoped to the identified issue without
|
supabase/
|
||||||
unnecessary rewrites or over-engineering?
|
config.toml # Pre-initialized supabase config
|
||||||
|
migrations/ # Empty — agent creates files here
|
||||||
Each model in the matrix generates a separate Braintrust experiment. The
|
|
||||||
dashboard supports side-by-side comparison of experiments.
|
|
||||||
|
|
||||||
## Adding Test Cases
|
|
||||||
|
|
||||||
No code changes needed. Add paired Incorrect/Correct blocks to any skill
|
|
||||||
reference file. The extractor picks them up automatically.
|
|
||||||
|
|
||||||
Required format in a reference `.md` file:
|
|
||||||
|
|
||||||
```markdown
|
|
||||||
## Section Title
|
|
||||||
|
|
||||||
Explanation of the issue.
|
|
||||||
|
|
||||||
**Incorrect:**
|
|
||||||
|
|
||||||
\```sql
|
|
||||||
-- bad code
|
|
||||||
\```
|
|
||||||
|
|
||||||
**Correct:**
|
|
||||||
|
|
||||||
\```sql
|
|
||||||
-- good code
|
|
||||||
\```
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Rules:
|
**EVAL.ts** is never copied to the workspace until after the agent finishes.
|
||||||
|
This prevents the agent from "teaching to the test."
|
||||||
- Pairs must be consecutive — an Incorrect block immediately followed by a
|
|
||||||
Correct block
|
|
||||||
- Labels are matched case-insensitively. Bad labels: `Incorrect`, `Wrong`, `Bad`.
|
|
||||||
Good labels: `Correct`, `Good`, `Usage`, `Implementation`, `Example`,
|
|
||||||
`Recommended`
|
|
||||||
- The optional parenthetical in the label becomes the `description` field:
|
|
||||||
`**Incorrect (missing RLS):**`
|
|
||||||
- Files prefixed with `_` (like `_sections.md`, `_template.md`) are skipped
|
|
||||||
- Each pair gets an ID like `supabase/db-rls-mandatory#0` (skill/filename#index)
|
|
||||||
|
|
||||||
## Adding/Removing Models
|
|
||||||
|
|
||||||
Edit the `EVAL_MODELS` array in `src/models.config.ts`:
|
|
||||||
|
|
||||||
```typescript
|
|
||||||
export const EVAL_MODELS: EvalModelConfig[] = [
|
|
||||||
{ id: "claude-sonnet-4-5-20250929", label: "Claude Sonnet 4.5", provider: "anthropic", ci: true },
|
|
||||||
{ id: "gpt-5.3", label: "GPT 5.3", provider: "openai", ci: true },
|
|
||||||
// Add new models here
|
|
||||||
];
|
|
||||||
```
|
|
||||||
|
|
||||||
Provider API keys must be configured in the Braintrust dashboard under
|
|
||||||
Settings → AI providers.
|
|
||||||
|
|
||||||
## Running Evals
|
## Running Evals
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Run all models locally (no Braintrust upload)
|
# Run all scenarios with Claude Sonnet 4.5 (default)
|
||||||
mise run eval
|
mise run eval
|
||||||
|
|
||||||
# Run a single model
|
# Run a specific scenario
|
||||||
mise run eval:model model=claude-sonnet-4-5-20250929
|
EVAL_SCENARIO=auth-rls-new-project mise run eval
|
||||||
|
|
||||||
# Run and upload to Braintrust dashboard
|
# Override model
|
||||||
mise run eval:upload
|
EVAL_MODEL=claude-opus-4-6 mise run eval
|
||||||
|
|
||||||
|
# Run with baseline comparison (with-skill vs without-skill)
|
||||||
|
EVAL_BASELINE=true mise run eval
|
||||||
```
|
```
|
||||||
|
|
||||||
Or directly:
|
Or directly:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd packages/evals
|
cd packages/evals
|
||||||
|
npx tsx src/runner.ts
|
||||||
|
|
||||||
# Local run (all models)
|
# Single scenario with baseline
|
||||||
npx braintrust eval --no-send-logs src/code-fix.eval.ts
|
EVAL_SCENARIO=auth-rls-new-project EVAL_BASELINE=true npx tsx src/runner.ts
|
||||||
|
|
||||||
# Single model
|
|
||||||
EVAL_MODEL=claude-sonnet-4-5-20250929 npx braintrust eval --no-send-logs src/code-fix.eval.ts
|
|
||||||
|
|
||||||
# Filter to one test case (across all models)
|
|
||||||
npx braintrust eval --no-send-logs src/code-fix.eval.ts --filter 'input.testCase.id=db-migrations-idempotent'
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Baseline Comparison
|
||||||
|
|
||||||
|
Set `EVAL_BASELINE=true` to run each scenario twice:
|
||||||
|
|
||||||
|
- **With skill**: The supabase skill is symlinked into the workspace. Claude
|
||||||
|
Code discovers it and uses reference files for guidance.
|
||||||
|
- **Baseline**: No skill available. The agent relies on innate knowledge.
|
||||||
|
|
||||||
|
Compare pass rates to measure how much the skill improves agent output.
|
||||||
|
|
||||||
|
## Adding Scenarios
|
||||||
|
|
||||||
|
1. Create `evals/{scenario-name}/` with `PROMPT.md`, `EVAL.ts`, `package.json`
|
||||||
|
2. Add any starter files the agent should see (e.g., `supabase/config.toml`)
|
||||||
|
3. Write vitest assertions in `EVAL.ts` that check the agent's output files
|
||||||
|
4. Document the scenario in `scenarios/SCENARIOS.md`
|
||||||
|
|
||||||
## Environment
|
## Environment
|
||||||
|
|
||||||
API keys are loaded by mise from `packages/evals/.env` (configured in root
|
|
||||||
`mise.toml`). Copy `.env.example` to `.env` and fill in the keys.
|
|
||||||
|
|
||||||
```
|
```
|
||||||
BRAINTRUST_API_KEY=... # Required: proxy routing + dashboard upload
|
ANTHROPIC_API_KEY=sk-ant-... # Required: Claude Code authentication
|
||||||
BRAINTRUST_PROJECT_ID=... # Required: Braintrust project identifier
|
EVAL_MODEL=... # Optional: override model (default: claude-sonnet-4-5-20250929)
|
||||||
ANTHROPIC_API_KEY=sk-ant-... # Required: judge model (Claude Opus 4.6)
|
EVAL_SCENARIO=... # Optional: run single scenario
|
||||||
|
EVAL_BASELINE=true # Optional: run baseline comparison
|
||||||
|
BRAINTRUST_UPLOAD=true # Optional: upload results to Braintrust
|
||||||
```
|
```
|
||||||
|
|
||||||
Optional overrides:
|
## Key Files
|
||||||
|
|
||||||
```
|
```
|
||||||
EVAL_MODEL=claude-sonnet-4-5-20250929 # Run only this model (skips matrix)
|
src/
|
||||||
EVAL_JUDGE_MODEL=claude-opus-4-6 # Judge model for scorers
|
runner.ts # Main orchestrator
|
||||||
|
types.ts # Core interfaces
|
||||||
|
runner/
|
||||||
|
scaffold.ts # Creates temp workspace from eval template
|
||||||
|
agent.ts # Invokes claude -p as subprocess
|
||||||
|
test.ts # Runs vitest EVAL.ts against workspace
|
||||||
|
results.ts # Collects results and prints summary
|
||||||
|
evals/
|
||||||
|
auth-rls-new-project/ # Scenario 1
|
||||||
|
scenarios/
|
||||||
|
SCENARIOS.md # Scenario descriptions
|
||||||
```
|
```
|
||||||
|
|
||||||
## Modifying Prompts
|
|
||||||
|
|
||||||
- `src/prompts/code-fix.ts` — what the eval model sees
|
|
||||||
- `src/scorer.ts` — judge prompts for each scorer dimension
|
|
||||||
|
|
||||||
Temperature settings:
|
|
||||||
|
|
||||||
- Eval model: `0.2` (in `code-fix.eval.ts`)
|
|
||||||
- Judge model: `0.1` (in `scorer.ts`)
|
|
||||||
|
|
||||||
## Modifying Scoring
|
|
||||||
|
|
||||||
Each scorer in `src/scorer.ts` is independent. To add a new dimension:
|
|
||||||
|
|
||||||
1. Create a new `EvalScorer` function in `scorer.ts`
|
|
||||||
2. Add it to the `scores` array in `code-fix.eval.ts`
|
|
||||||
|
|||||||
@@ -1,46 +1,51 @@
|
|||||||
# Evals
|
# Evals
|
||||||
|
|
||||||
LLM evaluation system for Supabase agent skills, powered by [Braintrust](https://www.braintrust.dev/). Tests whether models can correctly apply Supabase best practices using skill documentation as context.
|
Agent evaluation system for Supabase skills. Tests whether AI agents (starting
|
||||||
|
with Claude Code) correctly implement Supabase tasks when given access to skill
|
||||||
|
documentation.
|
||||||
|
|
||||||
## How It Works
|
## How It Works
|
||||||
|
|
||||||
Each eval follows a two-step **LLM-as-judge** pattern orchestrated by Braintrust's `Eval()`:
|
Each eval is a self-contained project directory with a task prompt. The agent
|
||||||
|
works on it autonomously, then hidden vitest assertions check the result.
|
||||||
|
Binary pass/fail.
|
||||||
|
|
||||||
1. **Generate** — The eval model (e.g. Sonnet 4.5) receives a prompt with skill context and produces a code fix.
|
```
|
||||||
2. **Judge** — Three independent scorers using a stronger model (Opus 4.6 by default) evaluate the fix via the Vercel AI SDK with structured output.
|
1. Create temp workspace from eval template
|
||||||
|
2. Agent (claude -p) reads prompt and creates files
|
||||||
Test cases are extracted automatically from skill reference files (`skills/*/references/*.md`). Each file contains paired **Incorrect** / **Correct** code blocks — the model receives the bad code and must produce the fix.
|
3. Hidden EVAL.ts runs vitest assertions against the output
|
||||||
|
4. Pass/fail
|
||||||
**Scoring dimensions (each 0–1):**
|
```
|
||||||
|
|
||||||
| Scorer | Description |
|
|
||||||
|--------|-------------|
|
|
||||||
| Correctness | Does the fix address the core issue? |
|
|
||||||
| Completeness | Does it include all necessary changes? |
|
|
||||||
| Best Practice | Does it follow Supabase best practices? |
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Run locally (no Braintrust upload)
|
# Run all scenarios
|
||||||
mise run eval
|
mise run eval
|
||||||
|
|
||||||
# Run and upload to Braintrust dashboard
|
# Run a specific scenario
|
||||||
mise run eval:upload
|
EVAL_SCENARIO=auth-rls-new-project mise run eval
|
||||||
|
|
||||||
|
# Run with baseline comparison (with-skill vs without-skill)
|
||||||
|
EVAL_BASELINE=true mise run eval
|
||||||
|
|
||||||
|
# Override model
|
||||||
|
EVAL_MODEL=claude-opus-4-6 mise run eval
|
||||||
```
|
```
|
||||||
|
|
||||||
### Environment Variables
|
### Environment Variables
|
||||||
|
|
||||||
API keys are loaded via mise from `packages/evals/.env` (see root `mise.toml`).
|
|
||||||
|
|
||||||
```
|
```
|
||||||
ANTHROPIC_API_KEY Required: eval model + judge model
|
ANTHROPIC_API_KEY Required: Claude Code authentication
|
||||||
BRAINTRUST_API_KEY Required for Braintrust dashboard upload
|
EVAL_MODEL Override model (default: claude-sonnet-4-5-20250929)
|
||||||
BRAINTRUST_PROJECT_ID Required for Braintrust dashboard upload
|
EVAL_SCENARIO Run single scenario by name
|
||||||
EVAL_MODEL Override default eval model (claude-sonnet-4-5-20250929)
|
EVAL_BASELINE=true Run baseline comparison (no skill)
|
||||||
EVAL_JUDGE_MODEL Override default judge model (claude-opus-4-6)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Adding Test Cases
|
## Adding Scenarios
|
||||||
|
|
||||||
Add paired Incorrect/Correct code blocks to any skill reference file. The extractor picks them up automatically on the next run.
|
1. Create `evals/{name}/` with `PROMPT.md`, `EVAL.ts`, and starter files
|
||||||
|
2. Write vitest assertions in `EVAL.ts`
|
||||||
|
3. Document in `scenarios/SCENARIOS.md`
|
||||||
|
|
||||||
|
See [AGENTS.md](AGENTS.md) for full details.
|
||||||
|
|||||||
107
packages/evals/evals/auth-rls-new-project/EVAL.ts
Normal file
107
packages/evals/evals/auth-rls-new-project/EVAL.ts
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
import { existsSync, readdirSync, readFileSync } from "node:fs";
|
||||||
|
import { join } from "node:path";
|
||||||
|
import { expect, test } from "vitest";
|
||||||
|
|
||||||
|
const supabaseDir = join(process.cwd(), "supabase");
|
||||||
|
const migrationsDir = join(supabaseDir, "migrations");
|
||||||
|
|
||||||
|
/** Find the first .sql migration file (agent may name it differently). */
|
||||||
|
function findMigrationFile(): string | null {
|
||||||
|
if (!existsSync(migrationsDir)) return null;
|
||||||
|
const files = readdirSync(migrationsDir).filter((f) => f.endsWith(".sql"));
|
||||||
|
return files.length > 0 ? join(migrationsDir, files[0]) : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getMigrationSQL(): string {
|
||||||
|
const file = findMigrationFile();
|
||||||
|
if (!file) throw new Error("No migration file found in supabase/migrations/");
|
||||||
|
return readFileSync(file, "utf-8");
|
||||||
|
}
|
||||||
|
|
||||||
|
test("supabase project initialized (config.toml exists)", () => {
|
||||||
|
expect(existsSync(join(supabaseDir, "config.toml"))).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("migration file exists in supabase/migrations/", () => {
|
||||||
|
expect(findMigrationFile()).not.toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
test("creates tasks table", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
expect(sql).toMatch(/create\s+table/);
|
||||||
|
expect(sql).toMatch(/tasks/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("enables RLS on tasks table", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
expect(sql).toMatch(/alter\s+table.*tasks.*enable\s+row\s+level\s+security/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("has foreign key to auth.users", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
expect(sql).toMatch(/references\s+auth\.users/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("uses ON DELETE CASCADE for auth FK", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
expect(sql).toMatch(/on\s+delete\s+cascade/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("uses (select auth.uid()) not bare auth.uid() in policies", () => {
|
||||||
|
const sql = getMigrationSQL();
|
||||||
|
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||||
|
for (const policy of policyBlocks) {
|
||||||
|
if (policy.includes("auth.uid()")) {
|
||||||
|
// The subselect form: (select auth.uid())
|
||||||
|
expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("policies use TO authenticated", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||||
|
expect(policyBlocks.length).toBeGreaterThan(0);
|
||||||
|
for (const policy of policyBlocks) {
|
||||||
|
expect(policy).toMatch(/to\s+authenticated/);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("uses timestamptz not plain timestamp for time columns", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// Match "timestamp" that is NOT followed by "tz" or "with time zone"
|
||||||
|
const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
|
||||||
|
// Only fail if the migration defines time columns with plain timestamp
|
||||||
|
if (
|
||||||
|
sql.includes("created_at") ||
|
||||||
|
sql.includes("updated_at") ||
|
||||||
|
sql.includes("due_date")
|
||||||
|
) {
|
||||||
|
expect(sql).not.toMatch(hasPlainTimestamp);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("creates index on user_id column", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
expect(sql).toMatch(/create\s+index/);
|
||||||
|
expect(sql).toMatch(/user_id/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("migration is idempotent (uses IF NOT EXISTS)", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
expect(sql).toMatch(/if\s+not\s+exists/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("overall quality: demonstrates Supabase best practices", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// A high-quality migration should contain most of these patterns
|
||||||
|
const signals = [
|
||||||
|
/enable\s+row\s+level\s+security/,
|
||||||
|
/\(select\s+auth\.uid\(\)\)/,
|
||||||
|
/to\s+authenticated/,
|
||||||
|
/on\s+delete\s+cascade/,
|
||||||
|
/create\s+index/,
|
||||||
|
];
|
||||||
|
const matches = signals.filter((r) => r.test(sql));
|
||||||
|
expect(matches.length).toBeGreaterThanOrEqual(4);
|
||||||
|
});
|
||||||
16
packages/evals/evals/auth-rls-new-project/PROMPT.md
Normal file
16
packages/evals/evals/auth-rls-new-project/PROMPT.md
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
I'm starting a new Supabase project from scratch for a task management app. Users should sign up with email/password, and each user should only see their own tasks.
|
||||||
|
|
||||||
|
Set up the project:
|
||||||
|
|
||||||
|
1. Initialize the Supabase project with the CLI (`npx supabase init`)
|
||||||
|
2. Start the local Supabase stack (`npx supabase start`)
|
||||||
|
3. Create a SQL migration for a tasks table with columns: title (text), description (text), status (text), and due_date
|
||||||
|
|
||||||
|
The migration must:
|
||||||
|
|
||||||
|
- Create the tasks table with proper column types
|
||||||
|
- Link tasks to authenticated users
|
||||||
|
- Enable Row Level Security
|
||||||
|
- Create policies so users can only CRUD their own tasks
|
||||||
|
- Add appropriate indexes
|
||||||
|
- Be idempotent (safe to run multiple times)
|
||||||
5
packages/evals/evals/auth-rls-new-project/package.json
Normal file
5
packages/evals/evals/auth-rls-new-project/package.json
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"name": "auth-rls-new-project",
|
||||||
|
"private": true,
|
||||||
|
"type": "module"
|
||||||
|
}
|
||||||
1216
packages/evals/package-lock.json
generated
1216
packages/evals/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -4,21 +4,18 @@
|
|||||||
"type": "module",
|
"type": "module",
|
||||||
"author": "Supabase",
|
"author": "Supabase",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"description": "LLM evaluation system for Supabase agent skills",
|
"description": "Agent evaluation system for Supabase skills",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"eval": "braintrust eval --no-send-logs src/code-fix.eval.ts",
|
"eval": "tsx src/runner.ts",
|
||||||
"eval:upload": "braintrust eval src/code-fix.eval.ts"
|
"eval:upload": "BRAINTRUST_UPLOAD=true tsx src/runner.ts"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@ai-sdk/anthropic": "^3.0.44",
|
"braintrust": "^1.0.2"
|
||||||
"@ai-sdk/openai": "^3.0.29",
|
|
||||||
"ai": "^6.0.86",
|
|
||||||
"braintrust": "^1.0.2",
|
|
||||||
"zod": "^3.23.0"
|
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/node": "^20.10.0",
|
"@types/node": "^20.10.0",
|
||||||
"tsx": "^4.7.0",
|
"tsx": "^4.7.0",
|
||||||
"typescript": "^5.3.0"
|
"typescript": "^5.3.0",
|
||||||
|
"vitest": "^3.1.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
51
packages/evals/scenarios/SCENARIOS.md
Normal file
51
packages/evals/scenarios/SCENARIOS.md
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
# Supabase Skills Eval Scenarios
|
||||||
|
|
||||||
|
## Scenario 1: auth-rls-new-project
|
||||||
|
|
||||||
|
**Description:** Set up a new Supabase project from scratch and add
|
||||||
|
authentication with RLS. The agent must initialize the project with the CLI,
|
||||||
|
start the local Supabase stack, then create a tasks table with proper security
|
||||||
|
(RLS policies, auth FK, indexes) in a single idempotent migration.
|
||||||
|
|
||||||
|
**Setup:** The workspace starts empty (no `supabase/` directory). The agent is
|
||||||
|
expected to run `npx supabase init` and `npx supabase start` before creating
|
||||||
|
the migration.
|
||||||
|
|
||||||
|
**Expected skill files read:**
|
||||||
|
|
||||||
|
- `SKILL.md` (skill body with reference file index)
|
||||||
|
- `references/dev-getting-started.md`
|
||||||
|
- `references/db-rls-mandatory.md`
|
||||||
|
- `references/db-rls-policy-types.md`
|
||||||
|
- `references/db-rls-common-mistakes.md`
|
||||||
|
- `references/db-schema-auth-fk.md`
|
||||||
|
- `references/db-schema-timestamps.md`
|
||||||
|
- `references/db-migrations-idempotent.md`
|
||||||
|
|
||||||
|
**Expected result:**
|
||||||
|
|
||||||
|
The agent initializes a Supabase project and creates a migration file that:
|
||||||
|
|
||||||
|
- Creates tasks table with `timestamptz` columns
|
||||||
|
- Has `user_id` FK to `auth.users(id)` with `ON DELETE CASCADE`
|
||||||
|
- Enables RLS (`ALTER TABLE tasks ENABLE ROW LEVEL SECURITY`)
|
||||||
|
- Creates per-operation policies using `(select auth.uid())` with `TO authenticated`
|
||||||
|
- Creates index on `user_id`
|
||||||
|
- Uses `IF NOT EXISTS` for idempotency
|
||||||
|
|
||||||
|
**Scorer:** Binary pass/fail (12 vitest assertions)
|
||||||
|
|
||||||
|
| Test | What it checks |
|
||||||
|
| --- | --- |
|
||||||
|
| supabase project initialized | `supabase/config.toml` exists after agent runs |
|
||||||
|
| migration file exists | Agent created a `.sql` file in `supabase/migrations/` |
|
||||||
|
| creates tasks table | SQL contains `CREATE TABLE ... tasks` |
|
||||||
|
| enables RLS | `ALTER TABLE tasks ENABLE ROW LEVEL SECURITY` |
|
||||||
|
| FK to auth.users | `REFERENCES auth.users` |
|
||||||
|
| ON DELETE CASCADE | Cascade delete on auth FK |
|
||||||
|
| (select auth.uid()) | Subselect form in policies (performance) |
|
||||||
|
| TO authenticated | Policies scoped to authenticated role |
|
||||||
|
| timestamptz | No plain `timestamp` for time columns |
|
||||||
|
| index on user_id | `CREATE INDEX` on the FK column |
|
||||||
|
| IF NOT EXISTS | Idempotent migration |
|
||||||
|
| overall quality | At least 4/5 best-practice signals present |
|
||||||
@@ -1,82 +0,0 @@
|
|||||||
import assert from "node:assert";
|
|
||||||
import { generateText } from "ai";
|
|
||||||
import { Eval } from "braintrust";
|
|
||||||
import { dataset } from "./dataset.js";
|
|
||||||
import type { EvalModelConfig } from "./models.config.js";
|
|
||||||
import { EVAL_MODELS } from "./models.config.js";
|
|
||||||
import { getProxyModel } from "./models.js";
|
|
||||||
import {
|
|
||||||
buildCodeFixPrompt,
|
|
||||||
buildCodeFixSystemPrompt,
|
|
||||||
} from "./prompts/code-fix.js";
|
|
||||||
import {
|
|
||||||
bestPracticeScorer,
|
|
||||||
completenessScorer,
|
|
||||||
correctnessScorer,
|
|
||||||
minimalityScorer,
|
|
||||||
regressionSafetyScorer,
|
|
||||||
} from "./scorer.js";
|
|
||||||
|
|
||||||
assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
|
|
||||||
assert(process.env.ANTHROPIC_API_KEY, "ANTHROPIC_API_KEY is not set");
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Resolve which models to run based on environment:
|
|
||||||
* - EVAL_MODEL set → single model only (local dev / cost control)
|
|
||||||
* - CI without EVAL_ALL_MODELS → ci:true models only
|
|
||||||
* - Otherwise → all models
|
|
||||||
*/
|
|
||||||
function getModelsToRun(): EvalModelConfig[] {
|
|
||||||
const singleModel = process.env.EVAL_MODEL;
|
|
||||||
if (singleModel) {
|
|
||||||
const found = EVAL_MODELS.find((m) => m.id === singleModel);
|
|
||||||
return [
|
|
||||||
found ?? {
|
|
||||||
id: singleModel,
|
|
||||||
label: singleModel,
|
|
||||||
provider: "unknown",
|
|
||||||
ci: false,
|
|
||||||
},
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (process.env.CI && !process.env.EVAL_ALL_MODELS) {
|
|
||||||
return EVAL_MODELS.filter((m) => m.ci);
|
|
||||||
}
|
|
||||||
|
|
||||||
return EVAL_MODELS;
|
|
||||||
}
|
|
||||||
|
|
||||||
const models = getModelsToRun();
|
|
||||||
|
|
||||||
for (const modelConfig of models) {
|
|
||||||
Eval("CodeFix", {
|
|
||||||
experimentName: modelConfig.id,
|
|
||||||
projectId: process.env.BRAINTRUST_PROJECT_ID,
|
|
||||||
trialCount: process.env.CI ? 3 : 1,
|
|
||||||
metadata: {
|
|
||||||
model: modelConfig.id,
|
|
||||||
label: modelConfig.label,
|
|
||||||
provider: modelConfig.provider,
|
|
||||||
},
|
|
||||||
data: () => dataset(),
|
|
||||||
task: async (input) => {
|
|
||||||
const model = getProxyModel(modelConfig.id);
|
|
||||||
const response = await generateText({
|
|
||||||
model,
|
|
||||||
system: buildCodeFixSystemPrompt(),
|
|
||||||
prompt: buildCodeFixPrompt(input.testCase),
|
|
||||||
temperature: 0.2,
|
|
||||||
maxRetries: 2,
|
|
||||||
});
|
|
||||||
return { llmOutput: response.text };
|
|
||||||
},
|
|
||||||
scores: [
|
|
||||||
correctnessScorer,
|
|
||||||
completenessScorer,
|
|
||||||
bestPracticeScorer,
|
|
||||||
regressionSafetyScorer,
|
|
||||||
minimalityScorer,
|
|
||||||
],
|
|
||||||
});
|
|
||||||
}
|
|
||||||
@@ -1,51 +0,0 @@
|
|||||||
import type { EvalCase } from "braintrust";
|
|
||||||
import { extractCodeFixDataset } from "./dataset/extract.js";
|
|
||||||
import type { CodeFixTestCase } from "./dataset/types.js";
|
|
||||||
|
|
||||||
export type Input = { testCase: CodeFixTestCase };
|
|
||||||
|
|
||||||
export type Expected = {
|
|
||||||
correctCode: string;
|
|
||||||
correctLanguage?: string;
|
|
||||||
};
|
|
||||||
|
|
||||||
export type Metadata = {
|
|
||||||
name: string;
|
|
||||||
skillName: string;
|
|
||||||
section: string;
|
|
||||||
referenceFile: string;
|
|
||||||
tags: string[];
|
|
||||||
};
|
|
||||||
|
|
||||||
export type Output = { llmOutput: string };
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Extract the feature category from a reference filename.
|
|
||||||
* e.g. "db-migrations-idempotent.md" → "db"
|
|
||||||
* "auth-core-sessions.md" → "auth"
|
|
||||||
*/
|
|
||||||
function featureCategory(filename: string): string {
|
|
||||||
return filename.replace(/\.md$/, "").split("-")[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
export function dataset(): EvalCase<Input, Expected, Metadata>[] {
|
|
||||||
return extractCodeFixDataset().map((tc) => ({
|
|
||||||
id: tc.id,
|
|
||||||
input: { testCase: tc },
|
|
||||||
tags: [
|
|
||||||
featureCategory(tc.referenceFilename),
|
|
||||||
tc.referenceFilename.replace(/\.md$/, ""),
|
|
||||||
],
|
|
||||||
expected: {
|
|
||||||
correctCode: tc.goodExample.code,
|
|
||||||
correctLanguage: tc.goodExample.language,
|
|
||||||
},
|
|
||||||
metadata: {
|
|
||||||
name: tc.title,
|
|
||||||
skillName: tc.skillName,
|
|
||||||
section: tc.section,
|
|
||||||
referenceFile: tc.referenceFilename,
|
|
||||||
tags: tc.tags,
|
|
||||||
},
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
@@ -1,277 +0,0 @@
|
|||||||
import { existsSync, readdirSync, readFileSync } from "node:fs";
|
|
||||||
import { basename, join, resolve } from "node:path";
|
|
||||||
import type { CodeFixTestCase } from "./types.js";
|
|
||||||
|
|
||||||
function findSkillsRoot(): string {
|
|
||||||
let dir = process.cwd();
|
|
||||||
for (let i = 0; i < 10; i++) {
|
|
||||||
const candidate = join(dir, "skills");
|
|
||||||
if (existsSync(candidate)) return candidate;
|
|
||||||
const parent = resolve(dir, "..");
|
|
||||||
if (parent === dir) break;
|
|
||||||
dir = parent;
|
|
||||||
}
|
|
||||||
throw new Error(
|
|
||||||
"Could not find skills/ directory. Run from the repository root or a subdirectory.",
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
const SKILLS_ROOT = findSkillsRoot();
|
|
||||||
|
|
||||||
// --- Duplicated from skills-build/src/parser.ts for isolation ---
|
|
||||||
|
|
||||||
interface CodeExample {
|
|
||||||
label: string;
|
|
||||||
description?: string;
|
|
||||||
code: string;
|
|
||||||
language?: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
function parseFrontmatter(content: string): {
|
|
||||||
frontmatter: Record<string, string>;
|
|
||||||
body: string;
|
|
||||||
} {
|
|
||||||
const frontmatter: Record<string, string> = {};
|
|
||||||
|
|
||||||
if (!content.startsWith("---")) {
|
|
||||||
return { frontmatter, body: content };
|
|
||||||
}
|
|
||||||
|
|
||||||
const endIndex = content.indexOf("---", 3);
|
|
||||||
if (endIndex === -1) {
|
|
||||||
return { frontmatter, body: content };
|
|
||||||
}
|
|
||||||
|
|
||||||
const frontmatterContent = content.slice(3, endIndex).trim();
|
|
||||||
const body = content.slice(endIndex + 3).trim();
|
|
||||||
|
|
||||||
for (const line of frontmatterContent.split("\n")) {
|
|
||||||
const colonIndex = line.indexOf(":");
|
|
||||||
if (colonIndex === -1) continue;
|
|
||||||
|
|
||||||
const key = line.slice(0, colonIndex).trim();
|
|
||||||
let value = line.slice(colonIndex + 1).trim();
|
|
||||||
|
|
||||||
if (
|
|
||||||
(value.startsWith('"') && value.endsWith('"')) ||
|
|
||||||
(value.startsWith("'") && value.endsWith("'"))
|
|
||||||
) {
|
|
||||||
value = value.slice(1, -1);
|
|
||||||
}
|
|
||||||
|
|
||||||
frontmatter[key] = value;
|
|
||||||
}
|
|
||||||
|
|
||||||
return { frontmatter, body };
|
|
||||||
}
|
|
||||||
|
|
||||||
function extractTitle(body: string): string | null {
|
|
||||||
const match = body.match(/^##\s+(.+)$/m);
|
|
||||||
return match ? match[1].trim() : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
interface Section {
|
|
||||||
title: string;
|
|
||||||
explanation: string;
|
|
||||||
examples: CodeExample[];
|
|
||||||
}
|
|
||||||
|
|
||||||
function extractSections(body: string): Section[] {
|
|
||||||
const sections: Section[] = [];
|
|
||||||
const lines = body.split("\n");
|
|
||||||
|
|
||||||
let currentTitle = "";
|
|
||||||
let explanationLines: string[] = [];
|
|
||||||
let currentExamples: CodeExample[] = [];
|
|
||||||
let currentLabel = "";
|
|
||||||
let currentDescription = "";
|
|
||||||
let inCodeBlock = false;
|
|
||||||
let codeBlockLang = "";
|
|
||||||
let codeBlockContent: string[] = [];
|
|
||||||
let collectingExplanation = false;
|
|
||||||
|
|
||||||
function flushExample() {
|
|
||||||
if (currentLabel && codeBlockContent.length > 0) {
|
|
||||||
currentExamples.push({
|
|
||||||
label: currentLabel,
|
|
||||||
description: currentDescription || undefined,
|
|
||||||
code: codeBlockContent.join("\n"),
|
|
||||||
language: codeBlockLang || undefined,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
currentLabel = "";
|
|
||||||
currentDescription = "";
|
|
||||||
codeBlockContent = [];
|
|
||||||
codeBlockLang = "";
|
|
||||||
}
|
|
||||||
|
|
||||||
function flushSection() {
|
|
||||||
if (currentTitle && currentExamples.length > 0) {
|
|
||||||
sections.push({
|
|
||||||
title: currentTitle,
|
|
||||||
explanation: explanationLines.join("\n").trim(),
|
|
||||||
examples: currentExamples,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
currentExamples = [];
|
|
||||||
explanationLines = [];
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const line of lines) {
|
|
||||||
if (line.startsWith("## ") && !inCodeBlock) {
|
|
||||||
flushExample();
|
|
||||||
flushSection();
|
|
||||||
currentTitle = line.replace(/^##\s+/, "").trim();
|
|
||||||
collectingExplanation = true;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
const labelMatch = line.match(
|
|
||||||
/^\*\*([^*]+?)(?:\s*\(([^)]+)\))?\s*:\*\*\s*$/,
|
|
||||||
);
|
|
||||||
if (labelMatch && !inCodeBlock) {
|
|
||||||
collectingExplanation = false;
|
|
||||||
flushExample();
|
|
||||||
currentLabel = labelMatch[1].trim();
|
|
||||||
currentDescription = labelMatch[2]?.trim() || "";
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (line.startsWith("```") && !inCodeBlock) {
|
|
||||||
collectingExplanation = false;
|
|
||||||
inCodeBlock = true;
|
|
||||||
codeBlockLang = line.slice(3).trim();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (line.startsWith("```") && inCodeBlock) {
|
|
||||||
inCodeBlock = false;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (inCodeBlock) {
|
|
||||||
codeBlockContent.push(line);
|
|
||||||
} else if (collectingExplanation) {
|
|
||||||
explanationLines.push(line);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
flushExample();
|
|
||||||
flushSection();
|
|
||||||
|
|
||||||
return sections;
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Duplicated from skills-build/src/validate.ts ---
|
|
||||||
|
|
||||||
function isBadExample(label: string): boolean {
|
|
||||||
const lower = label.toLowerCase();
|
|
||||||
return (
|
|
||||||
lower.includes("incorrect") ||
|
|
||||||
lower.includes("wrong") ||
|
|
||||||
lower.includes("bad")
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function isGoodExample(label: string): boolean {
|
|
||||||
const lower = label.toLowerCase();
|
|
||||||
return (
|
|
||||||
lower.includes("correct") ||
|
|
||||||
lower.includes("good") ||
|
|
||||||
lower.includes("usage") ||
|
|
||||||
lower.includes("implementation") ||
|
|
||||||
lower.includes("example") ||
|
|
||||||
lower.includes("recommended")
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Extraction logic ---
|
|
||||||
|
|
||||||
function pairExamples(
|
|
||||||
examples: CodeExample[],
|
|
||||||
): Array<{ bad: CodeExample; good: CodeExample }> {
|
|
||||||
const pairs: Array<{ bad: CodeExample; good: CodeExample }> = [];
|
|
||||||
|
|
||||||
for (let i = 0; i < examples.length - 1; i++) {
|
|
||||||
if (
|
|
||||||
isBadExample(examples[i].label) &&
|
|
||||||
isGoodExample(examples[i + 1].label)
|
|
||||||
) {
|
|
||||||
pairs.push({ bad: examples[i], good: examples[i + 1] });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return pairs;
|
|
||||||
}
|
|
||||||
|
|
||||||
function discoverSkillNames(): string[] {
|
|
||||||
if (!existsSync(SKILLS_ROOT)) return [];
|
|
||||||
|
|
||||||
return readdirSync(SKILLS_ROOT, { withFileTypes: true })
|
|
||||||
.filter((d) => d.isDirectory())
|
|
||||||
.filter((d) => existsSync(join(SKILLS_ROOT, d.name, "SKILL.md")))
|
|
||||||
.map((d) => d.name);
|
|
||||||
}
|
|
||||||
|
|
||||||
function getMarkdownFiles(dir: string): string[] {
|
|
||||||
if (!existsSync(dir)) return [];
|
|
||||||
|
|
||||||
return readdirSync(dir)
|
|
||||||
.filter((f) => f.endsWith(".md") && !f.startsWith("_"))
|
|
||||||
.map((f) => join(dir, f));
|
|
||||||
}
|
|
||||||
|
|
||||||
export function extractCodeFixDataset(skillName?: string): CodeFixTestCase[] {
|
|
||||||
const skills = skillName ? [skillName] : discoverSkillNames();
|
|
||||||
const testCases: CodeFixTestCase[] = [];
|
|
||||||
|
|
||||||
for (const skill of skills) {
|
|
||||||
const referencesDir = join(SKILLS_ROOT, skill, "references");
|
|
||||||
const files = getMarkdownFiles(referencesDir);
|
|
||||||
|
|
||||||
for (const filePath of files) {
|
|
||||||
const content = readFileSync(filePath, "utf-8");
|
|
||||||
const { frontmatter, body } = parseFrontmatter(content);
|
|
||||||
const fileTitle =
|
|
||||||
frontmatter.title || extractTitle(body) || basename(filePath, ".md");
|
|
||||||
const tags = frontmatter.tags?.split(",").map((t) => t.trim()) || [];
|
|
||||||
const section = basename(filePath, ".md").split("-")[0];
|
|
||||||
|
|
||||||
const sections = extractSections(body);
|
|
||||||
let pairIndex = 0;
|
|
||||||
|
|
||||||
for (const sec of sections) {
|
|
||||||
const pairs = pairExamples(sec.examples);
|
|
||||||
|
|
||||||
for (const { bad, good } of pairs) {
|
|
||||||
testCases.push({
|
|
||||||
id: `${skill}/${basename(filePath, ".md")}#${pairIndex}`,
|
|
||||||
skillName: skill,
|
|
||||||
referenceFile: filePath,
|
|
||||||
referenceFilename: basename(filePath),
|
|
||||||
title: sec.title || fileTitle,
|
|
||||||
explanation: sec.explanation,
|
|
||||||
section,
|
|
||||||
tags,
|
|
||||||
pairIndex,
|
|
||||||
badExample: {
|
|
||||||
label: bad.label,
|
|
||||||
description: bad.description,
|
|
||||||
code: bad.code,
|
|
||||||
language: bad.language,
|
|
||||||
},
|
|
||||||
goodExample: {
|
|
||||||
label: good.label,
|
|
||||||
description: good.description,
|
|
||||||
code: good.code,
|
|
||||||
language: good.language,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
pairIndex++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return testCases;
|
|
||||||
}
|
|
||||||
@@ -1,24 +0,0 @@
|
|||||||
export interface CodeFixTestCase {
|
|
||||||
/** Unique ID, e.g. "supabase/db-rls-mandatory#0" */
|
|
||||||
id: string;
|
|
||||||
skillName: string;
|
|
||||||
referenceFile: string;
|
|
||||||
referenceFilename: string;
|
|
||||||
title: string;
|
|
||||||
explanation: string;
|
|
||||||
section: string;
|
|
||||||
tags: string[];
|
|
||||||
pairIndex: number;
|
|
||||||
badExample: {
|
|
||||||
label: string;
|
|
||||||
description?: string;
|
|
||||||
code: string;
|
|
||||||
language?: string;
|
|
||||||
};
|
|
||||||
goodExample: {
|
|
||||||
label: string;
|
|
||||||
description?: string;
|
|
||||||
code: string;
|
|
||||||
language?: string;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -1,47 +0,0 @@
|
|||||||
export interface EvalModelConfig {
|
|
||||||
/** Model ID passed to the Braintrust proxy */
|
|
||||||
id: string;
|
|
||||||
/** Human-readable label for dashboards */
|
|
||||||
label: string;
|
|
||||||
/** Provider name for display/grouping */
|
|
||||||
provider: string;
|
|
||||||
/** Whether to include in CI runs by default */
|
|
||||||
ci: boolean;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Models to evaluate. Add/remove entries to change the eval matrix.
|
|
||||||
* Set `ci: false` to exclude expensive models from automated CI runs.
|
|
||||||
*/
|
|
||||||
export const EVAL_MODELS: EvalModelConfig[] = [
|
|
||||||
{
|
|
||||||
id: "claude-sonnet-4-5-20250929",
|
|
||||||
label: "Claude Sonnet 4.5",
|
|
||||||
provider: "anthropic",
|
|
||||||
ci: true,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
id: "gpt-5.3",
|
|
||||||
label: "GPT 5.3",
|
|
||||||
provider: "openai",
|
|
||||||
ci: true,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
id: "gpt-5.2",
|
|
||||||
label: "GPT 5.2",
|
|
||||||
provider: "openai",
|
|
||||||
ci: true,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
id: "gemini-3-pro",
|
|
||||||
label: "Gemini 3.0 Pro",
|
|
||||||
provider: "google",
|
|
||||||
ci: true,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
id: "claude-opus-4-6",
|
|
||||||
label: "Claude Opus 4.6",
|
|
||||||
provider: "anthropic",
|
|
||||||
ci: false,
|
|
||||||
},
|
|
||||||
];
|
|
||||||
@@ -1,52 +0,0 @@
|
|||||||
import type { AnthropicProvider } from "@ai-sdk/anthropic";
|
|
||||||
import { anthropic } from "@ai-sdk/anthropic";
|
|
||||||
import { createOpenAI } from "@ai-sdk/openai";
|
|
||||||
import type { LanguageModel } from "ai";
|
|
||||||
|
|
||||||
/** Model ID accepted by the Anthropic provider (string literal union + string). */
|
|
||||||
export type AnthropicModelId = Parameters<AnthropicProvider["chat"]>[0];
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Braintrust AI proxy — routes to any provider (Anthropic, OpenAI, Google)
|
|
||||||
* via a single OpenAI-compatible endpoint.
|
|
||||||
*
|
|
||||||
* Provider API keys are configured in the Braintrust dashboard at
|
|
||||||
* project or org level. The x-bt-parent header scopes the request to
|
|
||||||
* the project so project-level keys are resolved.
|
|
||||||
*/
|
|
||||||
const braintrustProxy = createOpenAI({
|
|
||||||
baseURL: "https://api.braintrust.dev/v1/proxy",
|
|
||||||
apiKey: process.env.BRAINTRUST_API_KEY ?? "",
|
|
||||||
headers: process.env.BRAINTRUST_PROJECT_ID
|
|
||||||
? { "x-bt-parent": `project_id:${process.env.BRAINTRUST_PROJECT_ID}` }
|
|
||||||
: undefined,
|
|
||||||
});
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get a model for the eval task. Claude models use the Anthropic SDK
|
|
||||||
* directly (via ANTHROPIC_API_KEY). All other models route through the
|
|
||||||
* Braintrust proxy (keys configured at the org level in Braintrust).
|
|
||||||
*/
|
|
||||||
export function getProxyModel(modelId: string): LanguageModel {
|
|
||||||
if (modelId.startsWith("claude")) {
|
|
||||||
return anthropic(modelId as AnthropicModelId);
|
|
||||||
}
|
|
||||||
return braintrustProxy(modelId);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get a model using direct provider SDKs. Used for the judge model which
|
|
||||||
* is always Claude and uses ANTHROPIC_API_KEY directly (no proxy).
|
|
||||||
*/
|
|
||||||
export function getModel(modelId: string): LanguageModel {
|
|
||||||
if (modelId.startsWith("claude")) {
|
|
||||||
return anthropic(modelId as AnthropicModelId);
|
|
||||||
}
|
|
||||||
|
|
||||||
return getProxyModel(modelId);
|
|
||||||
}
|
|
||||||
|
|
||||||
export function getJudgeModel(): LanguageModel {
|
|
||||||
const judgeModelId = process.env.EVAL_JUDGE_MODEL || "claude-opus-4-6";
|
|
||||||
return getModel(judgeModelId);
|
|
||||||
}
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
import type { CodeFixTestCase } from "../dataset/types.js";
|
|
||||||
|
|
||||||
export function buildCodeFixSystemPrompt(): string {
|
|
||||||
return `You are a senior Supabase developer and database architect. You fix code to follow Supabase best practices including:
|
|
||||||
- Row Level Security (RLS) policies
|
|
||||||
- Proper authentication patterns
|
|
||||||
- Safe migration workflows
|
|
||||||
- Correct SDK usage patterns
|
|
||||||
- Edge Function best practices
|
|
||||||
- Connection pooling configuration
|
|
||||||
- Security-first defaults
|
|
||||||
|
|
||||||
When fixing code, ensure the fix is complete, production-ready, and follows the latest Supabase conventions. Return only the corrected code inside a single code block.`;
|
|
||||||
}
|
|
||||||
|
|
||||||
export function buildCodeFixPrompt(testCase: CodeFixTestCase): string {
|
|
||||||
const langHint = testCase.badExample.language
|
|
||||||
? ` (${testCase.badExample.language})`
|
|
||||||
: "";
|
|
||||||
|
|
||||||
return `The following code has a problem related to: ${testCase.title}
|
|
||||||
|
|
||||||
Context: ${testCase.explanation}
|
|
||||||
|
|
||||||
Here is the problematic code${langHint}:
|
|
||||||
|
|
||||||
\`\`\`${testCase.badExample.language || ""}
|
|
||||||
${testCase.badExample.code}
|
|
||||||
\`\`\`
|
|
||||||
${testCase.badExample.description ? `\nIssue hint: ${testCase.badExample.description}` : ""}
|
|
||||||
|
|
||||||
Fix this code to follow Supabase best practices. Return ONLY the corrected code inside a single code block. Do not include any explanation outside the code block.`;
|
|
||||||
}
|
|
||||||
189
packages/evals/src/runner.ts
Normal file
189
packages/evals/src/runner.ts
Normal file
@@ -0,0 +1,189 @@
|
|||||||
|
import { existsSync, readdirSync, readFileSync } from "node:fs";
|
||||||
|
import { join, resolve } from "node:path";
|
||||||
|
import { runAgent } from "./runner/agent.js";
|
||||||
|
import { uploadToBraintrust } from "./runner/braintrust.js";
|
||||||
|
import { preflight } from "./runner/preflight.js";
|
||||||
|
import { listModifiedFiles, printSummary } from "./runner/results.js";
|
||||||
|
import { createWorkspace } from "./runner/scaffold.js";
|
||||||
|
import { runTests } from "./runner/test.js";
|
||||||
|
import type { EvalRunResult, EvalScenario } from "./types.js";
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Configuration from environment
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const DEFAULT_MODEL = "claude-sonnet-4-5-20250929";
|
||||||
|
const AGENT_TIMEOUT = 30 * 60 * 1000; // 30 minutes
|
||||||
|
|
||||||
|
const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL;
|
||||||
|
const scenarioFilter = process.env.EVAL_SCENARIO;
|
||||||
|
const runBaseline = process.env.EVAL_BASELINE === "true";
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Discover scenarios
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
function findEvalsDir(): string {
|
||||||
|
let dir = process.cwd();
|
||||||
|
for (let i = 0; i < 10; i++) {
|
||||||
|
const candidate = join(dir, "packages", "evals", "evals");
|
||||||
|
if (existsSync(candidate)) return candidate;
|
||||||
|
const parent = resolve(dir, "..");
|
||||||
|
if (parent === dir) break;
|
||||||
|
dir = parent;
|
||||||
|
}
|
||||||
|
throw new Error("Could not find packages/evals/evals/ directory");
|
||||||
|
}
|
||||||
|
|
||||||
|
function discoverScenarios(): EvalScenario[] {
|
||||||
|
const evalsDir = findEvalsDir();
|
||||||
|
const dirs = readdirSync(evalsDir, { withFileTypes: true }).filter(
|
||||||
|
(d) => d.isDirectory() && existsSync(join(evalsDir, d.name, "PROMPT.md")),
|
||||||
|
);
|
||||||
|
|
||||||
|
return dirs.map((d) => ({
|
||||||
|
id: d.name,
|
||||||
|
name: d.name,
|
||||||
|
tags: [],
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Run a single eval
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async function runEval(
|
||||||
|
scenario: EvalScenario,
|
||||||
|
skillEnabled: boolean,
|
||||||
|
): Promise<EvalRunResult> {
|
||||||
|
const evalsDir = findEvalsDir();
|
||||||
|
const evalDir = join(evalsDir, scenario.id);
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
`\n--- ${scenario.id} (${skillEnabled ? "with-skill" : "baseline"}) ---`,
|
||||||
|
);
|
||||||
|
|
||||||
|
// 1. Create isolated workspace
|
||||||
|
const { workspacePath, cleanup } = createWorkspace({
|
||||||
|
evalDir,
|
||||||
|
skillEnabled,
|
||||||
|
});
|
||||||
|
console.log(` Workspace: ${workspacePath}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// 2. Read the prompt
|
||||||
|
const prompt = readFileSync(join(evalDir, "PROMPT.md"), "utf-8").trim();
|
||||||
|
|
||||||
|
// 3. Run the agent
|
||||||
|
console.log(` Running agent (${model})...`);
|
||||||
|
const agentResult = await runAgent({
|
||||||
|
cwd: workspacePath,
|
||||||
|
prompt,
|
||||||
|
model,
|
||||||
|
timeout: AGENT_TIMEOUT,
|
||||||
|
skillEnabled,
|
||||||
|
});
|
||||||
|
console.log(
|
||||||
|
` Agent finished in ${(agentResult.duration / 1000).toFixed(1)}s`,
|
||||||
|
);
|
||||||
|
|
||||||
|
// 4. Run the hidden tests
|
||||||
|
const evalFilePath = existsSync(join(evalDir, "EVAL.tsx"))
|
||||||
|
? join(evalDir, "EVAL.tsx")
|
||||||
|
: join(evalDir, "EVAL.ts");
|
||||||
|
|
||||||
|
console.log(" Running tests...");
|
||||||
|
const testResult = await runTests({
|
||||||
|
workspacePath,
|
||||||
|
evalFilePath,
|
||||||
|
});
|
||||||
|
console.log(
|
||||||
|
` Tests: ${testResult.passedCount}/${testResult.totalCount} passed`,
|
||||||
|
);
|
||||||
|
|
||||||
|
// 5. Collect modified files
|
||||||
|
const filesModified = listModifiedFiles(workspacePath, evalDir);
|
||||||
|
|
||||||
|
return {
|
||||||
|
scenario: scenario.id,
|
||||||
|
agent: "claude-code",
|
||||||
|
model,
|
||||||
|
skillEnabled,
|
||||||
|
status: testResult.passed ? "passed" : "failed",
|
||||||
|
duration: agentResult.duration,
|
||||||
|
testOutput: testResult.output,
|
||||||
|
agentOutput: agentResult.output,
|
||||||
|
testsPassed: testResult.passedCount,
|
||||||
|
testsTotal: testResult.totalCount,
|
||||||
|
filesModified,
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
const err = error as Error;
|
||||||
|
return {
|
||||||
|
scenario: scenario.id,
|
||||||
|
agent: "claude-code",
|
||||||
|
model,
|
||||||
|
skillEnabled,
|
||||||
|
status: "error",
|
||||||
|
duration: 0,
|
||||||
|
testOutput: "",
|
||||||
|
agentOutput: "",
|
||||||
|
testsPassed: 0,
|
||||||
|
testsTotal: 0,
|
||||||
|
filesModified: [],
|
||||||
|
error: err.message,
|
||||||
|
};
|
||||||
|
} finally {
|
||||||
|
cleanup();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Main
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
preflight();
|
||||||
|
|
||||||
|
console.log("Supabase Skills Evals");
|
||||||
|
console.log(`Model: ${model}`);
|
||||||
|
console.log(`Baseline: ${runBaseline}`);
|
||||||
|
|
||||||
|
let scenarios = discoverScenarios();
|
||||||
|
|
||||||
|
if (scenarioFilter) {
|
||||||
|
scenarios = scenarios.filter((s) => s.id === scenarioFilter);
|
||||||
|
if (scenarios.length === 0) {
|
||||||
|
console.error(`Scenario not found: ${scenarioFilter}`);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Scenarios: ${scenarios.map((s) => s.id).join(", ")}`);
|
||||||
|
|
||||||
|
const results: EvalRunResult[] = [];
|
||||||
|
|
||||||
|
for (const scenario of scenarios) {
|
||||||
|
// Run with skill enabled
|
||||||
|
const withSkill = await runEval(scenario, true);
|
||||||
|
results.push(withSkill);
|
||||||
|
|
||||||
|
// Optionally run baseline (no skill)
|
||||||
|
if (runBaseline) {
|
||||||
|
const baseline = await runEval(scenario, false);
|
||||||
|
results.push(baseline);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printSummary(results);
|
||||||
|
|
||||||
|
if (process.env.BRAINTRUST_UPLOAD === "true") {
|
||||||
|
console.log("\nUploading to Braintrust...");
|
||||||
|
await uploadToBraintrust(results);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((err) => {
|
||||||
|
console.error("Fatal error:", err);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
82
packages/evals/src/runner/agent.ts
Normal file
82
packages/evals/src/runner/agent.ts
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
import { spawn } from "node:child_process";
|
||||||
|
|
||||||
|
export interface AgentRunResult {
|
||||||
|
output: string;
|
||||||
|
duration: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Invoke Claude Code in print mode as a subprocess.
|
||||||
|
*
|
||||||
|
* The agent operates in the workspace directory and can read/write files.
|
||||||
|
* When the skill is installed (symlinked into workspace), Claude Code
|
||||||
|
* discovers it automatically and uses it for guidance.
|
||||||
|
*/
|
||||||
|
export async function runAgent(opts: {
|
||||||
|
cwd: string;
|
||||||
|
prompt: string;
|
||||||
|
model: string;
|
||||||
|
timeout: number;
|
||||||
|
skillEnabled: boolean;
|
||||||
|
}): Promise<AgentRunResult> {
|
||||||
|
const start = Date.now();
|
||||||
|
|
||||||
|
const args = [
|
||||||
|
"-p", // Print mode (non-interactive)
|
||||||
|
"--output-format",
|
||||||
|
"text",
|
||||||
|
"--model",
|
||||||
|
opts.model,
|
||||||
|
"--no-session-persistence",
|
||||||
|
"--dangerously-skip-permissions",
|
||||||
|
"--tools",
|
||||||
|
"Edit,Write,Bash,Read,Glob,Grep",
|
||||||
|
];
|
||||||
|
|
||||||
|
// Disable skills for baseline runs so the agent relies on innate knowledge
|
||||||
|
if (!opts.skillEnabled) {
|
||||||
|
args.push("--disable-slash-commands");
|
||||||
|
}
|
||||||
|
|
||||||
|
const env = { ...process.env };
|
||||||
|
// Remove all Claude-related env vars to avoid nested-session detection
|
||||||
|
for (const key of Object.keys(env)) {
|
||||||
|
if (key === "CLAUDECODE" || key.startsWith("CLAUDE_")) {
|
||||||
|
delete env[key];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Promise<AgentRunResult>((resolve) => {
|
||||||
|
const child = spawn("claude", args, {
|
||||||
|
cwd: opts.cwd,
|
||||||
|
env,
|
||||||
|
stdio: ["pipe", "pipe", "pipe"],
|
||||||
|
});
|
||||||
|
|
||||||
|
// Pipe prompt via stdin and close — this is the standard way to
|
||||||
|
// pass multi-line prompts to `claude -p`.
|
||||||
|
child.stdin.write(opts.prompt);
|
||||||
|
child.stdin.end();
|
||||||
|
|
||||||
|
let stdout = "";
|
||||||
|
let stderr = "";
|
||||||
|
child.stdout.on("data", (d: Buffer) => {
|
||||||
|
stdout += d.toString();
|
||||||
|
});
|
||||||
|
child.stderr.on("data", (d: Buffer) => {
|
||||||
|
stderr += d.toString();
|
||||||
|
});
|
||||||
|
|
||||||
|
const timer = setTimeout(() => {
|
||||||
|
child.kill();
|
||||||
|
}, opts.timeout);
|
||||||
|
|
||||||
|
child.on("close", () => {
|
||||||
|
clearTimeout(timer);
|
||||||
|
resolve({
|
||||||
|
output: stdout || stderr,
|
||||||
|
duration: Date.now() - start,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
53
packages/evals/src/runner/braintrust.ts
Normal file
53
packages/evals/src/runner/braintrust.ts
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
import assert from "node:assert";
|
||||||
|
import { init } from "braintrust";
|
||||||
|
import type { EvalRunResult } from "../types.js";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Upload eval results to Braintrust as an experiment.
|
||||||
|
*
|
||||||
|
* Each EvalRunResult becomes a row in the experiment with:
|
||||||
|
* - input: scenario name + config
|
||||||
|
* - output: agent output summary
|
||||||
|
* - scores: pass (0 or 1)
|
||||||
|
* - metadata: model, skill toggle, duration, files modified
|
||||||
|
*/
|
||||||
|
export async function uploadToBraintrust(
|
||||||
|
results: EvalRunResult[],
|
||||||
|
): Promise<void> {
|
||||||
|
assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
|
||||||
|
assert(process.env.BRAINTRUST_PROJECT_ID, "BRAINTRUST_PROJECT_ID is not set");
|
||||||
|
|
||||||
|
const experiment = await init({
|
||||||
|
projectId: process.env.BRAINTRUST_PROJECT_ID,
|
||||||
|
});
|
||||||
|
|
||||||
|
for (const r of results) {
|
||||||
|
experiment.log({
|
||||||
|
input: {
|
||||||
|
scenario: r.scenario,
|
||||||
|
skillEnabled: r.skillEnabled,
|
||||||
|
},
|
||||||
|
output: {
|
||||||
|
status: r.status,
|
||||||
|
filesModified: r.filesModified,
|
||||||
|
testOutput: r.testOutput,
|
||||||
|
},
|
||||||
|
scores: {
|
||||||
|
pass: r.status === "passed" ? 1 : 0,
|
||||||
|
},
|
||||||
|
metadata: {
|
||||||
|
agent: r.agent,
|
||||||
|
model: r.model,
|
||||||
|
skillEnabled: r.skillEnabled,
|
||||||
|
duration: r.duration,
|
||||||
|
testsPassed: r.testsPassed,
|
||||||
|
testsTotal: r.testsTotal,
|
||||||
|
...(r.error ? { error: r.error } : {}),
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const summary = await experiment.summarize();
|
||||||
|
console.log(`\nBraintrust experiment: ${summary.experimentUrl}`);
|
||||||
|
await experiment.close();
|
||||||
|
}
|
||||||
43
packages/evals/src/runner/preflight.ts
Normal file
43
packages/evals/src/runner/preflight.ts
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
import { execFileSync } from "node:child_process";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verify the host environment has everything needed before spending
|
||||||
|
* API credits on an eval run.
|
||||||
|
*
|
||||||
|
* Checks: Node >= 20, Docker running, claude CLI available.
|
||||||
|
*/
|
||||||
|
export function preflight(): void {
|
||||||
|
const errors: string[] = [];
|
||||||
|
|
||||||
|
// Node.js >= 20
|
||||||
|
const [major] = process.versions.node.split(".").map(Number);
|
||||||
|
if (major < 20) {
|
||||||
|
errors.push(`Node.js >= 20 required (found ${process.versions.node})`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Docker daemon running
|
||||||
|
try {
|
||||||
|
execFileSync("docker", ["info"], { stdio: "ignore", timeout: 10_000 });
|
||||||
|
} catch {
|
||||||
|
errors.push("Docker is not running (required by supabase CLI)");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Claude CLI available
|
||||||
|
try {
|
||||||
|
execFileSync("claude", ["--version"], {
|
||||||
|
stdio: "ignore",
|
||||||
|
timeout: 10_000,
|
||||||
|
});
|
||||||
|
} catch {
|
||||||
|
errors.push("claude CLI not found on PATH");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (errors.length > 0) {
|
||||||
|
console.error("Preflight checks failed:\n");
|
||||||
|
for (const e of errors) {
|
||||||
|
console.error(` - ${e}`);
|
||||||
|
}
|
||||||
|
console.error("");
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
68
packages/evals/src/runner/results.ts
Normal file
68
packages/evals/src/runner/results.ts
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
import { mkdirSync, readdirSync, statSync, writeFileSync } from "node:fs";
|
||||||
|
import { join, resolve } from "node:path";
|
||||||
|
import type { EvalRunResult } from "../types.js";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* List files created or modified by the agent in the workspace.
|
||||||
|
* Compares against the original eval directory to find new files.
|
||||||
|
*/
|
||||||
|
export function listModifiedFiles(
|
||||||
|
workspacePath: string,
|
||||||
|
originalEvalDir: string,
|
||||||
|
): string[] {
|
||||||
|
const modified: string[] = [];
|
||||||
|
|
||||||
|
function walk(dir: string, prefix: string) {
|
||||||
|
const entries = readdirSync(dir, { withFileTypes: true });
|
||||||
|
for (const entry of entries) {
|
||||||
|
if (
|
||||||
|
entry.name === "node_modules" ||
|
||||||
|
entry.name === "skills" ||
|
||||||
|
entry.name === "EVAL.ts" ||
|
||||||
|
entry.name === "EVAL.tsx"
|
||||||
|
)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
const relPath = prefix ? `${prefix}/${entry.name}` : entry.name;
|
||||||
|
const fullPath = join(dir, entry.name);
|
||||||
|
|
||||||
|
if (entry.isDirectory()) {
|
||||||
|
walk(fullPath, relPath);
|
||||||
|
} else {
|
||||||
|
// Check if file is new (not in original eval dir)
|
||||||
|
const originalPath = join(originalEvalDir, relPath);
|
||||||
|
try {
|
||||||
|
statSync(originalPath);
|
||||||
|
} catch {
|
||||||
|
// File doesn't exist in original — it was created by the agent
|
||||||
|
modified.push(relPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
walk(workspacePath, "");
|
||||||
|
return modified;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Print a summary table of eval results. */
|
||||||
|
export function printSummary(results: EvalRunResult[]): void {
|
||||||
|
console.log("\n=== Eval Results ===\n");
|
||||||
|
|
||||||
|
for (const r of results) {
|
||||||
|
const icon = r.status === "passed" ? "PASS" : "FAIL";
|
||||||
|
const skill = r.skillEnabled ? "with-skill" : "baseline";
|
||||||
|
console.log(
|
||||||
|
`[${icon}] ${r.scenario} | ${r.model} | ${skill} | ${(r.duration / 1000).toFixed(1)}s`,
|
||||||
|
);
|
||||||
|
if (r.filesModified.length > 0) {
|
||||||
|
console.log(` Files: ${r.filesModified.join(", ")}`);
|
||||||
|
}
|
||||||
|
if (r.status === "error" && r.error) {
|
||||||
|
console.log(` Error: ${r.error}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const passed = results.filter((r) => r.status === "passed").length;
|
||||||
|
console.log(`\nTotal: ${passed}/${results.length} passed`);
|
||||||
|
}
|
||||||
65
packages/evals/src/runner/scaffold.ts
Normal file
65
packages/evals/src/runner/scaffold.ts
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
import {
|
||||||
|
cpSync,
|
||||||
|
existsSync,
|
||||||
|
mkdtempSync,
|
||||||
|
readdirSync,
|
||||||
|
rmSync,
|
||||||
|
symlinkSync,
|
||||||
|
} from "node:fs";
|
||||||
|
import { tmpdir } from "node:os";
|
||||||
|
import { join, resolve } from "node:path";
|
||||||
|
|
||||||
|
/** Walk up from cwd to find the repository root (contains skills/ and packages/). */
|
||||||
|
function findRepoRoot(): string {
|
||||||
|
let dir = process.cwd();
|
||||||
|
for (let i = 0; i < 10; i++) {
|
||||||
|
if (existsSync(join(dir, "skills")) && existsSync(join(dir, "packages"))) {
|
||||||
|
return dir;
|
||||||
|
}
|
||||||
|
const parent = resolve(dir, "..");
|
||||||
|
if (parent === dir) break;
|
||||||
|
dir = parent;
|
||||||
|
}
|
||||||
|
throw new Error("Could not find repository root (skills/ + packages/)");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create an isolated workspace for an eval run.
|
||||||
|
*
|
||||||
|
* 1. Copy the eval directory to a temp folder (excluding EVAL.ts)
|
||||||
|
* 2. Optionally symlink the supabase skill so Claude Code can discover it
|
||||||
|
*
|
||||||
|
* Returns the path to the workspace and a cleanup function.
|
||||||
|
*/
|
||||||
|
export function createWorkspace(opts: {
|
||||||
|
evalDir: string;
|
||||||
|
skillEnabled: boolean;
|
||||||
|
}): { workspacePath: string; cleanup: () => void } {
|
||||||
|
const repoRoot = findRepoRoot();
|
||||||
|
const workspacePath = mkdtempSync(join(tmpdir(), "supabase-eval-"));
|
||||||
|
|
||||||
|
// Copy eval directory, excluding EVAL.ts (hidden from agent)
|
||||||
|
const entries = readdirSync(opts.evalDir, { withFileTypes: true });
|
||||||
|
for (const entry of entries) {
|
||||||
|
if (entry.name === "EVAL.ts" || entry.name === "EVAL.tsx") continue;
|
||||||
|
const src = join(opts.evalDir, entry.name);
|
||||||
|
const dest = join(workspacePath, entry.name);
|
||||||
|
cpSync(src, dest, { recursive: true });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make the skill available to the agent by symlinking the skills dir
|
||||||
|
if (opts.skillEnabled) {
|
||||||
|
const skillsDir = join(repoRoot, "skills");
|
||||||
|
if (existsSync(skillsDir)) {
|
||||||
|
const destSkills = join(workspacePath, "skills");
|
||||||
|
symlinkSync(skillsDir, destSkills);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
workspacePath,
|
||||||
|
cleanup: () => {
|
||||||
|
rmSync(workspacePath, { recursive: true, force: true });
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
97
packages/evals/src/runner/test.ts
Normal file
97
packages/evals/src/runner/test.ts
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
import { execFile } from "node:child_process";
|
||||||
|
import { copyFileSync, existsSync, writeFileSync } from "node:fs";
|
||||||
|
import { dirname, join } from "node:path";
|
||||||
|
import { fileURLToPath } from "node:url";
|
||||||
|
import { promisify } from "node:util";
|
||||||
|
|
||||||
|
const __filename = fileURLToPath(import.meta.url);
|
||||||
|
const __dirname = dirname(__filename);
|
||||||
|
|
||||||
|
const exec = promisify(execFile);
|
||||||
|
|
||||||
|
export interface TestResult {
|
||||||
|
passed: boolean;
|
||||||
|
output: string;
|
||||||
|
/** Number of tests that passed */
|
||||||
|
passedCount: number;
|
||||||
|
/** Total number of tests */
|
||||||
|
totalCount: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run the hidden EVAL.ts tests against the agent's workspace.
|
||||||
|
*
|
||||||
|
* 1. Copy EVAL.ts into the workspace (agent is done, safe to expose)
|
||||||
|
* 2. Run vitest against it
|
||||||
|
* 3. Parse the output for pass/fail
|
||||||
|
*/
|
||||||
|
export async function runTests(opts: {
|
||||||
|
workspacePath: string;
|
||||||
|
evalFilePath: string;
|
||||||
|
}): Promise<TestResult> {
|
||||||
|
// Copy the hidden test file into the workspace
|
||||||
|
const evalFileName = opts.evalFilePath.endsWith(".tsx")
|
||||||
|
? "EVAL.tsx"
|
||||||
|
: "EVAL.ts";
|
||||||
|
const destPath = join(opts.workspacePath, evalFileName);
|
||||||
|
copyFileSync(opts.evalFilePath, destPath);
|
||||||
|
|
||||||
|
// Write a minimal vitest config that overrides the default include pattern
|
||||||
|
// so EVAL.ts (without .test. or .spec.) is picked up.
|
||||||
|
const vitestConfigPath = join(opts.workspacePath, "vitest.config.mjs");
|
||||||
|
if (!existsSync(vitestConfigPath)) {
|
||||||
|
writeFileSync(
|
||||||
|
vitestConfigPath,
|
||||||
|
`export default { test: { include: ["EVAL.{ts,tsx}"] } };\n`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use the vitest binary from the evals package (always available)
|
||||||
|
const evalsVitest = join(
|
||||||
|
__dirname,
|
||||||
|
"..",
|
||||||
|
"..",
|
||||||
|
"node_modules",
|
||||||
|
".bin",
|
||||||
|
"vitest",
|
||||||
|
);
|
||||||
|
const vitestBin = join(opts.workspacePath, "node_modules", ".bin", "vitest");
|
||||||
|
const cmd = existsSync(vitestBin) ? vitestBin : evalsVitest;
|
||||||
|
const args = ["run", evalFileName, "--reporter=verbose", "--no-color"];
|
||||||
|
|
||||||
|
try {
|
||||||
|
const { stdout, stderr } = await exec(cmd, args, {
|
||||||
|
cwd: opts.workspacePath,
|
||||||
|
timeout: 60_000,
|
||||||
|
env: { ...process.env },
|
||||||
|
maxBuffer: 5 * 1024 * 1024,
|
||||||
|
});
|
||||||
|
|
||||||
|
const output = `${stdout}\n${stderr}`;
|
||||||
|
return parseTestOutput(output);
|
||||||
|
} catch (error) {
|
||||||
|
const err = error as Error & { stdout?: string; stderr?: string };
|
||||||
|
const output = `${err.stdout ?? ""}\n${err.stderr ?? ""}`;
|
||||||
|
return parseTestOutput(output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseTestOutput(output: string): TestResult {
|
||||||
|
// Parse vitest output for pass/fail counts
|
||||||
|
// Format: "Tests N passed (M)" or "Tests N failed | M passed (T)"
|
||||||
|
const testsLine = output.match(
|
||||||
|
/Tests\s+(?:(\d+)\s+failed\s+\|\s+)?(\d+)\s+passed\s+\((\d+)\)/,
|
||||||
|
);
|
||||||
|
|
||||||
|
let passedCount = 0;
|
||||||
|
let totalCount = 0;
|
||||||
|
|
||||||
|
if (testsLine) {
|
||||||
|
passedCount = Number.parseInt(testsLine[2], 10);
|
||||||
|
totalCount = Number.parseInt(testsLine[3], 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
const passed = totalCount > 0 && passedCount === totalCount;
|
||||||
|
|
||||||
|
return { passed, output, passedCount, totalCount };
|
||||||
|
}
|
||||||
@@ -1,187 +0,0 @@
|
|||||||
import { generateText, Output } from "ai";
|
|
||||||
import type { EvalScorer } from "braintrust";
|
|
||||||
import { z } from "zod";
|
|
||||||
import type { CodeFixTestCase } from "./dataset/types.js";
|
|
||||||
import type { Expected, Input, Output as TaskOutput } from "./dataset.js";
|
|
||||||
import { getModel } from "./models.js";
|
|
||||||
|
|
||||||
const judgeModelId = process.env.EVAL_JUDGE_MODEL || "claude-opus-4-6";
|
|
||||||
|
|
||||||
const scoreSchema = z.object({
|
|
||||||
score: z
|
|
||||||
.number()
|
|
||||||
.describe("Score from 0 to 1 (0 = bad, 0.5 = partial, 1 = good)"),
|
|
||||||
reasoning: z.string().describe("Brief reasoning for the score"),
|
|
||||||
});
|
|
||||||
|
|
||||||
const SYSTEM_PROMPT =
|
|
||||||
"You are a precise, consistent evaluator of Supabase code fixes. You assess whether LLM-generated code correctly addresses Supabase anti-patterns by comparing against reference solutions. You are fair: functionally equivalent solutions that differ in style or approach from the reference still receive high scores. You are strict: partial fixes, missing security measures, or incorrect patterns receive low scores. Always provide specific evidence for your scoring.";
|
|
||||||
|
|
||||||
function buildContext(tc: CodeFixTestCase, llmOutput: string): string {
|
|
||||||
return `## Reference Information
|
|
||||||
|
|
||||||
**Topic:** ${tc.title}
|
|
||||||
**Explanation:** ${tc.explanation}
|
|
||||||
|
|
||||||
## Original Incorrect Code
|
|
||||||
|
|
||||||
\`\`\`${tc.badExample.language || ""}
|
|
||||||
${tc.badExample.code}
|
|
||||||
\`\`\`
|
|
||||||
|
|
||||||
## Reference Correct Code (ground truth)
|
|
||||||
|
|
||||||
\`\`\`${tc.goodExample.language || ""}
|
|
||||||
${tc.goodExample.code}
|
|
||||||
\`\`\`
|
|
||||||
|
|
||||||
## LLM's Attempted Fix
|
|
||||||
|
|
||||||
${llmOutput}`;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function judge(
|
|
||||||
prompt: string,
|
|
||||||
): Promise<{ score: number; reasoning: string }> {
|
|
||||||
const model = getModel(judgeModelId);
|
|
||||||
const { output } = await generateText({
|
|
||||||
model,
|
|
||||||
system: SYSTEM_PROMPT,
|
|
||||||
prompt,
|
|
||||||
output: Output.object({ schema: scoreSchema }),
|
|
||||||
temperature: 0.1,
|
|
||||||
maxRetries: 2,
|
|
||||||
});
|
|
||||||
if (!output) throw new Error("Judge returned no structured output");
|
|
||||||
return output;
|
|
||||||
}
|
|
||||||
|
|
||||||
export const correctnessScorer: EvalScorer<
|
|
||||||
Input,
|
|
||||||
TaskOutput,
|
|
||||||
Expected
|
|
||||||
> = async ({ input, output }) => {
|
|
||||||
const context = buildContext(input.testCase, output.llmOutput);
|
|
||||||
const result = await judge(`${context}
|
|
||||||
|
|
||||||
## Task
|
|
||||||
|
|
||||||
Evaluate **correctness**: Does the LLM's fix address the core issue identified in the incorrect code?
|
|
||||||
|
|
||||||
The fix does not need to be character-identical to the reference, but it must solve the same problem. Functionally equivalent or improved solutions should score well.
|
|
||||||
|
|
||||||
Score 1 if the fix fully addresses the core issue, 0.5 if it partially addresses it, 0 if it fails to address the core issue or introduces new problems.`);
|
|
||||||
|
|
||||||
return {
|
|
||||||
name: "Correctness",
|
|
||||||
score: result.score,
|
|
||||||
metadata: { reasoning: result.reasoning },
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
export const completenessScorer: EvalScorer<
|
|
||||||
Input,
|
|
||||||
TaskOutput,
|
|
||||||
Expected
|
|
||||||
> = async ({ input, output }) => {
|
|
||||||
const context = buildContext(input.testCase, output.llmOutput);
|
|
||||||
const result = await judge(`${context}
|
|
||||||
|
|
||||||
## Task
|
|
||||||
|
|
||||||
Evaluate **completeness**: Does the LLM's fix include ALL necessary changes shown in the reference?
|
|
||||||
|
|
||||||
Check for missing RLS enablement, missing policy clauses, missing columns, incomplete migrations, or any partial fixes. The fix should be production-ready.
|
|
||||||
|
|
||||||
Score 1 if all necessary changes are present, 0.5 if most changes are present but some are missing, 0 if significant changes are missing.`);
|
|
||||||
|
|
||||||
return {
|
|
||||||
name: "Completeness",
|
|
||||||
score: result.score,
|
|
||||||
metadata: { reasoning: result.reasoning },
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
export const bestPracticeScorer: EvalScorer<
|
|
||||||
Input,
|
|
||||||
TaskOutput,
|
|
||||||
Expected
|
|
||||||
> = async ({ input, output }) => {
|
|
||||||
const context = buildContext(input.testCase, output.llmOutput);
|
|
||||||
const result = await judge(`${context}
|
|
||||||
|
|
||||||
## Task
|
|
||||||
|
|
||||||
Evaluate **best practices**: Does the LLM's fix follow Supabase best practices as demonstrated in the reference?
|
|
||||||
|
|
||||||
Consider: RLS patterns, auth.users references, migration conventions, connection pooling, edge function patterns, SDK usage, and security-first defaults. Alternative correct approaches that achieve the same security/correctness goal are acceptable.
|
|
||||||
|
|
||||||
Score 1 if the fix follows best practices, 0.5 if it mostly follows best practices with minor deviations, 0 if it uses anti-patterns or ignores conventions.`);
|
|
||||||
|
|
||||||
return {
|
|
||||||
name: "Best Practice",
|
|
||||||
score: result.score,
|
|
||||||
metadata: { reasoning: result.reasoning },
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
export const regressionSafetyScorer: EvalScorer<
|
|
||||||
Input,
|
|
||||||
TaskOutput,
|
|
||||||
Expected
|
|
||||||
> = async ({ input, output }) => {
|
|
||||||
const context = buildContext(input.testCase, output.llmOutput);
|
|
||||||
const result = await judge(`${context}
|
|
||||||
|
|
||||||
## Task
|
|
||||||
|
|
||||||
Evaluate **regression safety**: Does the LLM's fix avoid introducing new problems?
|
|
||||||
|
|
||||||
Carefully check whether the fix:
|
|
||||||
- Breaks existing functionality that was working in the original code
|
|
||||||
- Removes security measures (RLS policies, auth checks, input validation) that were already present
|
|
||||||
- Changes function signatures, return types, or column names in ways that would break callers
|
|
||||||
- Introduces SQL injection, XSS, or other security vulnerabilities not present in the original
|
|
||||||
- Drops data, removes columns, or alters schemas destructively without necessity
|
|
||||||
- Changes behavior beyond the scope of the identified issue
|
|
||||||
|
|
||||||
The fix should repair the identified problem WITHOUT creating new ones. A fix that solves the original issue but breaks something else is dangerous in production.
|
|
||||||
|
|
||||||
Score 1 if the fix introduces no new problems. Score 0.5 if the fix introduces minor issues (e.g., slightly different naming that could confuse but not break). Score 0 if the fix introduces a new bug, security vulnerability, or breaking change.`);
|
|
||||||
|
|
||||||
return {
|
|
||||||
name: "Regression Safety",
|
|
||||||
score: result.score,
|
|
||||||
metadata: { reasoning: result.reasoning },
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
export const minimalityScorer: EvalScorer<
|
|
||||||
Input,
|
|
||||||
TaskOutput,
|
|
||||||
Expected
|
|
||||||
> = async ({ input, output }) => {
|
|
||||||
const context = buildContext(input.testCase, output.llmOutput);
|
|
||||||
const result = await judge(`${context}
|
|
||||||
|
|
||||||
## Task
|
|
||||||
|
|
||||||
Evaluate **minimality**: Does the LLM's fix make only the changes necessary to address the identified issue?
|
|
||||||
|
|
||||||
Check whether the fix:
|
|
||||||
- Rewrites or restructures code beyond what is needed to fix the problem
|
|
||||||
- Adds features, abstractions, or utilities not present in the reference solution
|
|
||||||
- Changes formatting, variable names, or style in unrelated parts of the code
|
|
||||||
- Adds excessive comments, logging, or error handling not required by the fix
|
|
||||||
- Over-engineers the solution (e.g., adding configuration options, generalization, or layers of abstraction when a simple targeted fix suffices)
|
|
||||||
|
|
||||||
Compare the scope of changes in the LLM's fix against the reference. The reference represents the ideal minimal fix. The LLM's fix should be similarly focused.
|
|
||||||
|
|
||||||
Score 1 if the fix is tightly scoped to the identified issue (similar scope to the reference). Score 0.5 if the fix includes some unnecessary changes but the core fix is present. Score 0 if the fix significantly over-reaches — rewriting large portions of code, adding unrelated features, or restructuring beyond what is needed.`);
|
|
||||||
|
|
||||||
return {
|
|
||||||
name: "Minimality",
|
|
||||||
score: result.score,
|
|
||||||
metadata: { reasoning: result.reasoning },
|
|
||||||
};
|
|
||||||
};
|
|
||||||
35
packages/evals/src/types.ts
Normal file
35
packages/evals/src/types.ts
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
export interface EvalScenario {
|
||||||
|
/** Directory name under evals/ */
|
||||||
|
id: string;
|
||||||
|
/** Human-readable name */
|
||||||
|
name: string;
|
||||||
|
/** Tags for filtering */
|
||||||
|
tags: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface AgentConfig {
|
||||||
|
/** Agent identifier */
|
||||||
|
agent: "claude-code";
|
||||||
|
/** Model to use */
|
||||||
|
model: string;
|
||||||
|
/** Whether the supabase skill is available */
|
||||||
|
skillEnabled: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface EvalRunResult {
|
||||||
|
scenario: string;
|
||||||
|
agent: string;
|
||||||
|
model: string;
|
||||||
|
skillEnabled: boolean;
|
||||||
|
status: "passed" | "failed" | "error";
|
||||||
|
duration: number;
|
||||||
|
testOutput: string;
|
||||||
|
agentOutput: string;
|
||||||
|
/** Number of vitest tests that passed */
|
||||||
|
testsPassed: number;
|
||||||
|
/** Total number of vitest tests */
|
||||||
|
testsTotal: number;
|
||||||
|
/** Files the agent created or modified in the workspace */
|
||||||
|
filesModified: string[];
|
||||||
|
error?: string;
|
||||||
|
}
|
||||||
@@ -12,5 +12,5 @@
|
|||||||
"resolveJsonModule": true
|
"resolveJsonModule": true
|
||||||
},
|
},
|
||||||
"include": ["src/**/*"],
|
"include": ["src/**/*"],
|
||||||
"exclude": ["node_modules", "dist"]
|
"exclude": ["node_modules", "dist", "evals"]
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user