mirror of
https://github.com/supabase/agent-skills.git
synced 2026-03-27 10:09:26 +08:00
workflow evals with one scenario
This commit is contained in:
@@ -1,5 +1,4 @@
|
||||
BRAINTRUST_API_KEY=
|
||||
BRAINTRUST_PROJECT_ID=
|
||||
ANTHROPIC_API_KEY=
|
||||
# Provider API keys for eval models are configured in the Braintrust dashboard
|
||||
# under Settings → AI providers (not needed locally).
|
||||
# Required for Braintrust upload (BRAINTRUST_UPLOAD=true)
|
||||
# BRAINTRUST_API_KEY=
|
||||
# BRAINTRUST_PROJECT_ID=
|
||||
|
||||
@@ -1,171 +1,108 @@
|
||||
# Evals — Agent Guide
|
||||
|
||||
This package evaluates whether LLMs correctly apply Supabase best practices
|
||||
using skill documentation as context. It uses
|
||||
[Braintrust](https://www.braintrust.dev/) for eval orchestration and the
|
||||
[Vercel AI SDK](https://sdk.vercel.ai/) for LLM calls.
|
||||
This package evaluates whether AI agents correctly implement Supabase tasks
|
||||
when using skill documentation. Modeled after
|
||||
[Vercel's next-evals-oss](https://github.com/vercel-labs/next-evals-oss): each
|
||||
eval is a self-contained project with a task prompt, the agent works on it, and
|
||||
hidden tests check the result. Binary pass/fail.
|
||||
|
||||
## Architecture
|
||||
|
||||
Two-step **LLM-as-judge** pattern powered by Braintrust's `Eval()`:
|
||||
|
||||
1. The **eval model** receives a prompt with skill context and produces a code
|
||||
fix. All eval model calls go through the **Braintrust AI proxy** — a single
|
||||
OpenAI-compatible endpoint that routes to any provider (Anthropic, OpenAI,
|
||||
Google, etc.).
|
||||
2. Five independent **judge scorers** (`claude-opus-4-6` via direct Anthropic
|
||||
API) evaluate the fix via structured output (Zod schemas via AI SDK's
|
||||
`Output.object()`).
|
||||
|
||||
The eval runs once per model in the model matrix, creating a separate Braintrust
|
||||
experiment per model for side-by-side comparison.
|
||||
|
||||
Key files:
|
||||
|
||||
```
|
||||
src/
|
||||
code-fix.eval.ts # Braintrust Eval() entry point (loops over models)
|
||||
dataset.ts # Maps extracted test cases to EvalCase format
|
||||
scorer.ts # Five AI SDK-based scorers (quality, safety, minimality)
|
||||
models.ts # Braintrust proxy + direct Anthropic provider
|
||||
models.config.ts # Model matrix (add/remove models here)
|
||||
dataset/
|
||||
types.ts # CodeFixTestCase interface
|
||||
extract.ts # Auto-extracts test cases from skill references
|
||||
prompts/
|
||||
code-fix.ts # System + user prompts for the eval model
|
||||
1. Create temp dir with project skeleton (PROMPT.md, supabase/ dir)
|
||||
2. Symlink supabase skill into workspace (or skip for baseline)
|
||||
3. Run: claude -p "prompt" --cwd /tmp/eval-xxx
|
||||
4. Agent reads skill, creates migrations/code in the workspace
|
||||
5. Copy hidden EVAL.ts into workspace, run vitest
|
||||
6. Capture pass/fail
|
||||
```
|
||||
|
||||
## How It Works
|
||||
The agent is **Claude Code** invoked via `claude -p` (print mode). It operates
|
||||
on a real filesystem in a temp directory and can read/write files freely.
|
||||
|
||||
**Test cases are auto-extracted** from `skills/*/references/*.md`. The extractor
|
||||
(`dataset/extract.ts`) finds consecutive `**Incorrect:**` / `**Correct:**` code
|
||||
block pairs under `##` sections. Each pair becomes one test case.
|
||||
## Eval Structure
|
||||
|
||||
Five independent scorers evaluate each fix (0–1 scale):
|
||||
Each eval lives in `evals/{scenario-name}/`:
|
||||
|
||||
- **Correctness** — does the fix address the core issue?
|
||||
- **Completeness** — does the fix include all necessary changes?
|
||||
- **Best Practice** — does the fix follow Supabase conventions?
|
||||
- **Regression Safety** — does the fix avoid introducing new problems (broken
|
||||
functionality, removed security measures, new vulnerabilities)?
|
||||
- **Minimality** — is the fix tightly scoped to the identified issue without
|
||||
unnecessary rewrites or over-engineering?
|
||||
|
||||
Each model in the matrix generates a separate Braintrust experiment. The
|
||||
dashboard supports side-by-side comparison of experiments.
|
||||
|
||||
## Adding Test Cases
|
||||
|
||||
No code changes needed. Add paired Incorrect/Correct blocks to any skill
|
||||
reference file. The extractor picks them up automatically.
|
||||
|
||||
Required format in a reference `.md` file:
|
||||
|
||||
```markdown
|
||||
## Section Title
|
||||
|
||||
Explanation of the issue.
|
||||
|
||||
**Incorrect:**
|
||||
|
||||
\```sql
|
||||
-- bad code
|
||||
\```
|
||||
|
||||
**Correct:**
|
||||
|
||||
\```sql
|
||||
-- good code
|
||||
\```
|
||||
```
|
||||
evals/auth-rls-new-project/
|
||||
PROMPT.md # Task description (visible to agent)
|
||||
EVAL.ts # Vitest assertions (hidden from agent during run)
|
||||
package.json # Minimal project manifest
|
||||
supabase/
|
||||
config.toml # Pre-initialized supabase config
|
||||
migrations/ # Empty — agent creates files here
|
||||
```
|
||||
|
||||
Rules:
|
||||
|
||||
- Pairs must be consecutive — an Incorrect block immediately followed by a
|
||||
Correct block
|
||||
- Labels are matched case-insensitively. Bad labels: `Incorrect`, `Wrong`, `Bad`.
|
||||
Good labels: `Correct`, `Good`, `Usage`, `Implementation`, `Example`,
|
||||
`Recommended`
|
||||
- The optional parenthetical in the label becomes the `description` field:
|
||||
`**Incorrect (missing RLS):**`
|
||||
- Files prefixed with `_` (like `_sections.md`, `_template.md`) are skipped
|
||||
- Each pair gets an ID like `supabase/db-rls-mandatory#0` (skill/filename#index)
|
||||
|
||||
## Adding/Removing Models
|
||||
|
||||
Edit the `EVAL_MODELS` array in `src/models.config.ts`:
|
||||
|
||||
```typescript
|
||||
export const EVAL_MODELS: EvalModelConfig[] = [
|
||||
{ id: "claude-sonnet-4-5-20250929", label: "Claude Sonnet 4.5", provider: "anthropic", ci: true },
|
||||
{ id: "gpt-5.3", label: "GPT 5.3", provider: "openai", ci: true },
|
||||
// Add new models here
|
||||
];
|
||||
```
|
||||
|
||||
Provider API keys must be configured in the Braintrust dashboard under
|
||||
Settings → AI providers.
|
||||
**EVAL.ts** is never copied to the workspace until after the agent finishes.
|
||||
This prevents the agent from "teaching to the test."
|
||||
|
||||
## Running Evals
|
||||
|
||||
```bash
|
||||
# Run all models locally (no Braintrust upload)
|
||||
# Run all scenarios with Claude Sonnet 4.5 (default)
|
||||
mise run eval
|
||||
|
||||
# Run a single model
|
||||
mise run eval:model model=claude-sonnet-4-5-20250929
|
||||
# Run a specific scenario
|
||||
EVAL_SCENARIO=auth-rls-new-project mise run eval
|
||||
|
||||
# Run and upload to Braintrust dashboard
|
||||
mise run eval:upload
|
||||
# Override model
|
||||
EVAL_MODEL=claude-opus-4-6 mise run eval
|
||||
|
||||
# Run with baseline comparison (with-skill vs without-skill)
|
||||
EVAL_BASELINE=true mise run eval
|
||||
```
|
||||
|
||||
Or directly:
|
||||
|
||||
```bash
|
||||
cd packages/evals
|
||||
npx tsx src/runner.ts
|
||||
|
||||
# Local run (all models)
|
||||
npx braintrust eval --no-send-logs src/code-fix.eval.ts
|
||||
|
||||
# Single model
|
||||
EVAL_MODEL=claude-sonnet-4-5-20250929 npx braintrust eval --no-send-logs src/code-fix.eval.ts
|
||||
|
||||
# Filter to one test case (across all models)
|
||||
npx braintrust eval --no-send-logs src/code-fix.eval.ts --filter 'input.testCase.id=db-migrations-idempotent'
|
||||
# Single scenario with baseline
|
||||
EVAL_SCENARIO=auth-rls-new-project EVAL_BASELINE=true npx tsx src/runner.ts
|
||||
```
|
||||
|
||||
## Baseline Comparison
|
||||
|
||||
Set `EVAL_BASELINE=true` to run each scenario twice:
|
||||
|
||||
- **With skill**: The supabase skill is symlinked into the workspace. Claude
|
||||
Code discovers it and uses reference files for guidance.
|
||||
- **Baseline**: No skill available. The agent relies on innate knowledge.
|
||||
|
||||
Compare pass rates to measure how much the skill improves agent output.
|
||||
|
||||
## Adding Scenarios
|
||||
|
||||
1. Create `evals/{scenario-name}/` with `PROMPT.md`, `EVAL.ts`, `package.json`
|
||||
2. Add any starter files the agent should see (e.g., `supabase/config.toml`)
|
||||
3. Write vitest assertions in `EVAL.ts` that check the agent's output files
|
||||
4. Document the scenario in `scenarios/SCENARIOS.md`
|
||||
|
||||
## Environment
|
||||
|
||||
API keys are loaded by mise from `packages/evals/.env` (configured in root
|
||||
`mise.toml`). Copy `.env.example` to `.env` and fill in the keys.
|
||||
|
||||
```
|
||||
BRAINTRUST_API_KEY=... # Required: proxy routing + dashboard upload
|
||||
BRAINTRUST_PROJECT_ID=... # Required: Braintrust project identifier
|
||||
ANTHROPIC_API_KEY=sk-ant-... # Required: judge model (Claude Opus 4.6)
|
||||
ANTHROPIC_API_KEY=sk-ant-... # Required: Claude Code authentication
|
||||
EVAL_MODEL=... # Optional: override model (default: claude-sonnet-4-5-20250929)
|
||||
EVAL_SCENARIO=... # Optional: run single scenario
|
||||
EVAL_BASELINE=true # Optional: run baseline comparison
|
||||
BRAINTRUST_UPLOAD=true # Optional: upload results to Braintrust
|
||||
```
|
||||
|
||||
Optional overrides:
|
||||
## Key Files
|
||||
|
||||
```
|
||||
EVAL_MODEL=claude-sonnet-4-5-20250929 # Run only this model (skips matrix)
|
||||
EVAL_JUDGE_MODEL=claude-opus-4-6 # Judge model for scorers
|
||||
src/
|
||||
runner.ts # Main orchestrator
|
||||
types.ts # Core interfaces
|
||||
runner/
|
||||
scaffold.ts # Creates temp workspace from eval template
|
||||
agent.ts # Invokes claude -p as subprocess
|
||||
test.ts # Runs vitest EVAL.ts against workspace
|
||||
results.ts # Collects results and prints summary
|
||||
evals/
|
||||
auth-rls-new-project/ # Scenario 1
|
||||
scenarios/
|
||||
SCENARIOS.md # Scenario descriptions
|
||||
```
|
||||
|
||||
## Modifying Prompts
|
||||
|
||||
- `src/prompts/code-fix.ts` — what the eval model sees
|
||||
- `src/scorer.ts` — judge prompts for each scorer dimension
|
||||
|
||||
Temperature settings:
|
||||
|
||||
- Eval model: `0.2` (in `code-fix.eval.ts`)
|
||||
- Judge model: `0.1` (in `scorer.ts`)
|
||||
|
||||
## Modifying Scoring
|
||||
|
||||
Each scorer in `src/scorer.ts` is independent. To add a new dimension:
|
||||
|
||||
1. Create a new `EvalScorer` function in `scorer.ts`
|
||||
2. Add it to the `scores` array in `code-fix.eval.ts`
|
||||
|
||||
@@ -1,46 +1,51 @@
|
||||
# Evals
|
||||
|
||||
LLM evaluation system for Supabase agent skills, powered by [Braintrust](https://www.braintrust.dev/). Tests whether models can correctly apply Supabase best practices using skill documentation as context.
|
||||
Agent evaluation system for Supabase skills. Tests whether AI agents (starting
|
||||
with Claude Code) correctly implement Supabase tasks when given access to skill
|
||||
documentation.
|
||||
|
||||
## How It Works
|
||||
|
||||
Each eval follows a two-step **LLM-as-judge** pattern orchestrated by Braintrust's `Eval()`:
|
||||
Each eval is a self-contained project directory with a task prompt. The agent
|
||||
works on it autonomously, then hidden vitest assertions check the result.
|
||||
Binary pass/fail.
|
||||
|
||||
1. **Generate** — The eval model (e.g. Sonnet 4.5) receives a prompt with skill context and produces a code fix.
|
||||
2. **Judge** — Three independent scorers using a stronger model (Opus 4.6 by default) evaluate the fix via the Vercel AI SDK with structured output.
|
||||
|
||||
Test cases are extracted automatically from skill reference files (`skills/*/references/*.md`). Each file contains paired **Incorrect** / **Correct** code blocks — the model receives the bad code and must produce the fix.
|
||||
|
||||
**Scoring dimensions (each 0–1):**
|
||||
|
||||
| Scorer | Description |
|
||||
|--------|-------------|
|
||||
| Correctness | Does the fix address the core issue? |
|
||||
| Completeness | Does it include all necessary changes? |
|
||||
| Best Practice | Does it follow Supabase best practices? |
|
||||
```
|
||||
1. Create temp workspace from eval template
|
||||
2. Agent (claude -p) reads prompt and creates files
|
||||
3. Hidden EVAL.ts runs vitest assertions against the output
|
||||
4. Pass/fail
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
# Run locally (no Braintrust upload)
|
||||
# Run all scenarios
|
||||
mise run eval
|
||||
|
||||
# Run and upload to Braintrust dashboard
|
||||
mise run eval:upload
|
||||
# Run a specific scenario
|
||||
EVAL_SCENARIO=auth-rls-new-project mise run eval
|
||||
|
||||
# Run with baseline comparison (with-skill vs without-skill)
|
||||
EVAL_BASELINE=true mise run eval
|
||||
|
||||
# Override model
|
||||
EVAL_MODEL=claude-opus-4-6 mise run eval
|
||||
```
|
||||
|
||||
### Environment Variables
|
||||
|
||||
API keys are loaded via mise from `packages/evals/.env` (see root `mise.toml`).
|
||||
|
||||
```
|
||||
ANTHROPIC_API_KEY Required: eval model + judge model
|
||||
BRAINTRUST_API_KEY Required for Braintrust dashboard upload
|
||||
BRAINTRUST_PROJECT_ID Required for Braintrust dashboard upload
|
||||
EVAL_MODEL Override default eval model (claude-sonnet-4-5-20250929)
|
||||
EVAL_JUDGE_MODEL Override default judge model (claude-opus-4-6)
|
||||
ANTHROPIC_API_KEY Required: Claude Code authentication
|
||||
EVAL_MODEL Override model (default: claude-sonnet-4-5-20250929)
|
||||
EVAL_SCENARIO Run single scenario by name
|
||||
EVAL_BASELINE=true Run baseline comparison (no skill)
|
||||
```
|
||||
|
||||
## Adding Test Cases
|
||||
## Adding Scenarios
|
||||
|
||||
Add paired Incorrect/Correct code blocks to any skill reference file. The extractor picks them up automatically on the next run.
|
||||
1. Create `evals/{name}/` with `PROMPT.md`, `EVAL.ts`, and starter files
|
||||
2. Write vitest assertions in `EVAL.ts`
|
||||
3. Document in `scenarios/SCENARIOS.md`
|
||||
|
||||
See [AGENTS.md](AGENTS.md) for full details.
|
||||
|
||||
107
packages/evals/evals/auth-rls-new-project/EVAL.ts
Normal file
107
packages/evals/evals/auth-rls-new-project/EVAL.ts
Normal file
@@ -0,0 +1,107 @@
|
||||
import { existsSync, readdirSync, readFileSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import { expect, test } from "vitest";
|
||||
|
||||
const supabaseDir = join(process.cwd(), "supabase");
|
||||
const migrationsDir = join(supabaseDir, "migrations");
|
||||
|
||||
/** Find the first .sql migration file (agent may name it differently). */
|
||||
function findMigrationFile(): string | null {
|
||||
if (!existsSync(migrationsDir)) return null;
|
||||
const files = readdirSync(migrationsDir).filter((f) => f.endsWith(".sql"));
|
||||
return files.length > 0 ? join(migrationsDir, files[0]) : null;
|
||||
}
|
||||
|
||||
function getMigrationSQL(): string {
|
||||
const file = findMigrationFile();
|
||||
if (!file) throw new Error("No migration file found in supabase/migrations/");
|
||||
return readFileSync(file, "utf-8");
|
||||
}
|
||||
|
||||
test("supabase project initialized (config.toml exists)", () => {
|
||||
expect(existsSync(join(supabaseDir, "config.toml"))).toBe(true);
|
||||
});
|
||||
|
||||
test("migration file exists in supabase/migrations/", () => {
|
||||
expect(findMigrationFile()).not.toBeNull();
|
||||
});
|
||||
|
||||
test("creates tasks table", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/create\s+table/);
|
||||
expect(sql).toMatch(/tasks/);
|
||||
});
|
||||
|
||||
test("enables RLS on tasks table", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/alter\s+table.*tasks.*enable\s+row\s+level\s+security/);
|
||||
});
|
||||
|
||||
test("has foreign key to auth.users", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/references\s+auth\.users/);
|
||||
});
|
||||
|
||||
test("uses ON DELETE CASCADE for auth FK", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/on\s+delete\s+cascade/);
|
||||
});
|
||||
|
||||
test("uses (select auth.uid()) not bare auth.uid() in policies", () => {
|
||||
const sql = getMigrationSQL();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
for (const policy of policyBlocks) {
|
||||
if (policy.includes("auth.uid()")) {
|
||||
// The subselect form: (select auth.uid())
|
||||
expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
test("policies use TO authenticated", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
expect(policyBlocks.length).toBeGreaterThan(0);
|
||||
for (const policy of policyBlocks) {
|
||||
expect(policy).toMatch(/to\s+authenticated/);
|
||||
}
|
||||
});
|
||||
|
||||
test("uses timestamptz not plain timestamp for time columns", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Match "timestamp" that is NOT followed by "tz" or "with time zone"
|
||||
const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
|
||||
// Only fail if the migration defines time columns with plain timestamp
|
||||
if (
|
||||
sql.includes("created_at") ||
|
||||
sql.includes("updated_at") ||
|
||||
sql.includes("due_date")
|
||||
) {
|
||||
expect(sql).not.toMatch(hasPlainTimestamp);
|
||||
}
|
||||
});
|
||||
|
||||
test("creates index on user_id column", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/create\s+index/);
|
||||
expect(sql).toMatch(/user_id/);
|
||||
});
|
||||
|
||||
test("migration is idempotent (uses IF NOT EXISTS)", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/if\s+not\s+exists/);
|
||||
});
|
||||
|
||||
test("overall quality: demonstrates Supabase best practices", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// A high-quality migration should contain most of these patterns
|
||||
const signals = [
|
||||
/enable\s+row\s+level\s+security/,
|
||||
/\(select\s+auth\.uid\(\)\)/,
|
||||
/to\s+authenticated/,
|
||||
/on\s+delete\s+cascade/,
|
||||
/create\s+index/,
|
||||
];
|
||||
const matches = signals.filter((r) => r.test(sql));
|
||||
expect(matches.length).toBeGreaterThanOrEqual(4);
|
||||
});
|
||||
16
packages/evals/evals/auth-rls-new-project/PROMPT.md
Normal file
16
packages/evals/evals/auth-rls-new-project/PROMPT.md
Normal file
@@ -0,0 +1,16 @@
|
||||
I'm starting a new Supabase project from scratch for a task management app. Users should sign up with email/password, and each user should only see their own tasks.
|
||||
|
||||
Set up the project:
|
||||
|
||||
1. Initialize the Supabase project with the CLI (`npx supabase init`)
|
||||
2. Start the local Supabase stack (`npx supabase start`)
|
||||
3. Create a SQL migration for a tasks table with columns: title (text), description (text), status (text), and due_date
|
||||
|
||||
The migration must:
|
||||
|
||||
- Create the tasks table with proper column types
|
||||
- Link tasks to authenticated users
|
||||
- Enable Row Level Security
|
||||
- Create policies so users can only CRUD their own tasks
|
||||
- Add appropriate indexes
|
||||
- Be idempotent (safe to run multiple times)
|
||||
5
packages/evals/evals/auth-rls-new-project/package.json
Normal file
5
packages/evals/evals/auth-rls-new-project/package.json
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"name": "auth-rls-new-project",
|
||||
"private": true,
|
||||
"type": "module"
|
||||
}
|
||||
1216
packages/evals/package-lock.json
generated
1216
packages/evals/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -4,21 +4,18 @@
|
||||
"type": "module",
|
||||
"author": "Supabase",
|
||||
"license": "MIT",
|
||||
"description": "LLM evaluation system for Supabase agent skills",
|
||||
"description": "Agent evaluation system for Supabase skills",
|
||||
"scripts": {
|
||||
"eval": "braintrust eval --no-send-logs src/code-fix.eval.ts",
|
||||
"eval:upload": "braintrust eval src/code-fix.eval.ts"
|
||||
"eval": "tsx src/runner.ts",
|
||||
"eval:upload": "BRAINTRUST_UPLOAD=true tsx src/runner.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"@ai-sdk/anthropic": "^3.0.44",
|
||||
"@ai-sdk/openai": "^3.0.29",
|
||||
"ai": "^6.0.86",
|
||||
"braintrust": "^1.0.2",
|
||||
"zod": "^3.23.0"
|
||||
"braintrust": "^1.0.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^20.10.0",
|
||||
"tsx": "^4.7.0",
|
||||
"typescript": "^5.3.0"
|
||||
"typescript": "^5.3.0",
|
||||
"vitest": "^3.1.0"
|
||||
}
|
||||
}
|
||||
|
||||
51
packages/evals/scenarios/SCENARIOS.md
Normal file
51
packages/evals/scenarios/SCENARIOS.md
Normal file
@@ -0,0 +1,51 @@
|
||||
# Supabase Skills Eval Scenarios
|
||||
|
||||
## Scenario 1: auth-rls-new-project
|
||||
|
||||
**Description:** Set up a new Supabase project from scratch and add
|
||||
authentication with RLS. The agent must initialize the project with the CLI,
|
||||
start the local Supabase stack, then create a tasks table with proper security
|
||||
(RLS policies, auth FK, indexes) in a single idempotent migration.
|
||||
|
||||
**Setup:** The workspace starts empty (no `supabase/` directory). The agent is
|
||||
expected to run `npx supabase init` and `npx supabase start` before creating
|
||||
the migration.
|
||||
|
||||
**Expected skill files read:**
|
||||
|
||||
- `SKILL.md` (skill body with reference file index)
|
||||
- `references/dev-getting-started.md`
|
||||
- `references/db-rls-mandatory.md`
|
||||
- `references/db-rls-policy-types.md`
|
||||
- `references/db-rls-common-mistakes.md`
|
||||
- `references/db-schema-auth-fk.md`
|
||||
- `references/db-schema-timestamps.md`
|
||||
- `references/db-migrations-idempotent.md`
|
||||
|
||||
**Expected result:**
|
||||
|
||||
The agent initializes a Supabase project and creates a migration file that:
|
||||
|
||||
- Creates tasks table with `timestamptz` columns
|
||||
- Has `user_id` FK to `auth.users(id)` with `ON DELETE CASCADE`
|
||||
- Enables RLS (`ALTER TABLE tasks ENABLE ROW LEVEL SECURITY`)
|
||||
- Creates per-operation policies using `(select auth.uid())` with `TO authenticated`
|
||||
- Creates index on `user_id`
|
||||
- Uses `IF NOT EXISTS` for idempotency
|
||||
|
||||
**Scorer:** Binary pass/fail (12 vitest assertions)
|
||||
|
||||
| Test | What it checks |
|
||||
| --- | --- |
|
||||
| supabase project initialized | `supabase/config.toml` exists after agent runs |
|
||||
| migration file exists | Agent created a `.sql` file in `supabase/migrations/` |
|
||||
| creates tasks table | SQL contains `CREATE TABLE ... tasks` |
|
||||
| enables RLS | `ALTER TABLE tasks ENABLE ROW LEVEL SECURITY` |
|
||||
| FK to auth.users | `REFERENCES auth.users` |
|
||||
| ON DELETE CASCADE | Cascade delete on auth FK |
|
||||
| (select auth.uid()) | Subselect form in policies (performance) |
|
||||
| TO authenticated | Policies scoped to authenticated role |
|
||||
| timestamptz | No plain `timestamp` for time columns |
|
||||
| index on user_id | `CREATE INDEX` on the FK column |
|
||||
| IF NOT EXISTS | Idempotent migration |
|
||||
| overall quality | At least 4/5 best-practice signals present |
|
||||
@@ -1,82 +0,0 @@
|
||||
import assert from "node:assert";
|
||||
import { generateText } from "ai";
|
||||
import { Eval } from "braintrust";
|
||||
import { dataset } from "./dataset.js";
|
||||
import type { EvalModelConfig } from "./models.config.js";
|
||||
import { EVAL_MODELS } from "./models.config.js";
|
||||
import { getProxyModel } from "./models.js";
|
||||
import {
|
||||
buildCodeFixPrompt,
|
||||
buildCodeFixSystemPrompt,
|
||||
} from "./prompts/code-fix.js";
|
||||
import {
|
||||
bestPracticeScorer,
|
||||
completenessScorer,
|
||||
correctnessScorer,
|
||||
minimalityScorer,
|
||||
regressionSafetyScorer,
|
||||
} from "./scorer.js";
|
||||
|
||||
assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
|
||||
assert(process.env.ANTHROPIC_API_KEY, "ANTHROPIC_API_KEY is not set");
|
||||
|
||||
/**
|
||||
* Resolve which models to run based on environment:
|
||||
* - EVAL_MODEL set → single model only (local dev / cost control)
|
||||
* - CI without EVAL_ALL_MODELS → ci:true models only
|
||||
* - Otherwise → all models
|
||||
*/
|
||||
function getModelsToRun(): EvalModelConfig[] {
|
||||
const singleModel = process.env.EVAL_MODEL;
|
||||
if (singleModel) {
|
||||
const found = EVAL_MODELS.find((m) => m.id === singleModel);
|
||||
return [
|
||||
found ?? {
|
||||
id: singleModel,
|
||||
label: singleModel,
|
||||
provider: "unknown",
|
||||
ci: false,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
if (process.env.CI && !process.env.EVAL_ALL_MODELS) {
|
||||
return EVAL_MODELS.filter((m) => m.ci);
|
||||
}
|
||||
|
||||
return EVAL_MODELS;
|
||||
}
|
||||
|
||||
const models = getModelsToRun();
|
||||
|
||||
for (const modelConfig of models) {
|
||||
Eval("CodeFix", {
|
||||
experimentName: modelConfig.id,
|
||||
projectId: process.env.BRAINTRUST_PROJECT_ID,
|
||||
trialCount: process.env.CI ? 3 : 1,
|
||||
metadata: {
|
||||
model: modelConfig.id,
|
||||
label: modelConfig.label,
|
||||
provider: modelConfig.provider,
|
||||
},
|
||||
data: () => dataset(),
|
||||
task: async (input) => {
|
||||
const model = getProxyModel(modelConfig.id);
|
||||
const response = await generateText({
|
||||
model,
|
||||
system: buildCodeFixSystemPrompt(),
|
||||
prompt: buildCodeFixPrompt(input.testCase),
|
||||
temperature: 0.2,
|
||||
maxRetries: 2,
|
||||
});
|
||||
return { llmOutput: response.text };
|
||||
},
|
||||
scores: [
|
||||
correctnessScorer,
|
||||
completenessScorer,
|
||||
bestPracticeScorer,
|
||||
regressionSafetyScorer,
|
||||
minimalityScorer,
|
||||
],
|
||||
});
|
||||
}
|
||||
@@ -1,51 +0,0 @@
|
||||
import type { EvalCase } from "braintrust";
|
||||
import { extractCodeFixDataset } from "./dataset/extract.js";
|
||||
import type { CodeFixTestCase } from "./dataset/types.js";
|
||||
|
||||
export type Input = { testCase: CodeFixTestCase };
|
||||
|
||||
export type Expected = {
|
||||
correctCode: string;
|
||||
correctLanguage?: string;
|
||||
};
|
||||
|
||||
export type Metadata = {
|
||||
name: string;
|
||||
skillName: string;
|
||||
section: string;
|
||||
referenceFile: string;
|
||||
tags: string[];
|
||||
};
|
||||
|
||||
export type Output = { llmOutput: string };
|
||||
|
||||
/**
|
||||
* Extract the feature category from a reference filename.
|
||||
* e.g. "db-migrations-idempotent.md" → "db"
|
||||
* "auth-core-sessions.md" → "auth"
|
||||
*/
|
||||
function featureCategory(filename: string): string {
|
||||
return filename.replace(/\.md$/, "").split("-")[0];
|
||||
}
|
||||
|
||||
export function dataset(): EvalCase<Input, Expected, Metadata>[] {
|
||||
return extractCodeFixDataset().map((tc) => ({
|
||||
id: tc.id,
|
||||
input: { testCase: tc },
|
||||
tags: [
|
||||
featureCategory(tc.referenceFilename),
|
||||
tc.referenceFilename.replace(/\.md$/, ""),
|
||||
],
|
||||
expected: {
|
||||
correctCode: tc.goodExample.code,
|
||||
correctLanguage: tc.goodExample.language,
|
||||
},
|
||||
metadata: {
|
||||
name: tc.title,
|
||||
skillName: tc.skillName,
|
||||
section: tc.section,
|
||||
referenceFile: tc.referenceFilename,
|
||||
tags: tc.tags,
|
||||
},
|
||||
}));
|
||||
}
|
||||
@@ -1,277 +0,0 @@
|
||||
import { existsSync, readdirSync, readFileSync } from "node:fs";
|
||||
import { basename, join, resolve } from "node:path";
|
||||
import type { CodeFixTestCase } from "./types.js";
|
||||
|
||||
function findSkillsRoot(): string {
|
||||
let dir = process.cwd();
|
||||
for (let i = 0; i < 10; i++) {
|
||||
const candidate = join(dir, "skills");
|
||||
if (existsSync(candidate)) return candidate;
|
||||
const parent = resolve(dir, "..");
|
||||
if (parent === dir) break;
|
||||
dir = parent;
|
||||
}
|
||||
throw new Error(
|
||||
"Could not find skills/ directory. Run from the repository root or a subdirectory.",
|
||||
);
|
||||
}
|
||||
|
||||
const SKILLS_ROOT = findSkillsRoot();
|
||||
|
||||
// --- Duplicated from skills-build/src/parser.ts for isolation ---
|
||||
|
||||
interface CodeExample {
|
||||
label: string;
|
||||
description?: string;
|
||||
code: string;
|
||||
language?: string;
|
||||
}
|
||||
|
||||
function parseFrontmatter(content: string): {
|
||||
frontmatter: Record<string, string>;
|
||||
body: string;
|
||||
} {
|
||||
const frontmatter: Record<string, string> = {};
|
||||
|
||||
if (!content.startsWith("---")) {
|
||||
return { frontmatter, body: content };
|
||||
}
|
||||
|
||||
const endIndex = content.indexOf("---", 3);
|
||||
if (endIndex === -1) {
|
||||
return { frontmatter, body: content };
|
||||
}
|
||||
|
||||
const frontmatterContent = content.slice(3, endIndex).trim();
|
||||
const body = content.slice(endIndex + 3).trim();
|
||||
|
||||
for (const line of frontmatterContent.split("\n")) {
|
||||
const colonIndex = line.indexOf(":");
|
||||
if (colonIndex === -1) continue;
|
||||
|
||||
const key = line.slice(0, colonIndex).trim();
|
||||
let value = line.slice(colonIndex + 1).trim();
|
||||
|
||||
if (
|
||||
(value.startsWith('"') && value.endsWith('"')) ||
|
||||
(value.startsWith("'") && value.endsWith("'"))
|
||||
) {
|
||||
value = value.slice(1, -1);
|
||||
}
|
||||
|
||||
frontmatter[key] = value;
|
||||
}
|
||||
|
||||
return { frontmatter, body };
|
||||
}
|
||||
|
||||
function extractTitle(body: string): string | null {
|
||||
const match = body.match(/^##\s+(.+)$/m);
|
||||
return match ? match[1].trim() : null;
|
||||
}
|
||||
|
||||
interface Section {
|
||||
title: string;
|
||||
explanation: string;
|
||||
examples: CodeExample[];
|
||||
}
|
||||
|
||||
function extractSections(body: string): Section[] {
|
||||
const sections: Section[] = [];
|
||||
const lines = body.split("\n");
|
||||
|
||||
let currentTitle = "";
|
||||
let explanationLines: string[] = [];
|
||||
let currentExamples: CodeExample[] = [];
|
||||
let currentLabel = "";
|
||||
let currentDescription = "";
|
||||
let inCodeBlock = false;
|
||||
let codeBlockLang = "";
|
||||
let codeBlockContent: string[] = [];
|
||||
let collectingExplanation = false;
|
||||
|
||||
function flushExample() {
|
||||
if (currentLabel && codeBlockContent.length > 0) {
|
||||
currentExamples.push({
|
||||
label: currentLabel,
|
||||
description: currentDescription || undefined,
|
||||
code: codeBlockContent.join("\n"),
|
||||
language: codeBlockLang || undefined,
|
||||
});
|
||||
}
|
||||
currentLabel = "";
|
||||
currentDescription = "";
|
||||
codeBlockContent = [];
|
||||
codeBlockLang = "";
|
||||
}
|
||||
|
||||
function flushSection() {
|
||||
if (currentTitle && currentExamples.length > 0) {
|
||||
sections.push({
|
||||
title: currentTitle,
|
||||
explanation: explanationLines.join("\n").trim(),
|
||||
examples: currentExamples,
|
||||
});
|
||||
}
|
||||
currentExamples = [];
|
||||
explanationLines = [];
|
||||
}
|
||||
|
||||
for (const line of lines) {
|
||||
if (line.startsWith("## ") && !inCodeBlock) {
|
||||
flushExample();
|
||||
flushSection();
|
||||
currentTitle = line.replace(/^##\s+/, "").trim();
|
||||
collectingExplanation = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
const labelMatch = line.match(
|
||||
/^\*\*([^*]+?)(?:\s*\(([^)]+)\))?\s*:\*\*\s*$/,
|
||||
);
|
||||
if (labelMatch && !inCodeBlock) {
|
||||
collectingExplanation = false;
|
||||
flushExample();
|
||||
currentLabel = labelMatch[1].trim();
|
||||
currentDescription = labelMatch[2]?.trim() || "";
|
||||
continue;
|
||||
}
|
||||
|
||||
if (line.startsWith("```") && !inCodeBlock) {
|
||||
collectingExplanation = false;
|
||||
inCodeBlock = true;
|
||||
codeBlockLang = line.slice(3).trim();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (line.startsWith("```") && inCodeBlock) {
|
||||
inCodeBlock = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (inCodeBlock) {
|
||||
codeBlockContent.push(line);
|
||||
} else if (collectingExplanation) {
|
||||
explanationLines.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
flushExample();
|
||||
flushSection();
|
||||
|
||||
return sections;
|
||||
}
|
||||
|
||||
// --- Duplicated from skills-build/src/validate.ts ---
|
||||
|
||||
function isBadExample(label: string): boolean {
|
||||
const lower = label.toLowerCase();
|
||||
return (
|
||||
lower.includes("incorrect") ||
|
||||
lower.includes("wrong") ||
|
||||
lower.includes("bad")
|
||||
);
|
||||
}
|
||||
|
||||
function isGoodExample(label: string): boolean {
|
||||
const lower = label.toLowerCase();
|
||||
return (
|
||||
lower.includes("correct") ||
|
||||
lower.includes("good") ||
|
||||
lower.includes("usage") ||
|
||||
lower.includes("implementation") ||
|
||||
lower.includes("example") ||
|
||||
lower.includes("recommended")
|
||||
);
|
||||
}
|
||||
|
||||
// --- Extraction logic ---
|
||||
|
||||
function pairExamples(
|
||||
examples: CodeExample[],
|
||||
): Array<{ bad: CodeExample; good: CodeExample }> {
|
||||
const pairs: Array<{ bad: CodeExample; good: CodeExample }> = [];
|
||||
|
||||
for (let i = 0; i < examples.length - 1; i++) {
|
||||
if (
|
||||
isBadExample(examples[i].label) &&
|
||||
isGoodExample(examples[i + 1].label)
|
||||
) {
|
||||
pairs.push({ bad: examples[i], good: examples[i + 1] });
|
||||
}
|
||||
}
|
||||
|
||||
return pairs;
|
||||
}
|
||||
|
||||
function discoverSkillNames(): string[] {
|
||||
if (!existsSync(SKILLS_ROOT)) return [];
|
||||
|
||||
return readdirSync(SKILLS_ROOT, { withFileTypes: true })
|
||||
.filter((d) => d.isDirectory())
|
||||
.filter((d) => existsSync(join(SKILLS_ROOT, d.name, "SKILL.md")))
|
||||
.map((d) => d.name);
|
||||
}
|
||||
|
||||
function getMarkdownFiles(dir: string): string[] {
|
||||
if (!existsSync(dir)) return [];
|
||||
|
||||
return readdirSync(dir)
|
||||
.filter((f) => f.endsWith(".md") && !f.startsWith("_"))
|
||||
.map((f) => join(dir, f));
|
||||
}
|
||||
|
||||
export function extractCodeFixDataset(skillName?: string): CodeFixTestCase[] {
|
||||
const skills = skillName ? [skillName] : discoverSkillNames();
|
||||
const testCases: CodeFixTestCase[] = [];
|
||||
|
||||
for (const skill of skills) {
|
||||
const referencesDir = join(SKILLS_ROOT, skill, "references");
|
||||
const files = getMarkdownFiles(referencesDir);
|
||||
|
||||
for (const filePath of files) {
|
||||
const content = readFileSync(filePath, "utf-8");
|
||||
const { frontmatter, body } = parseFrontmatter(content);
|
||||
const fileTitle =
|
||||
frontmatter.title || extractTitle(body) || basename(filePath, ".md");
|
||||
const tags = frontmatter.tags?.split(",").map((t) => t.trim()) || [];
|
||||
const section = basename(filePath, ".md").split("-")[0];
|
||||
|
||||
const sections = extractSections(body);
|
||||
let pairIndex = 0;
|
||||
|
||||
for (const sec of sections) {
|
||||
const pairs = pairExamples(sec.examples);
|
||||
|
||||
for (const { bad, good } of pairs) {
|
||||
testCases.push({
|
||||
id: `${skill}/${basename(filePath, ".md")}#${pairIndex}`,
|
||||
skillName: skill,
|
||||
referenceFile: filePath,
|
||||
referenceFilename: basename(filePath),
|
||||
title: sec.title || fileTitle,
|
||||
explanation: sec.explanation,
|
||||
section,
|
||||
tags,
|
||||
pairIndex,
|
||||
badExample: {
|
||||
label: bad.label,
|
||||
description: bad.description,
|
||||
code: bad.code,
|
||||
language: bad.language,
|
||||
},
|
||||
goodExample: {
|
||||
label: good.label,
|
||||
description: good.description,
|
||||
code: good.code,
|
||||
language: good.language,
|
||||
},
|
||||
});
|
||||
pairIndex++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return testCases;
|
||||
}
|
||||
@@ -1,24 +0,0 @@
|
||||
export interface CodeFixTestCase {
|
||||
/** Unique ID, e.g. "supabase/db-rls-mandatory#0" */
|
||||
id: string;
|
||||
skillName: string;
|
||||
referenceFile: string;
|
||||
referenceFilename: string;
|
||||
title: string;
|
||||
explanation: string;
|
||||
section: string;
|
||||
tags: string[];
|
||||
pairIndex: number;
|
||||
badExample: {
|
||||
label: string;
|
||||
description?: string;
|
||||
code: string;
|
||||
language?: string;
|
||||
};
|
||||
goodExample: {
|
||||
label: string;
|
||||
description?: string;
|
||||
code: string;
|
||||
language?: string;
|
||||
};
|
||||
}
|
||||
@@ -1,47 +0,0 @@
|
||||
export interface EvalModelConfig {
|
||||
/** Model ID passed to the Braintrust proxy */
|
||||
id: string;
|
||||
/** Human-readable label for dashboards */
|
||||
label: string;
|
||||
/** Provider name for display/grouping */
|
||||
provider: string;
|
||||
/** Whether to include in CI runs by default */
|
||||
ci: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Models to evaluate. Add/remove entries to change the eval matrix.
|
||||
* Set `ci: false` to exclude expensive models from automated CI runs.
|
||||
*/
|
||||
export const EVAL_MODELS: EvalModelConfig[] = [
|
||||
{
|
||||
id: "claude-sonnet-4-5-20250929",
|
||||
label: "Claude Sonnet 4.5",
|
||||
provider: "anthropic",
|
||||
ci: true,
|
||||
},
|
||||
{
|
||||
id: "gpt-5.3",
|
||||
label: "GPT 5.3",
|
||||
provider: "openai",
|
||||
ci: true,
|
||||
},
|
||||
{
|
||||
id: "gpt-5.2",
|
||||
label: "GPT 5.2",
|
||||
provider: "openai",
|
||||
ci: true,
|
||||
},
|
||||
{
|
||||
id: "gemini-3-pro",
|
||||
label: "Gemini 3.0 Pro",
|
||||
provider: "google",
|
||||
ci: true,
|
||||
},
|
||||
{
|
||||
id: "claude-opus-4-6",
|
||||
label: "Claude Opus 4.6",
|
||||
provider: "anthropic",
|
||||
ci: false,
|
||||
},
|
||||
];
|
||||
@@ -1,52 +0,0 @@
|
||||
import type { AnthropicProvider } from "@ai-sdk/anthropic";
|
||||
import { anthropic } from "@ai-sdk/anthropic";
|
||||
import { createOpenAI } from "@ai-sdk/openai";
|
||||
import type { LanguageModel } from "ai";
|
||||
|
||||
/** Model ID accepted by the Anthropic provider (string literal union + string). */
|
||||
export type AnthropicModelId = Parameters<AnthropicProvider["chat"]>[0];
|
||||
|
||||
/**
|
||||
* Braintrust AI proxy — routes to any provider (Anthropic, OpenAI, Google)
|
||||
* via a single OpenAI-compatible endpoint.
|
||||
*
|
||||
* Provider API keys are configured in the Braintrust dashboard at
|
||||
* project or org level. The x-bt-parent header scopes the request to
|
||||
* the project so project-level keys are resolved.
|
||||
*/
|
||||
const braintrustProxy = createOpenAI({
|
||||
baseURL: "https://api.braintrust.dev/v1/proxy",
|
||||
apiKey: process.env.BRAINTRUST_API_KEY ?? "",
|
||||
headers: process.env.BRAINTRUST_PROJECT_ID
|
||||
? { "x-bt-parent": `project_id:${process.env.BRAINTRUST_PROJECT_ID}` }
|
||||
: undefined,
|
||||
});
|
||||
|
||||
/**
|
||||
* Get a model for the eval task. Claude models use the Anthropic SDK
|
||||
* directly (via ANTHROPIC_API_KEY). All other models route through the
|
||||
* Braintrust proxy (keys configured at the org level in Braintrust).
|
||||
*/
|
||||
export function getProxyModel(modelId: string): LanguageModel {
|
||||
if (modelId.startsWith("claude")) {
|
||||
return anthropic(modelId as AnthropicModelId);
|
||||
}
|
||||
return braintrustProxy(modelId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a model using direct provider SDKs. Used for the judge model which
|
||||
* is always Claude and uses ANTHROPIC_API_KEY directly (no proxy).
|
||||
*/
|
||||
export function getModel(modelId: string): LanguageModel {
|
||||
if (modelId.startsWith("claude")) {
|
||||
return anthropic(modelId as AnthropicModelId);
|
||||
}
|
||||
|
||||
return getProxyModel(modelId);
|
||||
}
|
||||
|
||||
export function getJudgeModel(): LanguageModel {
|
||||
const judgeModelId = process.env.EVAL_JUDGE_MODEL || "claude-opus-4-6";
|
||||
return getModel(judgeModelId);
|
||||
}
|
||||
@@ -1,33 +0,0 @@
|
||||
import type { CodeFixTestCase } from "../dataset/types.js";
|
||||
|
||||
export function buildCodeFixSystemPrompt(): string {
|
||||
return `You are a senior Supabase developer and database architect. You fix code to follow Supabase best practices including:
|
||||
- Row Level Security (RLS) policies
|
||||
- Proper authentication patterns
|
||||
- Safe migration workflows
|
||||
- Correct SDK usage patterns
|
||||
- Edge Function best practices
|
||||
- Connection pooling configuration
|
||||
- Security-first defaults
|
||||
|
||||
When fixing code, ensure the fix is complete, production-ready, and follows the latest Supabase conventions. Return only the corrected code inside a single code block.`;
|
||||
}
|
||||
|
||||
export function buildCodeFixPrompt(testCase: CodeFixTestCase): string {
|
||||
const langHint = testCase.badExample.language
|
||||
? ` (${testCase.badExample.language})`
|
||||
: "";
|
||||
|
||||
return `The following code has a problem related to: ${testCase.title}
|
||||
|
||||
Context: ${testCase.explanation}
|
||||
|
||||
Here is the problematic code${langHint}:
|
||||
|
||||
\`\`\`${testCase.badExample.language || ""}
|
||||
${testCase.badExample.code}
|
||||
\`\`\`
|
||||
${testCase.badExample.description ? `\nIssue hint: ${testCase.badExample.description}` : ""}
|
||||
|
||||
Fix this code to follow Supabase best practices. Return ONLY the corrected code inside a single code block. Do not include any explanation outside the code block.`;
|
||||
}
|
||||
189
packages/evals/src/runner.ts
Normal file
189
packages/evals/src/runner.ts
Normal file
@@ -0,0 +1,189 @@
|
||||
import { existsSync, readdirSync, readFileSync } from "node:fs";
|
||||
import { join, resolve } from "node:path";
|
||||
import { runAgent } from "./runner/agent.js";
|
||||
import { uploadToBraintrust } from "./runner/braintrust.js";
|
||||
import { preflight } from "./runner/preflight.js";
|
||||
import { listModifiedFiles, printSummary } from "./runner/results.js";
|
||||
import { createWorkspace } from "./runner/scaffold.js";
|
||||
import { runTests } from "./runner/test.js";
|
||||
import type { EvalRunResult, EvalScenario } from "./types.js";
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Configuration from environment
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const DEFAULT_MODEL = "claude-sonnet-4-5-20250929";
|
||||
const AGENT_TIMEOUT = 30 * 60 * 1000; // 30 minutes
|
||||
|
||||
const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL;
|
||||
const scenarioFilter = process.env.EVAL_SCENARIO;
|
||||
const runBaseline = process.env.EVAL_BASELINE === "true";
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Discover scenarios
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function findEvalsDir(): string {
|
||||
let dir = process.cwd();
|
||||
for (let i = 0; i < 10; i++) {
|
||||
const candidate = join(dir, "packages", "evals", "evals");
|
||||
if (existsSync(candidate)) return candidate;
|
||||
const parent = resolve(dir, "..");
|
||||
if (parent === dir) break;
|
||||
dir = parent;
|
||||
}
|
||||
throw new Error("Could not find packages/evals/evals/ directory");
|
||||
}
|
||||
|
||||
function discoverScenarios(): EvalScenario[] {
|
||||
const evalsDir = findEvalsDir();
|
||||
const dirs = readdirSync(evalsDir, { withFileTypes: true }).filter(
|
||||
(d) => d.isDirectory() && existsSync(join(evalsDir, d.name, "PROMPT.md")),
|
||||
);
|
||||
|
||||
return dirs.map((d) => ({
|
||||
id: d.name,
|
||||
name: d.name,
|
||||
tags: [],
|
||||
}));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Run a single eval
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async function runEval(
|
||||
scenario: EvalScenario,
|
||||
skillEnabled: boolean,
|
||||
): Promise<EvalRunResult> {
|
||||
const evalsDir = findEvalsDir();
|
||||
const evalDir = join(evalsDir, scenario.id);
|
||||
|
||||
console.log(
|
||||
`\n--- ${scenario.id} (${skillEnabled ? "with-skill" : "baseline"}) ---`,
|
||||
);
|
||||
|
||||
// 1. Create isolated workspace
|
||||
const { workspacePath, cleanup } = createWorkspace({
|
||||
evalDir,
|
||||
skillEnabled,
|
||||
});
|
||||
console.log(` Workspace: ${workspacePath}`);
|
||||
|
||||
try {
|
||||
// 2. Read the prompt
|
||||
const prompt = readFileSync(join(evalDir, "PROMPT.md"), "utf-8").trim();
|
||||
|
||||
// 3. Run the agent
|
||||
console.log(` Running agent (${model})...`);
|
||||
const agentResult = await runAgent({
|
||||
cwd: workspacePath,
|
||||
prompt,
|
||||
model,
|
||||
timeout: AGENT_TIMEOUT,
|
||||
skillEnabled,
|
||||
});
|
||||
console.log(
|
||||
` Agent finished in ${(agentResult.duration / 1000).toFixed(1)}s`,
|
||||
);
|
||||
|
||||
// 4. Run the hidden tests
|
||||
const evalFilePath = existsSync(join(evalDir, "EVAL.tsx"))
|
||||
? join(evalDir, "EVAL.tsx")
|
||||
: join(evalDir, "EVAL.ts");
|
||||
|
||||
console.log(" Running tests...");
|
||||
const testResult = await runTests({
|
||||
workspacePath,
|
||||
evalFilePath,
|
||||
});
|
||||
console.log(
|
||||
` Tests: ${testResult.passedCount}/${testResult.totalCount} passed`,
|
||||
);
|
||||
|
||||
// 5. Collect modified files
|
||||
const filesModified = listModifiedFiles(workspacePath, evalDir);
|
||||
|
||||
return {
|
||||
scenario: scenario.id,
|
||||
agent: "claude-code",
|
||||
model,
|
||||
skillEnabled,
|
||||
status: testResult.passed ? "passed" : "failed",
|
||||
duration: agentResult.duration,
|
||||
testOutput: testResult.output,
|
||||
agentOutput: agentResult.output,
|
||||
testsPassed: testResult.passedCount,
|
||||
testsTotal: testResult.totalCount,
|
||||
filesModified,
|
||||
};
|
||||
} catch (error) {
|
||||
const err = error as Error;
|
||||
return {
|
||||
scenario: scenario.id,
|
||||
agent: "claude-code",
|
||||
model,
|
||||
skillEnabled,
|
||||
status: "error",
|
||||
duration: 0,
|
||||
testOutput: "",
|
||||
agentOutput: "",
|
||||
testsPassed: 0,
|
||||
testsTotal: 0,
|
||||
filesModified: [],
|
||||
error: err.message,
|
||||
};
|
||||
} finally {
|
||||
cleanup();
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Main
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async function main() {
|
||||
preflight();
|
||||
|
||||
console.log("Supabase Skills Evals");
|
||||
console.log(`Model: ${model}`);
|
||||
console.log(`Baseline: ${runBaseline}`);
|
||||
|
||||
let scenarios = discoverScenarios();
|
||||
|
||||
if (scenarioFilter) {
|
||||
scenarios = scenarios.filter((s) => s.id === scenarioFilter);
|
||||
if (scenarios.length === 0) {
|
||||
console.error(`Scenario not found: ${scenarioFilter}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Scenarios: ${scenarios.map((s) => s.id).join(", ")}`);
|
||||
|
||||
const results: EvalRunResult[] = [];
|
||||
|
||||
for (const scenario of scenarios) {
|
||||
// Run with skill enabled
|
||||
const withSkill = await runEval(scenario, true);
|
||||
results.push(withSkill);
|
||||
|
||||
// Optionally run baseline (no skill)
|
||||
if (runBaseline) {
|
||||
const baseline = await runEval(scenario, false);
|
||||
results.push(baseline);
|
||||
}
|
||||
}
|
||||
|
||||
printSummary(results);
|
||||
|
||||
if (process.env.BRAINTRUST_UPLOAD === "true") {
|
||||
console.log("\nUploading to Braintrust...");
|
||||
await uploadToBraintrust(results);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("Fatal error:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
82
packages/evals/src/runner/agent.ts
Normal file
82
packages/evals/src/runner/agent.ts
Normal file
@@ -0,0 +1,82 @@
|
||||
import { spawn } from "node:child_process";
|
||||
|
||||
export interface AgentRunResult {
|
||||
output: string;
|
||||
duration: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Invoke Claude Code in print mode as a subprocess.
|
||||
*
|
||||
* The agent operates in the workspace directory and can read/write files.
|
||||
* When the skill is installed (symlinked into workspace), Claude Code
|
||||
* discovers it automatically and uses it for guidance.
|
||||
*/
|
||||
export async function runAgent(opts: {
|
||||
cwd: string;
|
||||
prompt: string;
|
||||
model: string;
|
||||
timeout: number;
|
||||
skillEnabled: boolean;
|
||||
}): Promise<AgentRunResult> {
|
||||
const start = Date.now();
|
||||
|
||||
const args = [
|
||||
"-p", // Print mode (non-interactive)
|
||||
"--output-format",
|
||||
"text",
|
||||
"--model",
|
||||
opts.model,
|
||||
"--no-session-persistence",
|
||||
"--dangerously-skip-permissions",
|
||||
"--tools",
|
||||
"Edit,Write,Bash,Read,Glob,Grep",
|
||||
];
|
||||
|
||||
// Disable skills for baseline runs so the agent relies on innate knowledge
|
||||
if (!opts.skillEnabled) {
|
||||
args.push("--disable-slash-commands");
|
||||
}
|
||||
|
||||
const env = { ...process.env };
|
||||
// Remove all Claude-related env vars to avoid nested-session detection
|
||||
for (const key of Object.keys(env)) {
|
||||
if (key === "CLAUDECODE" || key.startsWith("CLAUDE_")) {
|
||||
delete env[key];
|
||||
}
|
||||
}
|
||||
|
||||
return new Promise<AgentRunResult>((resolve) => {
|
||||
const child = spawn("claude", args, {
|
||||
cwd: opts.cwd,
|
||||
env,
|
||||
stdio: ["pipe", "pipe", "pipe"],
|
||||
});
|
||||
|
||||
// Pipe prompt via stdin and close — this is the standard way to
|
||||
// pass multi-line prompts to `claude -p`.
|
||||
child.stdin.write(opts.prompt);
|
||||
child.stdin.end();
|
||||
|
||||
let stdout = "";
|
||||
let stderr = "";
|
||||
child.stdout.on("data", (d: Buffer) => {
|
||||
stdout += d.toString();
|
||||
});
|
||||
child.stderr.on("data", (d: Buffer) => {
|
||||
stderr += d.toString();
|
||||
});
|
||||
|
||||
const timer = setTimeout(() => {
|
||||
child.kill();
|
||||
}, opts.timeout);
|
||||
|
||||
child.on("close", () => {
|
||||
clearTimeout(timer);
|
||||
resolve({
|
||||
output: stdout || stderr,
|
||||
duration: Date.now() - start,
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
53
packages/evals/src/runner/braintrust.ts
Normal file
53
packages/evals/src/runner/braintrust.ts
Normal file
@@ -0,0 +1,53 @@
|
||||
import assert from "node:assert";
|
||||
import { init } from "braintrust";
|
||||
import type { EvalRunResult } from "../types.js";
|
||||
|
||||
/**
|
||||
* Upload eval results to Braintrust as an experiment.
|
||||
*
|
||||
* Each EvalRunResult becomes a row in the experiment with:
|
||||
* - input: scenario name + config
|
||||
* - output: agent output summary
|
||||
* - scores: pass (0 or 1)
|
||||
* - metadata: model, skill toggle, duration, files modified
|
||||
*/
|
||||
export async function uploadToBraintrust(
|
||||
results: EvalRunResult[],
|
||||
): Promise<void> {
|
||||
assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
|
||||
assert(process.env.BRAINTRUST_PROJECT_ID, "BRAINTRUST_PROJECT_ID is not set");
|
||||
|
||||
const experiment = await init({
|
||||
projectId: process.env.BRAINTRUST_PROJECT_ID,
|
||||
});
|
||||
|
||||
for (const r of results) {
|
||||
experiment.log({
|
||||
input: {
|
||||
scenario: r.scenario,
|
||||
skillEnabled: r.skillEnabled,
|
||||
},
|
||||
output: {
|
||||
status: r.status,
|
||||
filesModified: r.filesModified,
|
||||
testOutput: r.testOutput,
|
||||
},
|
||||
scores: {
|
||||
pass: r.status === "passed" ? 1 : 0,
|
||||
},
|
||||
metadata: {
|
||||
agent: r.agent,
|
||||
model: r.model,
|
||||
skillEnabled: r.skillEnabled,
|
||||
duration: r.duration,
|
||||
testsPassed: r.testsPassed,
|
||||
testsTotal: r.testsTotal,
|
||||
...(r.error ? { error: r.error } : {}),
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
const summary = await experiment.summarize();
|
||||
console.log(`\nBraintrust experiment: ${summary.experimentUrl}`);
|
||||
await experiment.close();
|
||||
}
|
||||
43
packages/evals/src/runner/preflight.ts
Normal file
43
packages/evals/src/runner/preflight.ts
Normal file
@@ -0,0 +1,43 @@
|
||||
import { execFileSync } from "node:child_process";
|
||||
|
||||
/**
|
||||
* Verify the host environment has everything needed before spending
|
||||
* API credits on an eval run.
|
||||
*
|
||||
* Checks: Node >= 20, Docker running, claude CLI available.
|
||||
*/
|
||||
export function preflight(): void {
|
||||
const errors: string[] = [];
|
||||
|
||||
// Node.js >= 20
|
||||
const [major] = process.versions.node.split(".").map(Number);
|
||||
if (major < 20) {
|
||||
errors.push(`Node.js >= 20 required (found ${process.versions.node})`);
|
||||
}
|
||||
|
||||
// Docker daemon running
|
||||
try {
|
||||
execFileSync("docker", ["info"], { stdio: "ignore", timeout: 10_000 });
|
||||
} catch {
|
||||
errors.push("Docker is not running (required by supabase CLI)");
|
||||
}
|
||||
|
||||
// Claude CLI available
|
||||
try {
|
||||
execFileSync("claude", ["--version"], {
|
||||
stdio: "ignore",
|
||||
timeout: 10_000,
|
||||
});
|
||||
} catch {
|
||||
errors.push("claude CLI not found on PATH");
|
||||
}
|
||||
|
||||
if (errors.length > 0) {
|
||||
console.error("Preflight checks failed:\n");
|
||||
for (const e of errors) {
|
||||
console.error(` - ${e}`);
|
||||
}
|
||||
console.error("");
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
68
packages/evals/src/runner/results.ts
Normal file
68
packages/evals/src/runner/results.ts
Normal file
@@ -0,0 +1,68 @@
|
||||
import { mkdirSync, readdirSync, statSync, writeFileSync } from "node:fs";
|
||||
import { join, resolve } from "node:path";
|
||||
import type { EvalRunResult } from "../types.js";
|
||||
|
||||
/**
|
||||
* List files created or modified by the agent in the workspace.
|
||||
* Compares against the original eval directory to find new files.
|
||||
*/
|
||||
export function listModifiedFiles(
|
||||
workspacePath: string,
|
||||
originalEvalDir: string,
|
||||
): string[] {
|
||||
const modified: string[] = [];
|
||||
|
||||
function walk(dir: string, prefix: string) {
|
||||
const entries = readdirSync(dir, { withFileTypes: true });
|
||||
for (const entry of entries) {
|
||||
if (
|
||||
entry.name === "node_modules" ||
|
||||
entry.name === "skills" ||
|
||||
entry.name === "EVAL.ts" ||
|
||||
entry.name === "EVAL.tsx"
|
||||
)
|
||||
continue;
|
||||
|
||||
const relPath = prefix ? `${prefix}/${entry.name}` : entry.name;
|
||||
const fullPath = join(dir, entry.name);
|
||||
|
||||
if (entry.isDirectory()) {
|
||||
walk(fullPath, relPath);
|
||||
} else {
|
||||
// Check if file is new (not in original eval dir)
|
||||
const originalPath = join(originalEvalDir, relPath);
|
||||
try {
|
||||
statSync(originalPath);
|
||||
} catch {
|
||||
// File doesn't exist in original — it was created by the agent
|
||||
modified.push(relPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
walk(workspacePath, "");
|
||||
return modified;
|
||||
}
|
||||
|
||||
/** Print a summary table of eval results. */
|
||||
export function printSummary(results: EvalRunResult[]): void {
|
||||
console.log("\n=== Eval Results ===\n");
|
||||
|
||||
for (const r of results) {
|
||||
const icon = r.status === "passed" ? "PASS" : "FAIL";
|
||||
const skill = r.skillEnabled ? "with-skill" : "baseline";
|
||||
console.log(
|
||||
`[${icon}] ${r.scenario} | ${r.model} | ${skill} | ${(r.duration / 1000).toFixed(1)}s`,
|
||||
);
|
||||
if (r.filesModified.length > 0) {
|
||||
console.log(` Files: ${r.filesModified.join(", ")}`);
|
||||
}
|
||||
if (r.status === "error" && r.error) {
|
||||
console.log(` Error: ${r.error}`);
|
||||
}
|
||||
}
|
||||
|
||||
const passed = results.filter((r) => r.status === "passed").length;
|
||||
console.log(`\nTotal: ${passed}/${results.length} passed`);
|
||||
}
|
||||
65
packages/evals/src/runner/scaffold.ts
Normal file
65
packages/evals/src/runner/scaffold.ts
Normal file
@@ -0,0 +1,65 @@
|
||||
import {
|
||||
cpSync,
|
||||
existsSync,
|
||||
mkdtempSync,
|
||||
readdirSync,
|
||||
rmSync,
|
||||
symlinkSync,
|
||||
} from "node:fs";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join, resolve } from "node:path";
|
||||
|
||||
/** Walk up from cwd to find the repository root (contains skills/ and packages/). */
|
||||
function findRepoRoot(): string {
|
||||
let dir = process.cwd();
|
||||
for (let i = 0; i < 10; i++) {
|
||||
if (existsSync(join(dir, "skills")) && existsSync(join(dir, "packages"))) {
|
||||
return dir;
|
||||
}
|
||||
const parent = resolve(dir, "..");
|
||||
if (parent === dir) break;
|
||||
dir = parent;
|
||||
}
|
||||
throw new Error("Could not find repository root (skills/ + packages/)");
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an isolated workspace for an eval run.
|
||||
*
|
||||
* 1. Copy the eval directory to a temp folder (excluding EVAL.ts)
|
||||
* 2. Optionally symlink the supabase skill so Claude Code can discover it
|
||||
*
|
||||
* Returns the path to the workspace and a cleanup function.
|
||||
*/
|
||||
export function createWorkspace(opts: {
|
||||
evalDir: string;
|
||||
skillEnabled: boolean;
|
||||
}): { workspacePath: string; cleanup: () => void } {
|
||||
const repoRoot = findRepoRoot();
|
||||
const workspacePath = mkdtempSync(join(tmpdir(), "supabase-eval-"));
|
||||
|
||||
// Copy eval directory, excluding EVAL.ts (hidden from agent)
|
||||
const entries = readdirSync(opts.evalDir, { withFileTypes: true });
|
||||
for (const entry of entries) {
|
||||
if (entry.name === "EVAL.ts" || entry.name === "EVAL.tsx") continue;
|
||||
const src = join(opts.evalDir, entry.name);
|
||||
const dest = join(workspacePath, entry.name);
|
||||
cpSync(src, dest, { recursive: true });
|
||||
}
|
||||
|
||||
// Make the skill available to the agent by symlinking the skills dir
|
||||
if (opts.skillEnabled) {
|
||||
const skillsDir = join(repoRoot, "skills");
|
||||
if (existsSync(skillsDir)) {
|
||||
const destSkills = join(workspacePath, "skills");
|
||||
symlinkSync(skillsDir, destSkills);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
workspacePath,
|
||||
cleanup: () => {
|
||||
rmSync(workspacePath, { recursive: true, force: true });
|
||||
},
|
||||
};
|
||||
}
|
||||
97
packages/evals/src/runner/test.ts
Normal file
97
packages/evals/src/runner/test.ts
Normal file
@@ -0,0 +1,97 @@
|
||||
import { execFile } from "node:child_process";
|
||||
import { copyFileSync, existsSync, writeFileSync } from "node:fs";
|
||||
import { dirname, join } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import { promisify } from "node:util";
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = dirname(__filename);
|
||||
|
||||
const exec = promisify(execFile);
|
||||
|
||||
export interface TestResult {
|
||||
passed: boolean;
|
||||
output: string;
|
||||
/** Number of tests that passed */
|
||||
passedCount: number;
|
||||
/** Total number of tests */
|
||||
totalCount: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the hidden EVAL.ts tests against the agent's workspace.
|
||||
*
|
||||
* 1. Copy EVAL.ts into the workspace (agent is done, safe to expose)
|
||||
* 2. Run vitest against it
|
||||
* 3. Parse the output for pass/fail
|
||||
*/
|
||||
export async function runTests(opts: {
|
||||
workspacePath: string;
|
||||
evalFilePath: string;
|
||||
}): Promise<TestResult> {
|
||||
// Copy the hidden test file into the workspace
|
||||
const evalFileName = opts.evalFilePath.endsWith(".tsx")
|
||||
? "EVAL.tsx"
|
||||
: "EVAL.ts";
|
||||
const destPath = join(opts.workspacePath, evalFileName);
|
||||
copyFileSync(opts.evalFilePath, destPath);
|
||||
|
||||
// Write a minimal vitest config that overrides the default include pattern
|
||||
// so EVAL.ts (without .test. or .spec.) is picked up.
|
||||
const vitestConfigPath = join(opts.workspacePath, "vitest.config.mjs");
|
||||
if (!existsSync(vitestConfigPath)) {
|
||||
writeFileSync(
|
||||
vitestConfigPath,
|
||||
`export default { test: { include: ["EVAL.{ts,tsx}"] } };\n`,
|
||||
);
|
||||
}
|
||||
|
||||
// Use the vitest binary from the evals package (always available)
|
||||
const evalsVitest = join(
|
||||
__dirname,
|
||||
"..",
|
||||
"..",
|
||||
"node_modules",
|
||||
".bin",
|
||||
"vitest",
|
||||
);
|
||||
const vitestBin = join(opts.workspacePath, "node_modules", ".bin", "vitest");
|
||||
const cmd = existsSync(vitestBin) ? vitestBin : evalsVitest;
|
||||
const args = ["run", evalFileName, "--reporter=verbose", "--no-color"];
|
||||
|
||||
try {
|
||||
const { stdout, stderr } = await exec(cmd, args, {
|
||||
cwd: opts.workspacePath,
|
||||
timeout: 60_000,
|
||||
env: { ...process.env },
|
||||
maxBuffer: 5 * 1024 * 1024,
|
||||
});
|
||||
|
||||
const output = `${stdout}\n${stderr}`;
|
||||
return parseTestOutput(output);
|
||||
} catch (error) {
|
||||
const err = error as Error & { stdout?: string; stderr?: string };
|
||||
const output = `${err.stdout ?? ""}\n${err.stderr ?? ""}`;
|
||||
return parseTestOutput(output);
|
||||
}
|
||||
}
|
||||
|
||||
function parseTestOutput(output: string): TestResult {
|
||||
// Parse vitest output for pass/fail counts
|
||||
// Format: "Tests N passed (M)" or "Tests N failed | M passed (T)"
|
||||
const testsLine = output.match(
|
||||
/Tests\s+(?:(\d+)\s+failed\s+\|\s+)?(\d+)\s+passed\s+\((\d+)\)/,
|
||||
);
|
||||
|
||||
let passedCount = 0;
|
||||
let totalCount = 0;
|
||||
|
||||
if (testsLine) {
|
||||
passedCount = Number.parseInt(testsLine[2], 10);
|
||||
totalCount = Number.parseInt(testsLine[3], 10);
|
||||
}
|
||||
|
||||
const passed = totalCount > 0 && passedCount === totalCount;
|
||||
|
||||
return { passed, output, passedCount, totalCount };
|
||||
}
|
||||
@@ -1,187 +0,0 @@
|
||||
import { generateText, Output } from "ai";
|
||||
import type { EvalScorer } from "braintrust";
|
||||
import { z } from "zod";
|
||||
import type { CodeFixTestCase } from "./dataset/types.js";
|
||||
import type { Expected, Input, Output as TaskOutput } from "./dataset.js";
|
||||
import { getModel } from "./models.js";
|
||||
|
||||
const judgeModelId = process.env.EVAL_JUDGE_MODEL || "claude-opus-4-6";
|
||||
|
||||
const scoreSchema = z.object({
|
||||
score: z
|
||||
.number()
|
||||
.describe("Score from 0 to 1 (0 = bad, 0.5 = partial, 1 = good)"),
|
||||
reasoning: z.string().describe("Brief reasoning for the score"),
|
||||
});
|
||||
|
||||
const SYSTEM_PROMPT =
|
||||
"You are a precise, consistent evaluator of Supabase code fixes. You assess whether LLM-generated code correctly addresses Supabase anti-patterns by comparing against reference solutions. You are fair: functionally equivalent solutions that differ in style or approach from the reference still receive high scores. You are strict: partial fixes, missing security measures, or incorrect patterns receive low scores. Always provide specific evidence for your scoring.";
|
||||
|
||||
function buildContext(tc: CodeFixTestCase, llmOutput: string): string {
|
||||
return `## Reference Information
|
||||
|
||||
**Topic:** ${tc.title}
|
||||
**Explanation:** ${tc.explanation}
|
||||
|
||||
## Original Incorrect Code
|
||||
|
||||
\`\`\`${tc.badExample.language || ""}
|
||||
${tc.badExample.code}
|
||||
\`\`\`
|
||||
|
||||
## Reference Correct Code (ground truth)
|
||||
|
||||
\`\`\`${tc.goodExample.language || ""}
|
||||
${tc.goodExample.code}
|
||||
\`\`\`
|
||||
|
||||
## LLM's Attempted Fix
|
||||
|
||||
${llmOutput}`;
|
||||
}
|
||||
|
||||
async function judge(
|
||||
prompt: string,
|
||||
): Promise<{ score: number; reasoning: string }> {
|
||||
const model = getModel(judgeModelId);
|
||||
const { output } = await generateText({
|
||||
model,
|
||||
system: SYSTEM_PROMPT,
|
||||
prompt,
|
||||
output: Output.object({ schema: scoreSchema }),
|
||||
temperature: 0.1,
|
||||
maxRetries: 2,
|
||||
});
|
||||
if (!output) throw new Error("Judge returned no structured output");
|
||||
return output;
|
||||
}
|
||||
|
||||
export const correctnessScorer: EvalScorer<
|
||||
Input,
|
||||
TaskOutput,
|
||||
Expected
|
||||
> = async ({ input, output }) => {
|
||||
const context = buildContext(input.testCase, output.llmOutput);
|
||||
const result = await judge(`${context}
|
||||
|
||||
## Task
|
||||
|
||||
Evaluate **correctness**: Does the LLM's fix address the core issue identified in the incorrect code?
|
||||
|
||||
The fix does not need to be character-identical to the reference, but it must solve the same problem. Functionally equivalent or improved solutions should score well.
|
||||
|
||||
Score 1 if the fix fully addresses the core issue, 0.5 if it partially addresses it, 0 if it fails to address the core issue or introduces new problems.`);
|
||||
|
||||
return {
|
||||
name: "Correctness",
|
||||
score: result.score,
|
||||
metadata: { reasoning: result.reasoning },
|
||||
};
|
||||
};
|
||||
|
||||
export const completenessScorer: EvalScorer<
|
||||
Input,
|
||||
TaskOutput,
|
||||
Expected
|
||||
> = async ({ input, output }) => {
|
||||
const context = buildContext(input.testCase, output.llmOutput);
|
||||
const result = await judge(`${context}
|
||||
|
||||
## Task
|
||||
|
||||
Evaluate **completeness**: Does the LLM's fix include ALL necessary changes shown in the reference?
|
||||
|
||||
Check for missing RLS enablement, missing policy clauses, missing columns, incomplete migrations, or any partial fixes. The fix should be production-ready.
|
||||
|
||||
Score 1 if all necessary changes are present, 0.5 if most changes are present but some are missing, 0 if significant changes are missing.`);
|
||||
|
||||
return {
|
||||
name: "Completeness",
|
||||
score: result.score,
|
||||
metadata: { reasoning: result.reasoning },
|
||||
};
|
||||
};
|
||||
|
||||
export const bestPracticeScorer: EvalScorer<
|
||||
Input,
|
||||
TaskOutput,
|
||||
Expected
|
||||
> = async ({ input, output }) => {
|
||||
const context = buildContext(input.testCase, output.llmOutput);
|
||||
const result = await judge(`${context}
|
||||
|
||||
## Task
|
||||
|
||||
Evaluate **best practices**: Does the LLM's fix follow Supabase best practices as demonstrated in the reference?
|
||||
|
||||
Consider: RLS patterns, auth.users references, migration conventions, connection pooling, edge function patterns, SDK usage, and security-first defaults. Alternative correct approaches that achieve the same security/correctness goal are acceptable.
|
||||
|
||||
Score 1 if the fix follows best practices, 0.5 if it mostly follows best practices with minor deviations, 0 if it uses anti-patterns or ignores conventions.`);
|
||||
|
||||
return {
|
||||
name: "Best Practice",
|
||||
score: result.score,
|
||||
metadata: { reasoning: result.reasoning },
|
||||
};
|
||||
};
|
||||
|
||||
export const regressionSafetyScorer: EvalScorer<
|
||||
Input,
|
||||
TaskOutput,
|
||||
Expected
|
||||
> = async ({ input, output }) => {
|
||||
const context = buildContext(input.testCase, output.llmOutput);
|
||||
const result = await judge(`${context}
|
||||
|
||||
## Task
|
||||
|
||||
Evaluate **regression safety**: Does the LLM's fix avoid introducing new problems?
|
||||
|
||||
Carefully check whether the fix:
|
||||
- Breaks existing functionality that was working in the original code
|
||||
- Removes security measures (RLS policies, auth checks, input validation) that were already present
|
||||
- Changes function signatures, return types, or column names in ways that would break callers
|
||||
- Introduces SQL injection, XSS, or other security vulnerabilities not present in the original
|
||||
- Drops data, removes columns, or alters schemas destructively without necessity
|
||||
- Changes behavior beyond the scope of the identified issue
|
||||
|
||||
The fix should repair the identified problem WITHOUT creating new ones. A fix that solves the original issue but breaks something else is dangerous in production.
|
||||
|
||||
Score 1 if the fix introduces no new problems. Score 0.5 if the fix introduces minor issues (e.g., slightly different naming that could confuse but not break). Score 0 if the fix introduces a new bug, security vulnerability, or breaking change.`);
|
||||
|
||||
return {
|
||||
name: "Regression Safety",
|
||||
score: result.score,
|
||||
metadata: { reasoning: result.reasoning },
|
||||
};
|
||||
};
|
||||
|
||||
export const minimalityScorer: EvalScorer<
|
||||
Input,
|
||||
TaskOutput,
|
||||
Expected
|
||||
> = async ({ input, output }) => {
|
||||
const context = buildContext(input.testCase, output.llmOutput);
|
||||
const result = await judge(`${context}
|
||||
|
||||
## Task
|
||||
|
||||
Evaluate **minimality**: Does the LLM's fix make only the changes necessary to address the identified issue?
|
||||
|
||||
Check whether the fix:
|
||||
- Rewrites or restructures code beyond what is needed to fix the problem
|
||||
- Adds features, abstractions, or utilities not present in the reference solution
|
||||
- Changes formatting, variable names, or style in unrelated parts of the code
|
||||
- Adds excessive comments, logging, or error handling not required by the fix
|
||||
- Over-engineers the solution (e.g., adding configuration options, generalization, or layers of abstraction when a simple targeted fix suffices)
|
||||
|
||||
Compare the scope of changes in the LLM's fix against the reference. The reference represents the ideal minimal fix. The LLM's fix should be similarly focused.
|
||||
|
||||
Score 1 if the fix is tightly scoped to the identified issue (similar scope to the reference). Score 0.5 if the fix includes some unnecessary changes but the core fix is present. Score 0 if the fix significantly over-reaches — rewriting large portions of code, adding unrelated features, or restructuring beyond what is needed.`);
|
||||
|
||||
return {
|
||||
name: "Minimality",
|
||||
score: result.score,
|
||||
metadata: { reasoning: result.reasoning },
|
||||
};
|
||||
};
|
||||
35
packages/evals/src/types.ts
Normal file
35
packages/evals/src/types.ts
Normal file
@@ -0,0 +1,35 @@
|
||||
export interface EvalScenario {
|
||||
/** Directory name under evals/ */
|
||||
id: string;
|
||||
/** Human-readable name */
|
||||
name: string;
|
||||
/** Tags for filtering */
|
||||
tags: string[];
|
||||
}
|
||||
|
||||
export interface AgentConfig {
|
||||
/** Agent identifier */
|
||||
agent: "claude-code";
|
||||
/** Model to use */
|
||||
model: string;
|
||||
/** Whether the supabase skill is available */
|
||||
skillEnabled: boolean;
|
||||
}
|
||||
|
||||
export interface EvalRunResult {
|
||||
scenario: string;
|
||||
agent: string;
|
||||
model: string;
|
||||
skillEnabled: boolean;
|
||||
status: "passed" | "failed" | "error";
|
||||
duration: number;
|
||||
testOutput: string;
|
||||
agentOutput: string;
|
||||
/** Number of vitest tests that passed */
|
||||
testsPassed: number;
|
||||
/** Total number of vitest tests */
|
||||
testsTotal: number;
|
||||
/** Files the agent created or modified in the workspace */
|
||||
filesModified: string[];
|
||||
error?: string;
|
||||
}
|
||||
@@ -12,5 +12,5 @@
|
||||
"resolveJsonModule": true
|
||||
},
|
||||
"include": ["src/**/*"],
|
||||
"exclude": ["node_modules", "dist"]
|
||||
"exclude": ["node_modules", "dist", "evals"]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user