initial skills evals

2026-03-27 10:09:26 +08:00 · 2026-02-18 12:02:28 +00:00
parent 69575f4c87
commit 27d7af255d
17 changed files with 3177 additions and 10 deletions
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -0,0 +1,49 @@
 name: Skill Evals
 on:
  workflow_dispatch:
  pull_request:
    types: [opened, synchronize, labeled]
    paths:
      - "skills/**"
      - "packages/evals/**"
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true
 permissions:
  pull-requests: write
  contents: read
 jobs:
  eval:
    name: Run evals
    if: >
      github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'pull_request' &&
       contains(github.event.pull_request.labels.*.name, 'run-evals'))
    runs-on: ubuntu-latest
    environment: evals
    timeout-minutes: 30
    env:
      BRAINTRUST_PROJECT_ID: ${{ secrets.BRAINTRUST_PROJECT_ID }}
      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
    steps:
      - uses: actions/checkout@v4
      - uses: jdx/mise-action@v3
        with:
          install: true
      - name: Install dependencies
        run: npm install && npm --prefix packages/evals install
      - name: Run Evals
        uses: braintrustdata/eval-action@v1
        with:
          api_key: ${{ secrets.BRAINTRUST_API_KEY }}
          runtime: node
          root: packages/evals
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -25,6 +25,16 @@ skills/
 packages/
  skills-build/           # Generic build system for all skills
  evals/                  # LLM evaluation system for skills
    AGENTS.md             # Agent guide for developing evals
    CLAUDE.md             # Symlink to AGENTS.md
    scenarios/
      workflow-scenarios.json  # Handwritten workflow test scenarios
    src/
      cli.ts              # Entry point
      prompts/            # Eval and judge prompts
      scorer/             # Zod schemas and judge execution
      dataset/            # Test case extraction from skill references
      runner/             # Eval orchestrator and runners
 ```
 ## Commands
--- a/mise.toml
+++ b/mise.toml
@@ -46,16 +46,11 @@ sources = ["test/**", "skills/**"]
 # ── Eval tasks ────────────────────────────────────────────────────────
 [tasks.eval]
-description = "Run all evals"
+description = "Run code-fix evals (local, no upload)"
-run = "tsx packages/evals/src/cli.ts"
+run = "npm --prefix packages/evals run eval"
 sources = ["packages/evals/src/**", "skills/**/references/**"]
-[tasks."eval:code-fix"]
+[tasks."eval:upload"]
-description = "Run code-fix evals"
+description = "Run code-fix evals and upload to Braintrust"
-run = "tsx packages/evals/src/cli.ts --type code-fix"
+run = "npm --prefix packages/evals run eval:upload"
 sources = ["packages/evals/src/**", "skills/**/references/**"]
 [tasks."eval:workflow"]
 description = "Run workflow evals"
 run = "tsx packages/evals/src/cli.ts --type workflow"
 sources = ["packages/evals/src/**", "skills/**/references/**"]
--- a/packages/evals/.env.example
+++ b/packages/evals/.env.example
@@ -0,0 +1,3 @@
 ANTHROPIC_API_KEY=
 BRAINTRUST_API_KEY=
 BRAINTRUST_PROJECT_ID=
--- a/packages/evals/AGENTS.md
+++ b/packages/evals/AGENTS.md
@@ -0,0 +1,142 @@
 # Evals — Agent Guide
 This package evaluates whether LLMs correctly apply Supabase best practices
 using skill documentation as context. It uses
 [Braintrust](https://www.braintrust.dev/) for eval orchestration and the
 [Vercel AI SDK](https://sdk.vercel.ai/) for LLM calls.
 ## Architecture
 Two-step **LLM-as-judge** pattern powered by Braintrust's `Eval()`:
 1. The **eval model** (default: `claude-sonnet-4-5-20250929`) receives a prompt
   with skill context and produces a code fix.
 2. Three independent **judge scorers** (default: `claude-opus-4-6`) evaluate the
   fix via structured output (Zod schemas via AI SDK's `Output.object()`).
 Key files:
 ```
 src/
  code-fix.eval.ts        # Braintrust Eval() entry point
  dataset.ts              # Maps extracted test cases to EvalCase format
  scorer.ts               # Three AI SDK-based scorers (Correctness, Completeness, Best Practice)
  models.ts               # Model provider factory (Anthropic / OpenAI)
  dataset/
    types.ts              # CodeFixTestCase interface
    extract.ts            # Auto-extracts test cases from skill references
  prompts/
    code-fix.ts           # System + user prompts for the eval model
 ```
 ## How It Works
 **Test cases are auto-extracted** from `skills/*/references/*.md`. The extractor
 (`dataset/extract.ts`) finds consecutive `**Incorrect:**` / `**Correct:**` code
 block pairs under `##` sections. Each pair becomes one test case.
 Three independent scorers evaluate each fix (0–1 scale):
 - **Correctness** — does the fix address the core issue?
 - **Completeness** — does the fix include all necessary changes?
 - **Best Practice** — does the fix follow Supabase conventions?
 Braintrust aggregates the scores and provides a dashboard for tracking
 regressions over time.
 ## Adding Test Cases
 No code changes needed. Add paired Incorrect/Correct blocks to any skill
 reference file. The extractor picks them up automatically.
 Required format in a reference `.md` file:
 ```markdown
 ## Section Title
 Explanation of the issue.
 **Incorrect:**
 \```sql
 -- bad code
 \```
 **Correct:**
 \```sql
 -- good code
 \```
 ```
 Rules:
 - Pairs must be consecutive — an Incorrect block immediately followed by a
  Correct block
 - Labels are matched case-insensitively. Bad labels: `Incorrect`, `Wrong`, `Bad`.
  Good labels: `Correct`, `Good`, `Usage`, `Implementation`, `Example`,
  `Recommended`
 - The optional parenthetical in the label becomes the `description` field:
  `**Incorrect (missing RLS):**`
 - Files prefixed with `_` (like `_sections.md`, `_template.md`) are skipped
 - Each pair gets an ID like `supabase/db-rls-mandatory#0` (skill/filename#index)
 ## Modifying Prompts
 - `src/prompts/code-fix.ts` — what the eval model sees
 - `src/scorer.ts` — judge prompts for each scorer dimension
 Temperature settings:
 - Eval model: `0.2` (in `code-fix.eval.ts`)
 - Judge model: `0.1` (in `scorer.ts`)
 ## Modifying Scoring
 Each scorer in `src/scorer.ts` is independent. To add a new dimension:
 1. Create a new `EvalScorer` function in `scorer.ts`
 2. Add it to the `scores` array in `code-fix.eval.ts`
 ## Running Evals
 ```bash
 # Run locally (no Braintrust upload)
 mise run eval
 # Run and upload to Braintrust dashboard
 mise run eval:upload
 ```
 Or directly:
 ```bash
 cd packages/evals
 # Local run
 npx braintrust eval --no-send-logs src/code-fix.eval.ts
 # Upload to Braintrust
 npx braintrust eval src/code-fix.eval.ts
 ```
 In CI, evals run via `braintrustdata/eval-action@v1` and are gated by the
 `run-evals` PR label.
 ## Environment
 API keys are loaded by mise from `packages/evals/.env` (configured in root
 `mise.toml`). Copy `.env.example` to `.env` and fill in the keys.
 ```
 ANTHROPIC_API_KEY=sk-ant-...    # Required: eval model + judge model
 BRAINTRUST_API_KEY=...          # Required for upload to Braintrust dashboard
 BRAINTRUST_PROJECT_ID=...       # Required for upload to Braintrust dashboard
 ```
 Optional overrides:
 ```
 EVAL_MODEL=claude-sonnet-4-5-20250929    # Model under test
 EVAL_JUDGE_MODEL=claude-opus-4-6         # Judge model for scorers
 ```
--- a/packages/evals/CLAUDE.md
+++ b/packages/evals/CLAUDE.md
@@ -0,0 +1 @@
 AGENTS.md
--- a/packages/evals/README.md
+++ b/packages/evals/README.md
@@ -0,0 +1,46 @@
 # Evals
 LLM evaluation system for Supabase agent skills, powered by [Braintrust](https://www.braintrust.dev/). Tests whether models can correctly apply Supabase best practices using skill documentation as context.
 ## How It Works
 Each eval follows a two-step **LLM-as-judge** pattern orchestrated by Braintrust's `Eval()`:
 1. **Generate** — The eval model (e.g. Sonnet 4.5) receives a prompt with skill context and produces a code fix.
 2. **Judge** — Three independent scorers using a stronger model (Opus 4.6 by default) evaluate the fix via the Vercel AI SDK with structured output.
 Test cases are extracted automatically from skill reference files (`skills/*/references/*.md`). Each file contains paired **Incorrect** / **Correct** code blocks — the model receives the bad code and must produce the fix.
 **Scoring dimensions (each 0–1):**
 | Scorer | Description |
 |--------|-------------|
 | Correctness | Does the fix address the core issue? |
 | Completeness | Does it include all necessary changes? |
 | Best Practice | Does it follow Supabase best practices? |
 ## Usage
 ```bash
 # Run locally (no Braintrust upload)
 mise run eval
 # Run and upload to Braintrust dashboard
 mise run eval:upload
 ```
 ### Environment Variables
 API keys are loaded via mise from `packages/evals/.env` (see root `mise.toml`).
 ```
 ANTHROPIC_API_KEY         Required: eval model + judge model
 BRAINTRUST_API_KEY        Required for Braintrust dashboard upload
 BRAINTRUST_PROJECT_ID     Required for Braintrust dashboard upload
 EVAL_MODEL                Override default eval model (claude-sonnet-4-5-20250929)
 EVAL_JUDGE_MODEL          Override default judge model (claude-opus-4-6)
 ```
 ## Adding Test Cases
 Add paired Incorrect/Correct code blocks to any skill reference file. The extractor picks them up automatically on the next run.
--- a/packages/evals/package-lock.json
+++ b/packages/evals/package-lock.json
--- a/packages/evals/package.json
+++ b/packages/evals/package.json
@@ -0,0 +1,24 @@
 {
 	"name": "evals",
 	"version": "1.0.0",
 	"type": "module",
 	"author": "Supabase",
 	"license": "MIT",
 	"description": "LLM evaluation system for Supabase agent skills",
 	"scripts": {
 		"eval": "braintrust eval --no-send-logs src/code-fix.eval.ts",
 		"eval:upload": "braintrust eval src/code-fix.eval.ts"
 	},
 	"dependencies": {
 		"@ai-sdk/anthropic": "^3.0.44",
 		"@ai-sdk/openai": "^3.0.29",
 		"ai": "^6.0.86",
 		"braintrust": "^1.0.2",
 		"zod": "^3.23.0"
 	},
 	"devDependencies": {
 		"@types/node": "^20.10.0",
 		"tsx": "^4.7.0",
 		"typescript": "^5.3.0"
 	}
 }
--- a/packages/evals/src/code-fix.eval.ts
+++ b/packages/evals/src/code-fix.eval.ts
@@ -0,0 +1,36 @@
 import assert from "node:assert";
 import { generateText } from "ai";
 import { Eval } from "braintrust";
 import { dataset } from "./dataset.js";
 import { getModel } from "./models.js";
 import {
 	buildCodeFixPrompt,
 	buildCodeFixSystemPrompt,
 } from "./prompts/code-fix.js";
 import {
 	bestPracticeScorer,
 	completenessScorer,
 	correctnessScorer,
 } from "./scorer.js";
 assert(process.env.ANTHROPIC_API_KEY, "ANTHROPIC_API_KEY is not set");
 const modelId = process.env.EVAL_MODEL || "claude-sonnet-4-5-20250929";
 Eval("CodeFix", {
 	projectId: process.env.BRAINTRUST_PROJECT_ID,
 	trialCount: process.env.CI ? 3 : 1,
 	data: () => dataset(),
 	task: async (input) => {
 		const model = getModel(modelId);
 		const response = await generateText({
 			model,
 			system: buildCodeFixSystemPrompt(),
 			prompt: buildCodeFixPrompt(input.testCase),
 			temperature: 0.2,
 			maxRetries: 2,
 		});
 		return { llmOutput: response.text };
 	},
 	scores: [correctnessScorer, completenessScorer, bestPracticeScorer],
 });
--- a/packages/evals/src/dataset.ts
+++ b/packages/evals/src/dataset.ts
@@ -0,0 +1,33 @@
 import type { EvalCase } from "braintrust";
 import { extractCodeFixDataset } from "./dataset/extract.js";
 import type { CodeFixTestCase } from "./dataset/types.js";
 export type Input = { testCase: CodeFixTestCase };
 export type Expected = {
 	correctCode: string;
 	correctLanguage?: string;
 };
 export type Metadata = {
 	skillName: string;
 	section: string;
 	tags: string[];
 };
 export type Output = { llmOutput: string };
 export function dataset(): EvalCase<Input, Expected, Metadata>[] {
 	return extractCodeFixDataset().map((tc) => ({
 		input: { testCase: tc },
 		expected: {
 			correctCode: tc.goodExample.code,
 			correctLanguage: tc.goodExample.language,
 		},
 		metadata: {
 			skillName: tc.skillName,
 			section: tc.section,
 			tags: tc.tags,
 		},
 	}));
 }
--- a/packages/evals/src/dataset/extract.ts
+++ b/packages/evals/src/dataset/extract.ts
@@ -0,0 +1,277 @@
 import { existsSync, readdirSync, readFileSync } from "node:fs";
 import { basename, join, resolve } from "node:path";
 import type { CodeFixTestCase } from "./types.js";
 function findSkillsRoot(): string {
 	let dir = process.cwd();
 	for (let i = 0; i < 10; i++) {
 		const candidate = join(dir, "skills");
 		if (existsSync(candidate)) return candidate;
 		const parent = resolve(dir, "..");
 		if (parent === dir) break;
 		dir = parent;
 	}
 	throw new Error(
 		"Could not find skills/ directory. Run from the repository root or a subdirectory.",
 	);
 }
 const SKILLS_ROOT = findSkillsRoot();
 // --- Duplicated from skills-build/src/parser.ts for isolation ---
 interface CodeExample {
 	label: string;
 	description?: string;
 	code: string;
 	language?: string;
 }
 function parseFrontmatter(content: string): {
 	frontmatter: Record<string, string>;
 	body: string;
 } {
 	const frontmatter: Record<string, string> = {};
 	if (!content.startsWith("---")) {
 		return { frontmatter, body: content };
 	}
 	const endIndex = content.indexOf("---", 3);
 	if (endIndex === -1) {
 		return { frontmatter, body: content };
 	}
 	const frontmatterContent = content.slice(3, endIndex).trim();
 	const body = content.slice(endIndex + 3).trim();
 	for (const line of frontmatterContent.split("\n")) {
 		const colonIndex = line.indexOf(":");
 		if (colonIndex === -1) continue;
 		const key = line.slice(0, colonIndex).trim();
 		let value = line.slice(colonIndex + 1).trim();
 		if (
 			(value.startsWith('"') && value.endsWith('"')) ||
 			(value.startsWith("'") && value.endsWith("'"))
 		) {
 			value = value.slice(1, -1);
 		}
 		frontmatter[key] = value;
 	}
 	return { frontmatter, body };
 }
 function extractTitle(body: string): string | null {
 	const match = body.match(/^##\s+(.+)$/m);
 	return match ? match[1].trim() : null;
 }
 interface Section {
 	title: string;
 	explanation: string;
 	examples: CodeExample[];
 }
 function extractSections(body: string): Section[] {
 	const sections: Section[] = [];
 	const lines = body.split("\n");
 	let currentTitle = "";
 	let explanationLines: string[] = [];
 	let currentExamples: CodeExample[] = [];
 	let currentLabel = "";
 	let currentDescription = "";
 	let inCodeBlock = false;
 	let codeBlockLang = "";
 	let codeBlockContent: string[] = [];
 	let collectingExplanation = false;
 	function flushExample() {
 		if (currentLabel && codeBlockContent.length > 0) {
 			currentExamples.push({
 				label: currentLabel,
 				description: currentDescription || undefined,
 				code: codeBlockContent.join("\n"),
 				language: codeBlockLang || undefined,
 			});
 		}
 		currentLabel = "";
 		currentDescription = "";
 		codeBlockContent = [];
 		codeBlockLang = "";
 	}
 	function flushSection() {
 		if (currentTitle && currentExamples.length > 0) {
 			sections.push({
 				title: currentTitle,
 				explanation: explanationLines.join("\n").trim(),
 				examples: currentExamples,
 			});
 		}
 		currentExamples = [];
 		explanationLines = [];
 	}
 	for (const line of lines) {
 		if (line.startsWith("## ") && !inCodeBlock) {
 			flushExample();
 			flushSection();
 			currentTitle = line.replace(/^##\s+/, "").trim();
 			collectingExplanation = true;
 			continue;
 		}
 		const labelMatch = line.match(
 			/^\*\*([^*]+?)(?:\s*\(([^)]+)\))?\s*:\*\*\s*$/,
 		);
 		if (labelMatch && !inCodeBlock) {
 			collectingExplanation = false;
 			flushExample();
 			currentLabel = labelMatch[1].trim();
 			currentDescription = labelMatch[2]?.trim() || "";
 			continue;
 		}
 		if (line.startsWith("```") && !inCodeBlock) {
 			collectingExplanation = false;
 			inCodeBlock = true;
 			codeBlockLang = line.slice(3).trim();
 			continue;
 		}
 		if (line.startsWith("```") && inCodeBlock) {
 			inCodeBlock = false;
 			continue;
 		}
 		if (inCodeBlock) {
 			codeBlockContent.push(line);
 		} else if (collectingExplanation) {
 			explanationLines.push(line);
 		}
 	}
 	flushExample();
 	flushSection();
 	return sections;
 }
 // --- Duplicated from skills-build/src/validate.ts ---
 function isBadExample(label: string): boolean {
 	const lower = label.toLowerCase();
 	return (
 		lower.includes("incorrect") ||
 		lower.includes("wrong") ||
 		lower.includes("bad")
 	);
 }
 function isGoodExample(label: string): boolean {
 	const lower = label.toLowerCase();
 	return (
 		lower.includes("correct") ||
 		lower.includes("good") ||
 		lower.includes("usage") ||
 		lower.includes("implementation") ||
 		lower.includes("example") ||
 		lower.includes("recommended")
 	);
 }
 // --- Extraction logic ---
 function pairExamples(
 	examples: CodeExample[],
 ): Array<{ bad: CodeExample; good: CodeExample }> {
 	const pairs: Array<{ bad: CodeExample; good: CodeExample }> = [];
 	for (let i = 0; i < examples.length - 1; i++) {
 		if (
 			isBadExample(examples[i].label) &&
 			isGoodExample(examples[i + 1].label)
 		) {
 			pairs.push({ bad: examples[i], good: examples[i + 1] });
 		}
 	}
 	return pairs;
 }
 function discoverSkillNames(): string[] {
 	if (!existsSync(SKILLS_ROOT)) return [];
 	return readdirSync(SKILLS_ROOT, { withFileTypes: true })
 		.filter((d) => d.isDirectory())
 		.filter((d) => existsSync(join(SKILLS_ROOT, d.name, "SKILL.md")))
 		.map((d) => d.name);
 }
 function getMarkdownFiles(dir: string): string[] {
 	if (!existsSync(dir)) return [];
 	return readdirSync(dir)
 		.filter((f) => f.endsWith(".md") && !f.startsWith("_"))
 		.map((f) => join(dir, f));
 }
 export function extractCodeFixDataset(skillName?: string): CodeFixTestCase[] {
 	const skills = skillName ? [skillName] : discoverSkillNames();
 	const testCases: CodeFixTestCase[] = [];
 	for (const skill of skills) {
 		const referencesDir = join(SKILLS_ROOT, skill, "references");
 		const files = getMarkdownFiles(referencesDir);
 		for (const filePath of files) {
 			const content = readFileSync(filePath, "utf-8");
 			const { frontmatter, body } = parseFrontmatter(content);
 			const fileTitle =
 				frontmatter.title || extractTitle(body) || basename(filePath, ".md");
 			const tags = frontmatter.tags?.split(",").map((t) => t.trim()) || [];
 			const section = basename(filePath, ".md").split("-")[0];
 			const sections = extractSections(body);
 			let pairIndex = 0;
 			for (const sec of sections) {
 				const pairs = pairExamples(sec.examples);
 				for (const { bad, good } of pairs) {
 					testCases.push({
 						id: `${skill}/${basename(filePath, ".md")}#${pairIndex}`,
 						skillName: skill,
 						referenceFile: filePath,
 						referenceFilename: basename(filePath),
 						title: sec.title || fileTitle,
 						explanation: sec.explanation,
 						section,
 						tags,
 						pairIndex,
 						badExample: {
 							label: bad.label,
 							description: bad.description,
 							code: bad.code,
 							language: bad.language,
 						},
 						goodExample: {
 							label: good.label,
 							description: good.description,
 							code: good.code,
 							language: good.language,
 						},
 					});
 					pairIndex++;
 				}
 			}
 		}
 	}
 	return testCases;
 }
--- a/packages/evals/src/dataset/types.ts
+++ b/packages/evals/src/dataset/types.ts
@@ -0,0 +1,24 @@
 export interface CodeFixTestCase {
 	/** Unique ID, e.g. "supabase/db-rls-mandatory#0" */
 	id: string;
 	skillName: string;
 	referenceFile: string;
 	referenceFilename: string;
 	title: string;
 	explanation: string;
 	section: string;
 	tags: string[];
 	pairIndex: number;
 	badExample: {
 		label: string;
 		description?: string;
 		code: string;
 		language?: string;
 	};
 	goodExample: {
 		label: string;
 		description?: string;
 		code: string;
 		language?: string;
 	};
 }
--- a/packages/evals/src/models.ts
+++ b/packages/evals/src/models.ts
@@ -0,0 +1,51 @@
 import type { AnthropicProvider } from "@ai-sdk/anthropic";
 import { anthropic } from "@ai-sdk/anthropic";
 import type { OpenAIProvider } from "@ai-sdk/openai";
 import { openai } from "@ai-sdk/openai";
 import type { LanguageModel } from "ai";
 /** Model ID accepted by the Anthropic provider (string literal union + string). */
 export type AnthropicModelId = Parameters<AnthropicProvider["chat"]>[0];
 /** Model ID accepted by the OpenAI provider (string literal union + string). */
 export type OpenAIModelId = Parameters<OpenAIProvider["chat"]>[0];
 /** Any model ID accepted by the eval harness. */
 export type SupportedModelId = AnthropicModelId | OpenAIModelId;
 const MODEL_MAP: Record<string, () => LanguageModel> = {
 	"claude-opus-4-6": () => anthropic("claude-opus-4-6"),
 	"claude-sonnet-4-5-20250929": () => anthropic("claude-sonnet-4-5-20250929"),
 	"claude-haiku-4-5-20251001": () => anthropic("claude-haiku-4-5-20251001"),
 	"claude-opus-4-5-20251101": () => anthropic("claude-opus-4-5-20251101"),
 	"claude-sonnet-4-20250514": () => anthropic("claude-sonnet-4-20250514"),
 	"gpt-4o": () => openai("gpt-4o"),
 	"gpt-4o-mini": () => openai("gpt-4o-mini"),
 	"o3-mini": () => openai("o3-mini"),
 };
 export function getModel(modelId: SupportedModelId): LanguageModel {
 	const factory = MODEL_MAP[modelId];
 	if (factory) return factory();
 	// Fall back to provider detection from model ID prefix
 	if (modelId.startsWith("claude")) {
 		return anthropic(modelId as AnthropicModelId);
 	}
 	if (
 		modelId.startsWith("gpt") ||
 		modelId.startsWith("o1") ||
 		modelId.startsWith("o3")
 	) {
 		return openai(modelId as OpenAIModelId);
 	}
 	throw new Error(
 		`Unknown model: ${modelId}. Available: ${Object.keys(MODEL_MAP).join(", ")}`,
 	);
 }
 export function getJudgeModel(): LanguageModel {
 	const judgeModelId = process.env.EVAL_JUDGE_MODEL || "claude-opus-4-6";
 	return getModel(judgeModelId);
 }
--- a/packages/evals/src/prompts/code-fix.ts
+++ b/packages/evals/src/prompts/code-fix.ts
@@ -0,0 +1,33 @@
 import type { CodeFixTestCase } from "../dataset/types.js";
 export function buildCodeFixSystemPrompt(): string {
 	return `You are a senior Supabase developer and database architect. You fix code to follow Supabase best practices including:
 - Row Level Security (RLS) policies
 - Proper authentication patterns
 - Safe migration workflows
 - Correct SDK usage patterns
 - Edge Function best practices
 - Connection pooling configuration
 - Security-first defaults
 When fixing code, ensure the fix is complete, production-ready, and follows the latest Supabase conventions. Return only the corrected code inside a single code block.`;
 }
 export function buildCodeFixPrompt(testCase: CodeFixTestCase): string {
 	const langHint = testCase.badExample.language
 		? ` (${testCase.badExample.language})`
 		: "";
 	return `The following code has a problem related to: ${testCase.title}
 Context: ${testCase.explanation}
 Here is the problematic code${langHint}:
 \`\`\`${testCase.badExample.language || ""}
 ${testCase.badExample.code}
 \`\`\`
 ${testCase.badExample.description ? `\nIssue hint: ${testCase.badExample.description}` : ""}
 Fix this code to follow Supabase best practices. Return ONLY the corrected code inside a single code block. Do not include any explanation outside the code block.`;
 }
--- a/packages/evals/src/scorer.ts
+++ b/packages/evals/src/scorer.ts
@@ -0,0 +1,126 @@
 import { generateText, Output } from "ai";
 import type { EvalScorer } from "braintrust";
 import { z } from "zod";
 import type { CodeFixTestCase } from "./dataset/types.js";
 import type { Expected, Input, Output as TaskOutput } from "./dataset.js";
 import { getModel } from "./models.js";
 const judgeModelId = process.env.EVAL_JUDGE_MODEL || "claude-opus-4-6";
 const scoreSchema = z.object({
 	score: z
 		.number()
 		.describe("Score from 0 to 1 (0 = bad, 0.5 = partial, 1 = good)"),
 	reasoning: z.string().describe("Brief reasoning for the score"),
 });
 const SYSTEM_PROMPT =
 	"You are a precise, consistent evaluator of Supabase code fixes. You assess whether LLM-generated code correctly addresses Supabase anti-patterns by comparing against reference solutions. You are fair: functionally equivalent solutions that differ in style or approach from the reference still receive high scores. You are strict: partial fixes, missing security measures, or incorrect patterns receive low scores. Always provide specific evidence for your scoring.";
 function buildContext(tc: CodeFixTestCase, llmOutput: string): string {
 	return `## Reference Information
 **Topic:** ${tc.title}
 **Explanation:** ${tc.explanation}
 ## Original Incorrect Code
 \`\`\`${tc.badExample.language || ""}
 ${tc.badExample.code}
 \`\`\`
 ## Reference Correct Code (ground truth)
 \`\`\`${tc.goodExample.language || ""}
 ${tc.goodExample.code}
 \`\`\`
 ## LLM's Attempted Fix
 ${llmOutput}`;
 }
 async function judge(
 	prompt: string,
 ): Promise<{ score: number; reasoning: string }> {
 	const model = getModel(judgeModelId);
 	const { output } = await generateText({
 		model,
 		system: SYSTEM_PROMPT,
 		prompt,
 		output: Output.object({ schema: scoreSchema }),
 		temperature: 0.1,
 		maxRetries: 2,
 	});
 	if (!output) throw new Error("Judge returned no structured output");
 	return output;
 }
 export const correctnessScorer: EvalScorer<
 	Input,
 	TaskOutput,
 	Expected
 > = async ({ input, output }) => {
 	const context = buildContext(input.testCase, output.llmOutput);
 	const result = await judge(`${context}
 ## Task
 Evaluate **correctness**: Does the LLM's fix address the core issue identified in the incorrect code?
 The fix does not need to be character-identical to the reference, but it must solve the same problem. Functionally equivalent or improved solutions should score well.
 Score 1 if the fix fully addresses the core issue, 0.5 if it partially addresses it, 0 if it fails to address the core issue or introduces new problems.`);
 	return {
 		name: "Correctness",
 		score: result.score,
 		metadata: { reasoning: result.reasoning },
 	};
 };
 export const completenessScorer: EvalScorer<
 	Input,
 	TaskOutput,
 	Expected
 > = async ({ input, output }) => {
 	const context = buildContext(input.testCase, output.llmOutput);
 	const result = await judge(`${context}
 ## Task
 Evaluate **completeness**: Does the LLM's fix include ALL necessary changes shown in the reference?
 Check for missing RLS enablement, missing policy clauses, missing columns, incomplete migrations, or any partial fixes. The fix should be production-ready.
 Score 1 if all necessary changes are present, 0.5 if most changes are present but some are missing, 0 if significant changes are missing.`);
 	return {
 		name: "Completeness",
 		score: result.score,
 		metadata: { reasoning: result.reasoning },
 	};
 };
 export const bestPracticeScorer: EvalScorer<
 	Input,
 	TaskOutput,
 	Expected
 > = async ({ input, output }) => {
 	const context = buildContext(input.testCase, output.llmOutput);
 	const result = await judge(`${context}
 ## Task
 Evaluate **best practices**: Does the LLM's fix follow Supabase best practices as demonstrated in the reference?
 Consider: RLS patterns, auth.users references, migration conventions, connection pooling, edge function patterns, SDK usage, and security-first defaults. Alternative correct approaches that achieve the same security/correctness goal are acceptable.
 Score 1 if the fix follows best practices, 0.5 if it mostly follows best practices with minor deviations, 0 if it uses anti-patterns or ignores conventions.`);
 	return {
 		name: "Best Practice",
 		score: result.score,
 		metadata: { reasoning: result.reasoning },
 	};
 };
--- a/packages/evals/tsconfig.json
+++ b/packages/evals/tsconfig.json
@@ -0,0 +1,16 @@
 {
 	"compilerOptions": {
 		"target": "ES2022",
 		"module": "ESNext",
 		"moduleResolution": "bundler",
 		"esModuleInterop": true,
 		"strict": true,
 		"skipLibCheck": true,
 		"outDir": "dist",
 		"rootDir": "src",
 		"declaration": true,
 		"resolveJsonModule": true
 	},
 	"include": ["src/**/*"],
 	"exclude": ["node_modules", "dist"]
 }