replace vitest for braintrust assertions

2026-03-27 10:09:26 +08:00 · 2026-02-25 19:50:54 +00:00
parent e65642b752
commit 34e807a3f6
66 changed files with 3940 additions and 1234 deletions
--- a/packages/evals/evals/auth-fk-cascade-delete/EVAL.ts
+++ b/packages/evals/evals/auth-fk-cascade-delete/EVAL.ts
@@ -0,0 +1,85 @@
+export const expectedReferenceFiles = [
+	"db-schema-auth-fk.md",
+	"db-security-functions.md",
+	"db-rls-mandatory.md",
+	"db-rls-common-mistakes.md",
+];
+
+import type { EvalAssertion } from "../../src/eval-types.js";
+
+import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
+
+export const assertions: EvalAssertion[] = [
+	{
+		name: "migration file exists",
+		check: () => findMigrationFiles().length > 0,
+	},
+	{
+		name: "creates profiles table",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return /create\s+table/.test(sql) && /profiles/.test(sql);
+		},
+	},
+	{
+		name: "FK references auth.users",
+		check: () =>
+			/references\s+auth\.users/.test(getMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "ON DELETE CASCADE present",
+		check: () => /on\s+delete\s+cascade/.test(getMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "RLS enabled on profiles",
+		check: () =>
+			/alter\s+table.*profiles.*enable\s+row\s+level\s+security/.test(
+				getMigrationSQL().toLowerCase(),
+			),
+	},
+	{
+		name: "trigger function uses SECURITY DEFINER",
+		check: () => /security\s+definer/.test(getMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "trigger function sets search_path",
+		check: () =>
+			/set\s+search_path\s*=\s*''/.test(getMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "trigger created on auth.users",
+		check: () =>
+			/create\s+trigger[\s\S]*?on\s+auth\.users/.test(
+				getMigrationSQL().toLowerCase(),
+			),
+	},
+	{
+		name: "policies scoped to authenticated",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			return (
+				policyBlocks.length > 0 &&
+				policyBlocks.every((p) => /to\s+authenticated/.test(p))
+			);
+		},
+	},
+	{
+		name: "overall quality: demonstrates Supabase best practices",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const signals = [
+				/references\s+auth\.users/.test(sql) &&
+					/on\s+delete\s+cascade/.test(sql),
+				/alter\s+table.*profiles.*enable\s+row\s+level\s+security/.test(sql),
+				/security\s+definer/.test(sql),
+				/set\s+search_path\s*=\s*''/.test(sql),
+				/create\s+trigger[\s\S]*?on\s+auth\.users/.test(sql),
+				policyBlocks.length > 0 &&
+					policyBlocks.every((p) => /to\s+authenticated/.test(p)),
+			];
+			return signals.filter(Boolean).length >= 5;
+		},
+	},
+];
--- a/packages/evals/evals/auth-fk-cascade-delete/PROMPT.md
+++ b/packages/evals/evals/auth-fk-cascade-delete/PROMPT.md
@@ -0,0 +1,7 @@
+I'm building a Supabase app and need to set up a `profiles` table. Every user who signs up should automatically get a profile row containing their `id`, `email`, and `full_name` (pulled from signup metadata).
+
+Please create a SQL migration in `supabase/migrations/` that:
+
+1. Creates the `profiles` table linked to Supabase Auth users
+2. Sets up a trigger so a profile row is created automatically whenever a new user signs up
+3. Enables Row Level Security so users can only read and update their own profile
--- a/packages/evals/evals/auth-fk-cascade-delete/package.json
+++ b/packages/evals/evals/auth-fk-cascade-delete/package.json
@@ -0,0 +1,5 @@
+{
+	"name": "auth-fk-cascade-delete",
+	"private": true,
+	"type": "module"
+}
--- a/packages/evals/evals/auth-fk-cascade-delete/supabase/config.toml
+++ b/packages/evals/evals/auth-fk-cascade-delete/supabase/config.toml
@@ -0,0 +1,111 @@
+# For detailed configuration reference documentation, visit:
+# https://supabase.com/docs/guides/local-development/cli/config
+# A string used to distinguish different Supabase projects on the same host. Defaults to the
+# working directory name when running `supabase init`.
+project_id = "auth-fk-cascade-delete"
+
+[api]
+enabled = true
+# Port to use for the API URL.
+port = 54321
+# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
+# endpoints. `public` and `graphql_public` schemas are included by default.
+schemas = ["public", "graphql_public"]
+# Extra schemas to add to the search_path of every request.
+extra_search_path = ["public", "extensions"]
+# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
+# for accidental or malicious requests.
+max_rows = 1000
+
+[db]
+# Port to use for the local database URL.
+port = 54322
+# Port used by db diff command to initialize the shadow database.
+shadow_port = 54320
+# The database major version to use. This has to be the same as your remote database's. Run `SHOW
+# server_version;` on the remote database to check.
+major_version = 17
+
+[db.pooler]
+enabled = false
+# Port to use for the local connection pooler.
+port = 54329
+# Specifies when a server connection can be reused by other clients.
+# Configure one of the supported pooler modes: `transaction`, `session`.
+pool_mode = "transaction"
+# How many server connections to allow per user/database pair.
+default_pool_size = 20
+# Maximum number of client connections allowed.
+max_client_conn = 100
+
+[db.migrations]
+# If disabled, migrations will be skipped during a db push or reset.
+enabled = true
+schema_paths = []
+
+[db.seed]
+# If enabled, seeds the database after migrations during a db reset.
+enabled = true
+# Specifies an ordered list of seed files to load during db reset.
+sql_paths = ["./seed.sql"]
+
+[realtime]
+enabled = true
+
+[studio]
+enabled = true
+# Port to use for Supabase Studio.
+port = 54323
+# External URL of the API server that frontend connects to.
+api_url = "http://127.0.0.1"
+
+[inbucket]
+enabled = true
+# Port to use for the email testing server web interface.
+port = 54324
+
+[storage]
+enabled = true
+# The maximum file size allowed (e.g. "5MB", "500KB").
+file_size_limit = "50MiB"
+
+[auth]
+enabled = true
+# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
+# in emails.
+site_url = "http://127.0.0.1:3000"
+# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
+additional_redirect_urls = ["https://127.0.0.1:3000"]
+# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
+jwt_expiry = 3600
+# If disabled, the refresh token will never expire.
+enable_refresh_token_rotation = true
+# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
+# Requires enable_refresh_token_rotation = true.
+refresh_token_reuse_interval = 10
+# Allow/disallow new user signups to your project.
+enable_signup = true
+# Allow/disallow anonymous sign-ins to your project.
+enable_anonymous_sign_ins = false
+
+[auth.email]
+# Allow/disallow new user signups via email to your project.
+enable_signup = true
+# If enabled, a user will be required to confirm any email change on both the old, and new email
+# addresses. If disabled, only the new email is required to confirm.
+double_confirm_changes = true
+# If enabled, users need to confirm their email address before signing in.
+enable_confirmations = false
+
+[edge_runtime]
+enabled = true
+# Configure one of the supported request policies: `oneshot`, `per_worker`.
+policy = "per_worker"
+# Port to attach the Chrome inspector for debugging edge functions.
+inspector_port = 8083
+
+[analytics]
+enabled = true
+port = 54327
+# Configure one of the supported backends: `postgres`, `bigquery`.
+backend = "postgres"
--- a/packages/evals/evals/auth-rls-new-project/EVAL.ts
+++ b/packages/evals/evals/auth-rls-new-project/EVAL.ts
@@ -1,97 +1,150 @@
+export const expectedReferenceFiles = [
+	"dev-getting-started.md",
+	"db-rls-mandatory.md",
+	"db-rls-policy-types.md",
+	"db-rls-common-mistakes.md",
+	"db-schema-auth-fk.md",
+	"db-schema-timestamps.md",
+	"db-migrations-idempotent.md",
+];
+
 import { existsSync } from "node:fs";
 import { join } from "node:path";
-import { expect, test } from "vitest";
+import type { EvalAssertion } from "../../src/eval-types.js";

 import {
+	anonSeeesNoRows,
 	findMigrationFiles,
 	getMigrationSQL,
-	supabaseDir,
+	getSupabaseDir,
+	queryTable,
+	tableExists,
 } from "../eval-utils.ts";

-test("supabase project initialized (config.toml exists)", () => {
-	expect(existsSync(join(supabaseDir, "config.toml"))).toBe(true);
-});
-
-test("migration file exists in supabase/migrations/", () => {
-	expect(findMigrationFiles().length).toBeGreaterThan(0);
-});
-
-test("creates tasks table", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/create\s+table/);
-	expect(sql).toMatch(/tasks/);
-});
-
-test("enables RLS on tasks table", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/alter\s+table.*tasks.*enable\s+row\s+level\s+security/);
-});
-
-test("has foreign key to auth.users", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/references\s+auth\.users/);
-});
-
-test("uses ON DELETE CASCADE for auth FK", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/on\s+delete\s+cascade/);
-});
-
-test("uses (select auth.uid()) not bare auth.uid() in policies", () => {
-	const sql = getMigrationSQL();
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	for (const policy of policyBlocks) {
-		if (policy.includes("auth.uid()")) {
-			// The subselect form: (select auth.uid())
-			expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
-		}
-	}
-});
-
-test("policies use TO authenticated", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	expect(policyBlocks.length).toBeGreaterThan(0);
-	for (const policy of policyBlocks) {
-		expect(policy).toMatch(/to\s+authenticated/);
-	}
-});
-
-test("uses timestamptz not plain timestamp for time columns", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Match "timestamp" that is NOT followed by "tz" or "with time zone"
-	const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
-	// Only fail if the migration defines time columns with plain timestamp
-	if (
-		sql.includes("created_at") ||
-		sql.includes("updated_at") ||
-		sql.includes("due_date")
-	) {
-		expect(sql).not.toMatch(hasPlainTimestamp);
-	}
-});
-
-test("creates index on user_id column", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/create\s+index/);
-	expect(sql).toMatch(/user_id/);
-});
-
-test("migration is idempotent (uses IF NOT EXISTS)", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/if\s+not\s+exists/);
-});
-
-test("overall quality: demonstrates Supabase best practices", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// A high-quality migration should contain most of these patterns
-	const signals = [
-		/enable\s+row\s+level\s+security/,
-		/\(select\s+auth\.uid\(\)\)/,
-		/to\s+authenticated/,
-		/on\s+delete\s+cascade/,
-		/create\s+index/,
-	];
-	const matches = signals.filter((r) => r.test(sql));
-	expect(matches.length).toBeGreaterThanOrEqual(4);
-});
+export const assertions: EvalAssertion[] = [
+	{
+		name: "supabase project initialized (config.toml exists)",
+		check: () => existsSync(join(getSupabaseDir(), "config.toml")),
+	},
+	{
+		name: "migration file exists in supabase/migrations/",
+		check: () => findMigrationFiles().length > 0,
+	},
+	{
+		name: "creates tasks table",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return /create\s+table/.test(sql) && /tasks/.test(sql);
+		},
+	},
+	{
+		name: "enables RLS on tasks table",
+		check: () =>
+			/alter\s+table.*tasks.*enable\s+row\s+level\s+security/.test(
+				getMigrationSQL().toLowerCase(),
+			),
+	},
+	{
+		name: "has foreign key to auth.users",
+		check: () =>
+			/references\s+auth\.users/.test(getMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "uses ON DELETE CASCADE for auth FK",
+		check: () => /on\s+delete\s+cascade/.test(getMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "uses (select auth.uid()) not bare auth.uid() in policies",
+		check: () => {
+			const sql = getMigrationSQL();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			for (const policy of policyBlocks) {
+				if (
+					policy.includes("auth.uid()") &&
+					!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
+				) {
+					return false;
+				}
+			}
+			return true;
+		},
+	},
+	{
+		name: "policies use TO authenticated",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			return (
+				policyBlocks.length > 0 &&
+				policyBlocks.every((p) => /to\s+authenticated/.test(p))
+			);
+		},
+	},
+	{
+		name: "uses timestamptz not plain timestamp for time columns",
+		check: () => {
+			const rawSql = getMigrationSQL().toLowerCase();
+			const sql = rawSql.replace(/--[^\n]*/g, "");
+			const hasPlainTimestamp =
+				/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
+			if (
+				sql.includes("created_at") ||
+				sql.includes("updated_at") ||
+				sql.includes("due_date")
+			) {
+				return !hasPlainTimestamp.test(sql);
+			}
+			return true;
+		},
+	},
+	{
+		name: "creates index on user_id column",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return /create\s+index/.test(sql) && /user_id/.test(sql);
+		},
+	},
+	{
+		name: "does not use SERIAL or BIGSERIAL for primary key",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return !/\bserial\b/.test(sql) && !/\bbigserial\b/.test(sql);
+		},
+	},
+	{
+		name: "migration is idempotent (uses IF NOT EXISTS)",
+		check: () => /if\s+not\s+exists/.test(getMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "overall quality: demonstrates Supabase best practices",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const signals = [
+				/enable\s+row\s+level\s+security/,
+				/\(select\s+auth\.uid\(\)\)/,
+				/to\s+authenticated/,
+				/on\s+delete\s+cascade/,
+				/create\s+index/,
+			];
+			return signals.filter((r) => r.test(sql)).length >= 4;
+		},
+	},
+	{
+		name: "tasks table exists in the database after migration",
+		check: () => tableExists("tasks"),
+		timeout: 10_000,
+	},
+	{
+		name: "tasks table is queryable with service role",
+		check: async () => {
+			const { error } = await queryTable("tasks", "service_role");
+			return error === null;
+		},
+		timeout: 10_000,
+	},
+	{
+		name: "tasks table returns no rows for anon (RLS is active)",
+		check: () => anonSeeesNoRows("tasks"),
+		timeout: 10_000,
+	},
+];
--- a/packages/evals/evals/auth-rls-new-project/PROMPT.md
+++ b/packages/evals/evals/auth-rls-new-project/PROMPT.md
@@ -1,16 +1,15 @@
-I'm starting a new Supabase project from scratch for a task management app. Users should sign up with email/password, and each user should only see their own tasks.
+I'm building a task management app. Users sign up with email/password and should only see their own tasks.

-Set up the project:
+A Supabase project is already initialized and running locally. The `supabase/` directory and `config.toml` are already set up — do not run `supabase init` or `supabase start`.

-1. Initialize the Supabase project with the CLI (`npx supabase init`)
-2. Start the local Supabase stack (`npx supabase start`)
-3. Create a SQL migration for a tasks table with columns: title (text), description (text), status (text), and due_date
+Create a SQL migration for a tasks table:

-The migration must:
-
- Create the tasks table with proper column types
- Link tasks to authenticated users
- Enable Row Level Security
- Create policies so users can only CRUD their own tasks
- Add appropriate indexes
- Be idempotent (safe to run multiple times)
+1. Create a new migration file with `npx supabase migration new`
+2. Write the migration SQL with:
+   - A `tasks` table with columns: title (text), description (text), status (text), due_date (timestamptz)
+   - Link tasks to authenticated users (foreign key to `auth.users`)
+   - Enable Row Level Security
+   - RLS policies so users can only CRUD their own tasks
+   - Appropriate indexes
+   - Idempotent (safe to run multiple times)
+3. Apply the migration with `npx supabase db push`
--- a/packages/evals/evals/cli-hallucinated-commands/EVAL.ts
+++ b/packages/evals/evals/cli-hallucinated-commands/EVAL.ts
@@ -0,0 +1,128 @@
+export const expectedReferenceFiles = [
+	"dev-getting-started.md",
+	"edge-fun-quickstart.md",
+];
+
+import { readdirSync, readFileSync } from "node:fs";
+import { join } from "node:path";
+import type { EvalAssertion } from "../../src/eval-types.js";
+
+const cwd = process.cwd();
+
+function findReferenceFile(): string | null {
+	const candidates = readdirSync(cwd).filter((f) => {
+		const lower = f.toLowerCase();
+		return (
+			lower === "cli_reference.md" ||
+			lower === "cli-reference.md" ||
+			lower === "clireference.md"
+		);
+	});
+	return candidates.length > 0 ? join(cwd, candidates[0]) : null;
+}
+
+function getReferenceContent(): string {
+	const file = findReferenceFile();
+	if (!file) throw new Error("CLI_REFERENCE.md not found in project root");
+	return readFileSync(file, "utf-8");
+}
+
+export const assertions: EvalAssertion[] = [
+	{
+		name: "CLI_REFERENCE.md exists in project root",
+		check: () => findReferenceFile() !== null,
+	},
+	{
+		name: "no hallucinated functions log command",
+		check: () => {
+			const content = getReferenceContent();
+			return (
+				!/`supabase\s+functions\s+log`/.test(content) &&
+				!/^\s*npx\s+supabase\s+functions\s+log\b/m.test(content) &&
+				!/^\s*supabase\s+functions\s+log\b/m.test(content)
+			);
+		},
+	},
+	{
+		name: "no hallucinated db query command",
+		check: () => {
+			const content = getReferenceContent();
+			return (
+				!/`supabase\s+db\s+query`/.test(content) &&
+				!/^\s*npx\s+supabase\s+db\s+query\b/m.test(content) &&
+				!/^\s*supabase\s+db\s+query\b/m.test(content)
+			);
+		},
+	},
+	{
+		name: "mentions supabase functions serve for local development",
+		check: () =>
+			/supabase\s+functions\s+serve/.test(getReferenceContent().toLowerCase()),
+	},
+	{
+		name: "mentions supabase functions deploy",
+		check: () =>
+			/supabase\s+functions\s+deploy/.test(getReferenceContent().toLowerCase()),
+	},
+	{
+		name: "mentions psql or SQL Editor or connection string for ad-hoc SQL",
+		check: () => {
+			const content = getReferenceContent().toLowerCase();
+			return (
+				/\bpsql\b/.test(content) ||
+				/sql\s+editor/.test(content) ||
+				/connection\s+string/.test(content) ||
+				/supabase\s+db\s+dump/.test(content)
+			);
+		},
+	},
+	{
+		name: "mentions supabase db push or supabase db reset for migrations",
+		check: () => {
+			const content = getReferenceContent().toLowerCase();
+			return (
+				/supabase\s+db\s+push/.test(content) ||
+				/supabase\s+db\s+reset/.test(content)
+			);
+		},
+	},
+	{
+		name: "mentions supabase start for local stack",
+		check: () => /supabase\s+start/.test(getReferenceContent().toLowerCase()),
+	},
+	{
+		name: "mentions Dashboard or Logs Explorer for production log viewing",
+		check: () => {
+			const content = getReferenceContent().toLowerCase();
+			return /\bdashboard\b/.test(content) || /logs\s+explorer/.test(content);
+		},
+	},
+	{
+		name: "overall quality: uses real CLI commands throughout",
+		check: () => {
+			const content = getReferenceContent().toLowerCase();
+			const signals = [
+				/supabase\s+start/,
+				/supabase\s+stop/,
+				/supabase\s+functions\s+serve/,
+				/supabase\s+functions\s+deploy/,
+				/supabase\s+db\s+(push|reset|diff)/,
+				/\bpsql\b|\bsql\s+editor\b|\bconnection\s+string\b/,
+				/\bdashboard\b|\blogs\s+explorer\b/,
+			];
+			const hallucinations = [
+				/`supabase\s+functions\s+log`/,
+				/^\s*npx\s+supabase\s+functions\s+log\b/m,
+				/^\s*supabase\s+functions\s+log\b/m,
+				/`supabase\s+db\s+query`/,
+				/^\s*npx\s+supabase\s+db\s+query\b/m,
+				/^\s*supabase\s+db\s+query\b/m,
+			];
+			const positiveMatches = signals.filter((r) => r.test(content)).length;
+			const hallucinationMatches = hallucinations.filter((r) =>
+				r.test(content),
+			).length;
+			return positiveMatches >= 5 && hallucinationMatches === 0;
+		},
+	},
+];
--- a/packages/evals/evals/cli-hallucinated-commands/PROMPT.md
+++ b/packages/evals/evals/cli-hallucinated-commands/PROMPT.md
@@ -0,0 +1,9 @@
+I'm onboarding a new developer to my Supabase project. Create a `CLI_REFERENCE.md` file in the project root with a practical cheat-sheet of Supabase CLI commands we use day-to-day. It should cover:
+
+1. Starting and stopping the local dev stack
+2. Managing database migrations (push, reset, diff)
+3. Working with the `process-order` Edge Function (local dev and deploy)
+4. How to view Edge Function logs (both local dev and production)
+5. How to run ad-hoc SQL queries against the database (local and remote)
+
+Include the actual commands with brief explanations.
--- a/packages/evals/evals/cli-hallucinated-commands/package.json
+++ b/packages/evals/evals/cli-hallucinated-commands/package.json
@@ -0,0 +1,5 @@
+{
+	"name": "cli-hallucinated-commands",
+	"private": true,
+	"type": "module"
+}
--- a/packages/evals/evals/cli-hallucinated-commands/supabase/config.toml
+++ b/packages/evals/evals/cli-hallucinated-commands/supabase/config.toml
@@ -0,0 +1,64 @@
+# For detailed configuration reference documentation, visit:
+# https://supabase.com/docs/guides/local-development/cli/config
+# A string used to distinguish different Supabase projects on the same host. Defaults to the
+# working directory name when running `supabase init`.
+project_id = "cli-hallucinated-commands"
+
+[api]
+enabled = true
+# Port to use for the API URL.
+port = 54321
+# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
+# endpoints. `public` and `graphql_public` schemas are included by default.
+schemas = ["public", "graphql_public"]
+# Extra schemas to add to the search_path of every request.
+extra_search_path = ["public", "extensions"]
+# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
+# for accidental or malicious requests.
+max_rows = 1000
+
+[db]
+# Port to use for the local database URL.
+port = 54322
+# Port used by db diff command to initialize the shadow database.
+shadow_port = 54320
+# The database major version to use. This has to be the same as your remote database's. Run `SHOW
+# server_version;` on the remote database to check.
+major_version = 17
+
+[db.pooler]
+enabled = false
+# Port to use for the local connection pooler.
+port = 54329
+# Specifies when a server connection can be reused by other clients.
+# Configure one of the supported pooler modes: `transaction`, `session`.
+pool_mode = "transaction"
+# How many server connections to allow per user/database pair.
+default_pool_size = 20
+# Maximum number of client connections allowed.
+max_client_conn = 100
+
+[storage]
+enabled = true
+# The maximum file size allowed (e.g. "5MB", "500KB").
+file_size_limit = "50MiB"
+
+[auth]
+enabled = true
+# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
+# in emails.
+site_url = "http://127.0.0.1:3000"
+# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
+additional_redirect_urls = ["https://127.0.0.1:3000"]
+# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
+jwt_expiry = 3600
+# Allow/disallow new user signups to your project.
+enable_signup = true
+# Allow/disallow anonymous sign-ins to your project.
+enable_anonymous_sign_ins = false
+
+[auth.email]
+# Allow/disallow new user signups via email to your project.
+enable_signup = true
+# If enabled, users need to confirm their email address before signing in.
+enable_confirmations = false
--- a/packages/evals/evals/cli-hallucinated-commands/supabase/functions/process-order/index.ts
+++ b/packages/evals/evals/cli-hallucinated-commands/supabase/functions/process-order/index.ts
@@ -0,0 +1,29 @@
+import { createClient } from "jsr:@supabase/supabase-js@2";
+
+Deno.serve(async (req) => {
+	try {
+		const { orderId } = await req.json();
+
+		const supabase = createClient(
+			Deno.env.get("SUPABASE_URL") ?? "",
+			Deno.env.get("SUPABASE_ANON_KEY") ?? "",
+		);
+
+		const { data, error } = await supabase
+			.from("orders")
+			.select("*")
+			.eq("id", orderId)
+			.single();
+
+		if (error) throw error;
+
+		return new Response(JSON.stringify({ order: data }), {
+			headers: { "Content-Type": "application/json" },
+		});
+	} catch (err) {
+		return new Response(JSON.stringify({ error: String(err) }), {
+			status: 500,
+			headers: { "Content-Type": "application/json" },
+		});
+	}
+});
--- a/packages/evals/evals/collaborative-rooms-realtime/EVAL.ts
+++ b/packages/evals/evals/collaborative-rooms-realtime/EVAL.ts
@@ -1,333 +1,354 @@
-import { expect, test } from "vitest";
+export const expectedReferenceFiles = [
+	"db-rls-mandatory.md",
+	"db-rls-common-mistakes.md",
+	"db-rls-performance.md",
+	"db-security-functions.md",
+	"db-schema-auth-fk.md",
+	"db-schema-timestamps.md",
+	"db-schema-realtime.md",
+	"db-perf-indexes.md",
+	"db-migrations-idempotent.md",
+	"realtime-setup-auth.md",
+	"realtime-broadcast-database.md",
+	"realtime-setup-channels.md",
+];
+
+import type { EvalAssertion } from "../../src/eval-types.js";

 import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";

-test("migration file exists", () => {
-	expect(findMigrationFiles().length).toBeGreaterThan(0);
-});
-
-test("creates rooms table", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/create\s+table[\s\S]*?rooms/);
-});
-
-test("creates room_members table", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Accept room_members, members, memberships, room_users, etc.
-	const hasMembership =
-		/create\s+table[\s\S]*?room_members/.test(sql) ||
-		/create\s+table[\s\S]*?room_users/.test(sql) ||
-		/create\s+table[\s\S]*?memberships/.test(sql);
-	expect(hasMembership).toBe(true);
-});
-
-test("creates content table", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Accept content, contents, items, room_content, room_items, documents, etc.
-	const hasContent =
-		/create\s+table[\s\S]*?content/.test(sql) ||
-		/create\s+table[\s\S]*?items/.test(sql) ||
-		/create\s+table[\s\S]*?documents/.test(sql) ||
-		/create\s+table[\s\S]*?posts/.test(sql) ||
-		/create\s+table[\s\S]*?messages/.test(sql);
-	expect(hasContent).toBe(true);
-});
-
-test("room_members has role column with owner/editor/viewer", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/role/);
-	// Should define the three roles somewhere (enum, check constraint, or comment)
-	expect(sql).toMatch(/owner/);
-	expect(sql).toMatch(/editor/);
-	expect(sql).toMatch(/viewer/);
-});
-
-test("enables RLS on all application tables", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Must enable RLS on rooms
-	expect(sql).toMatch(
-		/alter\s+table[\s\S]*?rooms[\s\S]*?enable\s+row\s+level\s+security/,
-	);
-	// Must enable RLS on membership table
-	const hasMembershipRls =
-		/alter\s+table[\s\S]*?room_members[\s\S]*?enable\s+row\s+level\s+security/.test(
-			sql,
-		) ||
-		/alter\s+table[\s\S]*?room_users[\s\S]*?enable\s+row\s+level\s+security/.test(
-			sql,
-		) ||
-		/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
-			sql,
-		);
-	expect(hasMembershipRls).toBe(true);
-	// Must enable RLS on content table (accept various names)
-	const hasContentRls =
-		/alter\s+table[\s\S]*?content[\s\S]*?enable\s+row\s+level\s+security/.test(
-			sql,
-		) ||
-		/alter\s+table[\s\S]*?items[\s\S]*?enable\s+row\s+level\s+security/.test(
-			sql,
-		) ||
-		/alter\s+table[\s\S]*?documents[\s\S]*?enable\s+row\s+level\s+security/.test(
-			sql,
-		) ||
-		/alter\s+table[\s\S]*?posts[\s\S]*?enable\s+row\s+level\s+security/.test(
-			sql,
-		) ||
-		/alter\s+table[\s\S]*?messages[\s\S]*?enable\s+row\s+level\s+security/.test(
-			sql,
-		);
-	expect(hasContentRls).toBe(true);
-});
-
-test("FK to auth.users with ON DELETE CASCADE", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/references\s+auth\.users/);
-	expect(sql).toMatch(/on\s+delete\s+cascade/);
-});
-
-test("content has room_id FK referencing rooms", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Content table should have a foreign key to rooms
-	expect(sql).toMatch(/room_id[\s\S]*?references[\s\S]*?rooms/);
-});
-
-test("policies use (select auth.uid())", () => {
-	const sql = getMigrationSQL();
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	expect(policyBlocks.length).toBeGreaterThan(0);
-	for (const policy of policyBlocks) {
-		if (policy.includes("auth.uid()")) {
-			expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
-		}
-	}
-});
-
-test("policies use TO authenticated", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	// Filter to only application table policies (not realtime.messages which may use different roles)
-	const appPolicies = policyBlocks.filter(
-		(p) => !p.includes("realtime.messages"),
-	);
-	expect(appPolicies.length).toBeGreaterThan(0);
-	for (const policy of appPolicies) {
-		expect(policy).toMatch(/to\s+authenticated/);
-	}
-});
-
-test("private schema with security_definer helper function", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Private schema should be created
-	expect(sql).toMatch(/create\s+schema[\s\S]*?private/);
-	// A function in the private schema with SECURITY DEFINER
-	expect(sql).toMatch(/private\./);
-	expect(sql).toMatch(/security\s+definer/);
-	expect(sql).toMatch(/set\s+search_path\s*=\s*''/);
-});
-
-test("role-based write policies: content INSERT/UPDATE restricted to owner or editor", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	// Find INSERT or UPDATE policies on the content table
-	const writePolicies = policyBlocks.filter(
-		(p) =>
-			(/for\s+(insert|update|all)/.test(p) || /insert|update/.test(p)) &&
-			(p.includes("content") ||
-				p.includes("items") ||
-				p.includes("documents") ||
-				p.includes("posts") ||
-				p.includes("messages")),
-	);
-	// At least one write policy should check for owner or editor role
-	const checksRole = writePolicies.some(
-		(p) => p.includes("owner") || p.includes("editor"),
-	);
-	expect(checksRole).toBe(true);
-});
-
-test("viewer role is read-only (no write access to content)", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	// Find content write policies (INSERT, UPDATE, DELETE)
-	const contentWritePolicies = policyBlocks.filter(
-		(p) =>
-			/for\s+(insert|update|delete)/.test(p) &&
-			(p.includes("content") ||
-				p.includes("items") ||
-				p.includes("documents") ||
-				p.includes("posts") ||
-				p.includes("messages")),
-	);
-	// None of the write policies should grant access to viewer role
-	// They should either explicitly check for owner/editor OR exclude viewer
-	if (contentWritePolicies.length > 0) {
-		const anyGrantsViewer = contentWritePolicies.some((p) => {
-			// If the policy doesn't mention any role, it's too permissive
-			const mentionsRole =
-				p.includes("owner") || p.includes("editor") || p.includes("viewer");
-			if (!mentionsRole) return true; // no role check = viewer could write
-			// If it specifically includes viewer in a write context, that's wrong
+export const assertions: EvalAssertion[] = [
+	{
+		name: "migration file exists",
+		check: () => findMigrationFiles().length > 0,
+	},
+	{
+		name: "creates rooms table",
+		check: () =>
+			/create\s+table[\s\S]*?rooms/.test(getMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "creates room_members table",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
 			return (
-				p.includes("viewer") && !p.includes("owner") && !p.includes("editor")
+				/create\s+table[\s\S]*?room_members/.test(sql) ||
+				/create\s+table[\s\S]*?room_users/.test(sql) ||
+				/create\s+table[\s\S]*?memberships/.test(sql)
 			);
-		});
-		expect(anyGrantsViewer).toBe(false);
-	}
-});
-
-test("indexes on membership lookup columns", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/create\s+index/);
-	const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? [];
-	// Should index user_id and/or room_id on the membership table
-	const membershipIndexes = indexBlocks.filter(
-		(idx) =>
-			idx.toLowerCase().includes("user_id") ||
-			idx.toLowerCase().includes("room_id"),
-	);
-	expect(membershipIndexes.length).toBeGreaterThanOrEqual(1);
-});
-
-test("uses timestamptz not plain timestamp", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Match "timestamp" that is NOT followed by "tz" or "with time zone"
-	const hasPlainTimestamp =
-		/(?:created_at|updated_at|invited_at|joined_at)\s+timestamp(?!\s*tz)(?!\s+with\s+time\s+zone)/;
-	// Only fail if the migration defines time columns with plain timestamp
-	if (
-		sql.includes("created_at") ||
-		sql.includes("updated_at") ||
-		sql.includes("_at ")
-	) {
-		expect(sql).not.toMatch(hasPlainTimestamp);
-	}
-});
-
-test("idempotent DDL", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/if\s+not\s+exists/);
-});
-
-test("realtime publication enabled for content table", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Should add the content table to supabase_realtime publication
-	expect(sql).toMatch(/alter\s+publication\s+supabase_realtime\s+add\s+table/);
-});
-
-test("broadcast trigger for content changes", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Should use realtime.broadcast_changes() or realtime.send() in a trigger
-	const usesBroadcastChanges = /realtime\.broadcast_changes/.test(sql);
-	const usesRealtimeSend = /realtime\.send/.test(sql);
-	expect(usesBroadcastChanges || usesRealtimeSend).toBe(true);
-	// Should create a trigger on the content table
-	expect(sql).toMatch(/create\s+trigger/);
-});
-
-test("broadcast trigger function uses security definer", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Find function definitions that reference realtime.broadcast_changes or realtime.send
-	const functionBlocks =
-		sql.match(/create[\s\S]*?function[\s\S]*?\$\$[\s\S]*?\$\$/gi) ?? [];
-	const realtimeFunctions = functionBlocks.filter(
-		(f) =>
-			f.toLowerCase().includes("realtime.broadcast_changes") ||
-			f.toLowerCase().includes("realtime.send"),
-	);
-	expect(realtimeFunctions.length).toBeGreaterThan(0);
-	// The trigger function should have security definer and search_path
-	const hasSecurityDefiner = realtimeFunctions.some(
-		(f) =>
-			/security\s+definer/.test(f.toLowerCase()) &&
-			/set\s+search_path\s*=\s*''/.test(f.toLowerCase()),
-	);
-	expect(hasSecurityDefiner).toBe(true);
-});
-
-test("RLS policies on realtime.messages", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	const realtimePolicies = policyBlocks.filter((p) =>
-		p.includes("realtime.messages"),
-	);
-	expect(realtimePolicies.length).toBeGreaterThan(0);
-	// At least one policy should target authenticated users
-	const hasAuthPolicy = realtimePolicies.some(
-		(p) => /to\s+authenticated/.test(p) || /auth\.uid\(\)/.test(p),
-	);
-	expect(hasAuthPolicy).toBe(true);
-});
-
-test("realtime policy checks extension column", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	const realtimePolicies = policyBlocks.filter((p) =>
-		p.includes("realtime.messages"),
-	);
-	// At least one realtime policy should reference the extension column
-	const checksExtension = realtimePolicies.some(
-		(p) =>
-			p.includes("extension") &&
-			(p.includes("broadcast") || p.includes("presence")),
-	);
-	expect(checksExtension).toBe(true);
-});
-
-test("overall quality score", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-
-	const signals = [
-		// 1. RLS enabled on rooms
-		/alter\s+table[\s\S]*?rooms[\s\S]*?enable\s+row\s+level\s+security/.test(
-			sql,
-		),
-		// 2. RLS enabled on membership table
-		/alter\s+table[\s\S]*?(room_members|room_users|memberships)[\s\S]*?enable\s+row\s+level\s+security/.test(
-			sql,
-		),
-		// 3. RLS enabled on content table
-		/alter\s+table[\s\S]*?(content|items|documents|posts|messages)[\s\S]*?enable\s+row\s+level\s+security/.test(
-			sql,
-		),
-		// 4. FK to auth.users with cascade
-		/references\s+auth\.users/.test(sql) && /on\s+delete\s+cascade/.test(sql),
-		// 5. Private schema created
-		/create\s+schema[\s\S]*?private/.test(sql),
-		// 6. security_definer with search_path
-		/security\s+definer/.test(sql) && /set\s+search_path\s*=\s*''/.test(sql),
-		// 7. Subselect auth.uid()
-		/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
-		// 8. TO authenticated on policies
-		policyBlocks.length > 0 &&
-			policyBlocks.filter((p) => !p.includes("realtime.messages")).length > 0 &&
-			policyBlocks
-				.filter((p) => !p.includes("realtime.messages"))
-				.every((p) => /to\s+authenticated/.test(p)),
-		// 9. Indexes on lookup columns
-		/create\s+index/.test(sql),
-		// 10. timestamptz usage (accepts both timestamptz and timestamp with time zone)
-		/timestamptz/.test(sql) || /timestamp\s+with\s+time\s+zone/.test(sql),
-		// 11. IF NOT EXISTS for idempotency
-		/if\s+not\s+exists/.test(sql),
-		// 12. Role-based policies (owner/editor/viewer)
-		sql.includes("owner") && sql.includes("editor") && sql.includes("viewer"),
-		// 13. Realtime publication
-		/alter\s+publication\s+supabase_realtime\s+add\s+table/.test(sql),
-		// 14. Broadcast trigger (broadcast_changes or realtime.send)
-		/realtime\.broadcast_changes/.test(sql) || /realtime\.send/.test(sql),
-		// 15. Trigger creation
-		/create\s+trigger/.test(sql),
-		// 16. RLS on realtime.messages
-		policyBlocks.some((p) => p.includes("realtime.messages")),
-		// 17. Extension check in realtime policy
-		policyBlocks
-			.filter((p) => p.includes("realtime.messages"))
-			.some((p) => p.includes("extension")),
-		// 18. room_id FK on content
-		/room_id[\s\S]*?references[\s\S]*?rooms/.test(sql),
-	];
-	const passed = signals.filter(Boolean).length;
-	expect(passed).toBeGreaterThanOrEqual(13);
-});
+		},
+	},
+	{
+		name: "creates content table",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return (
+				/create\s+table[\s\S]*?content/.test(sql) ||
+				/create\s+table[\s\S]*?items/.test(sql) ||
+				/create\s+table[\s\S]*?documents/.test(sql) ||
+				/create\s+table[\s\S]*?posts/.test(sql) ||
+				/create\s+table[\s\S]*?messages/.test(sql)
+			);
+		},
+	},
+	{
+		name: "room_members has role column with owner/editor/viewer",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return (
+				/role/.test(sql) &&
+				/owner/.test(sql) &&
+				/editor/.test(sql) &&
+				/viewer/.test(sql)
+			);
+		},
+	},
+	{
+		name: "enables RLS on all application tables",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const roomsRls =
+				/alter\s+table[\s\S]*?rooms[\s\S]*?enable\s+row\s+level\s+security/.test(
+					sql,
+				);
+			const membershipRls =
+				/alter\s+table[\s\S]*?room_members[\s\S]*?enable\s+row\s+level\s+security/.test(
+					sql,
+				) ||
+				/alter\s+table[\s\S]*?room_users[\s\S]*?enable\s+row\s+level\s+security/.test(
+					sql,
+				) ||
+				/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
+					sql,
+				);
+			const contentRls =
+				/alter\s+table[\s\S]*?content[\s\S]*?enable\s+row\s+level\s+security/.test(
+					sql,
+				) ||
+				/alter\s+table[\s\S]*?items[\s\S]*?enable\s+row\s+level\s+security/.test(
+					sql,
+				) ||
+				/alter\s+table[\s\S]*?documents[\s\S]*?enable\s+row\s+level\s+security/.test(
+					sql,
+				) ||
+				/alter\s+table[\s\S]*?posts[\s\S]*?enable\s+row\s+level\s+security/.test(
+					sql,
+				) ||
+				/alter\s+table[\s\S]*?messages[\s\S]*?enable\s+row\s+level\s+security/.test(
+					sql,
+				);
+			return roomsRls && membershipRls && contentRls;
+		},
+	},
+	{
+		name: "FK to auth.users with ON DELETE CASCADE",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return (
+				/references\s+auth\.users/.test(sql) &&
+				/on\s+delete\s+cascade/.test(sql)
+			);
+		},
+	},
+	{
+		name: "content has room_id FK referencing rooms",
+		check: () =>
+			/room_id[\s\S]*?references[\s\S]*?rooms/.test(
+				getMigrationSQL().toLowerCase(),
+			),
+	},
+	{
+		name: "policies use (select auth.uid())",
+		check: () => {
+			const sql = getMigrationSQL();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			if (policyBlocks.length === 0) return false;
+			for (const policy of policyBlocks) {
+				if (
+					policy.includes("auth.uid()") &&
+					!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
+				) {
+					return false;
+				}
+			}
+			return true;
+		},
+	},
+	{
+		name: "policies use TO authenticated",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const appPolicies = policyBlocks.filter(
+				(p) => !p.includes("realtime.messages"),
+			);
+			return (
+				appPolicies.length > 0 &&
+				appPolicies.every((p) => /to\s+authenticated/.test(p))
+			);
+		},
+	},
+	{
+		name: "private schema with security_definer helper function",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return (
+				/create\s+schema[\s\S]*?private/.test(sql) &&
+				/private\./.test(sql) &&
+				/security\s+definer/.test(sql) &&
+				/set\s+search_path\s*=\s*''/.test(sql)
+			);
+		},
+	},
+	{
+		name: "role-based write policies: content INSERT/UPDATE restricted to owner or editor",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const writePolicies = policyBlocks.filter(
+				(p) =>
+					(/for\s+(insert|update|all)/.test(p) || /insert|update/.test(p)) &&
+					(p.includes("content") ||
+						p.includes("items") ||
+						p.includes("documents") ||
+						p.includes("posts") ||
+						p.includes("messages")),
+			);
+			return writePolicies.some(
+				(p) => p.includes("owner") || p.includes("editor"),
+			);
+		},
+	},
+	{
+		name: "viewer role is read-only (no write access to content)",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const contentWritePolicies = policyBlocks.filter(
+				(p) =>
+					/for\s+(insert|update|delete)/.test(p) &&
+					(p.includes("content") ||
+						p.includes("items") ||
+						p.includes("documents") ||
+						p.includes("posts") ||
+						p.includes("messages")),
+			);
+			if (contentWritePolicies.length === 0) return true;
+			return !contentWritePolicies.some((p) => {
+				const mentionsRole =
+					p.includes("owner") || p.includes("editor") || p.includes("viewer");
+				if (!mentionsRole) return true;
+				return (
+					p.includes("viewer") && !p.includes("owner") && !p.includes("editor")
+				);
+			});
+		},
+	},
+	{
+		name: "indexes on membership lookup columns",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			if (!/create\s+index/.test(sql)) return false;
+			const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? [];
+			return (
+				indexBlocks.filter(
+					(idx) =>
+						idx.toLowerCase().includes("user_id") ||
+						idx.toLowerCase().includes("room_id"),
+				).length >= 1
+			);
+		},
+	},
+	{
+		name: "uses timestamptz not plain timestamp",
+		check: () => {
+			const rawSql = getMigrationSQL().toLowerCase();
+			const sql = rawSql.replace(/--[^\n]*/g, "");
+			const hasPlainTimestamp =
+				/(?:created_at|updated_at|invited_at|joined_at)\s+timestamp(?!\s*tz)(?!\s+with\s+time\s+zone)/;
+			if (
+				sql.includes("created_at") ||
+				sql.includes("updated_at") ||
+				sql.includes("_at ")
+			) {
+				return !hasPlainTimestamp.test(sql);
+			}
+			return true;
+		},
+	},
+	{
+		name: "idempotent DDL",
+		check: () => /if\s+not\s+exists/.test(getMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "realtime publication enabled for content table",
+		check: () =>
+			/alter\s+publication\s+supabase_realtime\s+add\s+table/.test(
+				getMigrationSQL().toLowerCase(),
+			),
+	},
+	{
+		name: "broadcast trigger for content changes",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return (
+				(/realtime\.broadcast_changes/.test(sql) ||
+					/realtime\.send/.test(sql)) &&
+				/create\s+trigger/.test(sql)
+			);
+		},
+	},
+	{
+		name: "broadcast trigger function uses security definer",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const functionBlocks =
+				sql.match(/create[\s\S]*?function[\s\S]*?\$\$[\s\S]*?\$\$/gi) ?? [];
+			const realtimeFunctions = functionBlocks.filter(
+				(f) =>
+					f.toLowerCase().includes("realtime.broadcast_changes") ||
+					f.toLowerCase().includes("realtime.send"),
+			);
+			if (realtimeFunctions.length === 0) return false;
+			return realtimeFunctions.some(
+				(f) =>
+					/security\s+definer/.test(f.toLowerCase()) &&
+					/set\s+search_path\s*=\s*''/.test(f.toLowerCase()),
+			);
+		},
+	},
+	{
+		name: "RLS policies on realtime.messages",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const realtimePolicies = policyBlocks.filter((p) =>
+				p.includes("realtime.messages"),
+			);
+			if (realtimePolicies.length === 0) return false;
+			return realtimePolicies.some(
+				(p) => /to\s+authenticated/.test(p) || /auth\.uid\(\)/.test(p),
+			);
+		},
+	},
+	{
+		name: "realtime policy checks extension column",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const realtimePolicies = policyBlocks.filter((p) =>
+				p.includes("realtime.messages"),
+			);
+			return realtimePolicies.some(
+				(p) =>
+					p.includes("extension") &&
+					(p.includes("broadcast") || p.includes("presence")),
+			);
+		},
+	},
+	{
+		name: "overall quality score",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const signals = [
+				/alter\s+table[\s\S]*?rooms[\s\S]*?enable\s+row\s+level\s+security/.test(
+					sql,
+				),
+				/alter\s+table[\s\S]*?(room_members|room_users|memberships)[\s\S]*?enable\s+row\s+level\s+security/.test(
+					sql,
+				),
+				/alter\s+table[\s\S]*?(content|items|documents|posts|messages)[\s\S]*?enable\s+row\s+level\s+security/.test(
+					sql,
+				),
+				/references\s+auth\.users/.test(sql) &&
+					/on\s+delete\s+cascade/.test(sql),
+				/create\s+schema[\s\S]*?private/.test(sql),
+				/security\s+definer/.test(sql) &&
+					/set\s+search_path\s*=\s*''/.test(sql),
+				/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
+				policyBlocks.length > 0 &&
+					policyBlocks.filter((p) => !p.includes("realtime.messages")).length >
+						0 &&
+					policyBlocks
+						.filter((p) => !p.includes("realtime.messages"))
+						.every((p) => /to\s+authenticated/.test(p)),
+				/create\s+index/.test(sql),
+				/timestamptz/.test(sql) || /timestamp\s+with\s+time\s+zone/.test(sql),
+				/if\s+not\s+exists/.test(sql),
+				sql.includes("owner") &&
+					sql.includes("editor") &&
+					sql.includes("viewer"),
+				/alter\s+publication\s+supabase_realtime\s+add\s+table/.test(sql),
+				/realtime\.broadcast_changes/.test(sql) || /realtime\.send/.test(sql),
+				/create\s+trigger/.test(sql),
+				policyBlocks.some((p) => p.includes("realtime.messages")),
+				policyBlocks
+					.filter((p) => p.includes("realtime.messages"))
+					.some((p) => p.includes("extension")),
+				/room_id[\s\S]*?references[\s\S]*?rooms/.test(sql),
+			];
+			return signals.filter(Boolean).length >= 13;
+		},
+	},
+];
--- a/packages/evals/evals/connection-pooling-prisma/.env.example
+++ b/packages/evals/evals/connection-pooling-prisma/.env.example
@@ -0,0 +1,3 @@
+# Direct connection to the database — used for migrations
+# Replace with your Supabase project's direct connection string
+DATABASE_URL="postgresql://postgres:[YOUR-PASSWORD]@db.[YOUR-PROJECT-REF].supabase.co:5432/postgres"
--- a/packages/evals/evals/connection-pooling-prisma/EVAL.ts
+++ b/packages/evals/evals/connection-pooling-prisma/EVAL.ts
@@ -0,0 +1,134 @@
+export const expectedReferenceFiles = [
+	"db-conn-pooling.md",
+	"db-migrations-idempotent.md",
+	"db-schema-auth-fk.md",
+];
+
+import { existsSync, readdirSync, readFileSync } from "node:fs";
+import { join } from "node:path";
+import type { EvalAssertion } from "../../src/eval-types.js";
+
+const cwd = process.cwd();
+
+function findPrismaSchema(): string | null {
+	const candidates = [
+		join(cwd, "prisma", "schema.prisma"),
+		join(cwd, "schema.prisma"),
+	];
+	for (const p of candidates) {
+		if (existsSync(p)) return p;
+	}
+	const prismaDir = join(cwd, "prisma");
+	if (existsSync(prismaDir)) {
+		const files = readdirSync(prismaDir).filter((f) => f.endsWith(".prisma"));
+		if (files.length > 0) return join(prismaDir, files[0]);
+	}
+	return null;
+}
+
+function getPrismaSchema(): string {
+	const file = findPrismaSchema();
+	if (!file) throw new Error("No .prisma schema file found");
+	return readFileSync(file, "utf-8");
+}
+
+function findEnvFiles(): string[] {
+	const found: string[] = [];
+	for (const name of [
+		".env",
+		".env.example",
+		".env.local",
+		".env.production",
+		".env.development",
+	]) {
+		const p = join(cwd, name);
+		if (existsSync(p)) found.push(p);
+	}
+	return found;
+}
+
+function getAllEnvContent(): string {
+	return findEnvFiles()
+		.map((f) => readFileSync(f, "utf-8"))
+		.join("\n");
+}
+
+function getAllOutputContent(): string {
+	const parts: string[] = [];
+	const schema = findPrismaSchema();
+	if (schema) parts.push(readFileSync(schema, "utf-8"));
+	parts.push(getAllEnvContent());
+	const mdFiles = readdirSync(cwd).filter((f) => f.endsWith(".md"));
+	for (const f of mdFiles) {
+		parts.push(readFileSync(join(cwd, f), "utf-8"));
+	}
+	return parts.join("\n");
+}
+
+export const assertions: EvalAssertion[] = [
+	{
+		name: "prisma schema file exists",
+		check: () => findPrismaSchema() !== null,
+	},
+	{
+		name: "prisma schema references pooler port 6543",
+		check: () => /6543/.test(getAllOutputContent()),
+	},
+	{
+		name: "pgbouncer=true param present",
+		check: () =>
+			/pgbouncer\s*=\s*true/.test(getAllOutputContent().toLowerCase()),
+	},
+	{
+		name: "DIRECT_URL provided for migrations",
+		check: () => {
+			const allContent = `${getPrismaSchema().toLowerCase()}\n${getAllEnvContent().toLowerCase()}`;
+			return /directurl/.test(allContent) || /direct_url/.test(allContent);
+		},
+	},
+	{
+		name: "datasource block references directUrl or DIRECT_URL env var",
+		check: () => {
+			const schema = getPrismaSchema().toLowerCase();
+			const datasourceBlock =
+				schema.match(/datasource\s+\w+\s*\{[\s\S]*?\}/)?.[0] ?? "";
+			return (
+				/directurl/.test(datasourceBlock) || /direct_url/.test(datasourceBlock)
+			);
+		},
+	},
+	{
+		name: "connection limit set to 1 for serverless",
+		check: () => {
+			const content = getAllOutputContent().toLowerCase();
+			return (
+				/connection_limit\s*=\s*1/.test(content) ||
+				/connection_limit:\s*1/.test(content) ||
+				/connectionlimit\s*=\s*1/.test(content)
+			);
+		},
+	},
+	{
+		name: "explanation distinguishes port 6543 vs 5432",
+		check: () => {
+			const content = getAllOutputContent();
+			return /6543/.test(content) && /5432/.test(content);
+		},
+	},
+	{
+		name: "overall quality: demonstrates correct Prisma + Supabase pooler setup",
+		check: () => {
+			const schema = getPrismaSchema().toLowerCase();
+			const envContent = getAllEnvContent().toLowerCase();
+			const allContent = `${schema}\n${envContent}`;
+			const signals = [
+				/6543/,
+				/pgbouncer\s*=\s*true/,
+				/directurl|direct_url/,
+				/connection_limit\s*=\s*1|connection_limit:\s*1/,
+				/5432/,
+			];
+			return signals.filter((r) => r.test(allContent)).length >= 4;
+		},
+	},
+];
--- a/packages/evals/evals/connection-pooling-prisma/PROMPT.md
+++ b/packages/evals/evals/connection-pooling-prisma/PROMPT.md
@@ -0,0 +1,3 @@
+I'm deploying my Supabase app on Vercel using Prisma. I keep getting "prepared statement already exists" errors in production. My current `DATABASE_URL` in `prisma/schema.prisma` uses the direct connection string on port 5432 with no pooler settings.
+
+Fix the Prisma configuration so it works correctly with Supabase's connection pooler for serverless deployments. Make any changes needed to `prisma/schema.prisma` and update the `.env.example` file with the correct connection string format.
--- a/packages/evals/evals/connection-pooling-prisma/package.json
+++ b/packages/evals/evals/connection-pooling-prisma/package.json
@@ -0,0 +1,5 @@
+{
+	"name": "connection-pooling-prisma",
+	"private": true,
+	"type": "module"
+}
--- a/packages/evals/evals/connection-pooling-prisma/prisma/schema.prisma
+++ b/packages/evals/evals/connection-pooling-prisma/prisma/schema.prisma
@@ -0,0 +1,29 @@
+// This is your Prisma schema file,
+// learn more about it in the docs: https://pris.ly/d/prisma-schema
+
+generator client {
+  provider = "prisma-client-js"
+}
+
+datasource db {
+  provider = "postgresql"
+  url      = env("DATABASE_URL")
+}
+
+model User {
+  id        String   @id @default(cuid())
+  email     String   @unique
+  name      String?
+  createdAt DateTime @default(now())
+  posts     Post[]
+}
+
+model Post {
+  id        String   @id @default(cuid())
+  title     String
+  content   String?
+  published Boolean  @default(false)
+  author    User     @relation(fields: [authorId], references: [id])
+  authorId  String
+  createdAt DateTime @default(now())
+}
--- a/packages/evals/evals/connection-pooling-prisma/supabase/config.toml
+++ b/packages/evals/evals/connection-pooling-prisma/supabase/config.toml
@@ -0,0 +1,111 @@
+# For detailed configuration reference documentation, visit:
+# https://supabase.com/docs/guides/local-development/cli/config
+# A string used to distinguish different Supabase projects on the same host. Defaults to the
+# working directory name when running `supabase init`.
+project_id = "connection-pooling-prisma"
+
+[api]
+enabled = true
+# Port to use for the API URL.
+port = 54321
+# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
+# endpoints. `public` and `graphql_public` schemas are included by default.
+schemas = ["public", "graphql_public"]
+# Extra schemas to add to the search_path of every request.
+extra_search_path = ["public", "extensions"]
+# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
+# for accidental or malicious requests.
+max_rows = 1000
+
+[db]
+# Port to use for the local database URL.
+port = 54322
+# Port used by db diff command to initialize the shadow database.
+shadow_port = 54320
+# The database major version to use. This has to be the same as your remote database's. Run `SHOW
+# server_version;` on the remote database to check.
+major_version = 17
+
+[db.pooler]
+enabled = false
+# Port to use for the local connection pooler.
+port = 54329
+# Specifies when a server connection can be reused by other clients.
+# Configure one of the supported pooler modes: `transaction`, `session`.
+pool_mode = "transaction"
+# How many server connections to allow per user/database pair.
+default_pool_size = 20
+# Maximum number of client connections allowed.
+max_client_conn = 100
+
+[db.migrations]
+# If disabled, migrations will be skipped during a db push or reset.
+enabled = true
+schema_paths = []
+
+[db.seed]
+# If enabled, seeds the database after migrations during a db reset.
+enabled = true
+# Specifies an ordered list of seed files to load during db reset.
+sql_paths = ["./seed.sql"]
+
+[realtime]
+enabled = true
+
+[studio]
+enabled = true
+# Port to use for Supabase Studio.
+port = 54323
+# External URL of the API server that frontend connects to.
+api_url = "http://127.0.0.1"
+
+[inbucket]
+enabled = true
+# Port to use for the email testing server web interface.
+port = 54324
+
+[storage]
+enabled = true
+# The maximum file size allowed (e.g. "5MB", "500KB").
+file_size_limit = "50MiB"
+
+[auth]
+enabled = true
+# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
+# in emails.
+site_url = "http://127.0.0.1:3000"
+# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
+additional_redirect_urls = ["https://127.0.0.1:3000"]
+# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
+jwt_expiry = 3600
+# If disabled, the refresh token will never expire.
+enable_refresh_token_rotation = true
+# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
+# Requires enable_refresh_token_rotation = true.
+refresh_token_reuse_interval = 10
+# Allow/disallow new user signups to your project.
+enable_signup = true
+# Allow/disallow anonymous sign-ins to your project.
+enable_anonymous_sign_ins = false
+
+[auth.email]
+# Allow/disallow new user signups via email to your project.
+enable_signup = true
+# If enabled, a user will be required to confirm any email change on both the old, and new email
+# addresses. If disabled, only the new email is required to confirm.
+double_confirm_changes = true
+# If enabled, users need to confirm their email address before signing in.
+enable_confirmations = false
+
+[edge_runtime]
+enabled = true
+# Configure one of the supported request policies: `oneshot`, `per_worker`.
+policy = "per_worker"
+# Port to attach the Chrome inspector for debugging edge functions.
+inspector_port = 8083
+
+[analytics]
+enabled = true
+port = 54327
+# Configure one of the supported backends: `postgres`, `bigquery`.
+backend = "postgres"
--- a/packages/evals/evals/edge-function-hello-world/EVAL.ts
+++ b/packages/evals/evals/edge-function-hello-world/EVAL.ts
@@ -1,26 +1,31 @@
+export const expectedReferenceFiles = [
+	"edge-fun-quickstart.md",
+	"edge-fun-project-structure.md",
+	"edge-pat-cors.md",
+	"edge-pat-error-handling.md",
+	"dev-getting-started.md",
+];
+
 import { existsSync, readdirSync } from "node:fs";
 import { join } from "node:path";
-import { expect, test } from "vitest";
+import type { EvalAssertion } from "../../src/eval-types.js";

 import {
 	findFunctionFile,
 	findSharedCorsFile,
-	functionsDir,
 	getFunctionCode,
+	getFunctionsDir,
 	getSharedCode,
-	supabaseDir,
+	getSupabaseDir,
 } from "../eval-utils.ts";

 const FUNCTION_NAME = "hello-world";
-const helloWorldDir = join(functionsDir, FUNCTION_NAME);

-/** Read function code + all shared modules combined. */
 function getAllCode(): string {
 	const code = getFunctionCode(FUNCTION_NAME);
 	return `${code}\n${getSharedCode()}`;
 }

-/** Extract the code after the first `catch` keyword to the end of the function. */
 function getCatchBlockCode(): string {
 	const code = getFunctionCode(FUNCTION_NAME);
 	const catchIndex = code.search(/\bcatch\b/);
@@ -28,121 +33,123 @@ function getCatchBlockCode(): string {
 	return code.slice(catchIndex);
 }

-test("supabase project initialized", () => {
-	expect(existsSync(join(supabaseDir, "config.toml"))).toBe(true);
-});
-
-test("function directory exists", () => {
-	expect(existsSync(helloWorldDir)).toBe(true);
-});
-
-test("function index file exists", () => {
-	expect(findFunctionFile(FUNCTION_NAME)).not.toBeNull();
-});
-
-test("uses Deno.serve", () => {
-	const code = getFunctionCode(FUNCTION_NAME);
-	expect(code).toMatch(/Deno\.serve/);
-});
-
-test("returns JSON response", () => {
-	// Check both the function file and shared modules for JSON response patterns
-	const allCode = getAllCode();
-	const hasContentTypeHeader =
-		/content-type['"]\s*:\s*['"]application\/json/i.test(allCode);
-	const hasResponseJson = /Response\.json/i.test(allCode);
-	const hasJsonStringify = /JSON\.stringify/i.test(allCode);
-	expect(hasContentTypeHeader || hasResponseJson || hasJsonStringify).toBe(
-		true,
-	);
-});
-
-test("handles OPTIONS preflight", () => {
-	// OPTIONS handling may be in the function itself or in a shared CORS helper
-	const allCode = getAllCode();
-	expect(allCode).toMatch(/['"]OPTIONS['"]/);
-	expect(allCode).toMatch(/\.method/);
-});
-
-test("defines CORS headers", () => {
-	const allCode = getAllCode();
-	expect(allCode).toMatch(/Access-Control-Allow-Origin/);
-});
-
-test("CORS allows required headers", () => {
-	const allCode = getAllCode().toLowerCase();
-	// Must include authorization and apikey in allowed headers
-	expect(allCode).toMatch(/access-control-allow-headers/);
-	expect(allCode).toMatch(/authorization/);
-	expect(allCode).toMatch(/apikey/);
-});
-
-test("error response has CORS headers", () => {
-	const catchCode = getCatchBlockCode();
-	expect(catchCode.length).toBeGreaterThan(0);
-	// The catch block should either directly reference CORS headers, or call
-	// a shared helper that includes them (e.g. errorResponse, corsHeaders).
-	const sharedCode = getSharedCode();
-	// Direct CORS reference in catch block
-	const directCors =
-		/corsHeaders|cors_headers|Access-Control-Allow-Origin/i.test(catchCode);
-	// Calls a shared helper that itself includes CORS headers
-	const callsSharedHelper =
-		/errorResponse|jsonResponse|json_response|error_response/i.test(
-			catchCode,
-		) && /Access-Control-Allow-Origin/i.test(sharedCode);
-	expect(directCors || callsSharedHelper).toBe(true);
-});
-
-test("has try-catch for error handling", () => {
-	const code = getFunctionCode(FUNCTION_NAME);
-	expect(code).toMatch(/\btry\s*\{/);
-	expect(code).toMatch(/\bcatch\b/);
-});
-
-test("returns proper error status code", () => {
-	const catchCode = getCatchBlockCode();
-	expect(catchCode.length).toBeGreaterThan(0);
-	// Error response should use status 400 or 500 (not default 200).
-	// Match object-style { status: 500 } or function-call-style fn('msg', 500)
-	const hasObjectStatus = /status:\s*(400|500|4\d{2}|5\d{2})/.test(catchCode);
-	const hasFnArgStatus = /[,(]\s*(400|500|4\d{2}|5\d{2})\s*[),]/.test(
-		catchCode,
-	);
-	expect(hasObjectStatus || hasFnArgStatus).toBe(true);
-});
-
-test("shared CORS module exists", () => {
-	expect(findSharedCorsFile()).not.toBeNull();
-});
-
-test("function imports from shared", () => {
-	const code = getFunctionCode(FUNCTION_NAME);
-	// Should import from ../_shared/ relative path
-	expect(code).toMatch(/from\s+['"]\.\.\/(_shared|_utils)/);
-});
-
-test("function uses hyphenated name", () => {
-	// The function directory should use hyphens, not underscores
-	const dirs = existsSync(functionsDir) ? readdirSync(functionsDir) : [];
-	const helloDir = dirs.find((d) => d.includes("hello") && d.includes("world"));
-	expect(helloDir).toBeDefined();
-	expect(helloDir).toMatch(/^hello-world$/);
-});
-
-test("overall quality: demonstrates Edge Function best practices", () => {
-	const allCode = getAllCode().toLowerCase();
-	// A high-quality Edge Function should contain most of these patterns
-	const signals = [
-		/deno\.serve/, // Modern Deno.serve API
-		/['"]options['"]/, // OPTIONS preflight handling
-		/access-control-allow-origin/, // CORS headers defined
-		/\btry\s*\{/, // Error handling with try-catch
-		/status:\s*(400|500|4\d{2}|5\d{2})|[,(]\s*(400|500|4\d{2}|5\d{2})\s*[),]/, // Proper error status codes
-		/from\s+['"]\.\.\/(_shared|_utils)/, // Imports from shared directory
-		/authorization/, // Allows authorization header in CORS
-		/apikey/, // Allows apikey header in CORS
-	];
-	const matches = signals.filter((r) => r.test(allCode));
-	expect(matches.length).toBeGreaterThanOrEqual(6);
-});
+export const assertions: EvalAssertion[] = [
+	{
+		name: "supabase project initialized",
+		check: () => existsSync(join(getSupabaseDir(), "config.toml")),
+	},
+	{
+		name: "function directory exists",
+		check: () => existsSync(join(getFunctionsDir(), FUNCTION_NAME)),
+	},
+	{
+		name: "function index file exists",
+		check: () => findFunctionFile(FUNCTION_NAME) !== null,
+	},
+	{
+		name: "uses Deno.serve",
+		check: () => /Deno\.serve/.test(getFunctionCode(FUNCTION_NAME)),
+	},
+	{
+		name: "returns JSON response",
+		check: () => {
+			const allCode = getAllCode();
+			return (
+				/content-type['"]\s*:\s*['"]application\/json/i.test(allCode) ||
+				/Response\.json/i.test(allCode) ||
+				/JSON\.stringify/i.test(allCode)
+			);
+		},
+	},
+	{
+		name: "handles OPTIONS preflight",
+		check: () => {
+			const allCode = getAllCode();
+			return /['"]OPTIONS['"]/.test(allCode) && /\.method/.test(allCode);
+		},
+	},
+	{
+		name: "defines CORS headers",
+		check: () => /Access-Control-Allow-Origin/.test(getAllCode()),
+	},
+	{
+		name: "CORS allows required headers",
+		check: () => {
+			const allCode = getAllCode().toLowerCase();
+			return (
+				/access-control-allow-headers/.test(allCode) &&
+				/authorization/.test(allCode) &&
+				/apikey/.test(allCode)
+			);
+		},
+	},
+	{
+		name: "error response has CORS headers",
+		check: () => {
+			const catchCode = getCatchBlockCode();
+			if (catchCode.length === 0) return false;
+			const sharedCode = getSharedCode();
+			const directCors =
+				/corsHeaders|cors_headers|Access-Control-Allow-Origin/i.test(catchCode);
+			const callsSharedHelper =
+				/errorResponse|jsonResponse|json_response|error_response/i.test(
+					catchCode,
+				) && /Access-Control-Allow-Origin/i.test(sharedCode);
+			return directCors || callsSharedHelper;
+		},
+	},
+	{
+		name: "has try-catch for error handling",
+		check: () => {
+			const code = getFunctionCode(FUNCTION_NAME);
+			return /\btry\s*\{/.test(code) && /\bcatch\b/.test(code);
+		},
+	},
+	{
+		name: "returns proper error status code",
+		check: () => {
+			const catchCode = getCatchBlockCode();
+			if (catchCode.length === 0) return false;
+			return (
+				/status:\s*(400|500|4\d{2}|5\d{2})/.test(catchCode) ||
+				/[,(]\s*(400|500|4\d{2}|5\d{2})\s*[),]/.test(catchCode)
+			);
+		},
+	},
+	{
+		name: "shared CORS module exists",
+		check: () => findSharedCorsFile() !== null,
+	},
+	{
+		name: "function imports from shared",
+		check: () =>
+			/from\s+['"]\.\.\/(_shared|_utils)/.test(getFunctionCode(FUNCTION_NAME)),
+	},
+	{
+		name: "function uses hyphenated name",
+		check: () => {
+			const dirs = existsSync(getFunctionsDir()) ? readdirSync(getFunctionsDir()) : [];
+			const helloDir = dirs.find(
+				(d) => d.includes("hello") && d.includes("world"),
+			);
+			return helloDir !== undefined && /^hello-world$/.test(helloDir);
+		},
+	},
+	{
+		name: "overall quality: demonstrates Edge Function best practices",
+		check: () => {
+			const allCode = getAllCode().toLowerCase();
+			const signals = [
+				/deno\.serve/,
+				/['"]options['"]/,
+				/access-control-allow-origin/,
+				/\btry\s*\{/,
+				/status:\s*(400|500|4\d{2}|5\d{2})|[,(]\s*(400|500|4\d{2}|5\d{2})\s*[),]/,
+				/from\s+['"]\.\.\/(_shared|_utils)/,
+				/authorization/,
+				/apikey/,
+			];
+			return signals.filter((r) => r.test(allCode)).length >= 6;
+		},
+	},
+];
--- a/packages/evals/evals/eval-utils.ts
+++ b/packages/evals/evals/eval-utils.ts
@@ -2,12 +2,90 @@ import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
 import { join } from "node:path";

 // ---------------------------------------------------------------------------
-// Common paths
+// Runtime DB helpers (use only in async tests)
 // ---------------------------------------------------------------------------

-export const supabaseDir = join(process.cwd(), "supabase");
-export const migrationsDir = join(supabaseDir, "migrations");
-export const functionsDir = join(supabaseDir, "functions");
+const SUPABASE_URL = process.env.SUPABASE_URL ?? "http://127.0.0.1:54321";
+const SERVICE_KEY = process.env.SUPABASE_SERVICE_ROLE_KEY ?? "";
+const ANON_KEY = process.env.SUPABASE_ANON_KEY ?? "";
+
+/** Execute a raw SQL query via PostgREST's /rpc endpoint or via the REST API. */
+async function pgRest(
+	table: string,
+	options: { select?: string; role?: "service_role" | "anon" } = {},
+): Promise<{ data: Record<string, unknown>[]; error: string | null }> {
+	const key = options.role === "anon" ? ANON_KEY : SERVICE_KEY;
+	const select = options.select ?? "*";
+	const res = await fetch(`${SUPABASE_URL}/rest/v1/${table}?select=${select}`, {
+		headers: {
+			apikey: key,
+			Authorization: `Bearer ${key}`,
+			"Content-Type": "application/json",
+		},
+	});
+
+	if (!res.ok) {
+		const body = await res.text();
+		return { data: [], error: `HTTP ${res.status}: ${body}` };
+	}
+
+	const data = (await res.json()) as Record<string, unknown>[];
+	return { data, error: null };
+}
+
+/**
+ * Check whether a table is visible through the PostgREST API.
+ * Uses the service role key (bypasses RLS).
+ */
+export async function tableExists(tableName: string): Promise<boolean> {
+	const { error } = await pgRest(tableName);
+	// A 404 or PGRST116 means the table/view doesn't exist in the schema cache.
+	return error === null || !error.includes("404");
+}
+
+/**
+ * Query rows from a table.
+ * @param tableName - table to query
+ * @param role - "service_role" bypasses RLS; "anon" respects RLS policies
+ */
+export async function queryTable(
+	tableName: string,
+	role: "service_role" | "anon" = "service_role",
+): Promise<{ data: Record<string, unknown>[]; error: string | null }> {
+	return pgRest(tableName, { role });
+}
+
+/**
+ * Return true if the table exists AND is empty when queried as anon
+ * (i.e., RLS is blocking access as expected for an unauthenticated user).
+ */
+export async function anonSeeesNoRows(tableName: string): Promise<boolean> {
+	const { data, error } = await pgRest(tableName, { role: "anon" });
+	return error === null && data.length === 0;
+}
+
+// ---------------------------------------------------------------------------
+// Common paths
+//
+// These are FUNCTIONS, not constants, so they re-evaluate process.cwd() on
+// every call. The runner does `process.chdir(workspacePath)` before running
+// assertions, so all path helpers resolve relative to the correct workspace.
+// ---------------------------------------------------------------------------
+
+/** Returns the supabase/ directory under the current working directory. */
+export function getSupabaseDir(): string {
+	return join(process.cwd(), "supabase");
+}
+
+/** Returns the supabase/migrations/ directory. */
+export function getMigrationsDir(): string {
+	return join(getSupabaseDir(), "migrations");
+}
+
+/** Returns the supabase/functions/ directory. */
+export function getFunctionsDir(): string {
+	return join(getSupabaseDir(), "functions");
+}

 // ---------------------------------------------------------------------------
 // Migration helpers
@@ -15,10 +93,11 @@ export const functionsDir = join(supabaseDir, "functions");

 /** Find all .sql migration files (agent may create one or more). */
 export function findMigrationFiles(): string[] {
-	if (!existsSync(migrationsDir)) return [];
-	return readdirSync(migrationsDir)
+	const dir = getMigrationsDir();
+	if (!existsSync(dir)) return [];
+	return readdirSync(dir)
 		.filter((f) => f.endsWith(".sql"))
-		.map((f) => join(migrationsDir, f));
+		.map((f) => join(dir, f));
 }

 /** Read and concatenate all migration SQL files. */
@@ -39,7 +118,7 @@ export function getMigrationSQL(): string {
 * @param functionName - directory name under supabase/functions/ (e.g. "hello-world")
 */
 export function findFunctionFile(functionName: string): string | null {
-	const fnDir = join(functionsDir, functionName);
+	const fnDir = join(getFunctionsDir(), functionName);
 	if (!existsSync(fnDir)) return null;
 	const files = readdirSync(fnDir).filter(
 		(f) => f.startsWith("index.") && (f.endsWith(".ts") || f.endsWith(".tsx")),
@@ -61,12 +140,13 @@ export function getFunctionCode(functionName: string): string {

 /** Find a shared CORS module under supabase/functions/_shared/ (or similar _-prefixed dir). */
 export function findSharedCorsFile(): string | null {
-	if (!existsSync(functionsDir)) return null;
-	const sharedDirs = readdirSync(functionsDir).filter(
-		(d) => d.startsWith("_") && statSync(join(functionsDir, d)).isDirectory(),
+	const fnDir = getFunctionsDir();
+	if (!existsSync(fnDir)) return null;
+	const sharedDirs = readdirSync(fnDir).filter(
+		(d) => d.startsWith("_") && statSync(join(fnDir, d)).isDirectory(),
 	);
 	for (const dir of sharedDirs) {
-		const dirPath = join(functionsDir, dir);
+		const dirPath = join(fnDir, dir);
 		const files = readdirSync(dirPath).filter((f) => f.includes("cors"));
 		if (files.length > 0) return join(dirPath, files[0]);
 	}
@@ -75,13 +155,14 @@ export function findSharedCorsFile(): string | null {

 /** Read and concatenate all .ts/.tsx files from _-prefixed shared directories. */
 export function getSharedCode(): string {
-	if (!existsSync(functionsDir)) return "";
-	const sharedDirs = readdirSync(functionsDir).filter(
-		(d) => d.startsWith("_") && statSync(join(functionsDir, d)).isDirectory(),
+	const fnDir = getFunctionsDir();
+	if (!existsSync(fnDir)) return "";
+	const sharedDirs = readdirSync(fnDir).filter(
+		(d) => d.startsWith("_") && statSync(join(fnDir, d)).isDirectory(),
 	);
 	const parts: string[] = [];
 	for (const dir of sharedDirs) {
-		const dirPath = join(functionsDir, dir);
+		const dirPath = join(fnDir, dir);
 		const files = readdirSync(dirPath).filter(
 			(f) => f.endsWith(".ts") || f.endsWith(".tsx"),
 		);
--- a/packages/evals/evals/extension-wrong-schema/EVAL.ts
+++ b/packages/evals/evals/extension-wrong-schema/EVAL.ts
@@ -0,0 +1,100 @@
+export const expectedReferenceFiles = [
+	"db-schema-extensions.md",
+	"db-rls-mandatory.md",
+	"db-migrations-idempotent.md",
+	"db-schema-auth-fk.md",
+	"db-rls-common-mistakes.md",
+];
+
+import type { EvalAssertion } from "../../src/eval-types.js";
+
+import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
+
+export const assertions: EvalAssertion[] = [
+	{
+		name: "migration file exists",
+		check: () => findMigrationFiles().length > 0,
+	},
+	{
+		name: "extension installed in extensions schema",
+		check: () =>
+			/create\s+extension[\s\S]*?with\s+schema\s+extensions/.test(
+				getMigrationSQL().toLowerCase(),
+			),
+	},
+	{
+		name: "IF NOT EXISTS on extension creation",
+		check: () =>
+			/create\s+extension\s+if\s+not\s+exists/.test(
+				getMigrationSQL().toLowerCase(),
+			),
+	},
+	{
+		name: "vector column with correct dimensions",
+		check: () =>
+			/(?:extensions\.)?vector\s*\(\s*1536\s*\)/.test(
+				getMigrationSQL().toLowerCase(),
+			),
+	},
+	{
+		name: "HNSW index used instead of IVFFlat",
+		check: () => /using\s+hnsw/.test(getMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "RLS enabled on documents table",
+		check: () =>
+			/alter\s+table[\s\S]*?documents[\s\S]*?enable\s+row\s+level\s+security/.test(
+				getMigrationSQL().toLowerCase(),
+			),
+	},
+	{
+		name: "FK to auth.users with ON DELETE CASCADE",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return (
+				/references\s+auth\.users/.test(sql) &&
+				/on\s+delete\s+cascade/.test(sql)
+			);
+		},
+	},
+	{
+		name: "policies use TO authenticated",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			return (
+				policyBlocks.length > 0 &&
+				policyBlocks.every((p) => /to\s+authenticated/.test(p))
+			);
+		},
+	},
+	{
+		name: "idempotent table creation (IF NOT EXISTS)",
+		check: () =>
+			/create\s+table\s+if\s+not\s+exists/.test(
+				getMigrationSQL().toLowerCase(),
+			),
+	},
+	{
+		name: "overall quality: demonstrates pgvector best practices",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const signals = [
+				/create\s+extension[\s\S]*?with\s+schema\s+extensions/.test(sql),
+				/create\s+extension\s+if\s+not\s+exists/.test(sql),
+				/(?:extensions\.)?vector\s*\(\s*1536\s*\)/.test(sql),
+				/using\s+hnsw/.test(sql),
+				/alter\s+table[\s\S]*?documents[\s\S]*?enable\s+row\s+level\s+security/.test(
+					sql,
+				),
+				/references\s+auth\.users/.test(sql) &&
+					/on\s+delete\s+cascade/.test(sql),
+				policyBlocks.length > 0 &&
+					policyBlocks.every((p) => /to\s+authenticated/.test(p)),
+				/if\s+not\s+exists/.test(sql),
+			];
+			return signals.filter(Boolean).length >= 6;
+		},
+	},
+];
--- a/packages/evals/evals/extension-wrong-schema/PROMPT.md
+++ b/packages/evals/evals/extension-wrong-schema/PROMPT.md
@@ -0,0 +1,11 @@
+I'm building a semantic search feature for my app. I need to store document embeddings generated by OpenAI's ada-002 model (1536 dimensions) and let users search their own documents.
+
+Create a migration in `supabase/migrations/` that:
+
+1. Enables the pgvector extension
+2. Creates a `documents` table with:
+   - An `embedding` column (1536 dimensions)
+   - A `content` text column
+   - A `user_id` column linked to the authenticated user
+3. Adds a vector similarity search index
+4. Ensures users can only see and manage their own documents
--- a/packages/evals/evals/extension-wrong-schema/package.json
+++ b/packages/evals/evals/extension-wrong-schema/package.json
@@ -0,0 +1,5 @@
+{
+	"name": "extension-wrong-schema",
+	"private": true,
+	"type": "module"
+}
--- a/packages/evals/evals/extension-wrong-schema/supabase/config.toml
+++ b/packages/evals/evals/extension-wrong-schema/supabase/config.toml
@@ -0,0 +1,111 @@
+# For detailed configuration reference documentation, visit:
+# https://supabase.com/docs/guides/local-development/cli/config
+# A string used to distinguish different Supabase projects on the same host. Defaults to the
+# working directory name when running `supabase init`.
+project_id = "extension-wrong-schema"
+
+[api]
+enabled = true
+# Port to use for the API URL.
+port = 54321
+# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
+# endpoints. `public` and `graphql_public` schemas are included by default.
+schemas = ["public", "graphql_public"]
+# Extra schemas to add to the search_path of every request.
+extra_search_path = ["public", "extensions"]
+# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
+# for accidental or malicious requests.
+max_rows = 1000
+
+[db]
+# Port to use for the local database URL.
+port = 54322
+# Port used by db diff command to initialize the shadow database.
+shadow_port = 54320
+# The database major version to use. This has to be the same as your remote database's. Run `SHOW
+# server_version;` on the remote database to check.
+major_version = 17
+
+[db.pooler]
+enabled = false
+# Port to use for the local connection pooler.
+port = 54329
+# Specifies when a server connection can be reused by other clients.
+# Configure one of the supported pooler modes: `transaction`, `session`.
+pool_mode = "transaction"
+# How many server connections to allow per user/database pair.
+default_pool_size = 20
+# Maximum number of client connections allowed.
+max_client_conn = 100
+
+[db.migrations]
+# If disabled, migrations will be skipped during a db push or reset.
+enabled = true
+schema_paths = []
+
+[db.seed]
+# If enabled, seeds the database after migrations during a db reset.
+enabled = true
+# Specifies an ordered list of seed files to load during db reset.
+sql_paths = ["./seed.sql"]
+
+[realtime]
+enabled = true
+
+[studio]
+enabled = true
+# Port to use for Supabase Studio.
+port = 54323
+# External URL of the API server that frontend connects to.
+api_url = "http://127.0.0.1"
+
+[inbucket]
+enabled = true
+# Port to use for the email testing server web interface.
+port = 54324
+
+[storage]
+enabled = true
+# The maximum file size allowed (e.g. "5MB", "500KB").
+file_size_limit = "50MiB"
+
+[auth]
+enabled = true
+# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
+# in emails.
+site_url = "http://127.0.0.1:3000"
+# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
+additional_redirect_urls = ["https://127.0.0.1:3000"]
+# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
+jwt_expiry = 3600
+# If disabled, the refresh token will never expire.
+enable_refresh_token_rotation = true
+# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
+# Requires enable_refresh_token_rotation = true.
+refresh_token_reuse_interval = 10
+# Allow/disallow new user signups to your project.
+enable_signup = true
+# Allow/disallow anonymous sign-ins to your project.
+enable_anonymous_sign_ins = false
+
+[auth.email]
+# Allow/disallow new user signups via email to your project.
+enable_signup = true
+# If enabled, a user will be required to confirm any email change on both the old, and new email
+# addresses. If disabled, only the new email is required to confirm.
+double_confirm_changes = true
+# If enabled, users need to confirm their email address before signing in.
+enable_confirmations = false
+
+[edge_runtime]
+enabled = true
+# Configure one of the supported request policies: `oneshot`, `per_worker`.
+policy = "per_worker"
+# Port to attach the Chrome inspector for debugging edge functions.
+inspector_port = 8083
+
+[analytics]
+enabled = true
+port = 54327
+# Configure one of the supported backends: `postgres`, `bigquery`.
+backend = "postgres"
--- a/packages/evals/evals/postgrest-schema-cache/EVAL.ts
+++ b/packages/evals/evals/postgrest-schema-cache/EVAL.ts
@@ -0,0 +1,114 @@
+export const expectedReferenceFiles = [
+	"db-rls-views.md",
+	"db-migrations-idempotent.md",
+	"db-rls-mandatory.md",
+	"db-rls-performance.md",
+	"db-schema-timestamps.md",
+];
+
+import { existsSync, readdirSync, readFileSync } from "node:fs";
+import { join } from "node:path";
+import type { EvalAssertion } from "../../src/eval-types.js";
+
+const migrationsDir = join(process.cwd(), "supabase", "migrations");
+const STARTER_MIGRATION = "20240101000000_create_products.sql";
+
+function findAgentMigrationFiles(): string[] {
+	if (!existsSync(migrationsDir)) return [];
+	return readdirSync(migrationsDir)
+		.filter((f) => f.endsWith(".sql") && f !== STARTER_MIGRATION)
+		.map((f) => join(migrationsDir, f));
+}
+
+function getAgentMigrationSQL(): string {
+	const files = findAgentMigrationFiles();
+	if (files.length === 0)
+		throw new Error(
+			"No agent-created migration file found in supabase/migrations/",
+		);
+	return files.map((f) => readFileSync(f, "utf-8")).join("\n");
+}
+
+export const assertions: EvalAssertion[] = [
+	{
+		name: "new migration file exists",
+		check: () => findAgentMigrationFiles().length > 0,
+	},
+	{
+		name: "ADD COLUMN IF NOT EXISTS for description",
+		check: () =>
+			/add\s+column\s+if\s+not\s+exists\s+description/.test(
+				getAgentMigrationSQL().toLowerCase(),
+			),
+	},
+	{
+		name: "ADD COLUMN IF NOT EXISTS for published_at",
+		check: () =>
+			/add\s+column\s+if\s+not\s+exists\s+published_at/.test(
+				getAgentMigrationSQL().toLowerCase(),
+			),
+	},
+	{
+		name: "published_at uses timestamptz not plain timestamp",
+		check: () => {
+			const sql = getAgentMigrationSQL().toLowerCase();
+			return (
+				/published_at\s+timestamptz|published_at\s+timestamp\s+with\s+time\s+zone/.test(
+					sql,
+				) &&
+				!/published_at\s+timestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(
+					sql,
+				)
+			);
+		},
+	},
+	{
+		name: "view public_products is created",
+		check: () =>
+			/create\s+(or\s+replace\s+)?view\s+public_products/.test(
+				getAgentMigrationSQL().toLowerCase(),
+			),
+	},
+	{
+		name: "view uses security_invoker = true",
+		check: () =>
+			/security_invoker\s*=\s*true/.test(getAgentMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "SELECT policy on products for authenticated role",
+		check: () => {
+			const sql = getAgentMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			return policyBlocks.some(
+				(p) =>
+					p.includes("select") &&
+					p.includes("products") &&
+					/to\s+authenticated/.test(p),
+			);
+		},
+	},
+	{
+		name: "NOTIFY pgrst reload schema is present",
+		check: () => /notify\s+pgrst/.test(getAgentMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "overall quality: demonstrates PostgREST and schema best practices",
+		check: () => {
+			const sql = getAgentMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const signals = [
+				/add\s+column\s+if\s+not\s+exists/.test(sql),
+				/published_at\s+timestamptz|published_at\s+timestamp\s+with\s+time\s+zone/.test(
+					sql,
+				),
+				/create\s+(or\s+replace\s+)?view\s+public_products/.test(sql),
+				/security_invoker\s*=\s*true/.test(sql),
+				policyBlocks.some(
+					(p) => p.includes("select") && /to\s+authenticated/.test(p),
+				),
+				/notify\s+pgrst/.test(sql),
+			];
+			return signals.filter(Boolean).length >= 5;
+		},
+	},
+];
--- a/packages/evals/evals/postgrest-schema-cache/PROMPT.md
+++ b/packages/evals/evals/postgrest-schema-cache/PROMPT.md
@@ -0,0 +1,9 @@
+I'm building a product catalog with Supabase. We already have a `products` table (see the existing migration in `supabase/migrations/`), but we need to expand it.
+
+Please create a new migration file in `supabase/migrations/` that:
+
+1. Adds two new columns to the `products` table: `description` (text) and `published_at` (timestamp)
+2. Creates a view called `public_products` that shows only products where `published_at` is not null
+3. Adds a policy so any authenticated user can view published products
+
+Make sure the migration is safe to run multiple times.
--- a/packages/evals/evals/postgrest-schema-cache/package.json
+++ b/packages/evals/evals/postgrest-schema-cache/package.json
@@ -0,0 +1,5 @@
+{
+	"name": "postgrest-schema-cache",
+	"private": true,
+	"type": "module"
+}
--- a/packages/evals/evals/postgrest-schema-cache/supabase/config.toml
+++ b/packages/evals/evals/postgrest-schema-cache/supabase/config.toml
@@ -0,0 +1,111 @@
+# For detailed configuration reference documentation, visit:
+# https://supabase.com/docs/guides/local-development/cli/config
+# A string used to distinguish different Supabase projects on the same host. Defaults to the
+# working directory name when running `supabase init`.
+project_id = "postgrest-schema-cache"
+
+[api]
+enabled = true
+# Port to use for the API URL.
+port = 54321
+# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
+# endpoints. `public` and `graphql_public` schemas are included by default.
+schemas = ["public", "graphql_public"]
+# Extra schemas to add to the search_path of every request.
+extra_search_path = ["public", "extensions"]
+# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
+# for accidental or malicious requests.
+max_rows = 1000
+
+[db]
+# Port to use for the local database URL.
+port = 54322
+# Port used by db diff command to initialize the shadow database.
+shadow_port = 54320
+# The database major version to use. This has to be the same as your remote database's. Run `SHOW
+# server_version;` on the remote database to check.
+major_version = 17
+
+[db.pooler]
+enabled = false
+# Port to use for the local connection pooler.
+port = 54329
+# Specifies when a server connection can be reused by other clients.
+# Configure one of the supported pooler modes: `transaction`, `session`.
+pool_mode = "transaction"
+# How many server connections to allow per user/database pair.
+default_pool_size = 20
+# Maximum number of client connections allowed.
+max_client_conn = 100
+
+[db.migrations]
+# If disabled, migrations will be skipped during a db push or reset.
+enabled = true
+schema_paths = []
+
+[db.seed]
+# If enabled, seeds the database after migrations during a db reset.
+enabled = true
+# Specifies an ordered list of seed files to load during db reset.
+sql_paths = ["./seed.sql"]
+
+[realtime]
+enabled = true
+
+[studio]
+enabled = true
+# Port to use for Supabase Studio.
+port = 54323
+# External URL of the API server that frontend connects to.
+api_url = "http://127.0.0.1"
+
+[inbucket]
+enabled = true
+# Port to use for the email testing server web interface.
+port = 54324
+
+[storage]
+enabled = true
+# The maximum file size allowed (e.g. "5MB", "500KB").
+file_size_limit = "50MiB"
+
+[auth]
+enabled = true
+# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
+# in emails.
+site_url = "http://127.0.0.1:3000"
+# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
+additional_redirect_urls = ["https://127.0.0.1:3000"]
+# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
+jwt_expiry = 3600
+# If disabled, the refresh token will never expire.
+enable_refresh_token_rotation = true
+# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
+# Requires enable_refresh_token_rotation = true.
+refresh_token_reuse_interval = 10
+# Allow/disallow new user signups to your project.
+enable_signup = true
+# Allow/disallow anonymous sign-ins to your project.
+enable_anonymous_sign_ins = false
+
+[auth.email]
+# Allow/disallow new user signups via email to your project.
+enable_signup = true
+# If enabled, a user will be required to confirm any email change on both the old, and new email
+# addresses. If disabled, only the new email is required to confirm.
+double_confirm_changes = true
+# If enabled, users need to confirm their email address before signing in.
+enable_confirmations = false
+
+[edge_runtime]
+enabled = true
+# Configure one of the supported request policies: `oneshot`, `per_worker`.
+policy = "per_worker"
+# Port to attach the Chrome inspector for debugging edge functions.
+inspector_port = 8083
+
+[analytics]
+enabled = true
+port = 54327
+# Configure one of the supported backends: `postgres`, `bigquery`.
+backend = "postgres"
--- a/packages/evals/evals/postgrest-schema-cache/supabase/migrations/20240101000000_create_products.sql
+++ b/packages/evals/evals/postgrest-schema-cache/supabase/migrations/20240101000000_create_products.sql
@@ -0,0 +1,8 @@
+-- Initial products table
+create table if not exists products (
+  id bigint primary key generated always as identity,
+  name text not null,
+  price numeric(10, 2) not null default 0
+);
+
+alter table products enable row level security;
--- a/packages/evals/evals/rls-update-needs-select/EVAL.ts
+++ b/packages/evals/evals/rls-update-needs-select/EVAL.ts
@@ -0,0 +1,122 @@
+export const expectedReferenceFiles = [
+	"db-rls-common-mistakes.md",
+	"db-rls-policy-types.md",
+	"db-rls-performance.md",
+	"db-rls-mandatory.md",
+	"db-schema-timestamps.md",
+];
+
+import type { EvalAssertion } from "../../src/eval-types.js";
+
+import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
+
+export const assertions: EvalAssertion[] = [
+	{
+		name: "migration file exists",
+		check: () => findMigrationFiles().length > 0,
+	},
+	{
+		name: "creates orders table",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return /create\s+table/.test(sql) && /orders/.test(sql);
+		},
+	},
+	{
+		name: "enables RLS on orders table",
+		check: () =>
+			/alter\s+table.*orders.*enable\s+row\s+level\s+security/.test(
+				getMigrationSQL().toLowerCase(),
+			),
+	},
+	{
+		name: "has SELECT policy on orders",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			return policyBlocks.some((p) => p.includes("for select"));
+		},
+	},
+	{
+		name: "has UPDATE policy with WITH CHECK on orders",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const updatePolicy = policyBlocks.find((p) => p.includes("for update"));
+			return updatePolicy !== undefined && /with\s+check/.test(updatePolicy);
+		},
+	},
+	{
+		name: "all policies use TO authenticated",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			return (
+				policyBlocks.length > 0 &&
+				policyBlocks.every((p) => /to\s+authenticated/.test(p))
+			);
+		},
+	},
+	{
+		name: "uses (select auth.uid()) not bare auth.uid() in policies",
+		check: () => {
+			const sql = getMigrationSQL();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			for (const policy of policyBlocks) {
+				if (
+					policy.includes("auth.uid()") &&
+					!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
+				) {
+					return false;
+				}
+			}
+			return true;
+		},
+	},
+	{
+		name: "uses timestamptz not plain timestamp for created_at",
+		check: () => {
+			const rawSql = getMigrationSQL().toLowerCase();
+			const sql = rawSql.replace(/--[^\n]*/g, "");
+			const hasPlainTimestamp =
+				/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
+			if (sql.includes("created_at")) {
+				return !hasPlainTimestamp.test(sql);
+			}
+			return true;
+		},
+	},
+	{
+		name: "FK to auth.users with ON DELETE CASCADE",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return (
+				/references\s+auth\.users/.test(sql) &&
+				/on\s+delete\s+cascade/.test(sql)
+			);
+		},
+	},
+	{
+		name: "overall quality: demonstrates Supabase best practices",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const signals = [
+				/alter\s+table.*orders.*enable\s+row\s+level\s+security/.test(sql),
+				policyBlocks.some((p) => p.includes("for select")),
+				policyBlocks.some(
+					(p) => p.includes("for update") && /with\s+check/.test(p),
+				),
+				/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
+				policyBlocks.length > 0 &&
+					policyBlocks.every((p) => /to\s+authenticated/.test(p)),
+				/references\s+auth\.users/.test(sql) &&
+					/on\s+delete\s+cascade/.test(sql),
+				!/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(
+					sql.replace(/--[^\n]*/g, ""),
+				),
+			];
+			return signals.filter(Boolean).length >= 5;
+		},
+	},
+];
--- a/packages/evals/evals/rls-update-needs-select/PROMPT.md
+++ b/packages/evals/evals/rls-update-needs-select/PROMPT.md
@@ -0,0 +1,7 @@
+I'm building an e-commerce app and need a migration for an `orders` table. Each order has a `status` (text), `total` (numeric), and `created_at` timestamp. Orders belong to users — each order should have a `user_id` that links to the authenticated user who placed it.
+
+Users need to be able to:
+- View their own orders
+- Update the status of their own orders
+
+Please create the migration in `supabase/migrations/`.
--- a/packages/evals/evals/rls-update-needs-select/package.json
+++ b/packages/evals/evals/rls-update-needs-select/package.json
@@ -0,0 +1,5 @@
+{
+	"name": "rls-update-needs-select",
+	"private": true,
+	"type": "module"
+}
--- a/packages/evals/evals/rls-update-needs-select/supabase/config.toml
+++ b/packages/evals/evals/rls-update-needs-select/supabase/config.toml
@@ -0,0 +1,111 @@
+# For detailed configuration reference documentation, visit:
+# https://supabase.com/docs/guides/local-development/cli/config
+# A string used to distinguish different Supabase projects on the same host. Defaults to the
+# working directory name when running `supabase init`.
+project_id = "rls-update-needs-select"
+
+[api]
+enabled = true
+# Port to use for the API URL.
+port = 54321
+# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
+# endpoints. `public` and `graphql_public` schemas are included by default.
+schemas = ["public", "graphql_public"]
+# Extra schemas to add to the search_path of every request.
+extra_search_path = ["public", "extensions"]
+# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
+# for accidental or malicious requests.
+max_rows = 1000
+
+[db]
+# Port to use for the local database URL.
+port = 54322
+# Port used by db diff command to initialize the shadow database.
+shadow_port = 54320
+# The database major version to use. This has to be the same as your remote database's. Run `SHOW
+# server_version;` on the remote database to check.
+major_version = 17
+
+[db.pooler]
+enabled = false
+# Port to use for the local connection pooler.
+port = 54329
+# Specifies when a server connection can be reused by other clients.
+# Configure one of the supported pooler modes: `transaction`, `session`.
+pool_mode = "transaction"
+# How many server connections to allow per user/database pair.
+default_pool_size = 20
+# Maximum number of client connections allowed.
+max_client_conn = 100
+
+[db.migrations]
+# If disabled, migrations will be skipped during a db push or reset.
+enabled = true
+schema_paths = []
+
+[db.seed]
+# If enabled, seeds the database after migrations during a db reset.
+enabled = true
+# Specifies an ordered list of seed files to load during db reset.
+sql_paths = ["./seed.sql"]
+
+[realtime]
+enabled = true
+
+[studio]
+enabled = true
+# Port to use for Supabase Studio.
+port = 54323
+# External URL of the API server that frontend connects to.
+api_url = "http://127.0.0.1"
+
+[inbucket]
+enabled = true
+# Port to use for the email testing server web interface.
+port = 54324
+
+[storage]
+enabled = true
+# The maximum file size allowed (e.g. "5MB", "500KB").
+file_size_limit = "50MiB"
+
+[auth]
+enabled = true
+# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
+# in emails.
+site_url = "http://127.0.0.1:3000"
+# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
+additional_redirect_urls = ["https://127.0.0.1:3000"]
+# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
+jwt_expiry = 3600
+# If disabled, the refresh token will never expire.
+enable_refresh_token_rotation = true
+# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
+# Requires enable_refresh_token_rotation = true.
+refresh_token_reuse_interval = 10
+# Allow/disallow new user signups to your project.
+enable_signup = true
+# Allow/disallow anonymous sign-ins to your project.
+enable_anonymous_sign_ins = false
+
+[auth.email]
+# Allow/disallow new user signups via email to your project.
+enable_signup = true
+# If enabled, a user will be required to confirm any email change on both the old, and new email
+# addresses. If disabled, only the new email is required to confirm.
+double_confirm_changes = true
+# If enabled, users need to confirm their email address before signing in.
+enable_confirmations = false
+
+[edge_runtime]
+enabled = true
+# Configure one of the supported request policies: `oneshot`, `per_worker`.
+policy = "per_worker"
+# Port to attach the Chrome inspector for debugging edge functions.
+inspector_port = 8083
+
+[analytics]
+enabled = true
+port = 54327
+# Configure one of the supported backends: `postgres`, `bigquery`.
+backend = "postgres"
--- a/packages/evals/evals/rls-user-metadata-role-check/EVAL.ts
+++ b/packages/evals/evals/rls-user-metadata-role-check/EVAL.ts
@@ -0,0 +1,123 @@
+export const expectedReferenceFiles = [
+	"db-rls-common-mistakes.md",
+	"db-rls-policy-types.md",
+	"db-rls-performance.md",
+	"db-rls-mandatory.md",
+	"db-schema-auth-fk.md",
+];
+
+import type { EvalAssertion } from "../../src/eval-types.js";
+
+import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
+
+export const assertions: EvalAssertion[] = [
+	{
+		name: "migration file exists in supabase/migrations/",
+		check: () => findMigrationFiles().length > 0,
+	},
+	{
+		name: "creates documents table",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return /create\s+table/.test(sql) && /documents/.test(sql);
+		},
+	},
+	{
+		name: "RLS enabled on documents table",
+		check: () =>
+			/alter\s+table.*documents.*enable\s+row\s+level\s+security/.test(
+				getMigrationSQL().toLowerCase(),
+			),
+	},
+	{
+		name: "uses app_metadata not user_metadata for role check",
+		check: () => /app_metadata/.test(getMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "user_metadata does not appear in policy USING clauses",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			return policyBlocks.every((p) => !p.includes("user_metadata"));
+		},
+	},
+	{
+		name: "has at least two SELECT policies (owner and admin)",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const hasOwnerPolicy = policyBlocks.some(
+				(p) =>
+					(p.includes("select") || !p.includes("insert")) &&
+					(p.includes("user_id") ||
+						p.includes("owner") ||
+						p.includes("auth.uid")),
+			);
+			const hasAdminPolicy = policyBlocks.some((p) =>
+				p.includes("app_metadata"),
+			);
+			return hasOwnerPolicy && hasAdminPolicy;
+		},
+	},
+	{
+		name: "policies use TO authenticated",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			return (
+				policyBlocks.length > 0 &&
+				policyBlocks.every((p) => /to\s+authenticated/.test(p))
+			);
+		},
+	},
+	{
+		name: "uses (select auth.uid()) subselect form in policies",
+		check: () => {
+			const sql = getMigrationSQL();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			for (const policy of policyBlocks) {
+				if (
+					policy.includes("auth.uid()") &&
+					!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
+				) {
+					return false;
+				}
+			}
+			return true;
+		},
+	},
+	{
+		name: "FK to auth.users with ON DELETE CASCADE",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return (
+				/references\s+auth\.users/.test(sql) &&
+				/on\s+delete\s+cascade/.test(sql)
+			);
+		},
+	},
+	{
+		name: "overall quality: demonstrates Supabase best practices",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const signals = [
+				/alter\s+table.*documents.*enable\s+row\s+level\s+security/.test(sql),
+				/app_metadata/.test(sql),
+				policyBlocks.every((p) => !p.includes("user_metadata")),
+				/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
+				policyBlocks.length > 0 &&
+					policyBlocks.every((p) => /to\s+authenticated/.test(p)),
+				/references\s+auth\.users/.test(sql) &&
+					/on\s+delete\s+cascade/.test(sql),
+				policyBlocks.some(
+					(p) =>
+						p.includes("user_id") ||
+						p.includes("owner") ||
+						p.includes("auth.uid"),
+				) && policyBlocks.some((p) => p.includes("app_metadata")),
+			];
+			return signals.filter(Boolean).length >= 5;
+		},
+	},
+];
--- a/packages/evals/evals/rls-user-metadata-role-check/PROMPT.md
+++ b/packages/evals/evals/rls-user-metadata-role-check/PROMPT.md
@@ -0,0 +1,7 @@
+I'm building a document management app on Supabase. I need a migration for a `documents` table. Each document has a `title` (text), `content` (text), and belongs to a user (the owner).
+
+The access rules are:
+- Regular users can only read their own documents.
+- Admin users — identified by a role field stored in their JWT — should be able to read all documents.
+
+Please create the migration in `supabase/migrations/`. The Supabase project is already initialized.
--- a/packages/evals/evals/rls-user-metadata-role-check/package.json
+++ b/packages/evals/evals/rls-user-metadata-role-check/package.json
@@ -0,0 +1,5 @@
+{
+	"name": "rls-user-metadata-role-check",
+	"private": true,
+	"type": "module"
+}
--- a/packages/evals/evals/rls-user-metadata-role-check/supabase/config.toml
+++ b/packages/evals/evals/rls-user-metadata-role-check/supabase/config.toml
@@ -0,0 +1,111 @@
+# For detailed configuration reference documentation, visit:
+# https://supabase.com/docs/guides/local-development/cli/config
+# A string used to distinguish different Supabase projects on the same host. Defaults to the
+# working directory name when running `supabase init`.
+project_id = "rls-user-metadata-role-check"
+
+[api]
+enabled = true
+# Port to use for the API URL.
+port = 54321
+# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
+# endpoints. `public` and `graphql_public` schemas are included by default.
+schemas = ["public", "graphql_public"]
+# Extra schemas to add to the search_path of every request.
+extra_search_path = ["public", "extensions"]
+# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
+# for accidental or malicious requests.
+max_rows = 1000
+
+[db]
+# Port to use for the local database URL.
+port = 54322
+# Port used by db diff command to initialize the shadow database.
+shadow_port = 54320
+# The database major version to use. This has to be the same as your remote database's. Run `SHOW
+# server_version;` on the remote database to check.
+major_version = 17
+
+[db.pooler]
+enabled = false
+# Port to use for the local connection pooler.
+port = 54329
+# Specifies when a server connection can be reused by other clients.
+# Configure one of the supported pooler modes: `transaction`, `session`.
+pool_mode = "transaction"
+# How many server connections to allow per user/database pair.
+default_pool_size = 20
+# Maximum number of client connections allowed.
+max_client_conn = 100
+
+[db.migrations]
+# If disabled, migrations will be skipped during a db push or reset.
+enabled = true
+schema_paths = []
+
+[db.seed]
+# If enabled, seeds the database after migrations during a db reset.
+enabled = true
+# Specifies an ordered list of seed files to load during db reset.
+sql_paths = ["./seed.sql"]
+
+[realtime]
+enabled = true
+
+[studio]
+enabled = true
+# Port to use for Supabase Studio.
+port = 54323
+# External URL of the API server that frontend connects to.
+api_url = "http://127.0.0.1"
+
+[inbucket]
+enabled = true
+# Port to use for the email testing server web interface.
+port = 54324
+
+[storage]
+enabled = true
+# The maximum file size allowed (e.g. "5MB", "500KB").
+file_size_limit = "50MiB"
+
+[auth]
+enabled = true
+# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
+# in emails.
+site_url = "http://127.0.0.1:3000"
+# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
+additional_redirect_urls = ["https://127.0.0.1:3000"]
+# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
+jwt_expiry = 3600
+# If disabled, the refresh token will never expire.
+enable_refresh_token_rotation = true
+# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
+# Requires enable_refresh_token_rotation = true.
+refresh_token_reuse_interval = 10
+# Allow/disallow new user signups to your project.
+enable_signup = true
+# Allow/disallow anonymous sign-ins to your project.
+enable_anonymous_sign_ins = false
+
+[auth.email]
+# Allow/disallow new user signups via email to your project.
+enable_signup = true
+# If enabled, a user will be required to confirm any email change on both the old, and new email
+# addresses. If disabled, only the new email is required to confirm.
+double_confirm_changes = true
+# If enabled, users need to confirm their email address before signing in.
+enable_confirmations = false
+
+[edge_runtime]
+enabled = true
+# Configure one of the supported request policies: `oneshot`, `per_worker`.
+policy = "per_worker"
+# Port to attach the Chrome inspector for debugging edge functions.
+inspector_port = 8083
+
+[analytics]
+enabled = true
+port = 54327
+# Configure one of the supported backends: `postgres`, `bigquery`.
+backend = "postgres"
--- a/packages/evals/evals/service-role-edge-function/EVAL.ts
+++ b/packages/evals/evals/service-role-edge-function/EVAL.ts
@@ -0,0 +1,102 @@
+export const expectedReferenceFiles = [
+	"db-security-service-role.md",
+	"edge-fun-quickstart.md",
+	"edge-db-supabase-client.md",
+	"edge-pat-cors.md",
+	"edge-pat-error-handling.md",
+];
+
+import { existsSync } from "node:fs";
+import { join } from "node:path";
+import type { EvalAssertion } from "../../src/eval-types.js";
+
+import {
+	findFunctionFile,
+	getFunctionCode,
+	getSharedCode,
+	getSupabaseDir,
+} from "../eval-utils.ts";
+
+const FUNCTION_NAME = "admin-reports";
+
+function getAllCode(): string {
+	const code = getFunctionCode(FUNCTION_NAME);
+	return `${code}\n${getSharedCode()}`;
+}
+
+export const assertions: EvalAssertion[] = [
+	{
+		name: "supabase project initialized (config.toml exists)",
+		check: () => existsSync(join(getSupabaseDir(), "config.toml")),
+	},
+	{
+		name: "edge function file exists",
+		check: () => findFunctionFile(FUNCTION_NAME) !== null,
+	},
+	{
+		name: "uses Deno.env.get for service role key",
+		check: () =>
+			/Deno\.env\.get\(\s*['"][^'"]*service[_-]?role[^'"]*['"]\s*\)/i.test(
+				getAllCode(),
+			),
+	},
+	{
+		name: "no hardcoded service role key",
+		check: () => {
+			const allCode = getAllCode();
+			const lines = allCode.split("\n");
+			const nonCommentLines = lines.filter(
+				(line) => !line.trimStart().startsWith("//"),
+			);
+			return !nonCommentLines.some((line) =>
+				/(['"`])eyJ[A-Za-z0-9_-]+\.\1?|(['"`])eyJ[A-Za-z0-9_-]+/.test(line),
+			);
+		},
+	},
+	{
+		name: "createClient called with service role env var as second argument",
+		check: () => {
+			const allCode = getAllCode();
+			return (
+				/createClient/i.test(allCode) &&
+				/Deno\.env\.get\(\s*['"][^'"]*service[_-]?role[^'"]*['"]\s*\)/i.test(
+					allCode,
+				)
+			);
+		},
+	},
+	{
+		name: "service role key env var name does not use NEXT_PUBLIC_ prefix",
+		check: () => !/NEXT_PUBLIC_[^'"]*service[_-]?role/i.test(getAllCode()),
+	},
+	{
+		name: "CORS headers present",
+		check: () => /Access-Control-Allow-Origin/.test(getAllCode()),
+	},
+	{
+		name: "returns JSON response",
+		check: () => {
+			const allCode = getAllCode();
+			return (
+				/content-type['"]\s*:\s*['"]application\/json/i.test(allCode) ||
+				/Response\.json/i.test(allCode) ||
+				/JSON\.stringify/i.test(allCode)
+			);
+		},
+	},
+	{
+		name: "overall quality: demonstrates service role Edge Function best practices",
+		check: () => {
+			const allCode = getAllCode();
+			const signals: RegExp[] = [
+				/Deno\.env\.get\(\s*['"][^'"]*service[_-]?role[^'"]*['"]\s*\)/i,
+				/Access-Control-Allow-Origin/,
+				/createClient/i,
+				/\btry\s*\{/,
+				/Response\.json|JSON\.stringify/,
+				/Deno\.serve/,
+			];
+			return signals.filter((r) => r.test(allCode)).length >= 5;
+		},
+	},
+];
--- a/packages/evals/evals/service-role-edge-function/PROMPT.md
+++ b/packages/evals/evals/service-role-edge-function/PROMPT.md
@@ -0,0 +1,9 @@
+I'm building an internal admin dashboard for my app. I need a Supabase Edge Function called `admin-reports` that returns all rows from the `reports` table — this is an admin-only endpoint so it needs to bypass Row Level Security.
+
+Create the function at `supabase/functions/admin-reports/index.ts`. Use environment variables for any Supabase keys — do not hardcode them in the source code.
+
+The function should:
+
+1. Return all rows from the `reports` table as a JSON response
+2. Work when called from a browser (handle CORS)
+3. Handle errors gracefully
--- a/packages/evals/evals/service-role-edge-function/package.json
+++ b/packages/evals/evals/service-role-edge-function/package.json
@@ -0,0 +1,5 @@
+{
+	"name": "service-role-edge-function",
+	"private": true,
+	"type": "module"
+}
--- a/packages/evals/evals/service-role-edge-function/supabase/config.toml
+++ b/packages/evals/evals/service-role-edge-function/supabase/config.toml
@@ -0,0 +1,64 @@
+# For detailed configuration reference documentation, visit:
+# https://supabase.com/docs/guides/local-development/cli/config
+# A string used to distinguish different Supabase projects on the same host. Defaults to the
+# working directory name when running `supabase init`.
+project_id = "service-role-edge-function"
+
+[api]
+enabled = true
+# Port to use for the API URL.
+port = 54321
+# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
+# endpoints. `public` and `graphql_public` schemas are included by default.
+schemas = ["public", "graphql_public"]
+# Extra schemas to add to the search_path of every request.
+extra_search_path = ["public", "extensions"]
+# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
+# for accidental or malicious requests.
+max_rows = 1000
+
+[db]
+# Port to use for the local database URL.
+port = 54322
+# Port used by db diff command to initialize the shadow database.
+shadow_port = 54320
+# The database major version to use. This has to be the same as your remote database's. Run `SHOW
+# server_version;` on the remote database to check.
+major_version = 17
+
+[db.pooler]
+enabled = false
+# Port to use for the local connection pooler.
+port = 54329
+# Specifies when a server connection can be reused by other clients.
+# Configure one of the supported pooler modes: `transaction`, `session`.
+pool_mode = "transaction"
+# How many server connections to allow per user/database pair.
+default_pool_size = 20
+# Maximum number of client connections allowed.
+max_client_conn = 100
+
+[storage]
+enabled = true
+# The maximum file size allowed (e.g. "5MB", "500KB").
+file_size_limit = "50MiB"
+
+[auth]
+enabled = true
+# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
+# in emails.
+site_url = "http://127.0.0.1:3000"
+# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
+additional_redirect_urls = ["https://127.0.0.1:3000"]
+# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
+jwt_expiry = 3600
+# Allow/disallow new user signups to your project.
+enable_signup = true
+# Allow/disallow anonymous sign-ins to your project.
+enable_anonymous_sign_ins = false
+
+[auth.email]
+# Allow/disallow new user signups via email to your project.
+enable_signup = true
+# If enabled, users need to confirm their email address before signing in.
+enable_confirmations = false
--- a/packages/evals/evals/service-role-edge-function/supabase/migrations/20240101000000_create_reports_table.sql
+++ b/packages/evals/evals/service-role-edge-function/supabase/migrations/20240101000000_create_reports_table.sql
@@ -0,0 +1,10 @@
+-- Create the reports table
+create table if not exists public.reports (
+  id uuid primary key default gen_random_uuid(),
+  title text not null,
+  content text,
+  created_at timestamptz not null default now()
+);
+
+-- Enable Row Level Security (browser clients use anon key and are restricted by default)
+alter table public.reports enable row level security;
--- a/packages/evals/evals/storage-rls-user-folders/EVAL.ts
+++ b/packages/evals/evals/storage-rls-user-folders/EVAL.ts
@@ -1,263 +1,253 @@
-import { expect, test } from "vitest";
+export const expectedReferenceFiles = [
+	"storage-access-control.md",
+	"db-rls-mandatory.md",
+	"db-rls-common-mistakes.md",
+	"db-rls-performance.md",
+	"db-schema-auth-fk.md",
+	"db-schema-timestamps.md",
+	"db-perf-indexes.md",
+	"db-migrations-idempotent.md",
+];
+
+import type { EvalAssertion } from "../../src/eval-types.js";

 import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";

-test("migration file exists", () => {
-	expect(findMigrationFiles().length).toBeGreaterThan(0);
-});
-
-test("creates avatars bucket", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Should insert into storage.buckets with id 'avatars' and public = true
-	expect(sql).toMatch(/storage\.buckets/);
-	expect(sql).toMatch(/avatars/);
-	expect(sql).toMatch(/public/);
-	// Verify it's marked as a public bucket (true)
-	const avatarsBlock = sql.match(
-		/insert\s+into\s+storage\.buckets[\s\S]*?avatars[\s\S]*?;/,
-	);
-	expect(avatarsBlock).not.toBeNull();
-	if (avatarsBlock) {
-		expect(avatarsBlock[0]).toMatch(/true/);
-	}
-});
-
-test("creates documents bucket", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Should insert into storage.buckets with id 'documents' and public = false
-	expect(sql).toMatch(/documents/);
-	const documentsBlock = sql.match(
-		/insert\s+into\s+storage\.buckets[\s\S]*?documents[\s\S]*?;/,
-	);
-	expect(documentsBlock).not.toBeNull();
-	if (documentsBlock) {
-		expect(documentsBlock[0]).toMatch(/false/);
-	}
-});
-
-test("avatars bucket has mime type restriction", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Should have allowed_mime_types with image types
-	expect(sql).toMatch(/allowed_mime_types/);
-	// Check for image MIME types (jpeg, png, webp)
-	expect(sql).toMatch(/image\/jpeg/);
-	expect(sql).toMatch(/image\/png/);
-	expect(sql).toMatch(/image\/webp/);
-});
-
-test("avatars bucket has file size limit", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Should have file_size_limit set to approximately 2MB (2097152 bytes or 2MB string)
-	expect(sql).toMatch(/file_size_limit/);
-	// Accept either numeric bytes (2097152) or string form (2MB, 2MiB, 2 * 1024 * 1024)
-	const hasNumericLimit = /2097152/.test(sql);
-	const hasStringLimit = /2\s*m/i.test(sql);
-	const hasCalcLimit = /2\s*\*\s*1024\s*\*\s*1024/.test(sql);
-	expect(hasNumericLimit || hasStringLimit || hasCalcLimit).toBe(true);
-});
-
-test("storage policy uses foldername or path for user isolation", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Should use storage.foldername(name) with auth.uid()::text for folder isolation
-	const usesFoldername = /storage\.foldername\s*\(\s*name\s*\)/.test(sql);
-	// Also accept direct path matching patterns like (name ~ '^user-id/')
-	const usesPathMatch =
-		/\(\s*storage\.foldername\s*\(/.test(sql) ||
-		/\bname\b.*auth\.uid\(\)/.test(sql);
-	expect(usesFoldername || usesPathMatch).toBe(true);
-	// Should cast auth.uid() to text for comparison with folder name
-	expect(sql).toMatch(/auth\.uid\(\)\s*::\s*text/);
-});
-
-test("storage policy uses TO authenticated", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Storage upload/delete/update policies should target authenticated users.
-	// Accepted forms:
-	//   1. Explicit TO authenticated
-	//   2. auth.uid() in USING/WITH CHECK (implicitly restricts to authenticated)
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	const storagePolicies = policyBlocks.filter((p) =>
-		p.toLowerCase().includes("storage.objects"),
-	);
-	// At least one storage policy should restrict to authenticated users
-	const hasAuthenticatedPolicy = storagePolicies.some(
-		(p) =>
-			/to\s+(authenticated|public)/.test(p.toLowerCase()) ||
-			/auth\.uid\(\)/.test(p.toLowerCase()),
-	);
-	expect(hasAuthenticatedPolicy).toBe(true);
-	// Insert policies must restrict to authenticated users (explicit TO or auth.uid() check)
-	const insertPolicies = storagePolicies.filter((p) =>
-		/for\s+insert/.test(p.toLowerCase()),
-	);
-	for (const policy of insertPolicies) {
-		const hasExplicitTo = /to\s+authenticated/.test(policy.toLowerCase());
-		const hasAuthUidCheck = /auth\.uid\(\)/.test(policy.toLowerCase());
-		expect(hasExplicitTo || hasAuthUidCheck).toBe(true);
-	}
-});
-
-test("public read policy for avatars", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// A SELECT policy on storage.objects for avatars bucket should allow public/anon access.
-	// Accepted forms:
-	//   1. Explicit TO public / TO anon
-	//   2. No TO clause (defaults to public role, granting all access)
-	//   3. No auth.uid() restriction in USING (open to everyone)
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	const avatarSelectPolicies = policyBlocks.filter(
-		(p) =>
-			p.toLowerCase().includes("storage.objects") &&
-			/for\s+select/.test(p.toLowerCase()) &&
-			p.toLowerCase().includes("avatars"),
-	);
-	expect(avatarSelectPolicies.length).toBeGreaterThan(0);
-	// Should allow public access: explicit TO public/anon, or no TO clause without auth.uid() restriction
-	const hasPublicAccess = avatarSelectPolicies.some((p) => {
-		const lower = p.toLowerCase();
-		const hasExplicitPublic =
-			/to\s+public/.test(lower) || /to\s+anon/.test(lower);
-		// No TO clause and no auth.uid() restriction means open to all
-		const hasNoToClause = !/\bto\s+\w+/.test(lower);
-		const hasNoAuthRestriction = !/auth\.uid\(\)/.test(lower);
-		return hasExplicitPublic || (hasNoToClause && hasNoAuthRestriction);
-	});
-	expect(hasPublicAccess).toBe(true);
-});
-
-test("documents bucket is fully private", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// All policies for documents bucket should restrict to authenticated owner.
-	// Accepted forms:
-	//   1. Explicit TO authenticated
-	//   2. auth.uid() in USING/WITH CHECK (implicitly restricts to authenticated)
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	const documentPolicies = policyBlocks.filter(
-		(p) =>
-			p.toLowerCase().includes("storage.objects") &&
-			p.toLowerCase().includes("documents"),
-	);
-	expect(documentPolicies.length).toBeGreaterThan(0);
-	// None should allow public/anon access
-	for (const policy of documentPolicies) {
-		expect(policy).not.toMatch(/to\s+public/);
-		expect(policy).not.toMatch(/to\s+anon/);
-	}
-	// All should be scoped to authenticated (explicit TO or auth.uid() check)
-	for (const policy of documentPolicies) {
-		const hasExplicitTo = /to\s+authenticated/.test(policy);
-		const hasAuthUidCheck = /auth\.uid\(\)/.test(policy);
-		expect(hasExplicitTo || hasAuthUidCheck).toBe(true);
-	}
-});
-
-test("creates file_metadata table", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/create\s+table/);
-	expect(sql).toMatch(/file_metadata/);
-});
-
-test("file_metadata has FK to auth.users with CASCADE", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Find the file_metadata CREATE TABLE block or the surrounding context
-	expect(sql).toMatch(/references\s+auth\.users/);
-	expect(sql).toMatch(/on\s+delete\s+cascade/);
-});
-
-test("RLS enabled on file_metadata", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(
-		/alter\s+table.*file_metadata.*enable\s+row\s+level\s+security/,
-	);
-});
-
-test("file_metadata policies use (select auth.uid())", () => {
-	const sql = getMigrationSQL();
-	// Find policies that reference file_metadata
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	const metadataPolicies = policyBlocks.filter((p) =>
-		p.toLowerCase().includes("file_metadata"),
-	);
-	// Each policy that uses auth.uid() should use the subselect form
-	for (const policy of metadataPolicies) {
-		if (policy.includes("auth.uid()")) {
-			expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
-		}
-	}
-});
-
-test("uses timestamptz for time columns", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Only check if the migration defines time-related columns
-	if (
-		sql.includes("created_at") ||
-		sql.includes("updated_at") ||
-		sql.includes("uploaded_at")
-	) {
-		// Check column definitions for plain "timestamp" (not timestamptz / timestamp with time zone).
-		// Only match timestamp as a column type — look for column_name followed by timestamp.
-		// Exclude matches inside trigger/function bodies and RETURNS TRIGGER.
-		const columnDefs = sql.match(
-			/(?:created_at|updated_at|uploaded_at)\s+timestamp\b/g,
-		);
-		if (columnDefs) {
-			for (const def of columnDefs) {
-				// Each match should use timestamptz or "timestamp with time zone"
-				expect(def).toMatch(/timestamptz|timestamp\s+with\s+time\s+zone/);
+export const assertions: EvalAssertion[] = [
+	{
+		name: "migration file exists",
+		check: () => findMigrationFiles().length > 0,
+	},
+	{
+		name: "creates avatars bucket",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			if (
+				!/storage\.buckets/.test(sql) ||
+				!/avatars/.test(sql) ||
+				!/public/.test(sql)
+			)
+				return false;
+			const avatarsBlock = sql.match(
+				/insert\s+into\s+storage\.buckets[\s\S]*?avatars[\s\S]*?;/,
+			);
+			return avatarsBlock !== null && /true/.test(avatarsBlock[0]);
+		},
+	},
+	{
+		name: "creates documents bucket",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			if (!/documents/.test(sql)) return false;
+			const documentsBlock = sql.match(
+				/insert\s+into\s+storage\.buckets[\s\S]*?documents[\s\S]*?;/,
+			);
+			return documentsBlock !== null && /false/.test(documentsBlock[0]);
+		},
+	},
+	{
+		name: "avatars bucket has mime type restriction",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return (
+				/allowed_mime_types/.test(sql) &&
+				/image\/jpeg/.test(sql) &&
+				/image\/png/.test(sql) &&
+				/image\/webp/.test(sql)
+			);
+		},
+	},
+	{
+		name: "avatars bucket has file size limit",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			if (!/file_size_limit/.test(sql)) return false;
+			return (
+				/2097152/.test(sql) ||
+				/2\s*m/i.test(sql) ||
+				/2\s*\*\s*1024\s*\*\s*1024/.test(sql)
+			);
+		},
+	},
+	{
+		name: "storage policy uses foldername or path for user isolation",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const usesFoldername = /storage\.foldername\s*\(\s*name\s*\)/.test(sql);
+			const usesPathMatch =
+				/\(\s*storage\.foldername\s*\(/.test(sql) ||
+				/\bname\b.*auth\.uid\(\)/.test(sql);
+			return (
+				(usesFoldername || usesPathMatch) &&
+				/auth\.uid\(\)\s*::\s*text/.test(sql)
+			);
+		},
+	},
+	{
+		name: "storage policy uses TO authenticated",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const storagePolicies = policyBlocks.filter((p) =>
+				p.toLowerCase().includes("storage.objects"),
+			);
+			const hasAuthenticatedPolicy = storagePolicies.some(
+				(p) =>
+					/to\s+(authenticated|public)/.test(p.toLowerCase()) ||
+					/auth\.uid\(\)/.test(p.toLowerCase()),
+			);
+			if (!hasAuthenticatedPolicy) return false;
+			const insertPolicies = storagePolicies.filter((p) =>
+				/for\s+insert/.test(p.toLowerCase()),
+			);
+			return insertPolicies.every(
+				(p) =>
+					/to\s+authenticated/.test(p.toLowerCase()) ||
+					/auth\.uid\(\)/.test(p.toLowerCase()),
+			);
+		},
+	},
+	{
+		name: "public read policy for avatars",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const avatarSelectPolicies = policyBlocks.filter(
+				(p) =>
+					p.toLowerCase().includes("storage.objects") &&
+					/for\s+select/.test(p.toLowerCase()) &&
+					p.toLowerCase().includes("avatars"),
+			);
+			if (avatarSelectPolicies.length === 0) return false;
+			return avatarSelectPolicies.some((p) => {
+				const lower = p.toLowerCase();
+				const hasExplicitPublic =
+					/to\s+public/.test(lower) || /to\s+anon/.test(lower);
+				const hasNoToClause = !/\bto\s+\w+/.test(lower);
+				const hasNoAuthRestriction = !/auth\.uid\(\)/.test(lower);
+				return hasExplicitPublic || (hasNoToClause && hasNoAuthRestriction);
+			});
+		},
+	},
+	{
+		name: "documents bucket is fully private",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const documentPolicies = policyBlocks.filter(
+				(p) =>
+					p.toLowerCase().includes("storage.objects") &&
+					p.toLowerCase().includes("documents"),
+			);
+			if (documentPolicies.length === 0) return false;
+			return documentPolicies.every(
+				(p) =>
+					!/to\s+public/.test(p) &&
+					!/to\s+anon/.test(p) &&
+					(/to\s+authenticated/.test(p) || /auth\.uid\(\)/.test(p)),
+			);
+		},
+	},
+	{
+		name: "creates file_metadata table",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return /create\s+table/.test(sql) && /file_metadata/.test(sql);
+		},
+	},
+	{
+		name: "file_metadata has FK to auth.users with CASCADE",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return (
+				/references\s+auth\.users/.test(sql) &&
+				/on\s+delete\s+cascade/.test(sql)
+			);
+		},
+	},
+	{
+		name: "RLS enabled on file_metadata",
+		check: () =>
+			/alter\s+table.*file_metadata.*enable\s+row\s+level\s+security/.test(
+				getMigrationSQL().toLowerCase(),
+			),
+	},
+	{
+		name: "file_metadata policies use (select auth.uid())",
+		check: () => {
+			const sql = getMigrationSQL();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const metadataPolicies = policyBlocks.filter((p) =>
+				p.toLowerCase().includes("file_metadata"),
+			);
+			for (const policy of metadataPolicies) {
+				if (
+					policy.includes("auth.uid()") &&
+					!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
+				) {
+					return false;
+				}
 			}
-		}
-	}
-});
-
-test("index on file_metadata user_id", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/create\s+index/);
-	// Should index user_id on file_metadata
-	expect(sql).toMatch(/file_metadata/);
-	expect(sql).toMatch(/user_id/);
-});
-
-test("idempotent DDL", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/if\s+not\s+exists/);
-});
-
-test("overall quality score", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// A high-quality migration should contain most of these best-practice signals
-	const signals = [
-		// 1. Avatars bucket is public
-		/insert\s+into\s+storage\.buckets[\s\S]*?avatars/,
-		// 2. Documents bucket exists
-		/insert\s+into\s+storage\.buckets[\s\S]*?documents/,
-		// 3. MIME type restriction
-		/allowed_mime_types/,
-		// 4. File size limit
-		/file_size_limit/,
-		// 5. Storage foldername helper
-		/storage\.foldername/,
-		// 6. auth.uid()::text cast
-		/auth\.uid\(\)\s*::\s*text/,
-		// 7. TO authenticated on policies
-		/to\s+authenticated/,
-		// 8. Public read for avatars
-		/to\s+(public|anon)/,
-		// 9. RLS on file_metadata
-		/enable\s+row\s+level\s+security/,
-		// 10. FK to auth.users with cascade
-		/on\s+delete\s+cascade/,
-		// 11. (select auth.uid()) subselect form
-		/\(select\s+auth\.uid\(\)\)/,
-		// 12. Index on user_id
-		/create\s+index/,
-		// 13. timestamptz usage
-		/timestamptz/,
-		// 14. IF NOT EXISTS for idempotency
-		/if\s+not\s+exists/,
-		// 15. file_metadata table
-		/create\s+table[\s\S]*?file_metadata/,
-	];
-	const matches = signals.filter((r) => r.test(sql));
-	// Require at least 11 of 15 best-practice signals
-	expect(matches.length).toBeGreaterThanOrEqual(11);
-});
+			return true;
+		},
+	},
+	{
+		name: "uses timestamptz for time columns",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			if (
+				!sql.includes("created_at") &&
+				!sql.includes("updated_at") &&
+				!sql.includes("uploaded_at")
+			) {
+				return true;
+			}
+			const columnDefs = sql.match(
+				/(?:created_at|updated_at|uploaded_at)\s+timestamp\b/g,
+			);
+			if (!columnDefs) return true;
+			return columnDefs.every((def) =>
+				/timestamptz|timestamp\s+with\s+time\s+zone/.test(def),
+			);
+		},
+	},
+	{
+		name: "index on file_metadata user_id",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return (
+				/create\s+index/.test(sql) &&
+				/file_metadata/.test(sql) &&
+				/user_id/.test(sql)
+			);
+		},
+	},
+	{
+		name: "idempotent DDL",
+		check: () => /if\s+not\s+exists/.test(getMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "overall quality score",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const signals = [
+				/insert\s+into\s+storage\.buckets[\s\S]*?avatars/,
+				/insert\s+into\s+storage\.buckets[\s\S]*?documents/,
+				/allowed_mime_types/,
+				/file_size_limit/,
+				/storage\.foldername/,
+				/auth\.uid\(\)\s*::\s*text/,
+				/to\s+authenticated/,
+				/to\s+(public|anon)/,
+				/enable\s+row\s+level\s+security/,
+				/on\s+delete\s+cascade/,
+				/\(select\s+auth\.uid\(\)\)/,
+				/create\s+index/,
+				/timestamptz/,
+				/if\s+not\s+exists/,
+				/create\s+table[\s\S]*?file_metadata/,
+			];
+			return signals.filter((r) => r.test(sql)).length >= 11;
+		},
+	},
+];
--- a/packages/evals/evals/team-rls-security-definer/EVAL.ts
+++ b/packages/evals/evals/team-rls-security-definer/EVAL.ts
@@ -1,182 +1,216 @@
-import { expect, test } from "vitest";
+export const expectedReferenceFiles = [
+	"db-rls-mandatory.md",
+	"db-rls-policy-types.md",
+	"db-rls-common-mistakes.md",
+	"db-rls-performance.md",
+	"db-security-functions.md",
+	"db-schema-auth-fk.md",
+	"db-schema-timestamps.md",
+	"db-perf-indexes.md",
+	"db-migrations-idempotent.md",
+];
+
+import type { EvalAssertion } from "../../src/eval-types.js";

 import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";

-test("migration file exists", () => {
-	expect(findMigrationFiles().length).toBeGreaterThan(0);
-});
-
-test("creates organizations table", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/create\s+table[\s\S]*?organizations/);
-});
-
-test("creates memberships table", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/create\s+table[\s\S]*?memberships/);
-});
-
-test("creates projects table", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/create\s+table[\s\S]*?projects/);
-});
-
-test("enables RLS on all tables", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(
-		/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/,
-	);
-	expect(sql).toMatch(
-		/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/,
-	);
-	expect(sql).toMatch(
-		/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/,
-	);
-});
-
-test("FK to auth.users with ON DELETE CASCADE", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// memberships should reference auth.users with cascade delete
-	expect(sql).toMatch(/references\s+auth\.users/);
-	expect(sql).toMatch(/on\s+delete\s+cascade/);
-});
-
-test("org_id FK on projects", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// projects should have a foreign key referencing organizations
-	expect(sql).toMatch(
-		/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/,
-	);
-});
-
-test("private schema created", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/create\s+schema[\s\S]*?private/);
-});
-
-test("security_definer helper function", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Function should be in the private schema with SECURITY DEFINER and search_path = ''
-	expect(sql).toMatch(/private\./);
-	expect(sql).toMatch(/security\s+definer/);
-	expect(sql).toMatch(/set\s+search_path\s*=\s*''/);
-});
-
-test("policies use (select auth.uid())", () => {
-	const sql = getMigrationSQL();
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	expect(policyBlocks.length).toBeGreaterThan(0);
-	for (const policy of policyBlocks) {
-		if (policy.includes("auth.uid()")) {
-			// The subselect form: (select auth.uid())
-			expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
-		}
-	}
-});
-
-test("policies use TO authenticated", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	expect(policyBlocks.length).toBeGreaterThan(0);
-	for (const policy of policyBlocks) {
-		expect(policy).toMatch(/to\s+authenticated/);
-	}
-});
-
-test("index on membership lookup columns", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/create\s+index/);
-	// Should index user_id and/or org_id on memberships for policy lookups
-	const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? [];
-	const indexesUserOrOrg = indexBlocks.filter(
-		(idx) =>
-			idx.includes("user_id") ||
-			idx.includes("org_id") ||
-			idx.includes("organization_id"),
-	);
-	expect(indexesUserOrOrg.length).toBeGreaterThanOrEqual(1);
-});
-
-test("uses timestamptz", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Match "timestamp" that is NOT followed by "tz" or "with time zone"
-	const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
-	// Only fail if the migration defines time columns with plain timestamp
-	if (
-		sql.includes("created_at") ||
-		sql.includes("updated_at") ||
-		sql.includes("_at ")
-	) {
-		expect(sql).not.toMatch(hasPlainTimestamp);
-	}
-});
-
-test("idempotent DDL", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/if\s+not\s+exists/);
-});
-
-test("delete policy restricted to owner role", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	// Look for a delete policy on projects that checks for owner (or admin) role
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	const deletePolicy = policyBlocks.find(
-		(p) =>
-			p.toLowerCase().includes("delete") && p.toLowerCase().includes("project"),
-	);
-	expect(deletePolicy).toBeDefined();
-	// The delete policy should check for an owner/admin role
-	expect(deletePolicy?.toLowerCase()).toMatch(/owner|admin/);
-});
-
-test("overall quality score", () => {
-	const sql = getMigrationSQL().toLowerCase();
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	// A high-quality migration should contain most of these best-practice signals
-	const signals = [
-		// 1. RLS enabled on all three tables
-		/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/.test(
-			sql,
-		) &&
-			/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
-				sql,
-			) &&
-			/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/.test(
-				sql,
+export const assertions: EvalAssertion[] = [
+	{
+		name: "migration file exists",
+		check: () => findMigrationFiles().length > 0,
+	},
+	{
+		name: "creates organizations table",
+		check: () =>
+			/create\s+table[\s\S]*?organizations/.test(
+				getMigrationSQL().toLowerCase(),
 			),
-		// 2. FK to auth.users with cascade
-		/references\s+auth\.users/.test(sql) && /on\s+delete\s+cascade/.test(sql),
-		// 3. Private schema created
-		/create\s+schema[\s\S]*?private/.test(sql),
-		// 4. security_definer with search_path
-		/security\s+definer/.test(sql) && /set\s+search_path\s*=\s*''/.test(sql),
-		// 5. Subselect auth.uid()
-		/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
-		// 6. TO authenticated on policies
-		policyBlocks.length > 0 &&
-			policyBlocks.every((p) => /to\s+authenticated/.test(p)),
-		// 7. Indexes on lookup columns
-		/create\s+index/.test(sql),
-		// 8. timestamptz (no plain timestamp)
-		!/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(sql),
-		// 9. Idempotent DDL
-		/if\s+not\s+exists/.test(sql),
-		// 10. Delete policy checks owner role
-		policyBlocks.some(
-			(p) =>
-				p.toLowerCase().includes("delete") &&
-				p.toLowerCase().includes("project") &&
-				/owner|admin/.test(p.toLowerCase()),
-		),
-		// 11. org_id FK on projects
-		/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/.test(sql),
-		// 12. Multiple policies (at least one per table)
-		policyBlocks.length >= 3,
-		// 13. Membership role column exists
-		/role/.test(sql),
-		// 14. Private schema function referenced in policies
-		/private\./.test(sql),
-	];
-	const passed = signals.filter(Boolean).length;
-	expect(passed).toBeGreaterThanOrEqual(10);
-});
+	},
+	{
+		name: "creates memberships table",
+		check: () =>
+			/create\s+table[\s\S]*?memberships/.test(getMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "creates projects table",
+		check: () =>
+			/create\s+table[\s\S]*?projects/.test(getMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "enables RLS on all tables",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return (
+				/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/.test(
+					sql,
+				) &&
+				/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
+					sql,
+				) &&
+				/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/.test(
+					sql,
+				)
+			);
+		},
+	},
+	{
+		name: "FK to auth.users with ON DELETE CASCADE",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return (
+				/references\s+auth\.users/.test(sql) &&
+				/on\s+delete\s+cascade/.test(sql)
+			);
+		},
+	},
+	{
+		name: "org_id FK on projects",
+		check: () =>
+			/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/.test(
+				getMigrationSQL().toLowerCase(),
+			),
+	},
+	{
+		name: "private schema created",
+		check: () =>
+			/create\s+schema[\s\S]*?private/.test(getMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "security_definer helper function",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			return (
+				/private\./.test(sql) &&
+				/security\s+definer/.test(sql) &&
+				/set\s+search_path\s*=\s*''/.test(sql)
+			);
+		},
+	},
+	{
+		name: "policies use (select auth.uid())",
+		check: () => {
+			const sql = getMigrationSQL();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			if (policyBlocks.length === 0) return false;
+			for (const policy of policyBlocks) {
+				if (
+					policy.includes("auth.uid()") &&
+					!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
+				) {
+					return false;
+				}
+			}
+			return true;
+		},
+	},
+	{
+		name: "policies use TO authenticated",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			return (
+				policyBlocks.length > 0 &&
+				policyBlocks.every((p) => /to\s+authenticated/.test(p))
+			);
+		},
+	},
+	{
+		name: "index on membership lookup columns",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			if (!/create\s+index/.test(sql)) return false;
+			const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? [];
+			return (
+				indexBlocks.filter(
+					(idx) =>
+						idx.includes("user_id") ||
+						idx.includes("org_id") ||
+						idx.includes("organization_id"),
+				).length >= 1
+			);
+		},
+	},
+	{
+		name: "uses timestamptz",
+		check: () => {
+			const rawSql = getMigrationSQL().toLowerCase();
+			const sql = rawSql.replace(/--[^\n]*/g, "");
+			const hasPlainTimestamp =
+				/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
+			if (
+				sql.includes("created_at") ||
+				sql.includes("updated_at") ||
+				sql.includes("_at ")
+			) {
+				return !hasPlainTimestamp.test(sql);
+			}
+			return true;
+		},
+	},
+	{
+		name: "idempotent DDL",
+		check: () => /if\s+not\s+exists/.test(getMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "stable or immutable on helper function",
+		check: () =>
+			/\bstable\b|\bimmutable\b/.test(getMigrationSQL().toLowerCase()),
+	},
+	{
+		name: "delete policy restricted to owner role",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const deletePolicy = policyBlocks.find(
+				(p) =>
+					p.toLowerCase().includes("delete") &&
+					p.toLowerCase().includes("project"),
+			);
+			if (!deletePolicy) return false;
+			return /owner|admin/.test(deletePolicy.toLowerCase());
+		},
+	},
+	{
+		name: "overall quality score",
+		check: () => {
+			const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const signals = [
+				/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/.test(
+					sql,
+				) &&
+					/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
+						sql,
+					) &&
+					/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/.test(
+						sql,
+					),
+				/references\s+auth\.users/.test(sql) &&
+					/on\s+delete\s+cascade/.test(sql),
+				/create\s+schema[\s\S]*?private/.test(sql),
+				/security\s+definer/.test(sql) &&
+					/set\s+search_path\s*=\s*''/.test(sql),
+				/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
+				policyBlocks.length > 0 &&
+					policyBlocks.every((p) => /to\s+authenticated/.test(p)),
+				/create\s+index/.test(sql),
+				!/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(
+					sql.replace(/--[^\n]*/g, ""),
+				),
+				/if\s+not\s+exists/.test(sql),
+				policyBlocks.some(
+					(p) =>
+						p.toLowerCase().includes("delete") &&
+						p.toLowerCase().includes("project") &&
+						/owner|admin/.test(p.toLowerCase()),
+				),
+				/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/.test(sql),
+				policyBlocks.length >= 3,
+				/role/.test(sql),
+				/private\./.test(sql),
+				/\bstable\b|\bimmutable\b/.test(sql),
+			];
+			return signals.filter(Boolean).length >= 11;
+		},
+	},
+];
--- a/packages/evals/package.json
+++ b/packages/evals/package.json
@@ -17,7 +17,6 @@
 	"devDependencies": {
 		"@types/node": "^20.10.0",
 		"tsx": "^4.7.0",
-		"typescript": "^5.3.0",
-		"vitest": "^3.1.0"
+		"typescript": "^5.3.0"
 	}
 }
--- a/packages/evals/scenarios/SCENARIOS.md
+++ b/packages/evals/scenarios/SCENARIOS.md
@@ -6,5 +6,12 @@
 | 2 | [team-rls-security-definer](team-rls-security-definer.md) | Team-based RLS with security definer helper in a private schema |
 | 3 | [storage-rls-user-folders](storage-rls-user-folders.md) | Storage buckets with RLS policies for user-isolated folders |
 | 4 | [edge-function-hello-world](edge-function-hello-world.md) | Hello-world Edge Function with CORS and shared utilities |
-| 5 | edge-function-stripe-webhook | Stripe webhook Edge Function with signature verification and orders migration |
-| 6 | [collaborative-rooms-realtime](collaborative-rooms-realtime.md) | Collaborative rooms with role-based RLS, broadcast triggers, and Realtime authorization |
+| 5 | [collaborative-rooms-realtime](collaborative-rooms-realtime.md) | Collaborative rooms with role-based RLS, broadcast triggers, and Realtime authorization |
+| 6 | [auth-fk-cascade-delete](auth-fk-cascade-delete.md) | Profiles table with auth.users FK cascade and auto-create trigger |
+| 7 | [rls-update-needs-select](rls-update-needs-select.md) | Orders table where UPDATE silently fails without a matching SELECT policy |
+| 8 | [extension-wrong-schema](extension-wrong-schema.md) | pgvector extension setup with correct schema placement, HNSW index, and user-scoped RLS |
+| 9 | [connection-pooling-prisma](connection-pooling-prisma.md) | Fix Prisma schema to use Supabase transaction-mode pooler (port 6543, pgbouncer=true, directUrl) for serverless deployments |
+| 10 | [cli-hallucinated-commands](cli-hallucinated-commands.md) | CLI cheat-sheet that must use only real Supabase CLI commands, avoiding hallucinated `supabase functions log` and `supabase db query` |
+| 11 | [postgrest-schema-cache](postgrest-schema-cache.md) | Add columns and a view to an existing table, with NOTIFY pgrst to reload the PostgREST schema cache |
+| 12 | [rls-user-metadata-role-check](rls-user-metadata-role-check.md) | Documents table with owner and admin RLS — must use app_metadata not user_metadata for role authorization |
+| 13 | [service-role-edge-function](service-role-edge-function.md) | Admin Edge Function that bypasses RLS using the service role key via env vars, never hardcoded |
--- a/packages/evals/scenarios/auth-fk-cascade-delete.md
+++ b/packages/evals/scenarios/auth-fk-cascade-delete.md
@@ -0,0 +1,84 @@
+# Scenario: auth-fk-cascade-delete
+
+## Summary
+
+The agent must create a `profiles` table that references `auth.users` with
+`ON DELETE CASCADE`, and a trigger that auto-creates a profile row when a new
+user signs up. The common mistake — omitting CASCADE — causes user deletion to
+fail with a foreign key violation.
+
+## Real-World Justification
+
+Why this is a common and important workflow:
+
+1. **Top troubleshooting entry** — "Database error saving new user" and
+   "Errors when creating/updating/deleting users" are listed as common issues in
+   the Supabase troubleshooting guide. The majority of these failures trace back
+   to FK violations when deleting users who have linked profile rows.
+   - Source: https://supabase.com/docs/guides/troubleshooting
+2. **Auth trigger pattern ubiquity** — The `handle_new_user` trigger on
+   `auth.users` is documented in the official Supabase onboarding guide and
+   replicated in thousands of community starter templates. Getting the
+   `security definer` + `set search_path = ''` details wrong breaks signups.
+   - Source: https://supabase.com/docs/guides/database/postgres/cascade-deletes
+3. **Community-reported cascade omission** — Multiple GitHub issues report
+   unexpected FK violation errors when calling `auth.admin.deleteUser()` from
+   the SDK because the profile table was created without CASCADE.
+   - Source: https://github.com/supabase/supabase/issues/
+
+## Skill References Exercised
+
+| Reference File | What It Teaches | What the Agent Should Apply |
+|---|---|---|
+| `references/db-schema-auth-fk.md` | ON DELETE CASCADE requirement for auth.users FKs | `REFERENCES auth.users(id) ON DELETE CASCADE` |
+| `references/db-security-functions.md` | security definer + set search_path = '' for trigger functions | Correct trigger function definition |
+| `references/db-rls-mandatory.md` | Enable RLS on all public tables | RLS enabled on profiles |
+| `references/db-rls-common-mistakes.md` | TO clause and subselect auth.uid() | Correct policy scoping |
+
+## Workspace Setup
+
+- Empty workspace with a pre-initialized `supabase/config.toml` (no migrations)
+
+## Agent Task (PROMPT.md draft)
+
+> Set up a `profiles` table for my Supabase app. Every user who signs up should
+> automatically get a profile row with their `id`, `email`, and `full_name`
+> (pulled from signup metadata). The profiles table should go in
+> `supabase/migrations/` as a SQL migration. Users should only be able to read
+> and update their own profile.
+
+## Evaluation Criteria
+
+| # | Test Name | What It Checks | Quality Dimension |
+|---|-----------|----------------|-------------------|
+| 1 | migration file exists | At least one `.sql` file in `supabase/migrations/` | structure |
+| 2 | creates profiles table | SQL contains `CREATE TABLE` and `profiles` | correctness |
+| 3 | FK references auth.users | `REFERENCES auth.users` present | correctness |
+| 4 | ON DELETE CASCADE present | `ON DELETE CASCADE` on the auth.users FK | correctness |
+| 5 | RLS enabled on profiles | `ALTER TABLE profiles ENABLE ROW LEVEL SECURITY` | security |
+| 6 | trigger function uses security definer | `SECURITY DEFINER` in the trigger function definition | security |
+| 7 | trigger function sets search_path | `SET search_path = ''` or `set search_path` in trigger function | security |
+| 8 | trigger created on auth.users | `CREATE TRIGGER ... ON auth.users` | correctness |
+| 9 | policies scoped to authenticated | `TO authenticated` in policy definitions | security |
+
+## Reasoning
+
+1. **Baseline differentiator:** Without the skill, an agent creates the FK
+   without CASCADE and omits `set search_path = ''` on the trigger function —
+   two independently dangerous omissions.
+2. **Skill value:** `db-schema-auth-fk.md` is explicitly about this exact
+   scenario; `db-security-functions.md` covers the trigger security requirements.
+3. **Testability:** CASCADE and search_path are simple string patterns. Trigger
+   creation on `auth.users` is a unique structural signal.
+4. **Realism:** The profiles-with-trigger pattern is the #1 starter pattern in
+   every Supabase tutorial and the #1 source of FK-violation bugs reported in
+   the community.
+
+## Difficulty
+
+**Rating:** MEDIUM
+
+- Without skill: ~35% of assertions expected to pass (table and FK likely, but
+  no CASCADE, no search_path, weak policies)
+- With skill: ~90% of assertions expected to pass
+- **pass_threshold:** 8
--- a/packages/evals/scenarios/auth-rls-new-project.md
+++ b/packages/evals/scenarios/auth-rls-new-project.md
@@ -85,8 +85,9 @@ specific quality signal:
 | 8 | TO authenticated | Policies scoped to authenticated role | security |
 | 9 | timestamptz | No plain `timestamp` for time columns | correctness |
 | 10 | index on user_id | `CREATE INDEX` on the FK column | performance |
-| 11 | IF NOT EXISTS | Idempotent migration | idempotency |
-| 12 | overall quality | At least 4/5 best-practice signals present | overall |
+| 11 | no SERIAL/BIGSERIAL | PK does not use error-prone serial type | correctness |
+| 12 | IF NOT EXISTS | Idempotent migration | idempotency |
+| 13 | overall quality | At least 4/5 best-practice signals present | overall |

 ## Reasoning

@@ -121,4 +122,5 @@ Step-by-step reasoning for why this scenario is well-designed:
 **Rating:** EASY

 - Without skill: ~50-65% of assertions expected to pass
- With skill: ~90-100% of assertions expected to pass
+- With skill: ~90-100% of assertions expected to pass
+- **pass_threshold:** 10
--- a/packages/evals/scenarios/cli-hallucinated-commands.md
+++ b/packages/evals/scenarios/cli-hallucinated-commands.md
@@ -0,0 +1,120 @@
+# Scenario: cli-hallucinated-commands
+
+## Summary
+
+The agent must create a Supabase CLI reference cheat-sheet (`CLI_REFERENCE.md`)
+covering how to view Edge Function logs and how to run ad-hoc SQL queries
+against a Supabase project. This tests whether the agent invents non-existent
+CLI commands (`supabase functions log`, `supabase db query`) instead of
+describing the real workflows.
+
+## Real-World Justification
+
+Why this is a common and important workflow:
+
+1. **`supabase functions log` is a persistent hallucination** — LLMs frequently
+   suggest `supabase functions log` (singular) or `supabase functions logs` as
+   CLI commands to stream deployed function logs. Neither command exists in the
+   Supabase CLI. The real workflow is to use the Supabase Dashboard Logs
+   Explorer, or for local development, `supabase start` + `supabase functions
+   serve` which prints logs to stdout. This pattern appears across many
+   developer questions and multiple model responses.
+   - Source: https://supabase.com/docs/reference/cli/supabase-functions
+
+2. **`supabase db query` is a persistent hallucination** — LLMs suggest
+   `supabase db query` or `supabase db query --sql "SELECT ..."` as a way to
+   run ad-hoc SQL via the CLI. This command does not exist. The real workflow
+   is to connect via `psql` using the connection string from the Dashboard,
+   or use the Dashboard SQL Editor, or `supabase db dump` for schema exports.
+   - Source: https://supabase.com/docs/reference/cli/supabase-db
+
+3. **Developers frequently ask for a CLI cheat-sheet** — Setting up a reference
+   file for project onboarding is a standard ask. The agent must produce
+   accurate commands, not invented ones that will silently fail.
+
+## Skill References Exercised
+
+Which reference files the agent should consult and what each teaches:
+
+| Reference File | What It Teaches | What the Agent Should Apply |
+|---|---|---|
+| `references/dev-getting-started.md` | Real CLI commands: `supabase start`, `supabase stop`, `supabase db push`, `supabase db reset`, `supabase db diff` | Use only real `supabase db` subcommands |
+| `references/edge-fun-quickstart.md` | Real Edge Function CLI: `supabase functions new`, `supabase functions serve`, `supabase functions deploy` | Use real function commands, not invented log commands |
+
+## Workspace Setup
+
+What the workspace starts with before the agent runs:
+
+- A pre-initialized `supabase/config.toml` (standard project setup)
+- An existing Edge Function at `supabase/functions/process-order/index.ts`
+- The agent is expected to create `CLI_REFERENCE.md` in the project root
+
+## Agent Task (PROMPT.md draft)
+
+The prompt to give the agent. Written as a developer would ask it — no hints
+about what the tests check:
+
+> I'm onboarding a new developer to my Supabase project. Create a
+> `CLI_REFERENCE.md` file in the project root with a practical cheat-sheet
+> of Supabase CLI commands we use day-to-day. It should cover:
+>
+> 1. Starting and stopping the local dev stack
+> 2. Managing database migrations (push, reset, diff)
+> 3. Working with the `process-order` Edge Function (local dev and deploy)
+> 4. How to view Edge Function logs (both local dev and production)
+> 5. How to run ad-hoc SQL queries against the database (local and remote)
+>
+> Include the actual commands with brief explanations.
+
+## Evaluation Criteria
+
+What vitest should assert on the agent's output. Each assertion tests a
+specific quality signal:
+
+| # | Test Name | What It Checks | Quality Dimension |
+|---|-----------|----------------|-------------------|
+| 1 | CLI_REFERENCE.md exists | `CLI_REFERENCE.md` file exists in project root | structure |
+| 2 | no hallucinated functions log command | File does NOT contain `supabase functions log` (without 's' as a complete command) | correctness |
+| 3 | no hallucinated db query command | File does NOT contain `supabase db query` | correctness |
+| 4 | mentions supabase functions serve for local | File contains `supabase functions serve` | correctness |
+| 5 | mentions supabase functions deploy | File contains `supabase functions deploy` | correctness |
+| 6 | mentions psql or connection string for SQL | File contains `psql` or `connection string` or `SQL Editor` or `db dump` | correctness |
+| 7 | mentions supabase db push or reset | File contains `supabase db push` or `supabase db reset` | correctness |
+| 8 | mentions supabase start | File contains `supabase start` | correctness |
+| 9 | mentions Dashboard for production logs | File mentions `Dashboard` or `Logs Explorer` for production log viewing | correctness |
+
+## Reasoning
+
+Step-by-step reasoning for why this scenario is well-designed:
+
+1. **Baseline differentiator:** An agent without the skill is very likely to
+   hallucinate both `supabase functions log` and `supabase db query` since
+   these are plausible-sounding commands that follow the CLI's pattern.
+   Multiple real-world LLM responses have included these exact commands. With
+   the skill's reference files listing the actual CLI commands, the agent
+   should know what exists and what doesn't.
+
+2. **Skill value:** The quickstart and getting-started reference files
+   enumerate the real CLI subcommands. An agent reading these will see that
+   `supabase functions` only has `new`, `serve`, `deploy`, `delete`, `list`
+   subcommands, and `supabase db` only has `push`, `reset`, `diff`, `dump`,
+   `lint`, `pull` — not `query`. This directly prevents the hallucination.
+
+3. **Testability:** All assertions are regex/string matches on a single
+   markdown file. No runtime execution or migration parsing needed. Checks 2
+   and 3 are pure absence tests (NOT contains) which are simple but
+   high-signal.
+
+4. **Realism:** Writing a CLI reference for project onboarding is a genuine
+   task. The two hallucinated commands are the most commonly confused ones
+   based on developer feedback. Getting these wrong produces broken workflows
+   that are frustrating to debug.
+
+## Difficulty
+
+**Rating:** EASY
+
+- Without skill: ~30-50% of assertions expected to pass (likely fails checks
+  2 and/or 3 due to hallucination, may also miss Dashboard mention for logs)
+- With skill: ~90-100% of assertions expected to pass
+- **pass_threshold:** 9
--- a/packages/evals/scenarios/collaborative-rooms-realtime.md
+++ b/packages/evals/scenarios/collaborative-rooms-realtime.md
@@ -154,3 +154,4 @@ Step-by-step reasoning for why this scenario is well-designed:

 - Without skill: ~25-40% of assertions expected to pass
 - With skill: ~80-90% of assertions expected to pass
+- **pass_threshold:** 17
--- a/packages/evals/scenarios/connection-pooling-prisma.md
+++ b/packages/evals/scenarios/connection-pooling-prisma.md
@@ -0,0 +1,80 @@
+# Scenario: connection-pooling-prisma
+
+## Summary
+
+The agent must produce a `DATABASE_URL` configuration and Prisma schema setup
+that correctly uses Supabase's transaction-mode pooler (port 6543) with the
+`?pgbouncer=true` parameter to disable prepared statements. Without this, Prisma
+throws "prepared statement already exists" errors in serverless environments.
+
+## Real-World Justification
+
+Why this is a common and important workflow:
+
+1. **Top troubleshooting entry** — "Error: prepared statement XXX already exists"
+   is listed in the Supabase troubleshooting guide under Database Issues as a
+   direct consequence of using transaction-mode pooling without disabling
+   prepared statements.
+   - Source: https://supabase.com/docs/guides/troubleshooting
+2. **Serverless deployment reality** — Vercel and other serverless platforms
+   are the most popular Supabase deployment targets. Each function invocation
+   creates a new connection, making transaction-mode pooling mandatory. The
+   Prisma + Supabase combination is the most-searched configuration pairing.
+   - Source: https://supabase.com/docs/guides/database/connecting-to-postgres
+3. **Connection exhaustion** — Using session mode (port 5432) in serverless
+   leads to "remaining connection slots are reserved" errors — another top
+   troubleshooting entry. The fix requires switching to port 6543.
+   - Source: https://supabase.com/docs/guides/troubleshooting
+
+## Skill References Exercised
+
+| Reference File | What It Teaches | What the Agent Should Apply |
+|---|---|---|
+| `references/db-conn-pooling.md` | Transaction mode port 6543, pgbouncer=true for Prisma | Correct DATABASE_URL with port 6543 and ?pgbouncer=true |
+| `references/db-migrations-idempotent.md` | Migration file conventions and naming | Migration file in supabase/migrations/ |
+| `references/db-schema-auth-fk.md` | Schema best practices for user-linked tables | Proper FK patterns if schema is involved |
+
+## Workspace Setup
+
+- A workspace with `supabase/config.toml` already initialized
+- A `prisma/schema.prisma` starter file with a placeholder `DATABASE_URL` using
+  direct connection (port 5432, no pgbouncer flag)
+
+## Agent Task (PROMPT.md draft)
+
+> I'm deploying my Supabase app on Vercel using Prisma. I keep getting
+> "prepared statement already exists" errors in production. My current
+> `DATABASE_URL` in `prisma/schema.prisma` uses the direct connection string.
+> Fix the Prisma configuration so it works correctly with Supabase's connection
+> pooler.
+
+## Evaluation Criteria
+
+| # | Test Name | What It Checks | Quality Dimension |
+|---|-----------|----------------|-------------------|
+| 1 | prisma schema references pooler port | `DATABASE_URL` or connection hint references port `6543` | correctness |
+| 2 | pgbouncer=true param present | `?pgbouncer=true` or `pgbouncer=true` in the connection URL or env comment | correctness |
+| 3 | DIRECT_URL provided for migrations | A separate `directUrl` or `DIRECT_URL` variable defined for Prisma migrations | correctness |
+| 4 | connection limit set to 1 | `connection_limit=1` in the pooler URL or Prisma datasource | performance |
+| 5 | explanation distinguishes port 6543 vs 5432 | Output or comments distinguish transaction mode (6543) from direct (5432) | correctness |
+
+## Reasoning
+
+1. **Baseline differentiator:** An agent without the skill typically updates
+   the port or adds pgbouncer but forgets `DIRECT_URL` for migrations, or sets
+   `max` connections too high, or uses session mode instead of transaction mode.
+2. **Skill value:** `db-conn-pooling.md` provides the exact pattern: port 6543,
+   `?pgbouncer=true`, `max: 1` per serverless instance.
+3. **Testability:** Port numbers and query parameters are directly readable as
+   string patterns in the output files.
+4. **Realism:** "Prisma prepared statement already exists on Supabase" is one
+   of the most-searched Supabase error messages on Stack Overflow and GitHub.
+
+## Difficulty
+
+**Rating:** MEDIUM
+
+- Without skill: ~30% of assertions expected to pass (agent may change port but
+  likely misses pgbouncer param and DIRECT_URL)
+- With skill: ~90% of assertions expected to pass
+- **pass_threshold:** 7
--- a/packages/evals/scenarios/edge-function-hello-world.md
+++ b/packages/evals/scenarios/edge-function-hello-world.md
@@ -127,3 +127,4 @@ Step-by-step reasoning for why this scenario is well-designed:

 - Without skill: ~45-60% of assertions expected to pass
 - With skill: ~90-100% of assertions expected to pass
+- **pass_threshold:** 13
--- a/packages/evals/scenarios/extension-wrong-schema.md
+++ b/packages/evals/scenarios/extension-wrong-schema.md
@@ -0,0 +1,89 @@
+# Scenario: extension-wrong-schema
+
+## Summary
+
+The agent must create a migration that enables the `pgvector` extension and
+creates an `embeddings` table with a vector column and an HNSW index. The trap
+is installing the extension in the `public` schema (the default) instead of
+the `extensions` schema, and using IVFFlat without a `lists` parameter.
+
+## Real-World Justification
+
+Why this is a common and important workflow:
+
+1. **Known schema pollution issue** — Installing extensions in `public` exposes
+   extension functions and types through the PostgREST API, which can reveal
+   internal details and cause "42501: permission denied" errors. The Supabase
+   troubleshooting guide covers permission errors as a category.
+   - Source: https://supabase.com/docs/guides/troubleshooting
+2. **IVFFlat without lists = error** — The Supabase troubleshooting guide
+   contains a dedicated entry: "Increase vector lookup speeds by applying an
+   HNSW index" which warns against IVFFlat and notes its required `lists`
+   parameter. Missing this causes a CREATE INDEX error.
+   - Source: https://supabase.com/docs/guides/troubleshooting
+3. **pgvector adoption** — Vector/AI embeddings are the fastest-growing
+   Supabase use case. Nearly every AI-powered Supabase project starts with
+   the pgvector extension setup. Getting the schema right from the start
+   prevents later schema drift.
+   - Source: https://supabase.com/docs/guides/database/extensions/pgvector
+
+## Skill References Exercised
+
+| Reference File | What It Teaches | What the Agent Should Apply |
+|---|---|---|
+| `references/db-schema-extensions.md` | Install extensions in `extensions` schema, not `public`; HNSW over IVFFlat; IVFFlat needs `lists` | `CREATE EXTENSION ... WITH SCHEMA extensions`; HNSW index |
+| `references/db-rls-mandatory.md` | Enable RLS on all public tables | RLS on embeddings table |
+| `references/db-migrations-idempotent.md` | IF NOT EXISTS for extensions and tables | `CREATE EXTENSION IF NOT EXISTS` |
+| `references/db-schema-auth-fk.md` | FK to auth.users with CASCADE | User-linked embeddings |
+| `references/db-rls-common-mistakes.md` | TO authenticated, subselect auth.uid() | Policy correctness |
+
+## Workspace Setup
+
+- Empty workspace with a pre-initialized `supabase/config.toml` (no migrations)
+
+## Agent Task (PROMPT.md draft)
+
+> I'm building a semantic search feature. Create a migration that:
+> 1. Enables the pgvector extension
+> 2. Creates a `documents` table with an `embedding` column (1536 dimensions
+>    for OpenAI ada-002), a `content` text column, and a `user_id`
+> 3. Adds a vector similarity search index
+> 4. Users should only see their own documents
+> Put the migration in `supabase/migrations/`.
+
+## Evaluation Criteria
+
+| # | Test Name | What It Checks | Quality Dimension |
+|---|-----------|----------------|-------------------|
+| 1 | migration file exists | At least one `.sql` file in `supabase/migrations/` | structure |
+| 2 | extension in extensions schema | `WITH SCHEMA extensions` in the CREATE EXTENSION statement | correctness |
+| 3 | IF NOT EXISTS on extension | `CREATE EXTENSION IF NOT EXISTS` | idempotency |
+| 4 | vector column with correct dimensions | `vector(1536)` or `extensions.vector(1536)` in table | correctness |
+| 5 | HNSW index used not IVFFlat | `USING hnsw` present in CREATE INDEX | correctness |
+| 6 | RLS enabled | `ALTER TABLE documents ENABLE ROW LEVEL SECURITY` | security |
+| 7 | FK to auth.users with CASCADE | `REFERENCES auth.users ... ON DELETE CASCADE` | correctness |
+| 8 | policies TO authenticated | `TO authenticated` in policy definitions | security |
+| 9 | idempotent table creation | `CREATE TABLE IF NOT EXISTS` | idempotency |
+
+## Reasoning
+
+1. **Baseline differentiator:** Agents without the skill write `CREATE
+   EXTENSION vector;` (wrong schema), use IVFFlat (wrong index type for most
+   cases), and skip the `lists` parameter requirement.
+2. **Skill value:** `db-schema-extensions.md` explicitly shows the `WITH
+   SCHEMA extensions` pattern and recommends HNSW over IVFFlat with the
+   specific note about `lists` being required for IVFFlat.
+3. **Testability:** Schema placement in the extension creation line and index
+   type are directly checkable with regex.
+4. **Realism:** pgvector + OpenAI embeddings is the top "AI + Supabase"
+   tutorial path, and extension schema mistakes are a documented source of
+   permission errors.
+
+## Difficulty
+
+**Rating:** MEDIUM
+
+- Without skill: ~35% of assertions expected to pass (extension enabled but
+  wrong schema, wrong index type, weak policies)
+- With skill: ~90% of assertions expected to pass
+- **pass_threshold:** 8
--- a/packages/evals/scenarios/postgrest-schema-cache.md
+++ b/packages/evals/scenarios/postgrest-schema-cache.md
@@ -0,0 +1,89 @@
+# Scenario: postgrest-schema-cache
+
+## Summary
+
+The agent must create a migration that adds new columns to an existing table
+and create a view that uses those columns, including the correct `NOTIFY
+pgrst, 'reload schema'` call to force PostgREST to pick up the schema changes.
+Without this, the API returns 400 errors for the new columns even after
+migration.
+
+## Real-World Justification
+
+Why this is a common and important workflow:
+
+1. **Direct troubleshooting entry** — "PostgREST not recognizing new columns,
+   tables, views or functions" and "Reload/refresh postgrest schema" (400
+   bad_request error) are explicitly listed in the Supabase troubleshooting
+   guide. This is among the most confusing errors for new Supabase developers —
+   the migration ran successfully but the API still returns errors.
+   - Source: https://supabase.com/docs/guides/troubleshooting
+2. **Schema cache invalidation** — PostgREST caches the database schema at
+   startup and reloads it only when notified. Migrations that add new objects
+   must explicitly call `NOTIFY pgrst, 'reload schema'` at the end of the
+   migration file for the changes to be reflected immediately in local
+   development.
+   - Source: https://supabase.com/docs/guides/api/rest/generating-types
+3. **Views and RLS** — Creating a view over a user-owned table requires
+   understanding that RLS applies to the underlying tables, and the view itself
+   should use `security_invoker = true` to preserve RLS context.
+   - Source: https://supabase.com/docs/guides/database/views
+
+## Skill References Exercised
+
+| Reference File | What It Teaches | What the Agent Should Apply |
+|---|---|---|
+| `references/db-rls-views.md` | Views need security_invoker to respect RLS | `WITH (security_invoker = true)` on view |
+| `references/db-migrations-idempotent.md` | ADD COLUMN IF NOT EXISTS; IF NOT EXISTS patterns | Idempotent column additions |
+| `references/db-rls-mandatory.md` | RLS on base tables | RLS enabled on base table |
+| `references/db-rls-performance.md` | (select auth.uid()) subselect | Subselect form in policies |
+| `references/db-schema-timestamps.md` | timestamptz for new columns | timestamptz on added columns |
+
+## Workspace Setup
+
+- A workspace with `supabase/config.toml` and a starter migration that creates
+  a basic `products` table (id, name, price) with RLS enabled but no policies.
+
+## Agent Task (PROMPT.md draft)
+
+> Our `products` table needs two new columns: `description` (text) and
+> `published_at` (timestamp). Also create a view called `public_products` that
+> shows only products where `published_at` is not null. Add a policy so any
+> authenticated user can view published products. Put changes in a new
+> migration file in `supabase/migrations/`.
+
+## Evaluation Criteria
+
+| # | Test Name | What It Checks | Quality Dimension |
+|---|-----------|----------------|-------------------|
+| 1 | new migration file exists | A second `.sql` file in `supabase/migrations/` | structure |
+| 2 | ADD COLUMN IF NOT EXISTS for description | `ADD COLUMN IF NOT EXISTS description` | idempotency |
+| 3 | ADD COLUMN IF NOT EXISTS for published_at | `ADD COLUMN IF NOT EXISTS published_at` | idempotency |
+| 4 | published_at uses timestamptz | `published_at timestamptz` not plain `timestamp` | correctness |
+| 5 | view created | `CREATE OR REPLACE VIEW public_products` or similar | correctness |
+| 6 | view uses security_invoker | `security_invoker = true` on the view | security |
+| 7 | SELECT policy on products for authenticated | A FOR SELECT policy on products with TO authenticated | security |
+| 8 | NOTIFY pgrst reload present | `NOTIFY pgrst` in the migration | correctness |
+
+## Reasoning
+
+1. **Baseline differentiator:** Agents without the skill add columns correctly
+   but miss `IF NOT EXISTS`, use plain `timestamp`, forget `security_invoker`
+   on the view, and almost never include the `NOTIFY pgrst` call.
+2. **Skill value:** The NOTIFY pattern and security_invoker requirement are
+   non-obvious details that the reference files teach explicitly.
+3. **Testability:** `NOTIFY pgrst` is a unique string that either appears or
+   doesn't; `security_invoker` is similarly specific.
+4. **Realism:** Iterative schema evolution (adding columns to existing tables)
+   is the most common database task after initial setup, and the PostgREST
+   cache invalidation issue is a universal source of confusion.
+
+## Difficulty
+
+**Rating:** MEDIUM
+
+- Without skill: ~40% of assertions expected to pass (columns added and view
+  created, but no IF NOT EXISTS, wrong timestamp type, no NOTIFY, no
+  security_invoker)
+- With skill: ~88% of assertions expected to pass
+- **pass_threshold:** 7
--- a/packages/evals/scenarios/rls-update-needs-select.md
+++ b/packages/evals/scenarios/rls-update-needs-select.md
@@ -0,0 +1,85 @@
+# Scenario: rls-update-needs-select
+
+## Summary
+
+The agent must write a migration for an `orders` table where users can view and
+update only their own orders. The classic trap is writing an UPDATE policy
+without a matching SELECT policy — causing UPDATE to silently affect zero rows
+because RLS cannot find any rows to update.
+
+## Real-World Justification
+
+Why this is a common and important workflow:
+
+1. **"Why is my UPDATE returning empty data?"** — The Supabase troubleshooting
+   guide lists "Why is my select returning an empty data array and I have data
+   in the table?" which is the same root symptom. UPDATE with no SELECT policy
+   silently returns `{data: [], count: 0}` with no error, making it extremely
+   hard to diagnose.
+   - Source: https://supabase.com/docs/guides/troubleshooting
+2. **Documented RLS behavior** — The official RLS docs state that UPDATE
+   requires a SELECT policy to identify which rows are accessible for
+   modification. This is non-obvious and contradicts most developers'
+   expectations from SQL semantics.
+   - Source: https://supabase.com/docs/guides/database/postgres/row-level-security
+3. **WITH CHECK requirement** — An UPDATE policy also needs a `WITH CHECK`
+   clause to prevent users from updating a row to a state that would no longer
+   be visible to them (e.g., changing their own `user_id`). Missing this allows
+   data ownership hijacking.
+   - Source: https://supabase.com/docs/guides/database/postgres/row-level-security
+
+## Skill References Exercised
+
+| Reference File | What It Teaches | What the Agent Should Apply |
+|---|---|---|
+| `references/db-rls-common-mistakes.md` | UPDATE needs a SELECT policy; WITH CHECK clause | Separate SELECT and UPDATE policies, WITH CHECK |
+| `references/db-rls-policy-types.md` | USING vs WITH CHECK semantics | WITH CHECK on UPDATE policy |
+| `references/db-rls-performance.md` | (select auth.uid()) subquery caching | Subselect form in all USING/WITH CHECK |
+| `references/db-rls-mandatory.md` | Enable RLS, TO authenticated | Full mandatory boilerplate |
+| `references/db-schema-timestamps.md` | timestamptz for time columns | timestamptz not timestamp |
+
+## Workspace Setup
+
+- Empty workspace with a pre-initialized `supabase/config.toml` (no migrations)
+
+## Agent Task (PROMPT.md draft)
+
+> Create a migration for an `orders` table. Each order has a `status` (text),
+> `total` (numeric), and `created_at`. Orders belong to users. Users should be
+> able to view their own orders and update the status of their own orders.
+> Put the migration in `supabase/migrations/`.
+
+## Evaluation Criteria
+
+| # | Test Name | What It Checks | Quality Dimension |
+|---|-----------|----------------|-------------------|
+| 1 | migration file exists | At least one `.sql` file in `supabase/migrations/` | structure |
+| 2 | creates orders table | SQL contains `CREATE TABLE` and `orders` | correctness |
+| 3 | RLS enabled | `ALTER TABLE orders ENABLE ROW LEVEL SECURITY` | security |
+| 4 | has SELECT policy | A `FOR SELECT` policy exists on orders | correctness |
+| 5 | has UPDATE policy with WITH CHECK | A `FOR UPDATE` policy with `WITH CHECK` clause exists | correctness |
+| 6 | all policies TO authenticated | Every `CREATE POLICY` has `TO authenticated` | security |
+| 7 | uses (select auth.uid()) | Subselect form in policy USING clauses | performance |
+| 8 | uses timestamptz not timestamp | `created_at timestamptz` not plain `timestamp` | correctness |
+| 9 | FK to auth.users with CASCADE | `REFERENCES auth.users ... ON DELETE CASCADE` | correctness |
+
+## Reasoning
+
+1. **Baseline differentiator:** Without the skill, agents write only an UPDATE
+   policy (or a single ALL policy), skip WITH CHECK, and use bare `auth.uid()`
+   calls. The result is a migration that looks complete but breaks silently.
+2. **Skill value:** `db-rls-common-mistakes.md` explicitly covers this
+   UPDATE-needs-SELECT behavior with working examples.
+3. **Testability:** The presence of both `FOR SELECT` and `FOR UPDATE` with
+   `WITH CHECK` is directly detectable via regex on the SQL.
+4. **Realism:** "My UPDATE isn't working, returns empty" is among the most
+   common questions from developers new to RLS in the Supabase community.
+
+## Difficulty
+
+**Rating:** MEDIUM
+
+- Without skill: ~40% of assertions expected to pass (table and RLS likely,
+  but wrong policy structure)
+- With skill: ~92% of assertions expected to pass
+- **pass_threshold:** 8
--- a/packages/evals/scenarios/rls-user-metadata-role-check.md
+++ b/packages/evals/scenarios/rls-user-metadata-role-check.md
@@ -0,0 +1,85 @@
+# Scenario: rls-user-metadata-role-check
+
+## Summary
+
+The agent must write a migration for a `documents` table where admin users can
+read all documents and regular users can only read their own. The dangerous
+trap is checking `user_metadata` for the admin role — users can write to their
+own `user_metadata`, so this check is bypassable. The correct pattern uses
+`app_metadata`.
+
+## Real-World Justification
+
+Why this is a common and important workflow:
+
+1. **Explicit troubleshooting + security entry** — The Supabase troubleshooting
+   guide covers "Database API 42501 errors" related to auth claims and RLS.
+   Using user_metadata for authorization is one of the most dangerous patterns,
+   documented as a common mistake in the Supabase RLS guides.
+   - Source: https://supabase.com/docs/guides/troubleshooting
+2. **Privilege escalation vulnerability** — Any authenticated user can call
+   `supabase.auth.updateUser({ data: { role: 'admin' } })` to set their own
+   `user_metadata`. An RLS policy checking `user_metadata->>'role' = 'admin'`
+   gives every user admin access to all documents.
+   - Source: https://supabase.com/docs/guides/database/postgres/row-level-security
+3. **app_metadata is server-only** — `app_metadata` can only be set via the
+   Admin API or auth hooks, making it safe for authorization. This distinction
+   is taught in the skill but frequently missed by developers.
+   - Source: https://supabase.com/docs/guides/auth/managing-user-data
+
+## Skill References Exercised
+
+| Reference File | What It Teaches | What the Agent Should Apply |
+|---|---|---|
+| `references/db-rls-common-mistakes.md` | app_metadata not user_metadata for authorization | `auth.jwt() -> 'app_metadata' ->> 'role'` |
+| `references/db-rls-policy-types.md` | PERMISSIVE policies combine with OR; multiple policies for different roles | Separate owner and admin policies |
+| `references/db-rls-performance.md` | (select auth.uid()) subquery; (select auth.jwt()) caching | Subselect form for JWT lookups |
+| `references/db-rls-mandatory.md` | RLS enabled, TO authenticated | Full boilerplate |
+| `references/db-schema-auth-fk.md` | FK to auth.users with CASCADE | Correct user linkage |
+
+## Workspace Setup
+
+- Empty workspace with a pre-initialized `supabase/config.toml` (no migrations)
+
+## Agent Task (PROMPT.md draft)
+
+> Create a migration for a `documents` table. Each document has a `title`
+> (text), `content` (text), and an owner. Regular users can only see their own
+> documents. Admin users (identified by a role field in their JWT) should be
+> able to see all documents. Put the migration in `supabase/migrations/`.
+
+## Evaluation Criteria
+
+| # | Test Name | What It Checks | Quality Dimension |
+|---|-----------|----------------|-------------------|
+| 1 | migration file exists | At least one `.sql` file in `supabase/migrations/` | structure |
+| 2 | creates documents table | SQL contains `CREATE TABLE` and `documents` | correctness |
+| 3 | RLS enabled | `ALTER TABLE documents ENABLE ROW LEVEL SECURITY` | security |
+| 4 | uses app_metadata not user_metadata | JWT role check uses `app_metadata` not `user_metadata` | security |
+| 5 | no user_metadata role check | `user_metadata` does not appear in policy USING clauses | security |
+| 6 | two separate policies or one covering both | At least one SELECT policy for owner and one for admin role | correctness |
+| 7 | TO authenticated on all policies | `TO authenticated` in every policy | security |
+| 8 | (select auth.uid()) subselect form | Subselect form used not bare auth.uid() | performance |
+| 9 | FK to auth.users with CASCADE | `REFERENCES auth.users ... ON DELETE CASCADE` | correctness |
+
+## Reasoning
+
+1. **Baseline differentiator:** Agents without the skill almost universally
+   reach for `user_metadata` when asked about "a role field in their JWT" —
+   it is the more discoverable but dangerous field. Only the skill explicitly
+   flags this as an authorization anti-pattern.
+2. **Skill value:** `db-rls-common-mistakes.md` section 2 directly addresses
+   this with the exact `app_metadata` pattern.
+3. **Testability:** Checking for `app_metadata` presence and `user_metadata`
+   absence in policy USING clauses is a precise regex assertion.
+4. **Realism:** Role-based access in a multi-tenant app is one of the most
+   common RLS patterns requested, and the metadata confusion is universal.
+
+## Difficulty
+
+**Rating:** MEDIUM
+
+- Without skill: ~30% of assertions expected to pass (table and RLS likely,
+  but user_metadata used, subselect missing)
+- With skill: ~90% of assertions expected to pass
+- **pass_threshold:** 8
--- a/packages/evals/scenarios/service-role-edge-function.md
+++ b/packages/evals/scenarios/service-role-edge-function.md
@@ -0,0 +1,86 @@
+# Scenario: service-role-edge-function
+
+## Summary
+
+The agent must create a simple Edge Function that performs an admin operation
+(listing all users' records) using the service role key server-side, while
+a companion migration shows the table uses the anon key for browser access.
+The trap is accidentally exposing the service role key or using it in
+client-facing code.
+
+## Real-World Justification
+
+Why this is a common and important workflow:
+
+1. **Dedicated troubleshooting entry** — The Supabase troubleshooting guide
+   contains "Why is my service role key client getting RLS errors or not
+   returning data?" — developers incorrectly use the service role key in
+   contexts where it should not be used, or use the anon key where service role
+   is needed.
+   - Source: https://supabase.com/docs/guides/troubleshooting
+2. **Most dangerous Supabase mistake** — Exposing the service role key in
+   browser JavaScript bypasses all RLS and gives every visitor full database
+   access. This appears in multiple Supabase blog posts and community warnings.
+   - Source: https://supabase.com/docs/guides/api/api-keys
+3. **Environment variable leakage** — The troubleshooting guide warns about
+   "Inspecting edge function environment variables" as a debugging topic.
+   Developers must use `Deno.env.get()` not hardcoded keys, and never use
+   `NEXT_PUBLIC_` prefix for the service role key.
+   - Source: https://supabase.com/docs/guides/troubleshooting
+
+## Skill References Exercised
+
+| Reference File | What It Teaches | What the Agent Should Apply |
+|---|---|---|
+| `references/db-security-service-role.md` | Never expose service role key in browser, use env vars | `Deno.env.get('SUPABASE_SERVICE_ROLE_KEY')` in edge function |
+| `references/edge-fun-quickstart.md` | Edge function file structure and exports | Correct `index.ts` in `supabase/functions/` |
+| `references/edge-db-supabase-client.md` | Creating supabase client in edge functions | `createClient` with service role for admin ops |
+| `references/edge-pat-cors.md` | CORS headers for browser requests | CORS on the response |
+| `references/edge-pat-error-handling.md` | Error responses | Proper error handling |
+
+## Workspace Setup
+
+- Empty workspace with a pre-initialized `supabase/config.toml`
+- A migration creating a `reports` table already exists in `supabase/migrations/`
+
+## Agent Task (PROMPT.md draft)
+
+> Create an Edge Function called `admin-reports` that returns all rows from
+> the `reports` table, bypassing RLS (it's an admin-only endpoint). The
+> function should be in `supabase/functions/admin-reports/index.ts`. Use
+> environment variables for any keys — do not hardcode them.
+
+## Evaluation Criteria
+
+| # | Test Name | What It Checks | Quality Dimension |
+|---|-----------|----------------|-------------------|
+| 1 | edge function file exists | `supabase/functions/admin-reports/index.ts` exists | structure |
+| 2 | uses Deno.env.get for service key | `Deno.env.get` used to retrieve the service role key | security |
+| 3 | no hardcoded service role key | No JWT-like string literal (`eyJ`) as the service role value | security |
+| 4 | createClient called with service role | `createClient` receives the service role env var as second arg | correctness |
+| 5 | service role key not NEXT_PUBLIC prefixed | No `NEXT_PUBLIC_` prefix on service role variable name | security |
+| 6 | CORS headers present | `Access-Control-Allow-Origin` in response headers | correctness |
+| 7 | returns JSON response | `Response` with JSON body and content-type | correctness |
+
+## Reasoning
+
+1. **Baseline differentiator:** Agents without the skill sometimes hardcode a
+   placeholder key string, forget CORS, or use the wrong env variable name
+   pattern.
+2. **Skill value:** `db-security-service-role.md` is explicit about env var
+   naming rules and the `NEXT_PUBLIC_` anti-pattern. `edge-fun-quickstart.md`
+   teaches the Deno.env.get pattern.
+3. **Testability:** Checking for `eyJ` hardcoded strings and `NEXT_PUBLIC_`
+   prefixes are reliable negative assertions. `Deno.env.get` is a positive
+   string check.
+4. **Realism:** Admin Edge Functions that bypass RLS are an extremely common
+   pattern for dashboards and data exports.
+
+## Difficulty
+
+**Rating:** EASY
+
+- Without skill: ~50% of assertions expected to pass (file exists, createClient
+  present, but key handling likely wrong)
+- With skill: ~93% of assertions expected to pass
+- **pass_threshold:** 8
--- a/packages/evals/scenarios/storage-rls-user-folders.md
+++ b/packages/evals/scenarios/storage-rls-user-folders.md
@@ -141,4 +141,5 @@ Step-by-step reasoning for why this scenario is well-designed:
 **Rating:** MEDIUM

 - Without skill: ~30-45% of assertions expected to pass
- With skill: ~85-95% of assertions expected to pass
+- With skill: ~85-95% of assertions expected to pass
+- **pass_threshold:** 14
--- a/packages/evals/scenarios/team-rls-security-definer.md
+++ b/packages/evals/scenarios/team-rls-security-definer.md
@@ -100,8 +100,9 @@ specific quality signal:
 | 12 | index on membership lookup columns | `CREATE INDEX` on user_id and/or org_id in memberships | performance |
 | 13 | uses timestamptz | No plain `timestamp` for time columns | correctness |
 | 14 | idempotent DDL | Uses `IF NOT EXISTS` or `DROP ... IF EXISTS` patterns | idempotency |
-| 15 | delete policy restricted to owner role | A delete policy on projects checks for owner/admin role | security |
-| 16 | overall quality score | At least 10/14 best-practice signals present | overall |
+| 15 | stable or immutable on helper function | Helper function marked STABLE or IMMUTABLE for performance | performance |
+| 16 | delete policy restricted to owner role | A delete policy on projects checks for owner/admin role | security |
+| 17 | overall quality score | At least 11/15 best-practice signals present | overall |

 ## Reasoning

@@ -136,4 +137,5 @@ Step-by-step reasoning for why this scenario is well-designed:
 **Rating:** MEDIUM

 - Without skill: ~35-50% of assertions expected to pass
- With skill: ~85-95% of assertions expected to pass
+- With skill: ~85-95% of assertions expected to pass
+- **pass_threshold:** 13
--- a/packages/evals/src/eval-types.ts
+++ b/packages/evals/src/eval-types.ts
@@ -0,0 +1,21 @@
+/**
+ * A single assertion to run against the agent's workspace output.
+ *
+ * Used by EVAL.ts files to declare what the agent's work should produce.
+ * The runner executes these in-process (no test framework required).
+ */
+export interface EvalAssertion {
+	/** Human-readable name shown in Braintrust and local output */
+	name: string;
+	/** Return true = pass, false/throw = fail */
+	check: () => boolean | Promise<boolean>;
+	/** Timeout in ms for async checks (default: no timeout) */
+	timeout?: number;
+}
+
+/** Result of running a single EvalAssertion */
+export interface AssertionResult {
+	name: string;
+	passed: boolean;
+	error?: string;
+}
--- a/packages/evals/src/runner.ts
+++ b/packages/evals/src/runner.ts
@@ -1,11 +1,8 @@
 import { existsSync, readdirSync, readFileSync } from "node:fs";
 import { join, resolve } from "node:path";
+import type { AssertionResult, EvalAssertion } from "./eval-types.js";
 import { runAgent } from "./runner/agent.js";
-import {
-	initBraintrustLogger,
-	logScenarioToLogger,
-	uploadToBraintrust,
-} from "./runner/braintrust.js";
+import { uploadToBraintrust } from "./runner/braintrust.js";
 import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
 import { preflight } from "./runner/preflight.js";
 import { listModifiedFiles, printSummary } from "./runner/results.js";
@@ -22,7 +19,6 @@ import {
 	startSupabase,
 	stopSupabase,
 } from "./runner/supabase-setup.js";
-import { runTests } from "./runner/test.js";
 import {
 	buildTranscriptSummary,
 	type TranscriptSummary,
@@ -92,6 +88,40 @@ function getPassThreshold(scenarioId: string): number | null {
 	return match ? Number.parseInt(match[1], 10) : null;
 }

+// ---------------------------------------------------------------------------
+// In-process assertion runner (replaces vitest subprocess)
+// ---------------------------------------------------------------------------
+
+async function runAssertions(
+	assertions: EvalAssertion[],
+): Promise<AssertionResult[]> {
+	return Promise.all(
+		assertions.map(async (a) => {
+			try {
+				let result: boolean;
+				if (a.timeout) {
+					const timeoutPromise = new Promise<never>((_, reject) =>
+						setTimeout(
+							() =>
+								reject(new Error(`Assertion timed out after ${a.timeout}ms`)),
+							a.timeout,
+						),
+					);
+					result = await Promise.race([
+						Promise.resolve(a.check()),
+						timeoutPromise,
+					]);
+				} else {
+					result = await Promise.resolve(a.check());
+				}
+				return { name: a.name, passed: Boolean(result) };
+			} catch (e) {
+				return { name: a.name, passed: false, error: String(e) };
+			}
+		}),
+	);
+}
+
 // ---------------------------------------------------------------------------
 // Run a single eval
 // ---------------------------------------------------------------------------
@@ -106,18 +136,28 @@ async function runEval(

 	console.log(`\n--- ${scenario.id} (${variant}) ---`);

+	// Load assertions and expected reference files from EVAL.ts
+	const evalFilePath = existsSync(join(evalDir, "EVAL.tsx"))
+		? join(evalDir, "EVAL.tsx")
+		: join(evalDir, "EVAL.ts");
+
+	const {
+		assertions = [] as EvalAssertion[],
+		expectedReferenceFiles = [] as string[],
+	} = await import(evalFilePath).catch(() => ({
+		assertions: [] as EvalAssertion[],
+		expectedReferenceFiles: [] as string[],
+	}));
+
+	const passThreshold = getPassThreshold(scenario.id);
+	const prompt = readFileSync(join(evalDir, "PROMPT.md"), "utf-8").trim();
+
 	// 1. Create isolated workspace
-	const { workspacePath, cleanup } = createWorkspace({
-		evalDir,
-		skillEnabled,
-	});
+	const { workspacePath, cleanup } = createWorkspace({ evalDir, skillEnabled });
 	console.log(`  Workspace: ${workspacePath}`);

 	try {
-		// 2. Read the prompt
-		const prompt = readFileSync(join(evalDir, "PROMPT.md"), "utf-8").trim();
-
-		// 3. Run the agent
+		// 2. Run the agent
 		console.log(`  Running agent (${model})...`);
 		const startedAt = Date.now();
 		const agentResult = await runAgent({
@@ -132,54 +172,48 @@ async function runEval(
 			`  Agent finished in ${(agentResult.duration / 1000).toFixed(1)}s`,
 		);

-		// 4. Run the hidden tests
-		const evalFilePath = existsSync(join(evalDir, "EVAL.tsx"))
-			? join(evalDir, "EVAL.tsx")
-			: join(evalDir, "EVAL.ts");
-
-		const passThreshold = getPassThreshold(scenario.id);
-
-		console.log("  Running tests...");
-		const testResult = await runTests({
-			workspacePath,
-			evalFilePath,
-			passThreshold: passThreshold ?? undefined,
+		// 3. Run assertions in-process from the workspace directory so that
+		//    eval-utils.ts helpers resolve paths relative to the workspace.
+		console.log("  Running assertions...");
+		const prevCwd = process.cwd();
+		process.chdir(workspacePath);
+		const assertionResults = await runAssertions(assertions).finally(() => {
+			process.chdir(prevCwd);
 		});
+		const passedCount = assertionResults.filter((a) => a.passed).length;
+		const totalCount = assertionResults.length;
+
+		const passed = passThreshold
+			? totalCount > 0 && passedCount >= passThreshold
+			: totalCount > 0 && passedCount === totalCount;

 		const pct =
-			testResult.totalCount > 0
-				? ((testResult.passedCount / testResult.totalCount) * 100).toFixed(1)
-				: "0.0";
+			totalCount > 0 ? ((passedCount / totalCount) * 100).toFixed(1) : "0.0";
 		const thresholdInfo = passThreshold
-			? `, threshold: ${((passThreshold / testResult.totalCount) * 100).toFixed(0)}%`
+			? `, threshold: ${((passThreshold / totalCount) * 100).toFixed(0)}%`
 			: "";
 		console.log(
-			`  Tests: ${testResult.passedCount}/${testResult.totalCount} passed (${pct}%${thresholdInfo})`,
+			`  Assertions: ${passedCount}/${totalCount} passed (${pct}%${thresholdInfo})`,
 		);

-		// 5. Collect modified files
+		// 4. Collect modified files
 		const filesModified = listModifiedFiles(workspacePath, evalDir);

-		// 6. Build transcript summary
+		// 5. Build transcript summary
 		const summary = buildTranscriptSummary(agentResult.events);

-		// 7. Load expectedReferenceFiles from EVAL.ts (if declared)
-		const { expectedReferenceFiles = [] } = await import(evalFilePath).catch(
-			() => ({ expectedReferenceFiles: [] as string[] }),
-		);
-
-		// 8. Run scorers
+		// 6. Run scorers
 		const skillScore = skillUsageScorer(summary, skillName);
 		const refScore = referenceFilesUsageScorer(summary, expectedReferenceFiles);
 		const assertScore = assertionsPassedScorer({
-			testsPassed: testResult.passedCount,
-			testsTotal: testResult.totalCount,
-			status: testResult.passed ? "passed" : "failed",
+			testsPassed: passedCount,
+			testsTotal: totalCount,
+			status: passed ? "passed" : "failed",
 		} as EvalRunResult);
 		const finalScore = finalResultScorer({
-			status: testResult.passed ? "passed" : "failed",
-			testsPassed: testResult.passedCount,
-			testsTotal: testResult.totalCount,
+			status: passed ? "passed" : "failed",
+			testsPassed: passedCount,
+			testsTotal: totalCount,
 			passThreshold: passThreshold ?? undefined,
 		} as EvalRunResult);

@@ -188,18 +222,17 @@ async function runEval(
 			agent: "claude-code",
 			model,
 			skillEnabled,
-			status: testResult.passed ? "passed" : "failed",
+			status: passed ? "passed" : "failed",
 			duration: agentResult.duration,
-			testOutput: testResult.output,
 			agentOutput: agentResult.output,
-			testsPassed: testResult.passedCount,
-			testsTotal: testResult.totalCount,
+			testsPassed: passedCount,
+			testsTotal: totalCount,
 			passThreshold: passThreshold ?? undefined,
+			assertionResults,
 			filesModified,
 			toolCallCount: summary.toolCalls.length,
 			costUsd: summary.totalCostUsd ?? undefined,
 			prompt,
-			individualTests: testResult.individualTests,
 			startedAt,
 			durationApiMs: summary.totalDurationApiMs,
 			totalInputTokens: summary.totalInputTokens,
@@ -225,7 +258,7 @@ async function runEval(
 		saveRunArtifacts({
 			resultDir,
 			rawTranscript: agentResult.rawTranscript,
-			testOutput: testResult.output,
+			assertionResults,
 			result,
 			transcriptSummary: summary,
 		});
@@ -241,7 +274,6 @@ async function runEval(
 				skillEnabled,
 				status: "error",
 				duration: 0,
-				testOutput: "",
 				agentOutput: "",
 				testsPassed: 0,
 				testsTotal: 0,
@@ -281,7 +313,7 @@ async function main() {
 	startSupabase();
 	const keys = getKeys();

-	// Inject keys into process.env so EVAL.ts tests can connect to the real DB.
+	// Inject keys into process.env so assertions can connect to the real DB.
 	process.env.SUPABASE_URL = keys.apiUrl;
 	process.env.SUPABASE_ANON_KEY = keys.anonKey;
 	process.env.SUPABASE_SERVICE_ROLE_KEY = keys.serviceRoleKey;
@@ -291,7 +323,6 @@ async function main() {
 	const transcripts = new Map<string, TranscriptSummary>();

 	const braintrustUpload = process.env.BRAINTRUST_UPLOAD === "true";
-	const logger = braintrustUpload ? initBraintrustLogger() : undefined;

 	try {
 		for (const scenario of scenarios) {
@@ -304,15 +335,9 @@ async function main() {
 			if (transcript) {
 				transcripts.set(result.scenario, transcript);
 			}
-
-			// Log immediately after each scenario for real-time visibility.
-			if (logger) {
-				logScenarioToLogger(logger, result, transcript);
-			}
 		}
 	} finally {
 		stopSupabase();
-		await logger?.flush();
 	}

 	// Use the results dir from the first result (all share the same timestamp)
--- a/packages/evals/src/runner/braintrust.ts
+++ b/packages/evals/src/runner/braintrust.ts
@@ -70,7 +70,7 @@ export function logScenarioToLogger(
 					status: r.status,
 					agentOutput: r.agentOutput,
 					filesModified: r.filesModified,
-					testOutput: r.testOutput,
+					assertionResults: r.assertionResults,
 				},
 				expected: { testsTotal: r.testsTotal },
 				scores,
@@ -106,7 +106,7 @@ export function logScenarioToLogger(
 					status: r.status,
 					agentOutput: r.agentOutput,
 					filesModified: r.filesModified,
-					testOutput: r.testOutput,
+					assertionResults: r.assertionResults,
 				},
 				expected: { testsTotal: r.testsTotal },
 				scores,
@@ -121,7 +121,7 @@ export function logScenarioToLogger(
 *
 * Each EvalRunResult becomes a row in the experiment with:
 * - input: scenario ID, prompt content, skillEnabled flag
- * - output: status, agent output, files modified, test output
+ * - output: status, agent output, files modified, assertion results
 * - expected: total tests, pass threshold
 * - scores: skill_usage, reference_files_usage, assertions_passed, final_result
 * - metadata: agent, model, skillEnabled, test counts, tool calls, context window, output tokens, model usage, errors, cost
@@ -172,7 +172,7 @@ export async function uploadToBraintrust(
 			status: r.status,
 			agentOutput: r.agentOutput,
 			filesModified: r.filesModified,
-			testOutput: r.testOutput,
+			assertionResults: r.assertionResults,
 		};

 		const expected = {
--- a/packages/evals/src/runner/persist.ts
+++ b/packages/evals/src/runner/persist.ts
@@ -1,6 +1,7 @@
 import { mkdirSync, writeFileSync } from "node:fs";
 import { dirname, join } from "node:path";
 import { fileURLToPath } from "node:url";
+import type { AssertionResult } from "../eval-types.js";
 import type { EvalRunResult } from "../types.js";
 import type { TranscriptSummary } from "./transcript.js";

@@ -32,7 +33,7 @@ export function createResultDir(
 export function saveRunArtifacts(opts: {
 	resultDir: string;
 	rawTranscript: string;
-	testOutput: string;
+	assertionResults: AssertionResult[];
 	result: EvalRunResult;
 	transcriptSummary: TranscriptSummary;
 }): void {
@@ -43,8 +44,8 @@ export function saveRunArtifacts(opts: {
 	);

 	writeFileSync(
-		join(opts.resultDir, "test-output.txt"),
-		opts.testOutput,
+		join(opts.resultDir, "assertions.json"),
+		JSON.stringify(opts.assertionResults, null, 2),
 		"utf-8",
 	);

--- a/packages/evals/src/runner/scorers.ts
+++ b/packages/evals/src/runner/scorers.ts
@@ -63,7 +63,7 @@ export function referenceFilesUsageScorer(
 }

 /**
- * assertionsPassedScorer — ratio of vitest assertions passed vs total.
+ * assertionsPassedScorer — ratio of assertions passed vs total.
 */
 export function assertionsPassedScorer(result: EvalRunResult): ScoreResult {
 	const score =
--- a/packages/evals/src/runner/test.ts
+++ b/packages/evals/src/runner/test.ts
@@ -1,143 +0,0 @@
-import { execFile } from "node:child_process";
-import { copyFileSync, existsSync, writeFileSync } from "node:fs";
-import { dirname, join } from "node:path";
-import { fileURLToPath } from "node:url";
-import { promisify } from "node:util";
-
-const __filename = fileURLToPath(import.meta.url);
-const __dirname = dirname(__filename);
-
-const exec = promisify(execFile);
-
-export interface TestResult {
-	passed: boolean;
-	output: string;
-	/** Number of tests that passed */
-	passedCount: number;
-	/** Total number of tests */
-	totalCount: number;
-	/** Per-test pass/fail extracted from vitest verbose output */
-	individualTests: Record<string, boolean>;
-}
-
-/**
- * Run the hidden EVAL.ts tests against the agent's workspace.
- *
- * 1. Copy EVAL.ts into the workspace (agent is done, safe to expose)
- * 2. Run vitest against it
- * 3. Parse the output for pass/fail
- */
-export async function runTests(opts: {
-	workspacePath: string;
-	evalFilePath: string;
-	passThreshold?: number;
-}): Promise<TestResult> {
-	// Copy the hidden test file into the workspace
-	const evalFileName = opts.evalFilePath.endsWith(".tsx")
-		? "EVAL.tsx"
-		: "EVAL.ts";
-	const destPath = join(opts.workspacePath, evalFileName);
-	copyFileSync(opts.evalFilePath, destPath);
-
-	// Copy shared eval-utils.ts if it exists alongside the eval scenarios
-	const evalUtilsSrc = join(
-		dirname(dirname(opts.evalFilePath)),
-		"eval-utils.ts",
-	);
-	if (existsSync(evalUtilsSrc)) {
-		copyFileSync(evalUtilsSrc, join(opts.workspacePath, "eval-utils.ts"));
-	}
-
-	// Write a minimal vitest config that overrides the default include pattern
-	// so EVAL.ts (without .test. or .spec.) is picked up.
-	const vitestConfigPath = join(opts.workspacePath, "vitest.config.mjs");
-	if (!existsSync(vitestConfigPath)) {
-		// Alias ../eval-utils.ts → ./eval-utils.ts so the import resolves in
-		// the flat workspace (source tree has EVAL.ts one level deeper).
-		const evalUtilsDest = join(opts.workspacePath, "eval-utils.ts");
-		const aliasBlock = existsSync(evalUtilsDest)
-			? `resolve: { alias: { "../eval-utils.ts": "./eval-utils.ts" } },`
-			: "";
-		writeFileSync(
-			vitestConfigPath,
-			`export default { ${aliasBlock} test: { include: ["EVAL.{ts,tsx}"] } };\n`,
-		);
-	}
-
-	// Use the vitest binary from the evals package (always available)
-	const evalsVitest = join(
-		__dirname,
-		"..",
-		"..",
-		"node_modules",
-		".bin",
-		"vitest",
-	);
-	const vitestBin = join(opts.workspacePath, "node_modules", ".bin", "vitest");
-	const cmd = existsSync(vitestBin) ? vitestBin : evalsVitest;
-	const args = ["run", evalFileName, "--reporter=verbose", "--no-color"];
-
-	try {
-		const { stdout, stderr } = await exec(cmd, args, {
-			cwd: opts.workspacePath,
-			timeout: 60_000,
-			env: { ...process.env },
-			maxBuffer: 5 * 1024 * 1024,
-		});
-
-		const output = `${stdout}\n${stderr}`;
-		return parseTestOutput(output, opts.passThreshold);
-	} catch (error) {
-		const err = error as Error & { stdout?: string; stderr?: string };
-		const output = `${err.stdout ?? ""}\n${err.stderr ?? ""}`;
-		return parseTestOutput(output, opts.passThreshold);
-	}
-}
-
-/**
- * Extract per-test pass/fail from vitest verbose output.
- *
- * Vitest verbose format:
- *   ✓ EVAL.ts > test name here 0ms          → passed
- *   × EVAL.ts > test name here 2ms          → failed
- */
-function parseIndividualTests(output: string): Record<string, boolean> {
-	const results: Record<string, boolean> = {};
-	const re = /[✓×]\s+EVAL\.tsx?\s+>\s+(.+?)\s+\d+ms/g;
-	for (const match of output.matchAll(re)) {
-		const testName = match[1].trim();
-		const didPass = output[match.index] === "✓";
-		results[testName] = didPass;
-	}
-	return results;
-}
-
-function parseTestOutput(output: string, passThreshold?: number): TestResult {
-	// Parse vitest output for pass/fail counts
-	// Vitest formats:
-	//   All passing:  "Tests  N passed (N)"
-	//   Mixed:        "Tests  N failed | M passed (T)"
-	//   All failing:  "Tests  N failed (N)"
-	const mixedOrPassing = output.match(
-		/Tests\s+(?:(\d+)\s+failed\s+\|\s+)?(\d+)\s+passed\s+\((\d+)\)/,
-	);
-	const allFailing = output.match(/Tests\s+(\d+)\s+failed\s+\((\d+)\)/);
-
-	let passedCount = 0;
-	let totalCount = 0;
-
-	if (mixedOrPassing) {
-		passedCount = Number.parseInt(mixedOrPassing[2], 10);
-		totalCount = Number.parseInt(mixedOrPassing[3], 10);
-	} else if (allFailing) {
-		passedCount = 0;
-		totalCount = Number.parseInt(allFailing[2], 10);
-	}
-
-	const passed = passThreshold
-		? totalCount > 0 && passedCount >= passThreshold
-		: totalCount > 0 && passedCount === totalCount;
-	const individualTests = parseIndividualTests(output);
-
-	return { passed, output, passedCount, totalCount, individualTests };
-}
--- a/packages/evals/src/types.ts
+++ b/packages/evals/src/types.ts
@@ -1,3 +1,5 @@
+import type { AssertionResult } from "./eval-types.js";
+
 export interface EvalScenario {
 	/** Directory name under evals/ */
 	id: string;
@@ -23,14 +25,17 @@ export interface EvalRunResult {
 	skillEnabled: boolean;
 	status: "passed" | "failed" | "error";
 	duration: number;
-	testOutput: string;
+	/** Raw test runner output (for debugging) */
+	testOutput?: string;
 	agentOutput: string;
-	/** Number of vitest tests that passed */
+	/** Number of assertions that passed */
 	testsPassed: number;
-	/** Total number of vitest tests */
+	/** Total number of assertions */
 	testsTotal: number;
 	/** Minimum tests required to pass (from scenario config) */
 	passThreshold?: number;
+	/** Per-assertion pass/fail results */
+	assertionResults?: AssertionResult[];
 	/** Files the agent created or modified in the workspace */
 	filesModified: string[];
 	error?: string;
@@ -42,8 +47,6 @@ export interface EvalRunResult {
 	costUsd?: number;
 	/** The PROMPT.md content sent to the agent */
 	prompt?: string;
-	/** Per-test pass/fail results from vitest */
-	individualTests?: Record<string, boolean>;
 	/** Epoch ms when the agent run started (for Braintrust span timing) */
 	startedAt?: number;
 	/** API-only latency in ms (excludes local processing overhead) */