replace vitest for braintrust assertions

2026-03-27 10:09:26 +08:00 · 2026-02-25 19:50:54 +00:00
parent e65642b752
commit 34e807a3f6
66 changed files with 3940 additions and 1234 deletions
--- a/packages/evals/evals/auth-fk-cascade-delete/EVAL.ts
+++ b/packages/evals/evals/auth-fk-cascade-delete/EVAL.ts
@@ -0,0 +1,85 @@
 export const expectedReferenceFiles = [
 	"db-schema-auth-fk.md",
 	"db-security-functions.md",
 	"db-rls-mandatory.md",
 	"db-rls-common-mistakes.md",
 ];
 import type { EvalAssertion } from "../../src/eval-types.js";
 import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
 export const assertions: EvalAssertion[] = [
 	{
 		name: "migration file exists",
 		check: () => findMigrationFiles().length > 0,
 	},
 	{
 		name: "creates profiles table",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			return /create\s+table/.test(sql) && /profiles/.test(sql);
 		},
 	},
 	{
 		name: "FK references auth.users",
 		check: () =>
 			/references\s+auth\.users/.test(getMigrationSQL().toLowerCase()),
 	},
 	{
 		name: "ON DELETE CASCADE present",
 		check: () => /on\s+delete\s+cascade/.test(getMigrationSQL().toLowerCase()),
 	},
 	{
 		name: "RLS enabled on profiles",
 		check: () =>
 			/alter\s+table.*profiles.*enable\s+row\s+level\s+security/.test(
 				getMigrationSQL().toLowerCase(),
 			),
 	},
 	{
 		name: "trigger function uses SECURITY DEFINER",
 		check: () => /security\s+definer/.test(getMigrationSQL().toLowerCase()),
 	},
 	{
 		name: "trigger function sets search_path",
 		check: () =>
 			/set\s+search_path\s*=\s*''/.test(getMigrationSQL().toLowerCase()),
 	},
 	{
 		name: "trigger created on auth.users",
 		check: () =>
 			/create\s+trigger[\s\S]*?on\s+auth\.users/.test(
 				getMigrationSQL().toLowerCase(),
 			),
 	},
 	{
 		name: "policies scoped to authenticated",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			return (
 				policyBlocks.length > 0 &&
 				policyBlocks.every((p) => /to\s+authenticated/.test(p))
 			);
 		},
 	},
 	{
 		name: "overall quality: demonstrates Supabase best practices",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			const signals = [
 				/references\s+auth\.users/.test(sql) &&
 					/on\s+delete\s+cascade/.test(sql),
 				/alter\s+table.*profiles.*enable\s+row\s+level\s+security/.test(sql),
 				/security\s+definer/.test(sql),
 				/set\s+search_path\s*=\s*''/.test(sql),
 				/create\s+trigger[\s\S]*?on\s+auth\.users/.test(sql),
 				policyBlocks.length > 0 &&
 					policyBlocks.every((p) => /to\s+authenticated/.test(p)),
 			];
 			return signals.filter(Boolean).length >= 5;
 		},
 	},
 ];
--- a/packages/evals/evals/auth-fk-cascade-delete/PROMPT.md
+++ b/packages/evals/evals/auth-fk-cascade-delete/PROMPT.md
@@ -0,0 +1,7 @@
 I'm building a Supabase app and need to set up a `profiles` table. Every user who signs up should automatically get a profile row containing their `id`, `email`, and `full_name` (pulled from signup metadata).
 Please create a SQL migration in `supabase/migrations/` that:
 1. Creates the `profiles` table linked to Supabase Auth users
 2. Sets up a trigger so a profile row is created automatically whenever a new user signs up
 3. Enables Row Level Security so users can only read and update their own profile
--- a/packages/evals/evals/auth-fk-cascade-delete/package.json
+++ b/packages/evals/evals/auth-fk-cascade-delete/package.json
@@ -0,0 +1,5 @@
 {
 	"name": "auth-fk-cascade-delete",
 	"private": true,
 	"type": "module"
 }
--- a/packages/evals/evals/auth-fk-cascade-delete/supabase/config.toml
+++ b/packages/evals/evals/auth-fk-cascade-delete/supabase/config.toml
@@ -0,0 +1,111 @@
 # For detailed configuration reference documentation, visit:
 # https://supabase.com/docs/guides/local-development/cli/config
 # A string used to distinguish different Supabase projects on the same host. Defaults to the
 # working directory name when running `supabase init`.
 project_id = "auth-fk-cascade-delete"
 [api]
 enabled = true
 # Port to use for the API URL.
 port = 54321
 # Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
 # endpoints. `public` and `graphql_public` schemas are included by default.
 schemas = ["public", "graphql_public"]
 # Extra schemas to add to the search_path of every request.
 extra_search_path = ["public", "extensions"]
 # The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
 # for accidental or malicious requests.
 max_rows = 1000
 [db]
 # Port to use for the local database URL.
 port = 54322
 # Port used by db diff command to initialize the shadow database.
 shadow_port = 54320
 # The database major version to use. This has to be the same as your remote database's. Run `SHOW
 # server_version;` on the remote database to check.
 major_version = 17
 [db.pooler]
 enabled = false
 # Port to use for the local connection pooler.
 port = 54329
 # Specifies when a server connection can be reused by other clients.
 # Configure one of the supported pooler modes: `transaction`, `session`.
 pool_mode = "transaction"
 # How many server connections to allow per user/database pair.
 default_pool_size = 20
 # Maximum number of client connections allowed.
 max_client_conn = 100
 [db.migrations]
 # If disabled, migrations will be skipped during a db push or reset.
 enabled = true
 schema_paths = []
 [db.seed]
 # If enabled, seeds the database after migrations during a db reset.
 enabled = true
 # Specifies an ordered list of seed files to load during db reset.
 sql_paths = ["./seed.sql"]
 [realtime]
 enabled = true
 [studio]
 enabled = true
 # Port to use for Supabase Studio.
 port = 54323
 # External URL of the API server that frontend connects to.
 api_url = "http://127.0.0.1"
 [inbucket]
 enabled = true
 # Port to use for the email testing server web interface.
 port = 54324
 [storage]
 enabled = true
 # The maximum file size allowed (e.g. "5MB", "500KB").
 file_size_limit = "50MiB"
 [auth]
 enabled = true
 # The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
 # in emails.
 site_url = "http://127.0.0.1:3000"
 # A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
 additional_redirect_urls = ["https://127.0.0.1:3000"]
 # How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
 jwt_expiry = 3600
 # If disabled, the refresh token will never expire.
 enable_refresh_token_rotation = true
 # Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
 # Requires enable_refresh_token_rotation = true.
 refresh_token_reuse_interval = 10
 # Allow/disallow new user signups to your project.
 enable_signup = true
 # Allow/disallow anonymous sign-ins to your project.
 enable_anonymous_sign_ins = false
 [auth.email]
 # Allow/disallow new user signups via email to your project.
 enable_signup = true
 # If enabled, a user will be required to confirm any email change on both the old, and new email
 # addresses. If disabled, only the new email is required to confirm.
 double_confirm_changes = true
 # If enabled, users need to confirm their email address before signing in.
 enable_confirmations = false
 [edge_runtime]
 enabled = true
 # Configure one of the supported request policies: `oneshot`, `per_worker`.
 policy = "per_worker"
 # Port to attach the Chrome inspector for debugging edge functions.
 inspector_port = 8083
 [analytics]
 enabled = true
 port = 54327
 # Configure one of the supported backends: `postgres`, `bigquery`.
 backend = "postgres"
--- a/packages/evals/evals/auth-rls-new-project/EVAL.ts
+++ b/packages/evals/evals/auth-rls-new-project/EVAL.ts
@@ -1,97 +1,150 @@
 export const expectedReferenceFiles = [
 	"dev-getting-started.md",
 	"db-rls-mandatory.md",
 	"db-rls-policy-types.md",
 	"db-rls-common-mistakes.md",
 	"db-schema-auth-fk.md",
 	"db-schema-timestamps.md",
 	"db-migrations-idempotent.md",
 ];
 import { existsSync } from "node:fs";
 import { join } from "node:path";
-import { expect, test } from "vitest";
+import type { EvalAssertion } from "../../src/eval-types.js";
 import {
 	anonSeeesNoRows,
 	findMigrationFiles,
 	getMigrationSQL,
-	supabaseDir,
+	getSupabaseDir,
 	queryTable,
 	tableExists,
 } from "../eval-utils.ts";
-test("supabase project initialized (config.toml exists)", () => {
+export const assertions: EvalAssertion[] = [
-	expect(existsSync(join(supabaseDir, "config.toml"))).toBe(true);
+	{
-});
+		name: "supabase project initialized (config.toml exists)",
-
+		check: () => existsSync(join(getSupabaseDir(), "config.toml")),
-test("migration file exists in supabase/migrations/", () => {
+	},
-	expect(findMigrationFiles().length).toBeGreaterThan(0);
+	{
-});
+		name: "migration file exists in supabase/migrations/",
-
+		check: () => findMigrationFiles().length > 0,
-test("creates tasks table", () => {
+	},
-	const sql = getMigrationSQL().toLowerCase();
+	{
-	expect(sql).toMatch(/create\s+table/);
+		name: "creates tasks table",
-	expect(sql).toMatch(/tasks/);
+		check: () => {
-});
+			const sql = getMigrationSQL().toLowerCase();
-
+			return /create\s+table/.test(sql) && /tasks/.test(sql);
-test("enables RLS on tasks table", () => {
+		},
-	const sql = getMigrationSQL().toLowerCase();
+	},
-	expect(sql).toMatch(/alter\s+table.*tasks.*enable\s+row\s+level\s+security/);
+	{
-});
+		name: "enables RLS on tasks table",
-
+		check: () =>
-test("has foreign key to auth.users", () => {
+			/alter\s+table.*tasks.*enable\s+row\s+level\s+security/.test(
-	const sql = getMigrationSQL().toLowerCase();
+				getMigrationSQL().toLowerCase(),
-	expect(sql).toMatch(/references\s+auth\.users/);
+			),
-});
+	},
-
+	{
-test("uses ON DELETE CASCADE for auth FK", () => {
+		name: "has foreign key to auth.users",
-	const sql = getMigrationSQL().toLowerCase();
+		check: () =>
-	expect(sql).toMatch(/on\s+delete\s+cascade/);
+			/references\s+auth\.users/.test(getMigrationSQL().toLowerCase()),
-});
+	},
-
+	{
-test("uses (select auth.uid()) not bare auth.uid() in policies", () => {
+		name: "uses ON DELETE CASCADE for auth FK",
-	const sql = getMigrationSQL();
+		check: () => /on\s+delete\s+cascade/.test(getMigrationSQL().toLowerCase()),
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+	},
-	for (const policy of policyBlocks) {
+	{
-		if (policy.includes("auth.uid()")) {
+		name: "uses (select auth.uid()) not bare auth.uid() in policies",
-			// The subselect form: (select auth.uid())
+		check: () => {
-			expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
+			const sql = getMigrationSQL();
-		}
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	}
+			for (const policy of policyBlocks) {
-});
+				if (
-
+					policy.includes("auth.uid()") &&
-test("policies use TO authenticated", () => {
+					!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
-	const sql = getMigrationSQL().toLowerCase();
+				) {
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+					return false;
-	expect(policyBlocks.length).toBeGreaterThan(0);
+				}
-	for (const policy of policyBlocks) {
+			}
-		expect(policy).toMatch(/to\s+authenticated/);
+			return true;
-	}
+		},
-});
+	},
-
+	{
-test("uses timestamptz not plain timestamp for time columns", () => {
+		name: "policies use TO authenticated",
-	const sql = getMigrationSQL().toLowerCase();
+		check: () => {
-	// Match "timestamp" that is NOT followed by "tz" or "with time zone"
+			const sql = getMigrationSQL().toLowerCase();
-	const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	// Only fail if the migration defines time columns with plain timestamp
+			return (
-	if (
+				policyBlocks.length > 0 &&
-		sql.includes("created_at") ||
+				policyBlocks.every((p) => /to\s+authenticated/.test(p))
-		sql.includes("updated_at") ||
+			);
-		sql.includes("due_date")
+		},
-	) {
+	},
-		expect(sql).not.toMatch(hasPlainTimestamp);
+	{
-	}
+		name: "uses timestamptz not plain timestamp for time columns",
-});
+		check: () => {
-
+			const rawSql = getMigrationSQL().toLowerCase();
-test("creates index on user_id column", () => {
+			const sql = rawSql.replace(/--[^\n]*/g, "");
-	const sql = getMigrationSQL().toLowerCase();
+			const hasPlainTimestamp =
-	expect(sql).toMatch(/create\s+index/);
+				/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
-	expect(sql).toMatch(/user_id/);
+			if (
-});
+				sql.includes("created_at") ||
-
+				sql.includes("updated_at") ||
-test("migration is idempotent (uses IF NOT EXISTS)", () => {
+				sql.includes("due_date")
-	const sql = getMigrationSQL().toLowerCase();
+			) {
-	expect(sql).toMatch(/if\s+not\s+exists/);
+				return !hasPlainTimestamp.test(sql);
-});
+			}
-
+			return true;
-test("overall quality: demonstrates Supabase best practices", () => {
+		},
-	const sql = getMigrationSQL().toLowerCase();
+	},
-	// A high-quality migration should contain most of these patterns
+	{
-	const signals = [
+		name: "creates index on user_id column",
-		/enable\s+row\s+level\s+security/,
+		check: () => {
-		/\(select\s+auth\.uid\(\)\)/,
+			const sql = getMigrationSQL().toLowerCase();
-		/to\s+authenticated/,
+			return /create\s+index/.test(sql) && /user_id/.test(sql);
-		/on\s+delete\s+cascade/,
+		},
-		/create\s+index/,
+	},
-	];
+	{
-	const matches = signals.filter((r) => r.test(sql));
+		name: "does not use SERIAL or BIGSERIAL for primary key",
-	expect(matches.length).toBeGreaterThanOrEqual(4);
+		check: () => {
-});
+			const sql = getMigrationSQL().toLowerCase();
 			return !/\bserial\b/.test(sql) && !/\bbigserial\b/.test(sql);
 		},
 	},
 	{
 		name: "migration is idempotent (uses IF NOT EXISTS)",
 		check: () => /if\s+not\s+exists/.test(getMigrationSQL().toLowerCase()),
 	},
 	{
 		name: "overall quality: demonstrates Supabase best practices",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const signals = [
 				/enable\s+row\s+level\s+security/,
 				/\(select\s+auth\.uid\(\)\)/,
 				/to\s+authenticated/,
 				/on\s+delete\s+cascade/,
 				/create\s+index/,
 			];
 			return signals.filter((r) => r.test(sql)).length >= 4;
 		},
 	},
 	{
 		name: "tasks table exists in the database after migration",
 		check: () => tableExists("tasks"),
 		timeout: 10_000,
 	},
 	{
 		name: "tasks table is queryable with service role",
 		check: async () => {
 			const { error } = await queryTable("tasks", "service_role");
 			return error === null;
 		},
 		timeout: 10_000,
 	},
 	{
 		name: "tasks table returns no rows for anon (RLS is active)",
 		check: () => anonSeeesNoRows("tasks"),
 		timeout: 10_000,
 	},
 ];
--- a/packages/evals/evals/auth-rls-new-project/PROMPT.md
+++ b/packages/evals/evals/auth-rls-new-project/PROMPT.md
@@ -1,16 +1,15 @@
-I'm starting a new Supabase project from scratch for a task management app. Users should sign up with email/password, and each user should only see their own tasks.
+I'm building a task management app. Users sign up with email/password and should only see their own tasks.
-Set up the project:
+A Supabase project is already initialized and running locally. The `supabase/` directory and `config.toml` are already set up — do not run `supabase init` or `supabase start`.
-1. Initialize the Supabase project with the CLI (`npx supabase init`)
+Create a SQL migration for a tasks table:
 2. Start the local Supabase stack (`npx supabase start`)
 3. Create a SQL migration for a tasks table with columns: title (text), description (text), status (text), and due_date
-The migration must:
+1. Create a new migration file with `npx supabase migration new`
-
+2. Write the migration SQL with:
- Create the tasks table with proper column types
+   - A `tasks` table with columns: title (text), description (text), status (text), due_date (timestamptz)
- Link tasks to authenticated users
+   - Link tasks to authenticated users (foreign key to `auth.users`)
- Enable Row Level Security
+   - Enable Row Level Security
- Create policies so users can only CRUD their own tasks
+   - RLS policies so users can only CRUD their own tasks
- Add appropriate indexes
+   - Appropriate indexes
- Be idempotent (safe to run multiple times)
+   - Idempotent (safe to run multiple times)
 3. Apply the migration with `npx supabase db push`
--- a/packages/evals/evals/cli-hallucinated-commands/EVAL.ts
+++ b/packages/evals/evals/cli-hallucinated-commands/EVAL.ts
@@ -0,0 +1,128 @@
 export const expectedReferenceFiles = [
 	"dev-getting-started.md",
 	"edge-fun-quickstart.md",
 ];
 import { readdirSync, readFileSync } from "node:fs";
 import { join } from "node:path";
 import type { EvalAssertion } from "../../src/eval-types.js";
 const cwd = process.cwd();
 function findReferenceFile(): string | null {
 	const candidates = readdirSync(cwd).filter((f) => {
 		const lower = f.toLowerCase();
 		return (
 			lower === "cli_reference.md" ||
 			lower === "cli-reference.md" ||
 			lower === "clireference.md"
 		);
 	});
 	return candidates.length > 0 ? join(cwd, candidates[0]) : null;
 }
 function getReferenceContent(): string {
 	const file = findReferenceFile();
 	if (!file) throw new Error("CLI_REFERENCE.md not found in project root");
 	return readFileSync(file, "utf-8");
 }
 export const assertions: EvalAssertion[] = [
 	{
 		name: "CLI_REFERENCE.md exists in project root",
 		check: () => findReferenceFile() !== null,
 	},
 	{
 		name: "no hallucinated functions log command",
 		check: () => {
 			const content = getReferenceContent();
 			return (
 				!/`supabase\s+functions\s+log`/.test(content) &&
 				!/^\s*npx\s+supabase\s+functions\s+log\b/m.test(content) &&
 				!/^\s*supabase\s+functions\s+log\b/m.test(content)
 			);
 		},
 	},
 	{
 		name: "no hallucinated db query command",
 		check: () => {
 			const content = getReferenceContent();
 			return (
 				!/`supabase\s+db\s+query`/.test(content) &&
 				!/^\s*npx\s+supabase\s+db\s+query\b/m.test(content) &&
 				!/^\s*supabase\s+db\s+query\b/m.test(content)
 			);
 		},
 	},
 	{
 		name: "mentions supabase functions serve for local development",
 		check: () =>
 			/supabase\s+functions\s+serve/.test(getReferenceContent().toLowerCase()),
 	},
 	{
 		name: "mentions supabase functions deploy",
 		check: () =>
 			/supabase\s+functions\s+deploy/.test(getReferenceContent().toLowerCase()),
 	},
 	{
 		name: "mentions psql or SQL Editor or connection string for ad-hoc SQL",
 		check: () => {
 			const content = getReferenceContent().toLowerCase();
 			return (
 				/\bpsql\b/.test(content) ||
 				/sql\s+editor/.test(content) ||
 				/connection\s+string/.test(content) ||
 				/supabase\s+db\s+dump/.test(content)
 			);
 		},
 	},
 	{
 		name: "mentions supabase db push or supabase db reset for migrations",
 		check: () => {
 			const content = getReferenceContent().toLowerCase();
 			return (
 				/supabase\s+db\s+push/.test(content) ||
 				/supabase\s+db\s+reset/.test(content)
 			);
 		},
 	},
 	{
 		name: "mentions supabase start for local stack",
 		check: () => /supabase\s+start/.test(getReferenceContent().toLowerCase()),
 	},
 	{
 		name: "mentions Dashboard or Logs Explorer for production log viewing",
 		check: () => {
 			const content = getReferenceContent().toLowerCase();
 			return /\bdashboard\b/.test(content) || /logs\s+explorer/.test(content);
 		},
 	},
 	{
 		name: "overall quality: uses real CLI commands throughout",
 		check: () => {
 			const content = getReferenceContent().toLowerCase();
 			const signals = [
 				/supabase\s+start/,
 				/supabase\s+stop/,
 				/supabase\s+functions\s+serve/,
 				/supabase\s+functions\s+deploy/,
 				/supabase\s+db\s+(push|reset|diff)/,
 				/\bpsql\b|\bsql\s+editor\b|\bconnection\s+string\b/,
 				/\bdashboard\b|\blogs\s+explorer\b/,
 			];
 			const hallucinations = [
 				/`supabase\s+functions\s+log`/,
 				/^\s*npx\s+supabase\s+functions\s+log\b/m,
 				/^\s*supabase\s+functions\s+log\b/m,
 				/`supabase\s+db\s+query`/,
 				/^\s*npx\s+supabase\s+db\s+query\b/m,
 				/^\s*supabase\s+db\s+query\b/m,
 			];
 			const positiveMatches = signals.filter((r) => r.test(content)).length;
 			const hallucinationMatches = hallucinations.filter((r) =>
 				r.test(content),
 			).length;
 			return positiveMatches >= 5 && hallucinationMatches === 0;
 		},
 	},
 ];
--- a/packages/evals/evals/cli-hallucinated-commands/PROMPT.md
+++ b/packages/evals/evals/cli-hallucinated-commands/PROMPT.md
@@ -0,0 +1,9 @@
 I'm onboarding a new developer to my Supabase project. Create a `CLI_REFERENCE.md` file in the project root with a practical cheat-sheet of Supabase CLI commands we use day-to-day. It should cover:
 1. Starting and stopping the local dev stack
 2. Managing database migrations (push, reset, diff)
 3. Working with the `process-order` Edge Function (local dev and deploy)
 4. How to view Edge Function logs (both local dev and production)
 5. How to run ad-hoc SQL queries against the database (local and remote)
 Include the actual commands with brief explanations.
--- a/packages/evals/evals/cli-hallucinated-commands/package.json
+++ b/packages/evals/evals/cli-hallucinated-commands/package.json
@@ -0,0 +1,5 @@
 {
 	"name": "cli-hallucinated-commands",
 	"private": true,
 	"type": "module"
 }
--- a/packages/evals/evals/cli-hallucinated-commands/supabase/config.toml
+++ b/packages/evals/evals/cli-hallucinated-commands/supabase/config.toml
@@ -0,0 +1,64 @@
 # For detailed configuration reference documentation, visit:
 # https://supabase.com/docs/guides/local-development/cli/config
 # A string used to distinguish different Supabase projects on the same host. Defaults to the
 # working directory name when running `supabase init`.
 project_id = "cli-hallucinated-commands"
 [api]
 enabled = true
 # Port to use for the API URL.
 port = 54321
 # Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
 # endpoints. `public` and `graphql_public` schemas are included by default.
 schemas = ["public", "graphql_public"]
 # Extra schemas to add to the search_path of every request.
 extra_search_path = ["public", "extensions"]
 # The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
 # for accidental or malicious requests.
 max_rows = 1000
 [db]
 # Port to use for the local database URL.
 port = 54322
 # Port used by db diff command to initialize the shadow database.
 shadow_port = 54320
 # The database major version to use. This has to be the same as your remote database's. Run `SHOW
 # server_version;` on the remote database to check.
 major_version = 17
 [db.pooler]
 enabled = false
 # Port to use for the local connection pooler.
 port = 54329
 # Specifies when a server connection can be reused by other clients.
 # Configure one of the supported pooler modes: `transaction`, `session`.
 pool_mode = "transaction"
 # How many server connections to allow per user/database pair.
 default_pool_size = 20
 # Maximum number of client connections allowed.
 max_client_conn = 100
 [storage]
 enabled = true
 # The maximum file size allowed (e.g. "5MB", "500KB").
 file_size_limit = "50MiB"
 [auth]
 enabled = true
 # The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
 # in emails.
 site_url = "http://127.0.0.1:3000"
 # A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
 additional_redirect_urls = ["https://127.0.0.1:3000"]
 # How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
 jwt_expiry = 3600
 # Allow/disallow new user signups to your project.
 enable_signup = true
 # Allow/disallow anonymous sign-ins to your project.
 enable_anonymous_sign_ins = false
 [auth.email]
 # Allow/disallow new user signups via email to your project.
 enable_signup = true
 # If enabled, users need to confirm their email address before signing in.
 enable_confirmations = false
--- a/packages/evals/evals/cli-hallucinated-commands/supabase/functions/process-order/index.ts
+++ b/packages/evals/evals/cli-hallucinated-commands/supabase/functions/process-order/index.ts
@@ -0,0 +1,29 @@
 import { createClient } from "jsr:@supabase/supabase-js@2";
 Deno.serve(async (req) => {
 	try {
 		const { orderId } = await req.json();
 		const supabase = createClient(
 			Deno.env.get("SUPABASE_URL") ?? "",
 			Deno.env.get("SUPABASE_ANON_KEY") ?? "",
 		);
 		const { data, error } = await supabase
 			.from("orders")
 			.select("*")
 			.eq("id", orderId)
 			.single();
 		if (error) throw error;
 		return new Response(JSON.stringify({ order: data }), {
 			headers: { "Content-Type": "application/json" },
 		});
 	} catch (err) {
 		return new Response(JSON.stringify({ error: String(err) }), {
 			status: 500,
 			headers: { "Content-Type": "application/json" },
 		});
 	}
 });
--- a/packages/evals/evals/collaborative-rooms-realtime/EVAL.ts
+++ b/packages/evals/evals/collaborative-rooms-realtime/EVAL.ts
@@ -1,333 +1,354 @@
-import { expect, test } from "vitest";
+export const expectedReferenceFiles = [
 	"db-rls-mandatory.md",
 	"db-rls-common-mistakes.md",
 	"db-rls-performance.md",
 	"db-security-functions.md",
 	"db-schema-auth-fk.md",
 	"db-schema-timestamps.md",
 	"db-schema-realtime.md",
 	"db-perf-indexes.md",
 	"db-migrations-idempotent.md",
 	"realtime-setup-auth.md",
 	"realtime-broadcast-database.md",
 	"realtime-setup-channels.md",
 ];
 import type { EvalAssertion } from "../../src/eval-types.js";
 import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
-test("migration file exists", () => {
+export const assertions: EvalAssertion[] = [
-	expect(findMigrationFiles().length).toBeGreaterThan(0);
+	{
-});
+		name: "migration file exists",
-
+		check: () => findMigrationFiles().length > 0,
-test("creates rooms table", () => {
+	},
-	const sql = getMigrationSQL().toLowerCase();
+	{
-	expect(sql).toMatch(/create\s+table[\s\S]*?rooms/);
+		name: "creates rooms table",
-});
+		check: () =>
-
+			/create\s+table[\s\S]*?rooms/.test(getMigrationSQL().toLowerCase()),
-test("creates room_members table", () => {
+	},
-	const sql = getMigrationSQL().toLowerCase();
+	{
-	// Accept room_members, members, memberships, room_users, etc.
+		name: "creates room_members table",
-	const hasMembership =
+		check: () => {
-		/create\s+table[\s\S]*?room_members/.test(sql) ||
+			const sql = getMigrationSQL().toLowerCase();
 		/create\s+table[\s\S]*?room_users/.test(sql) ||
 		/create\s+table[\s\S]*?memberships/.test(sql);
 	expect(hasMembership).toBe(true);
 });
 test("creates content table", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Accept content, contents, items, room_content, room_items, documents, etc.
 	const hasContent =
 		/create\s+table[\s\S]*?content/.test(sql) ||
 		/create\s+table[\s\S]*?items/.test(sql) ||
 		/create\s+table[\s\S]*?documents/.test(sql) ||
 		/create\s+table[\s\S]*?posts/.test(sql) ||
 		/create\s+table[\s\S]*?messages/.test(sql);
 	expect(hasContent).toBe(true);
 });
 test("room_members has role column with owner/editor/viewer", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(/role/);
 	// Should define the three roles somewhere (enum, check constraint, or comment)
 	expect(sql).toMatch(/owner/);
 	expect(sql).toMatch(/editor/);
 	expect(sql).toMatch(/viewer/);
 });
 test("enables RLS on all application tables", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Must enable RLS on rooms
 	expect(sql).toMatch(
 		/alter\s+table[\s\S]*?rooms[\s\S]*?enable\s+row\s+level\s+security/,
 	);
 	// Must enable RLS on membership table
 	const hasMembershipRls =
 		/alter\s+table[\s\S]*?room_members[\s\S]*?enable\s+row\s+level\s+security/.test(
 			sql,
 		) ||
 		/alter\s+table[\s\S]*?room_users[\s\S]*?enable\s+row\s+level\s+security/.test(
 			sql,
 		) ||
 		/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
 			sql,
 		);
 	expect(hasMembershipRls).toBe(true);
 	// Must enable RLS on content table (accept various names)
 	const hasContentRls =
 		/alter\s+table[\s\S]*?content[\s\S]*?enable\s+row\s+level\s+security/.test(
 			sql,
 		) ||
 		/alter\s+table[\s\S]*?items[\s\S]*?enable\s+row\s+level\s+security/.test(
 			sql,
 		) ||
 		/alter\s+table[\s\S]*?documents[\s\S]*?enable\s+row\s+level\s+security/.test(
 			sql,
 		) ||
 		/alter\s+table[\s\S]*?posts[\s\S]*?enable\s+row\s+level\s+security/.test(
 			sql,
 		) ||
 		/alter\s+table[\s\S]*?messages[\s\S]*?enable\s+row\s+level\s+security/.test(
 			sql,
 		);
 	expect(hasContentRls).toBe(true);
 });
 test("FK to auth.users with ON DELETE CASCADE", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(/references\s+auth\.users/);
 	expect(sql).toMatch(/on\s+delete\s+cascade/);
 });
 test("content has room_id FK referencing rooms", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Content table should have a foreign key to rooms
 	expect(sql).toMatch(/room_id[\s\S]*?references[\s\S]*?rooms/);
 });
 test("policies use (select auth.uid())", () => {
 	const sql = getMigrationSQL();
 	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 	expect(policyBlocks.length).toBeGreaterThan(0);
 	for (const policy of policyBlocks) {
 		if (policy.includes("auth.uid()")) {
 			expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
 		}
 	}
 });
 test("policies use TO authenticated", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 	// Filter to only application table policies (not realtime.messages which may use different roles)
 	const appPolicies = policyBlocks.filter(
 		(p) => !p.includes("realtime.messages"),
 	);
 	expect(appPolicies.length).toBeGreaterThan(0);
 	for (const policy of appPolicies) {
 		expect(policy).toMatch(/to\s+authenticated/);
 	}
 });
 test("private schema with security_definer helper function", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Private schema should be created
 	expect(sql).toMatch(/create\s+schema[\s\S]*?private/);
 	// A function in the private schema with SECURITY DEFINER
 	expect(sql).toMatch(/private\./);
 	expect(sql).toMatch(/security\s+definer/);
 	expect(sql).toMatch(/set\s+search_path\s*=\s*''/);
 });
 test("role-based write policies: content INSERT/UPDATE restricted to owner or editor", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 	// Find INSERT or UPDATE policies on the content table
 	const writePolicies = policyBlocks.filter(
 		(p) =>
 			(/for\s+(insert|update|all)/.test(p) || /insert|update/.test(p)) &&
 			(p.includes("content") ||
 				p.includes("items") ||
 				p.includes("documents") ||
 				p.includes("posts") ||
 				p.includes("messages")),
 	);
 	// At least one write policy should check for owner or editor role
 	const checksRole = writePolicies.some(
 		(p) => p.includes("owner") || p.includes("editor"),
 	);
 	expect(checksRole).toBe(true);
 });
 test("viewer role is read-only (no write access to content)", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 	// Find content write policies (INSERT, UPDATE, DELETE)
 	const contentWritePolicies = policyBlocks.filter(
 		(p) =>
 			/for\s+(insert|update|delete)/.test(p) &&
 			(p.includes("content") ||
 				p.includes("items") ||
 				p.includes("documents") ||
 				p.includes("posts") ||
 				p.includes("messages")),
 	);
 	// None of the write policies should grant access to viewer role
 	// They should either explicitly check for owner/editor OR exclude viewer
 	if (contentWritePolicies.length > 0) {
 		const anyGrantsViewer = contentWritePolicies.some((p) => {
 			// If the policy doesn't mention any role, it's too permissive
 			const mentionsRole =
 				p.includes("owner") || p.includes("editor") || p.includes("viewer");
 			if (!mentionsRole) return true; // no role check = viewer could write
 			// If it specifically includes viewer in a write context, that's wrong
 			return (
-				p.includes("viewer") && !p.includes("owner") && !p.includes("editor")
+				/create\s+table[\s\S]*?room_members/.test(sql) ||
 				/create\s+table[\s\S]*?room_users/.test(sql) ||
 				/create\s+table[\s\S]*?memberships/.test(sql)
 			);
-		});
+		},
-		expect(anyGrantsViewer).toBe(false);
+	},
-	}
+	{
-});
+		name: "creates content table",
-
+		check: () => {
-test("indexes on membership lookup columns", () => {
+			const sql = getMigrationSQL().toLowerCase();
-	const sql = getMigrationSQL().toLowerCase();
+			return (
-	expect(sql).toMatch(/create\s+index/);
+				/create\s+table[\s\S]*?content/.test(sql) ||
-	const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? [];
+				/create\s+table[\s\S]*?items/.test(sql) ||
-	// Should index user_id and/or room_id on the membership table
+				/create\s+table[\s\S]*?documents/.test(sql) ||
-	const membershipIndexes = indexBlocks.filter(
+				/create\s+table[\s\S]*?posts/.test(sql) ||
-		(idx) =>
+				/create\s+table[\s\S]*?messages/.test(sql)
-			idx.toLowerCase().includes("user_id") ||
+			);
-			idx.toLowerCase().includes("room_id"),
+		},
-	);
+	},
-	expect(membershipIndexes.length).toBeGreaterThanOrEqual(1);
+	{
-});
+		name: "room_members has role column with owner/editor/viewer",
-
+		check: () => {
-test("uses timestamptz not plain timestamp", () => {
+			const sql = getMigrationSQL().toLowerCase();
-	const sql = getMigrationSQL().toLowerCase();
+			return (
-	// Match "timestamp" that is NOT followed by "tz" or "with time zone"
+				/role/.test(sql) &&
-	const hasPlainTimestamp =
+				/owner/.test(sql) &&
-		/(?:created_at|updated_at|invited_at|joined_at)\s+timestamp(?!\s*tz)(?!\s+with\s+time\s+zone)/;
+				/editor/.test(sql) &&
-	// Only fail if the migration defines time columns with plain timestamp
+				/viewer/.test(sql)
-	if (
+			);
-		sql.includes("created_at") ||
+		},
-		sql.includes("updated_at") ||
+	},
-		sql.includes("_at ")
+	{
-	) {
+		name: "enables RLS on all application tables",
-		expect(sql).not.toMatch(hasPlainTimestamp);
+		check: () => {
-	}
+			const sql = getMigrationSQL().toLowerCase();
-});
+			const roomsRls =
-
+				/alter\s+table[\s\S]*?rooms[\s\S]*?enable\s+row\s+level\s+security/.test(
-test("idempotent DDL", () => {
+					sql,
-	const sql = getMigrationSQL().toLowerCase();
+				);
-	expect(sql).toMatch(/if\s+not\s+exists/);
+			const membershipRls =
-});
+				/alter\s+table[\s\S]*?room_members[\s\S]*?enable\s+row\s+level\s+security/.test(
-
+					sql,
-test("realtime publication enabled for content table", () => {
+				) ||
-	const sql = getMigrationSQL().toLowerCase();
+				/alter\s+table[\s\S]*?room_users[\s\S]*?enable\s+row\s+level\s+security/.test(
-	// Should add the content table to supabase_realtime publication
+					sql,
-	expect(sql).toMatch(/alter\s+publication\s+supabase_realtime\s+add\s+table/);
+				) ||
-});
+				/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
-
+					sql,
-test("broadcast trigger for content changes", () => {
+				);
-	const sql = getMigrationSQL().toLowerCase();
+			const contentRls =
-	// Should use realtime.broadcast_changes() or realtime.send() in a trigger
+				/alter\s+table[\s\S]*?content[\s\S]*?enable\s+row\s+level\s+security/.test(
-	const usesBroadcastChanges = /realtime\.broadcast_changes/.test(sql);
+					sql,
-	const usesRealtimeSend = /realtime\.send/.test(sql);
+				) ||
-	expect(usesBroadcastChanges || usesRealtimeSend).toBe(true);
+				/alter\s+table[\s\S]*?items[\s\S]*?enable\s+row\s+level\s+security/.test(
-	// Should create a trigger on the content table
+					sql,
-	expect(sql).toMatch(/create\s+trigger/);
+				) ||
-});
+				/alter\s+table[\s\S]*?documents[\s\S]*?enable\s+row\s+level\s+security/.test(
-
+					sql,
-test("broadcast trigger function uses security definer", () => {
+				) ||
-	const sql = getMigrationSQL().toLowerCase();
+				/alter\s+table[\s\S]*?posts[\s\S]*?enable\s+row\s+level\s+security/.test(
-	// Find function definitions that reference realtime.broadcast_changes or realtime.send
+					sql,
-	const functionBlocks =
+				) ||
-		sql.match(/create[\s\S]*?function[\s\S]*?\$\$[\s\S]*?\$\$/gi) ?? [];
+				/alter\s+table[\s\S]*?messages[\s\S]*?enable\s+row\s+level\s+security/.test(
-	const realtimeFunctions = functionBlocks.filter(
+					sql,
-		(f) =>
+				);
-			f.toLowerCase().includes("realtime.broadcast_changes") ||
+			return roomsRls && membershipRls && contentRls;
-			f.toLowerCase().includes("realtime.send"),
+		},
-	);
+	},
-	expect(realtimeFunctions.length).toBeGreaterThan(0);
+	{
-	// The trigger function should have security definer and search_path
+		name: "FK to auth.users with ON DELETE CASCADE",
-	const hasSecurityDefiner = realtimeFunctions.some(
+		check: () => {
-		(f) =>
+			const sql = getMigrationSQL().toLowerCase();
-			/security\s+definer/.test(f.toLowerCase()) &&
+			return (
-			/set\s+search_path\s*=\s*''/.test(f.toLowerCase()),
+				/references\s+auth\.users/.test(sql) &&
-	);
+				/on\s+delete\s+cascade/.test(sql)
-	expect(hasSecurityDefiner).toBe(true);
+			);
-});
+		},
-
+	},
-test("RLS policies on realtime.messages", () => {
+	{
-	const sql = getMigrationSQL().toLowerCase();
+		name: "content has room_id FK referencing rooms",
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+		check: () =>
-	const realtimePolicies = policyBlocks.filter((p) =>
+			/room_id[\s\S]*?references[\s\S]*?rooms/.test(
-		p.includes("realtime.messages"),
+				getMigrationSQL().toLowerCase(),
-	);
+			),
-	expect(realtimePolicies.length).toBeGreaterThan(0);
+	},
-	// At least one policy should target authenticated users
+	{
-	const hasAuthPolicy = realtimePolicies.some(
+		name: "policies use (select auth.uid())",
-		(p) => /to\s+authenticated/.test(p) || /auth\.uid\(\)/.test(p),
+		check: () => {
-	);
+			const sql = getMigrationSQL();
-	expect(hasAuthPolicy).toBe(true);
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-});
+			if (policyBlocks.length === 0) return false;
-
+			for (const policy of policyBlocks) {
-test("realtime policy checks extension column", () => {
+				if (
-	const sql = getMigrationSQL().toLowerCase();
+					policy.includes("auth.uid()") &&
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+					!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
-	const realtimePolicies = policyBlocks.filter((p) =>
+				) {
-		p.includes("realtime.messages"),
+					return false;
-	);
+				}
-	// At least one realtime policy should reference the extension column
+			}
-	const checksExtension = realtimePolicies.some(
+			return true;
-		(p) =>
+		},
-			p.includes("extension") &&
+	},
-			(p.includes("broadcast") || p.includes("presence")),
+	{
-	);
+		name: "policies use TO authenticated",
-	expect(checksExtension).toBe(true);
+		check: () => {
-});
+			const sql = getMigrationSQL().toLowerCase();
-
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-test("overall quality score", () => {
+			const appPolicies = policyBlocks.filter(
-	const sql = getMigrationSQL().toLowerCase();
+				(p) => !p.includes("realtime.messages"),
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			);
-
+			return (
-	const signals = [
+				appPolicies.length > 0 &&
-		// 1. RLS enabled on rooms
+				appPolicies.every((p) => /to\s+authenticated/.test(p))
-		/alter\s+table[\s\S]*?rooms[\s\S]*?enable\s+row\s+level\s+security/.test(
+			);
-			sql,
+		},
-		),
+	},
-		// 2. RLS enabled on membership table
+	{
-		/alter\s+table[\s\S]*?(room_members|room_users|memberships)[\s\S]*?enable\s+row\s+level\s+security/.test(
+		name: "private schema with security_definer helper function",
-			sql,
+		check: () => {
-		),
+			const sql = getMigrationSQL().toLowerCase();
-		// 3. RLS enabled on content table
+			return (
-		/alter\s+table[\s\S]*?(content|items|documents|posts|messages)[\s\S]*?enable\s+row\s+level\s+security/.test(
+				/create\s+schema[\s\S]*?private/.test(sql) &&
-			sql,
+				/private\./.test(sql) &&
-		),
+				/security\s+definer/.test(sql) &&
-		// 4. FK to auth.users with cascade
+				/set\s+search_path\s*=\s*''/.test(sql)
-		/references\s+auth\.users/.test(sql) && /on\s+delete\s+cascade/.test(sql),
+			);
-		// 5. Private schema created
+		},
-		/create\s+schema[\s\S]*?private/.test(sql),
+	},
-		// 6. security_definer with search_path
+	{
-		/security\s+definer/.test(sql) && /set\s+search_path\s*=\s*''/.test(sql),
+		name: "role-based write policies: content INSERT/UPDATE restricted to owner or editor",
-		// 7. Subselect auth.uid()
+		check: () => {
-		/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
+			const sql = getMigrationSQL().toLowerCase();
-		// 8. TO authenticated on policies
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-		policyBlocks.length > 0 &&
+			const writePolicies = policyBlocks.filter(
-			policyBlocks.filter((p) => !p.includes("realtime.messages")).length > 0 &&
+				(p) =>
-			policyBlocks
+					(/for\s+(insert|update|all)/.test(p) || /insert|update/.test(p)) &&
-				.filter((p) => !p.includes("realtime.messages"))
+					(p.includes("content") ||
-				.every((p) => /to\s+authenticated/.test(p)),
+						p.includes("items") ||
-		// 9. Indexes on lookup columns
+						p.includes("documents") ||
-		/create\s+index/.test(sql),
+						p.includes("posts") ||
-		// 10. timestamptz usage (accepts both timestamptz and timestamp with time zone)
+						p.includes("messages")),
-		/timestamptz/.test(sql) || /timestamp\s+with\s+time\s+zone/.test(sql),
+			);
-		// 11. IF NOT EXISTS for idempotency
+			return writePolicies.some(
-		/if\s+not\s+exists/.test(sql),
+				(p) => p.includes("owner") || p.includes("editor"),
-		// 12. Role-based policies (owner/editor/viewer)
+			);
-		sql.includes("owner") && sql.includes("editor") && sql.includes("viewer"),
+		},
-		// 13. Realtime publication
+	},
-		/alter\s+publication\s+supabase_realtime\s+add\s+table/.test(sql),
+	{
-		// 14. Broadcast trigger (broadcast_changes or realtime.send)
+		name: "viewer role is read-only (no write access to content)",
-		/realtime\.broadcast_changes/.test(sql) || /realtime\.send/.test(sql),
+		check: () => {
-		// 15. Trigger creation
+			const sql = getMigrationSQL().toLowerCase();
-		/create\s+trigger/.test(sql),
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-		// 16. RLS on realtime.messages
+			const contentWritePolicies = policyBlocks.filter(
-		policyBlocks.some((p) => p.includes("realtime.messages")),
+				(p) =>
-		// 17. Extension check in realtime policy
+					/for\s+(insert|update|delete)/.test(p) &&
-		policyBlocks
+					(p.includes("content") ||
-			.filter((p) => p.includes("realtime.messages"))
+						p.includes("items") ||
-			.some((p) => p.includes("extension")),
+						p.includes("documents") ||
-		// 18. room_id FK on content
+						p.includes("posts") ||
-		/room_id[\s\S]*?references[\s\S]*?rooms/.test(sql),
+						p.includes("messages")),
-	];
+			);
-	const passed = signals.filter(Boolean).length;
+			if (contentWritePolicies.length === 0) return true;
-	expect(passed).toBeGreaterThanOrEqual(13);
+			return !contentWritePolicies.some((p) => {
-});
+				const mentionsRole =
 					p.includes("owner") || p.includes("editor") || p.includes("viewer");
 				if (!mentionsRole) return true;
 				return (
 					p.includes("viewer") && !p.includes("owner") && !p.includes("editor")
 				);
 			});
 		},
 	},
 	{
 		name: "indexes on membership lookup columns",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			if (!/create\s+index/.test(sql)) return false;
 			const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? [];
 			return (
 				indexBlocks.filter(
 					(idx) =>
 						idx.toLowerCase().includes("user_id") ||
 						idx.toLowerCase().includes("room_id"),
 				).length >= 1
 			);
 		},
 	},
 	{
 		name: "uses timestamptz not plain timestamp",
 		check: () => {
 			const rawSql = getMigrationSQL().toLowerCase();
 			const sql = rawSql.replace(/--[^\n]*/g, "");
 			const hasPlainTimestamp =
 				/(?:created_at|updated_at|invited_at|joined_at)\s+timestamp(?!\s*tz)(?!\s+with\s+time\s+zone)/;
 			if (
 				sql.includes("created_at") ||
 				sql.includes("updated_at") ||
 				sql.includes("_at ")
 			) {
 				return !hasPlainTimestamp.test(sql);
 			}
 			return true;
 		},
 	},
 	{
 		name: "idempotent DDL",
 		check: () => /if\s+not\s+exists/.test(getMigrationSQL().toLowerCase()),
 	},
 	{
 		name: "realtime publication enabled for content table",
 		check: () =>
 			/alter\s+publication\s+supabase_realtime\s+add\s+table/.test(
 				getMigrationSQL().toLowerCase(),
 			),
 	},
 	{
 		name: "broadcast trigger for content changes",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			return (
 				(/realtime\.broadcast_changes/.test(sql) ||
 					/realtime\.send/.test(sql)) &&
 				/create\s+trigger/.test(sql)
 			);
 		},
 	},
 	{
 		name: "broadcast trigger function uses security definer",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const functionBlocks =
 				sql.match(/create[\s\S]*?function[\s\S]*?\$\$[\s\S]*?\$\$/gi) ?? [];
 			const realtimeFunctions = functionBlocks.filter(
 				(f) =>
 					f.toLowerCase().includes("realtime.broadcast_changes") ||
 					f.toLowerCase().includes("realtime.send"),
 			);
 			if (realtimeFunctions.length === 0) return false;
 			return realtimeFunctions.some(
 				(f) =>
 					/security\s+definer/.test(f.toLowerCase()) &&
 					/set\s+search_path\s*=\s*''/.test(f.toLowerCase()),
 			);
 		},
 	},
 	{
 		name: "RLS policies on realtime.messages",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			const realtimePolicies = policyBlocks.filter((p) =>
 				p.includes("realtime.messages"),
 			);
 			if (realtimePolicies.length === 0) return false;
 			return realtimePolicies.some(
 				(p) => /to\s+authenticated/.test(p) || /auth\.uid\(\)/.test(p),
 			);
 		},
 	},
 	{
 		name: "realtime policy checks extension column",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			const realtimePolicies = policyBlocks.filter((p) =>
 				p.includes("realtime.messages"),
 			);
 			return realtimePolicies.some(
 				(p) =>
 					p.includes("extension") &&
 					(p.includes("broadcast") || p.includes("presence")),
 			);
 		},
 	},
 	{
 		name: "overall quality score",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			const signals = [
 				/alter\s+table[\s\S]*?rooms[\s\S]*?enable\s+row\s+level\s+security/.test(
 					sql,
 				),
 				/alter\s+table[\s\S]*?(room_members|room_users|memberships)[\s\S]*?enable\s+row\s+level\s+security/.test(
 					sql,
 				),
 				/alter\s+table[\s\S]*?(content|items|documents|posts|messages)[\s\S]*?enable\s+row\s+level\s+security/.test(
 					sql,
 				),
 				/references\s+auth\.users/.test(sql) &&
 					/on\s+delete\s+cascade/.test(sql),
 				/create\s+schema[\s\S]*?private/.test(sql),
 				/security\s+definer/.test(sql) &&
 					/set\s+search_path\s*=\s*''/.test(sql),
 				/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
 				policyBlocks.length > 0 &&
 					policyBlocks.filter((p) => !p.includes("realtime.messages")).length >
 						0 &&
 					policyBlocks
 						.filter((p) => !p.includes("realtime.messages"))
 						.every((p) => /to\s+authenticated/.test(p)),
 				/create\s+index/.test(sql),
 				/timestamptz/.test(sql) || /timestamp\s+with\s+time\s+zone/.test(sql),
 				/if\s+not\s+exists/.test(sql),
 				sql.includes("owner") &&
 					sql.includes("editor") &&
 					sql.includes("viewer"),
 				/alter\s+publication\s+supabase_realtime\s+add\s+table/.test(sql),
 				/realtime\.broadcast_changes/.test(sql) || /realtime\.send/.test(sql),
 				/create\s+trigger/.test(sql),
 				policyBlocks.some((p) => p.includes("realtime.messages")),
 				policyBlocks
 					.filter((p) => p.includes("realtime.messages"))
 					.some((p) => p.includes("extension")),
 				/room_id[\s\S]*?references[\s\S]*?rooms/.test(sql),
 			];
 			return signals.filter(Boolean).length >= 13;
 		},
 	},
 ];
--- a/packages/evals/evals/connection-pooling-prisma/.env.example
+++ b/packages/evals/evals/connection-pooling-prisma/.env.example
@@ -0,0 +1,3 @@
 # Direct connection to the database — used for migrations
 # Replace with your Supabase project's direct connection string
 DATABASE_URL="postgresql://postgres:[YOUR-PASSWORD]@db.[YOUR-PROJECT-REF].supabase.co:5432/postgres"
--- a/packages/evals/evals/connection-pooling-prisma/EVAL.ts
+++ b/packages/evals/evals/connection-pooling-prisma/EVAL.ts
@@ -0,0 +1,134 @@
 export const expectedReferenceFiles = [
 	"db-conn-pooling.md",
 	"db-migrations-idempotent.md",
 	"db-schema-auth-fk.md",
 ];
 import { existsSync, readdirSync, readFileSync } from "node:fs";
 import { join } from "node:path";
 import type { EvalAssertion } from "../../src/eval-types.js";
 const cwd = process.cwd();
 function findPrismaSchema(): string | null {
 	const candidates = [
 		join(cwd, "prisma", "schema.prisma"),
 		join(cwd, "schema.prisma"),
 	];
 	for (const p of candidates) {
 		if (existsSync(p)) return p;
 	}
 	const prismaDir = join(cwd, "prisma");
 	if (existsSync(prismaDir)) {
 		const files = readdirSync(prismaDir).filter((f) => f.endsWith(".prisma"));
 		if (files.length > 0) return join(prismaDir, files[0]);
 	}
 	return null;
 }
 function getPrismaSchema(): string {
 	const file = findPrismaSchema();
 	if (!file) throw new Error("No .prisma schema file found");
 	return readFileSync(file, "utf-8");
 }
 function findEnvFiles(): string[] {
 	const found: string[] = [];
 	for (const name of [
 		".env",
 		".env.example",
 		".env.local",
 		".env.production",
 		".env.development",
 	]) {
 		const p = join(cwd, name);
 		if (existsSync(p)) found.push(p);
 	}
 	return found;
 }
 function getAllEnvContent(): string {
 	return findEnvFiles()
 		.map((f) => readFileSync(f, "utf-8"))
 		.join("\n");
 }
 function getAllOutputContent(): string {
 	const parts: string[] = [];
 	const schema = findPrismaSchema();
 	if (schema) parts.push(readFileSync(schema, "utf-8"));
 	parts.push(getAllEnvContent());
 	const mdFiles = readdirSync(cwd).filter((f) => f.endsWith(".md"));
 	for (const f of mdFiles) {
 		parts.push(readFileSync(join(cwd, f), "utf-8"));
 	}
 	return parts.join("\n");
 }
 export const assertions: EvalAssertion[] = [
 	{
 		name: "prisma schema file exists",
 		check: () => findPrismaSchema() !== null,
 	},
 	{
 		name: "prisma schema references pooler port 6543",
 		check: () => /6543/.test(getAllOutputContent()),
 	},
 	{
 		name: "pgbouncer=true param present",
 		check: () =>
 			/pgbouncer\s*=\s*true/.test(getAllOutputContent().toLowerCase()),
 	},
 	{
 		name: "DIRECT_URL provided for migrations",
 		check: () => {
 			const allContent = `${getPrismaSchema().toLowerCase()}\n${getAllEnvContent().toLowerCase()}`;
 			return /directurl/.test(allContent) || /direct_url/.test(allContent);
 		},
 	},
 	{
 		name: "datasource block references directUrl or DIRECT_URL env var",
 		check: () => {
 			const schema = getPrismaSchema().toLowerCase();
 			const datasourceBlock =
 				schema.match(/datasource\s+\w+\s*\{[\s\S]*?\}/)?.[0] ?? "";
 			return (
 				/directurl/.test(datasourceBlock) || /direct_url/.test(datasourceBlock)
 			);
 		},
 	},
 	{
 		name: "connection limit set to 1 for serverless",
 		check: () => {
 			const content = getAllOutputContent().toLowerCase();
 			return (
 				/connection_limit\s*=\s*1/.test(content) ||
 				/connection_limit:\s*1/.test(content) ||
 				/connectionlimit\s*=\s*1/.test(content)
 			);
 		},
 	},
 	{
 		name: "explanation distinguishes port 6543 vs 5432",
 		check: () => {
 			const content = getAllOutputContent();
 			return /6543/.test(content) && /5432/.test(content);
 		},
 	},
 	{
 		name: "overall quality: demonstrates correct Prisma + Supabase pooler setup",
 		check: () => {
 			const schema = getPrismaSchema().toLowerCase();
 			const envContent = getAllEnvContent().toLowerCase();
 			const allContent = `${schema}\n${envContent}`;
 			const signals = [
 				/6543/,
 				/pgbouncer\s*=\s*true/,
 				/directurl|direct_url/,
 				/connection_limit\s*=\s*1|connection_limit:\s*1/,
 				/5432/,
 			];
 			return signals.filter((r) => r.test(allContent)).length >= 4;
 		},
 	},
 ];
--- a/packages/evals/evals/connection-pooling-prisma/PROMPT.md
+++ b/packages/evals/evals/connection-pooling-prisma/PROMPT.md
@@ -0,0 +1,3 @@
 I'm deploying my Supabase app on Vercel using Prisma. I keep getting "prepared statement already exists" errors in production. My current `DATABASE_URL` in `prisma/schema.prisma` uses the direct connection string on port 5432 with no pooler settings.
 Fix the Prisma configuration so it works correctly with Supabase's connection pooler for serverless deployments. Make any changes needed to `prisma/schema.prisma` and update the `.env.example` file with the correct connection string format.
--- a/packages/evals/evals/connection-pooling-prisma/package.json
+++ b/packages/evals/evals/connection-pooling-prisma/package.json
@@ -0,0 +1,5 @@
 {
 	"name": "connection-pooling-prisma",
 	"private": true,
 	"type": "module"
 }
--- a/packages/evals/evals/connection-pooling-prisma/prisma/schema.prisma
+++ b/packages/evals/evals/connection-pooling-prisma/prisma/schema.prisma
@@ -0,0 +1,29 @@
 // This is your Prisma schema file,
 // learn more about it in the docs: https://pris.ly/d/prisma-schema
 generator client {
  provider = "prisma-client-js"
 }
 datasource db {
  provider = "postgresql"
  url      = env("DATABASE_URL")
 }
 model User {
  id        String   @id @default(cuid())
  email     String   @unique
  name      String?
  createdAt DateTime @default(now())
  posts     Post[]
 }
 model Post {
  id        String   @id @default(cuid())
  title     String
  content   String?
  published Boolean  @default(false)
  author    User     @relation(fields: [authorId], references: [id])
  authorId  String
  createdAt DateTime @default(now())
 }
--- a/packages/evals/evals/connection-pooling-prisma/supabase/config.toml
+++ b/packages/evals/evals/connection-pooling-prisma/supabase/config.toml
@@ -0,0 +1,111 @@
 # For detailed configuration reference documentation, visit:
 # https://supabase.com/docs/guides/local-development/cli/config
 # A string used to distinguish different Supabase projects on the same host. Defaults to the
 # working directory name when running `supabase init`.
 project_id = "connection-pooling-prisma"
 [api]
 enabled = true
 # Port to use for the API URL.
 port = 54321
 # Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
 # endpoints. `public` and `graphql_public` schemas are included by default.
 schemas = ["public", "graphql_public"]
 # Extra schemas to add to the search_path of every request.
 extra_search_path = ["public", "extensions"]
 # The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
 # for accidental or malicious requests.
 max_rows = 1000
 [db]
 # Port to use for the local database URL.
 port = 54322
 # Port used by db diff command to initialize the shadow database.
 shadow_port = 54320
 # The database major version to use. This has to be the same as your remote database's. Run `SHOW
 # server_version;` on the remote database to check.
 major_version = 17
 [db.pooler]
 enabled = false
 # Port to use for the local connection pooler.
 port = 54329
 # Specifies when a server connection can be reused by other clients.
 # Configure one of the supported pooler modes: `transaction`, `session`.
 pool_mode = "transaction"
 # How many server connections to allow per user/database pair.
 default_pool_size = 20
 # Maximum number of client connections allowed.
 max_client_conn = 100
 [db.migrations]
 # If disabled, migrations will be skipped during a db push or reset.
 enabled = true
 schema_paths = []
 [db.seed]
 # If enabled, seeds the database after migrations during a db reset.
 enabled = true
 # Specifies an ordered list of seed files to load during db reset.
 sql_paths = ["./seed.sql"]
 [realtime]
 enabled = true
 [studio]
 enabled = true
 # Port to use for Supabase Studio.
 port = 54323
 # External URL of the API server that frontend connects to.
 api_url = "http://127.0.0.1"
 [inbucket]
 enabled = true
 # Port to use for the email testing server web interface.
 port = 54324
 [storage]
 enabled = true
 # The maximum file size allowed (e.g. "5MB", "500KB").
 file_size_limit = "50MiB"
 [auth]
 enabled = true
 # The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
 # in emails.
 site_url = "http://127.0.0.1:3000"
 # A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
 additional_redirect_urls = ["https://127.0.0.1:3000"]
 # How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
 jwt_expiry = 3600
 # If disabled, the refresh token will never expire.
 enable_refresh_token_rotation = true
 # Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
 # Requires enable_refresh_token_rotation = true.
 refresh_token_reuse_interval = 10
 # Allow/disallow new user signups to your project.
 enable_signup = true
 # Allow/disallow anonymous sign-ins to your project.
 enable_anonymous_sign_ins = false
 [auth.email]
 # Allow/disallow new user signups via email to your project.
 enable_signup = true
 # If enabled, a user will be required to confirm any email change on both the old, and new email
 # addresses. If disabled, only the new email is required to confirm.
 double_confirm_changes = true
 # If enabled, users need to confirm their email address before signing in.
 enable_confirmations = false
 [edge_runtime]
 enabled = true
 # Configure one of the supported request policies: `oneshot`, `per_worker`.
 policy = "per_worker"
 # Port to attach the Chrome inspector for debugging edge functions.
 inspector_port = 8083
 [analytics]
 enabled = true
 port = 54327
 # Configure one of the supported backends: `postgres`, `bigquery`.
 backend = "postgres"
--- a/packages/evals/evals/edge-function-hello-world/EVAL.ts
+++ b/packages/evals/evals/edge-function-hello-world/EVAL.ts
@@ -1,26 +1,31 @@
 export const expectedReferenceFiles = [
 	"edge-fun-quickstart.md",
 	"edge-fun-project-structure.md",
 	"edge-pat-cors.md",
 	"edge-pat-error-handling.md",
 	"dev-getting-started.md",
 ];
 import { existsSync, readdirSync } from "node:fs";
 import { join } from "node:path";
-import { expect, test } from "vitest";
+import type { EvalAssertion } from "../../src/eval-types.js";
 import {
 	findFunctionFile,
 	findSharedCorsFile,
 	functionsDir,
 	getFunctionCode,
 	getFunctionsDir,
 	getSharedCode,
-	supabaseDir,
+	getSupabaseDir,
 } from "../eval-utils.ts";
 const FUNCTION_NAME = "hello-world";
 const helloWorldDir = join(functionsDir, FUNCTION_NAME);
 /** Read function code + all shared modules combined. */
 function getAllCode(): string {
 	const code = getFunctionCode(FUNCTION_NAME);
 	return `${code}\n${getSharedCode()}`;
 }
 /** Extract the code after the first `catch` keyword to the end of the function. */
 function getCatchBlockCode(): string {
 	const code = getFunctionCode(FUNCTION_NAME);
 	const catchIndex = code.search(/\bcatch\b/);
@@ -28,121 +33,123 @@ function getCatchBlockCode(): string {
 	return code.slice(catchIndex);
 }
-test("supabase project initialized", () => {
+export const assertions: EvalAssertion[] = [
-	expect(existsSync(join(supabaseDir, "config.toml"))).toBe(true);
+	{
-});
+		name: "supabase project initialized",
-
+		check: () => existsSync(join(getSupabaseDir(), "config.toml")),
-test("function directory exists", () => {
+	},
-	expect(existsSync(helloWorldDir)).toBe(true);
+	{
-});
+		name: "function directory exists",
-
+		check: () => existsSync(join(getFunctionsDir(), FUNCTION_NAME)),
-test("function index file exists", () => {
+	},
-	expect(findFunctionFile(FUNCTION_NAME)).not.toBeNull();
+	{
-});
+		name: "function index file exists",
-
+		check: () => findFunctionFile(FUNCTION_NAME) !== null,
-test("uses Deno.serve", () => {
+	},
-	const code = getFunctionCode(FUNCTION_NAME);
+	{
-	expect(code).toMatch(/Deno\.serve/);
+		name: "uses Deno.serve",
-});
+		check: () => /Deno\.serve/.test(getFunctionCode(FUNCTION_NAME)),
-
+	},
-test("returns JSON response", () => {
+	{
-	// Check both the function file and shared modules for JSON response patterns
+		name: "returns JSON response",
-	const allCode = getAllCode();
+		check: () => {
-	const hasContentTypeHeader =
+			const allCode = getAllCode();
-		/content-type['"]\s*:\s*['"]application\/json/i.test(allCode);
+			return (
-	const hasResponseJson = /Response\.json/i.test(allCode);
+				/content-type['"]\s*:\s*['"]application\/json/i.test(allCode) ||
-	const hasJsonStringify = /JSON\.stringify/i.test(allCode);
+				/Response\.json/i.test(allCode) ||
-	expect(hasContentTypeHeader || hasResponseJson || hasJsonStringify).toBe(
+				/JSON\.stringify/i.test(allCode)
-		true,
+			);
-	);
+		},
-});
+	},
-
+	{
-test("handles OPTIONS preflight", () => {
+		name: "handles OPTIONS preflight",
-	// OPTIONS handling may be in the function itself or in a shared CORS helper
+		check: () => {
-	const allCode = getAllCode();
+			const allCode = getAllCode();
-	expect(allCode).toMatch(/['"]OPTIONS['"]/);
+			return /['"]OPTIONS['"]/.test(allCode) && /\.method/.test(allCode);
-	expect(allCode).toMatch(/\.method/);
+		},
-});
+	},
-
+	{
-test("defines CORS headers", () => {
+		name: "defines CORS headers",
-	const allCode = getAllCode();
+		check: () => /Access-Control-Allow-Origin/.test(getAllCode()),
-	expect(allCode).toMatch(/Access-Control-Allow-Origin/);
+	},
-});
+	{
-
+		name: "CORS allows required headers",
-test("CORS allows required headers", () => {
+		check: () => {
-	const allCode = getAllCode().toLowerCase();
+			const allCode = getAllCode().toLowerCase();
-	// Must include authorization and apikey in allowed headers
+			return (
-	expect(allCode).toMatch(/access-control-allow-headers/);
+				/access-control-allow-headers/.test(allCode) &&
-	expect(allCode).toMatch(/authorization/);
+				/authorization/.test(allCode) &&
-	expect(allCode).toMatch(/apikey/);
+				/apikey/.test(allCode)
-});
+			);
-
+		},
-test("error response has CORS headers", () => {
+	},
-	const catchCode = getCatchBlockCode();
+	{
-	expect(catchCode.length).toBeGreaterThan(0);
+		name: "error response has CORS headers",
-	// The catch block should either directly reference CORS headers, or call
+		check: () => {
-	// a shared helper that includes them (e.g. errorResponse, corsHeaders).
+			const catchCode = getCatchBlockCode();
-	const sharedCode = getSharedCode();
+			if (catchCode.length === 0) return false;
-	// Direct CORS reference in catch block
+			const sharedCode = getSharedCode();
-	const directCors =
+			const directCors =
-		/corsHeaders|cors_headers|Access-Control-Allow-Origin/i.test(catchCode);
+				/corsHeaders|cors_headers|Access-Control-Allow-Origin/i.test(catchCode);
-	// Calls a shared helper that itself includes CORS headers
+			const callsSharedHelper =
-	const callsSharedHelper =
+				/errorResponse|jsonResponse|json_response|error_response/i.test(
-		/errorResponse|jsonResponse|json_response|error_response/i.test(
+					catchCode,
-			catchCode,
+				) && /Access-Control-Allow-Origin/i.test(sharedCode);
-		) && /Access-Control-Allow-Origin/i.test(sharedCode);
+			return directCors || callsSharedHelper;
-	expect(directCors || callsSharedHelper).toBe(true);
+		},
-});
+	},
-
+	{
-test("has try-catch for error handling", () => {
+		name: "has try-catch for error handling",
-	const code = getFunctionCode(FUNCTION_NAME);
+		check: () => {
-	expect(code).toMatch(/\btry\s*\{/);
+			const code = getFunctionCode(FUNCTION_NAME);
-	expect(code).toMatch(/\bcatch\b/);
+			return /\btry\s*\{/.test(code) && /\bcatch\b/.test(code);
-});
+		},
-
+	},
-test("returns proper error status code", () => {
+	{
-	const catchCode = getCatchBlockCode();
+		name: "returns proper error status code",
-	expect(catchCode.length).toBeGreaterThan(0);
+		check: () => {
-	// Error response should use status 400 or 500 (not default 200).
+			const catchCode = getCatchBlockCode();
-	// Match object-style { status: 500 } or function-call-style fn('msg', 500)
+			if (catchCode.length === 0) return false;
-	const hasObjectStatus = /status:\s*(400|500|4\d{2}|5\d{2})/.test(catchCode);
+			return (
-	const hasFnArgStatus = /[,(]\s*(400|500|4\d{2}|5\d{2})\s*[),]/.test(
+				/status:\s*(400|500|4\d{2}|5\d{2})/.test(catchCode) ||
-		catchCode,
+				/[,(]\s*(400|500|4\d{2}|5\d{2})\s*[),]/.test(catchCode)
-	);
+			);
-	expect(hasObjectStatus || hasFnArgStatus).toBe(true);
+		},
-});
+	},
-
+	{
-test("shared CORS module exists", () => {
+		name: "shared CORS module exists",
-	expect(findSharedCorsFile()).not.toBeNull();
+		check: () => findSharedCorsFile() !== null,
-});
+	},
-
+	{
-test("function imports from shared", () => {
+		name: "function imports from shared",
-	const code = getFunctionCode(FUNCTION_NAME);
+		check: () =>
-	// Should import from ../_shared/ relative path
+			/from\s+['"]\.\.\/(_shared|_utils)/.test(getFunctionCode(FUNCTION_NAME)),
-	expect(code).toMatch(/from\s+['"]\.\.\/(_shared|_utils)/);
+	},
-});
+	{
-
+		name: "function uses hyphenated name",
-test("function uses hyphenated name", () => {
+		check: () => {
-	// The function directory should use hyphens, not underscores
+			const dirs = existsSync(getFunctionsDir()) ? readdirSync(getFunctionsDir()) : [];
-	const dirs = existsSync(functionsDir) ? readdirSync(functionsDir) : [];
+			const helloDir = dirs.find(
-	const helloDir = dirs.find((d) => d.includes("hello") && d.includes("world"));
+				(d) => d.includes("hello") && d.includes("world"),
-	expect(helloDir).toBeDefined();
+			);
-	expect(helloDir).toMatch(/^hello-world$/);
+			return helloDir !== undefined && /^hello-world$/.test(helloDir);
-});
+		},
-
+	},
-test("overall quality: demonstrates Edge Function best practices", () => {
+	{
-	const allCode = getAllCode().toLowerCase();
+		name: "overall quality: demonstrates Edge Function best practices",
-	// A high-quality Edge Function should contain most of these patterns
+		check: () => {
-	const signals = [
+			const allCode = getAllCode().toLowerCase();
-		/deno\.serve/, // Modern Deno.serve API
+			const signals = [
-		/['"]options['"]/, // OPTIONS preflight handling
+				/deno\.serve/,
-		/access-control-allow-origin/, // CORS headers defined
+				/['"]options['"]/,
-		/\btry\s*\{/, // Error handling with try-catch
+				/access-control-allow-origin/,
-		/status:\s*(400|500|4\d{2}|5\d{2})|[,(]\s*(400|500|4\d{2}|5\d{2})\s*[),]/, // Proper error status codes
+				/\btry\s*\{/,
-		/from\s+['"]\.\.\/(_shared|_utils)/, // Imports from shared directory
+				/status:\s*(400|500|4\d{2}|5\d{2})|[,(]\s*(400|500|4\d{2}|5\d{2})\s*[),]/,
-		/authorization/, // Allows authorization header in CORS
+				/from\s+['"]\.\.\/(_shared|_utils)/,
-		/apikey/, // Allows apikey header in CORS
+				/authorization/,
-	];
+				/apikey/,
-	const matches = signals.filter((r) => r.test(allCode));
+			];
-	expect(matches.length).toBeGreaterThanOrEqual(6);
+			return signals.filter((r) => r.test(allCode)).length >= 6;
-});
+		},
 	},
 ];
--- a/packages/evals/evals/eval-utils.ts
+++ b/packages/evals/evals/eval-utils.ts
@@ -2,12 +2,90 @@ import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
 import { join } from "node:path";
 // ---------------------------------------------------------------------------
-// Common paths
+// Runtime DB helpers (use only in async tests)
 // ---------------------------------------------------------------------------
-export const supabaseDir = join(process.cwd(), "supabase");
+const SUPABASE_URL = process.env.SUPABASE_URL ?? "http://127.0.0.1:54321";
-export const migrationsDir = join(supabaseDir, "migrations");
+const SERVICE_KEY = process.env.SUPABASE_SERVICE_ROLE_KEY ?? "";
-export const functionsDir = join(supabaseDir, "functions");
+const ANON_KEY = process.env.SUPABASE_ANON_KEY ?? "";
 /** Execute a raw SQL query via PostgREST's /rpc endpoint or via the REST API. */
 async function pgRest(
 	table: string,
 	options: { select?: string; role?: "service_role" | "anon" } = {},
 ): Promise<{ data: Record<string, unknown>[]; error: string | null }> {
 	const key = options.role === "anon" ? ANON_KEY : SERVICE_KEY;
 	const select = options.select ?? "*";
 	const res = await fetch(`${SUPABASE_URL}/rest/v1/${table}?select=${select}`, {
 		headers: {
 			apikey: key,
 			Authorization: `Bearer ${key}`,
 			"Content-Type": "application/json",
 		},
 	});
 	if (!res.ok) {
 		const body = await res.text();
 		return { data: [], error: `HTTP ${res.status}: ${body}` };
 	}
 	const data = (await res.json()) as Record<string, unknown>[];
 	return { data, error: null };
 }
 /**
 * Check whether a table is visible through the PostgREST API.
 * Uses the service role key (bypasses RLS).
 */
 export async function tableExists(tableName: string): Promise<boolean> {
 	const { error } = await pgRest(tableName);
 	// A 404 or PGRST116 means the table/view doesn't exist in the schema cache.
 	return error === null || !error.includes("404");
 }
 /**
 * Query rows from a table.
 * @param tableName - table to query
 * @param role - "service_role" bypasses RLS; "anon" respects RLS policies
 */
 export async function queryTable(
 	tableName: string,
 	role: "service_role" | "anon" = "service_role",
 ): Promise<{ data: Record<string, unknown>[]; error: string | null }> {
 	return pgRest(tableName, { role });
 }
 /**
 * Return true if the table exists AND is empty when queried as anon
 * (i.e., RLS is blocking access as expected for an unauthenticated user).
 */
 export async function anonSeeesNoRows(tableName: string): Promise<boolean> {
 	const { data, error } = await pgRest(tableName, { role: "anon" });
 	return error === null && data.length === 0;
 }
 // ---------------------------------------------------------------------------
 // Common paths
 //
 // These are FUNCTIONS, not constants, so they re-evaluate process.cwd() on
 // every call. The runner does `process.chdir(workspacePath)` before running
 // assertions, so all path helpers resolve relative to the correct workspace.
 // ---------------------------------------------------------------------------
 /** Returns the supabase/ directory under the current working directory. */
 export function getSupabaseDir(): string {
 	return join(process.cwd(), "supabase");
 }
 /** Returns the supabase/migrations/ directory. */
 export function getMigrationsDir(): string {
 	return join(getSupabaseDir(), "migrations");
 }
 /** Returns the supabase/functions/ directory. */
 export function getFunctionsDir(): string {
 	return join(getSupabaseDir(), "functions");
 }
 // ---------------------------------------------------------------------------
 // Migration helpers
@@ -15,10 +93,11 @@ export const functionsDir = join(supabaseDir, "functions");
 /** Find all .sql migration files (agent may create one or more). */
 export function findMigrationFiles(): string[] {
-	if (!existsSync(migrationsDir)) return [];
+	const dir = getMigrationsDir();
-	return readdirSync(migrationsDir)
+	if (!existsSync(dir)) return [];
 	return readdirSync(dir)
 		.filter((f) => f.endsWith(".sql"))
-		.map((f) => join(migrationsDir, f));
+		.map((f) => join(dir, f));
 }
 /** Read and concatenate all migration SQL files. */
@@ -39,7 +118,7 @@ export function getMigrationSQL(): string {
 * @param functionName - directory name under supabase/functions/ (e.g. "hello-world")
 */
 export function findFunctionFile(functionName: string): string | null {
-	const fnDir = join(functionsDir, functionName);
+	const fnDir = join(getFunctionsDir(), functionName);
 	if (!existsSync(fnDir)) return null;
 	const files = readdirSync(fnDir).filter(
 		(f) => f.startsWith("index.") && (f.endsWith(".ts") || f.endsWith(".tsx")),
@@ -61,12 +140,13 @@ export function getFunctionCode(functionName: string): string {
 /** Find a shared CORS module under supabase/functions/_shared/ (or similar _-prefixed dir). */
 export function findSharedCorsFile(): string | null {
-	if (!existsSync(functionsDir)) return null;
+	const fnDir = getFunctionsDir();
-	const sharedDirs = readdirSync(functionsDir).filter(
+	if (!existsSync(fnDir)) return null;
-		(d) => d.startsWith("_") && statSync(join(functionsDir, d)).isDirectory(),
+	const sharedDirs = readdirSync(fnDir).filter(
 		(d) => d.startsWith("_") && statSync(join(fnDir, d)).isDirectory(),
 	);
 	for (const dir of sharedDirs) {
-		const dirPath = join(functionsDir, dir);
+		const dirPath = join(fnDir, dir);
 		const files = readdirSync(dirPath).filter((f) => f.includes("cors"));
 		if (files.length > 0) return join(dirPath, files[0]);
 	}
@@ -75,13 +155,14 @@ export function findSharedCorsFile(): string | null {
 /** Read and concatenate all .ts/.tsx files from _-prefixed shared directories. */
 export function getSharedCode(): string {
-	if (!existsSync(functionsDir)) return "";
+	const fnDir = getFunctionsDir();
-	const sharedDirs = readdirSync(functionsDir).filter(
+	if (!existsSync(fnDir)) return "";
-		(d) => d.startsWith("_") && statSync(join(functionsDir, d)).isDirectory(),
+	const sharedDirs = readdirSync(fnDir).filter(
 		(d) => d.startsWith("_") && statSync(join(fnDir, d)).isDirectory(),
 	);
 	const parts: string[] = [];
 	for (const dir of sharedDirs) {
-		const dirPath = join(functionsDir, dir);
+		const dirPath = join(fnDir, dir);
 		const files = readdirSync(dirPath).filter(
 			(f) => f.endsWith(".ts") || f.endsWith(".tsx"),
 		);
--- a/packages/evals/evals/extension-wrong-schema/EVAL.ts
+++ b/packages/evals/evals/extension-wrong-schema/EVAL.ts
@@ -0,0 +1,100 @@
 export const expectedReferenceFiles = [
 	"db-schema-extensions.md",
 	"db-rls-mandatory.md",
 	"db-migrations-idempotent.md",
 	"db-schema-auth-fk.md",
 	"db-rls-common-mistakes.md",
 ];
 import type { EvalAssertion } from "../../src/eval-types.js";
 import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
 export const assertions: EvalAssertion[] = [
 	{
 		name: "migration file exists",
 		check: () => findMigrationFiles().length > 0,
 	},
 	{
 		name: "extension installed in extensions schema",
 		check: () =>
 			/create\s+extension[\s\S]*?with\s+schema\s+extensions/.test(
 				getMigrationSQL().toLowerCase(),
 			),
 	},
 	{
 		name: "IF NOT EXISTS on extension creation",
 		check: () =>
 			/create\s+extension\s+if\s+not\s+exists/.test(
 				getMigrationSQL().toLowerCase(),
 			),
 	},
 	{
 		name: "vector column with correct dimensions",
 		check: () =>
 			/(?:extensions\.)?vector\s*\(\s*1536\s*\)/.test(
 				getMigrationSQL().toLowerCase(),
 			),
 	},
 	{
 		name: "HNSW index used instead of IVFFlat",
 		check: () => /using\s+hnsw/.test(getMigrationSQL().toLowerCase()),
 	},
 	{
 		name: "RLS enabled on documents table",
 		check: () =>
 			/alter\s+table[\s\S]*?documents[\s\S]*?enable\s+row\s+level\s+security/.test(
 				getMigrationSQL().toLowerCase(),
 			),
 	},
 	{
 		name: "FK to auth.users with ON DELETE CASCADE",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			return (
 				/references\s+auth\.users/.test(sql) &&
 				/on\s+delete\s+cascade/.test(sql)
 			);
 		},
 	},
 	{
 		name: "policies use TO authenticated",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			return (
 				policyBlocks.length > 0 &&
 				policyBlocks.every((p) => /to\s+authenticated/.test(p))
 			);
 		},
 	},
 	{
 		name: "idempotent table creation (IF NOT EXISTS)",
 		check: () =>
 			/create\s+table\s+if\s+not\s+exists/.test(
 				getMigrationSQL().toLowerCase(),
 			),
 	},
 	{
 		name: "overall quality: demonstrates pgvector best practices",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			const signals = [
 				/create\s+extension[\s\S]*?with\s+schema\s+extensions/.test(sql),
 				/create\s+extension\s+if\s+not\s+exists/.test(sql),
 				/(?:extensions\.)?vector\s*\(\s*1536\s*\)/.test(sql),
 				/using\s+hnsw/.test(sql),
 				/alter\s+table[\s\S]*?documents[\s\S]*?enable\s+row\s+level\s+security/.test(
 					sql,
 				),
 				/references\s+auth\.users/.test(sql) &&
 					/on\s+delete\s+cascade/.test(sql),
 				policyBlocks.length > 0 &&
 					policyBlocks.every((p) => /to\s+authenticated/.test(p)),
 				/if\s+not\s+exists/.test(sql),
 			];
 			return signals.filter(Boolean).length >= 6;
 		},
 	},
 ];
--- a/packages/evals/evals/extension-wrong-schema/PROMPT.md
+++ b/packages/evals/evals/extension-wrong-schema/PROMPT.md
@@ -0,0 +1,11 @@
 I'm building a semantic search feature for my app. I need to store document embeddings generated by OpenAI's ada-002 model (1536 dimensions) and let users search their own documents.
 Create a migration in `supabase/migrations/` that:
 1. Enables the pgvector extension
 2. Creates a `documents` table with:
   - An `embedding` column (1536 dimensions)
   - A `content` text column
   - A `user_id` column linked to the authenticated user
 3. Adds a vector similarity search index
 4. Ensures users can only see and manage their own documents
--- a/packages/evals/evals/extension-wrong-schema/package.json
+++ b/packages/evals/evals/extension-wrong-schema/package.json
@@ -0,0 +1,5 @@
 {
 	"name": "extension-wrong-schema",
 	"private": true,
 	"type": "module"
 }
--- a/packages/evals/evals/extension-wrong-schema/supabase/config.toml
+++ b/packages/evals/evals/extension-wrong-schema/supabase/config.toml
@@ -0,0 +1,111 @@
 # For detailed configuration reference documentation, visit:
 # https://supabase.com/docs/guides/local-development/cli/config
 # A string used to distinguish different Supabase projects on the same host. Defaults to the
 # working directory name when running `supabase init`.
 project_id = "extension-wrong-schema"
 [api]
 enabled = true
 # Port to use for the API URL.
 port = 54321
 # Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
 # endpoints. `public` and `graphql_public` schemas are included by default.
 schemas = ["public", "graphql_public"]
 # Extra schemas to add to the search_path of every request.
 extra_search_path = ["public", "extensions"]
 # The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
 # for accidental or malicious requests.
 max_rows = 1000
 [db]
 # Port to use for the local database URL.
 port = 54322
 # Port used by db diff command to initialize the shadow database.
 shadow_port = 54320
 # The database major version to use. This has to be the same as your remote database's. Run `SHOW
 # server_version;` on the remote database to check.
 major_version = 17
 [db.pooler]
 enabled = false
 # Port to use for the local connection pooler.
 port = 54329
 # Specifies when a server connection can be reused by other clients.
 # Configure one of the supported pooler modes: `transaction`, `session`.
 pool_mode = "transaction"
 # How many server connections to allow per user/database pair.
 default_pool_size = 20
 # Maximum number of client connections allowed.
 max_client_conn = 100
 [db.migrations]
 # If disabled, migrations will be skipped during a db push or reset.
 enabled = true
 schema_paths = []
 [db.seed]
 # If enabled, seeds the database after migrations during a db reset.
 enabled = true
 # Specifies an ordered list of seed files to load during db reset.
 sql_paths = ["./seed.sql"]
 [realtime]
 enabled = true
 [studio]
 enabled = true
 # Port to use for Supabase Studio.
 port = 54323
 # External URL of the API server that frontend connects to.
 api_url = "http://127.0.0.1"
 [inbucket]
 enabled = true
 # Port to use for the email testing server web interface.
 port = 54324
 [storage]
 enabled = true
 # The maximum file size allowed (e.g. "5MB", "500KB").
 file_size_limit = "50MiB"
 [auth]
 enabled = true
 # The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
 # in emails.
 site_url = "http://127.0.0.1:3000"
 # A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
 additional_redirect_urls = ["https://127.0.0.1:3000"]
 # How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
 jwt_expiry = 3600
 # If disabled, the refresh token will never expire.
 enable_refresh_token_rotation = true
 # Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
 # Requires enable_refresh_token_rotation = true.
 refresh_token_reuse_interval = 10
 # Allow/disallow new user signups to your project.
 enable_signup = true
 # Allow/disallow anonymous sign-ins to your project.
 enable_anonymous_sign_ins = false
 [auth.email]
 # Allow/disallow new user signups via email to your project.
 enable_signup = true
 # If enabled, a user will be required to confirm any email change on both the old, and new email
 # addresses. If disabled, only the new email is required to confirm.
 double_confirm_changes = true
 # If enabled, users need to confirm their email address before signing in.
 enable_confirmations = false
 [edge_runtime]
 enabled = true
 # Configure one of the supported request policies: `oneshot`, `per_worker`.
 policy = "per_worker"
 # Port to attach the Chrome inspector for debugging edge functions.
 inspector_port = 8083
 [analytics]
 enabled = true
 port = 54327
 # Configure one of the supported backends: `postgres`, `bigquery`.
 backend = "postgres"
--- a/packages/evals/evals/postgrest-schema-cache/EVAL.ts
+++ b/packages/evals/evals/postgrest-schema-cache/EVAL.ts
@@ -0,0 +1,114 @@
 export const expectedReferenceFiles = [
 	"db-rls-views.md",
 	"db-migrations-idempotent.md",
 	"db-rls-mandatory.md",
 	"db-rls-performance.md",
 	"db-schema-timestamps.md",
 ];
 import { existsSync, readdirSync, readFileSync } from "node:fs";
 import { join } from "node:path";
 import type { EvalAssertion } from "../../src/eval-types.js";
 const migrationsDir = join(process.cwd(), "supabase", "migrations");
 const STARTER_MIGRATION = "20240101000000_create_products.sql";
 function findAgentMigrationFiles(): string[] {
 	if (!existsSync(migrationsDir)) return [];
 	return readdirSync(migrationsDir)
 		.filter((f) => f.endsWith(".sql") && f !== STARTER_MIGRATION)
 		.map((f) => join(migrationsDir, f));
 }
 function getAgentMigrationSQL(): string {
 	const files = findAgentMigrationFiles();
 	if (files.length === 0)
 		throw new Error(
 			"No agent-created migration file found in supabase/migrations/",
 		);
 	return files.map((f) => readFileSync(f, "utf-8")).join("\n");
 }
 export const assertions: EvalAssertion[] = [
 	{
 		name: "new migration file exists",
 		check: () => findAgentMigrationFiles().length > 0,
 	},
 	{
 		name: "ADD COLUMN IF NOT EXISTS for description",
 		check: () =>
 			/add\s+column\s+if\s+not\s+exists\s+description/.test(
 				getAgentMigrationSQL().toLowerCase(),
 			),
 	},
 	{
 		name: "ADD COLUMN IF NOT EXISTS for published_at",
 		check: () =>
 			/add\s+column\s+if\s+not\s+exists\s+published_at/.test(
 				getAgentMigrationSQL().toLowerCase(),
 			),
 	},
 	{
 		name: "published_at uses timestamptz not plain timestamp",
 		check: () => {
 			const sql = getAgentMigrationSQL().toLowerCase();
 			return (
 				/published_at\s+timestamptz|published_at\s+timestamp\s+with\s+time\s+zone/.test(
 					sql,
 				) &&
 				!/published_at\s+timestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(
 					sql,
 				)
 			);
 		},
 	},
 	{
 		name: "view public_products is created",
 		check: () =>
 			/create\s+(or\s+replace\s+)?view\s+public_products/.test(
 				getAgentMigrationSQL().toLowerCase(),
 			),
 	},
 	{
 		name: "view uses security_invoker = true",
 		check: () =>
 			/security_invoker\s*=\s*true/.test(getAgentMigrationSQL().toLowerCase()),
 	},
 	{
 		name: "SELECT policy on products for authenticated role",
 		check: () => {
 			const sql = getAgentMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			return policyBlocks.some(
 				(p) =>
 					p.includes("select") &&
 					p.includes("products") &&
 					/to\s+authenticated/.test(p),
 			);
 		},
 	},
 	{
 		name: "NOTIFY pgrst reload schema is present",
 		check: () => /notify\s+pgrst/.test(getAgentMigrationSQL().toLowerCase()),
 	},
 	{
 		name: "overall quality: demonstrates PostgREST and schema best practices",
 		check: () => {
 			const sql = getAgentMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			const signals = [
 				/add\s+column\s+if\s+not\s+exists/.test(sql),
 				/published_at\s+timestamptz|published_at\s+timestamp\s+with\s+time\s+zone/.test(
 					sql,
 				),
 				/create\s+(or\s+replace\s+)?view\s+public_products/.test(sql),
 				/security_invoker\s*=\s*true/.test(sql),
 				policyBlocks.some(
 					(p) => p.includes("select") && /to\s+authenticated/.test(p),
 				),
 				/notify\s+pgrst/.test(sql),
 			];
 			return signals.filter(Boolean).length >= 5;
 		},
 	},
 ];
--- a/packages/evals/evals/postgrest-schema-cache/PROMPT.md
+++ b/packages/evals/evals/postgrest-schema-cache/PROMPT.md
@@ -0,0 +1,9 @@
 I'm building a product catalog with Supabase. We already have a `products` table (see the existing migration in `supabase/migrations/`), but we need to expand it.
 Please create a new migration file in `supabase/migrations/` that:
 1. Adds two new columns to the `products` table: `description` (text) and `published_at` (timestamp)
 2. Creates a view called `public_products` that shows only products where `published_at` is not null
 3. Adds a policy so any authenticated user can view published products
 Make sure the migration is safe to run multiple times.
--- a/packages/evals/evals/postgrest-schema-cache/package.json
+++ b/packages/evals/evals/postgrest-schema-cache/package.json
@@ -0,0 +1,5 @@
 {
 	"name": "postgrest-schema-cache",
 	"private": true,
 	"type": "module"
 }
--- a/packages/evals/evals/postgrest-schema-cache/supabase/config.toml
+++ b/packages/evals/evals/postgrest-schema-cache/supabase/config.toml
@@ -0,0 +1,111 @@
 # For detailed configuration reference documentation, visit:
 # https://supabase.com/docs/guides/local-development/cli/config
 # A string used to distinguish different Supabase projects on the same host. Defaults to the
 # working directory name when running `supabase init`.
 project_id = "postgrest-schema-cache"
 [api]
 enabled = true
 # Port to use for the API URL.
 port = 54321
 # Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
 # endpoints. `public` and `graphql_public` schemas are included by default.
 schemas = ["public", "graphql_public"]
 # Extra schemas to add to the search_path of every request.
 extra_search_path = ["public", "extensions"]
 # The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
 # for accidental or malicious requests.
 max_rows = 1000
 [db]
 # Port to use for the local database URL.
 port = 54322
 # Port used by db diff command to initialize the shadow database.
 shadow_port = 54320
 # The database major version to use. This has to be the same as your remote database's. Run `SHOW
 # server_version;` on the remote database to check.
 major_version = 17
 [db.pooler]
 enabled = false
 # Port to use for the local connection pooler.
 port = 54329
 # Specifies when a server connection can be reused by other clients.
 # Configure one of the supported pooler modes: `transaction`, `session`.
 pool_mode = "transaction"
 # How many server connections to allow per user/database pair.
 default_pool_size = 20
 # Maximum number of client connections allowed.
 max_client_conn = 100
 [db.migrations]
 # If disabled, migrations will be skipped during a db push or reset.
 enabled = true
 schema_paths = []
 [db.seed]
 # If enabled, seeds the database after migrations during a db reset.
 enabled = true
 # Specifies an ordered list of seed files to load during db reset.
 sql_paths = ["./seed.sql"]
 [realtime]
 enabled = true
 [studio]
 enabled = true
 # Port to use for Supabase Studio.
 port = 54323
 # External URL of the API server that frontend connects to.
 api_url = "http://127.0.0.1"
 [inbucket]
 enabled = true
 # Port to use for the email testing server web interface.
 port = 54324
 [storage]
 enabled = true
 # The maximum file size allowed (e.g. "5MB", "500KB").
 file_size_limit = "50MiB"
 [auth]
 enabled = true
 # The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
 # in emails.
 site_url = "http://127.0.0.1:3000"
 # A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
 additional_redirect_urls = ["https://127.0.0.1:3000"]
 # How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
 jwt_expiry = 3600
 # If disabled, the refresh token will never expire.
 enable_refresh_token_rotation = true
 # Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
 # Requires enable_refresh_token_rotation = true.
 refresh_token_reuse_interval = 10
 # Allow/disallow new user signups to your project.
 enable_signup = true
 # Allow/disallow anonymous sign-ins to your project.
 enable_anonymous_sign_ins = false
 [auth.email]
 # Allow/disallow new user signups via email to your project.
 enable_signup = true
 # If enabled, a user will be required to confirm any email change on both the old, and new email
 # addresses. If disabled, only the new email is required to confirm.
 double_confirm_changes = true
 # If enabled, users need to confirm their email address before signing in.
 enable_confirmations = false
 [edge_runtime]
 enabled = true
 # Configure one of the supported request policies: `oneshot`, `per_worker`.
 policy = "per_worker"
 # Port to attach the Chrome inspector for debugging edge functions.
 inspector_port = 8083
 [analytics]
 enabled = true
 port = 54327
 # Configure one of the supported backends: `postgres`, `bigquery`.
 backend = "postgres"
--- a/packages/evals/evals/postgrest-schema-cache/supabase/migrations/20240101000000_create_products.sql
+++ b/packages/evals/evals/postgrest-schema-cache/supabase/migrations/20240101000000_create_products.sql
@@ -0,0 +1,8 @@
 -- Initial products table
 create table if not exists products (
  id bigint primary key generated always as identity,
  name text not null,
  price numeric(10, 2) not null default 0
 );
 alter table products enable row level security;
--- a/packages/evals/evals/rls-update-needs-select/EVAL.ts
+++ b/packages/evals/evals/rls-update-needs-select/EVAL.ts
@@ -0,0 +1,122 @@
 export const expectedReferenceFiles = [
 	"db-rls-common-mistakes.md",
 	"db-rls-policy-types.md",
 	"db-rls-performance.md",
 	"db-rls-mandatory.md",
 	"db-schema-timestamps.md",
 ];
 import type { EvalAssertion } from "../../src/eval-types.js";
 import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
 export const assertions: EvalAssertion[] = [
 	{
 		name: "migration file exists",
 		check: () => findMigrationFiles().length > 0,
 	},
 	{
 		name: "creates orders table",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			return /create\s+table/.test(sql) && /orders/.test(sql);
 		},
 	},
 	{
 		name: "enables RLS on orders table",
 		check: () =>
 			/alter\s+table.*orders.*enable\s+row\s+level\s+security/.test(
 				getMigrationSQL().toLowerCase(),
 			),
 	},
 	{
 		name: "has SELECT policy on orders",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			return policyBlocks.some((p) => p.includes("for select"));
 		},
 	},
 	{
 		name: "has UPDATE policy with WITH CHECK on orders",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			const updatePolicy = policyBlocks.find((p) => p.includes("for update"));
 			return updatePolicy !== undefined && /with\s+check/.test(updatePolicy);
 		},
 	},
 	{
 		name: "all policies use TO authenticated",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			return (
 				policyBlocks.length > 0 &&
 				policyBlocks.every((p) => /to\s+authenticated/.test(p))
 			);
 		},
 	},
 	{
 		name: "uses (select auth.uid()) not bare auth.uid() in policies",
 		check: () => {
 			const sql = getMigrationSQL();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			for (const policy of policyBlocks) {
 				if (
 					policy.includes("auth.uid()") &&
 					!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
 				) {
 					return false;
 				}
 			}
 			return true;
 		},
 	},
 	{
 		name: "uses timestamptz not plain timestamp for created_at",
 		check: () => {
 			const rawSql = getMigrationSQL().toLowerCase();
 			const sql = rawSql.replace(/--[^\n]*/g, "");
 			const hasPlainTimestamp =
 				/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
 			if (sql.includes("created_at")) {
 				return !hasPlainTimestamp.test(sql);
 			}
 			return true;
 		},
 	},
 	{
 		name: "FK to auth.users with ON DELETE CASCADE",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			return (
 				/references\s+auth\.users/.test(sql) &&
 				/on\s+delete\s+cascade/.test(sql)
 			);
 		},
 	},
 	{
 		name: "overall quality: demonstrates Supabase best practices",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			const signals = [
 				/alter\s+table.*orders.*enable\s+row\s+level\s+security/.test(sql),
 				policyBlocks.some((p) => p.includes("for select")),
 				policyBlocks.some(
 					(p) => p.includes("for update") && /with\s+check/.test(p),
 				),
 				/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
 				policyBlocks.length > 0 &&
 					policyBlocks.every((p) => /to\s+authenticated/.test(p)),
 				/references\s+auth\.users/.test(sql) &&
 					/on\s+delete\s+cascade/.test(sql),
 				!/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(
 					sql.replace(/--[^\n]*/g, ""),
 				),
 			];
 			return signals.filter(Boolean).length >= 5;
 		},
 	},
 ];
--- a/packages/evals/evals/rls-update-needs-select/PROMPT.md
+++ b/packages/evals/evals/rls-update-needs-select/PROMPT.md
@@ -0,0 +1,7 @@
 I'm building an e-commerce app and need a migration for an `orders` table. Each order has a `status` (text), `total` (numeric), and `created_at` timestamp. Orders belong to users — each order should have a `user_id` that links to the authenticated user who placed it.
 Users need to be able to:
 - View their own orders
 - Update the status of their own orders
 Please create the migration in `supabase/migrations/`.
--- a/packages/evals/evals/rls-update-needs-select/package.json
+++ b/packages/evals/evals/rls-update-needs-select/package.json
@@ -0,0 +1,5 @@
 {
 	"name": "rls-update-needs-select",
 	"private": true,
 	"type": "module"
 }
--- a/packages/evals/evals/rls-update-needs-select/supabase/config.toml
+++ b/packages/evals/evals/rls-update-needs-select/supabase/config.toml
@@ -0,0 +1,111 @@
 # For detailed configuration reference documentation, visit:
 # https://supabase.com/docs/guides/local-development/cli/config
 # A string used to distinguish different Supabase projects on the same host. Defaults to the
 # working directory name when running `supabase init`.
 project_id = "rls-update-needs-select"
 [api]
 enabled = true
 # Port to use for the API URL.
 port = 54321
 # Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
 # endpoints. `public` and `graphql_public` schemas are included by default.
 schemas = ["public", "graphql_public"]
 # Extra schemas to add to the search_path of every request.
 extra_search_path = ["public", "extensions"]
 # The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
 # for accidental or malicious requests.
 max_rows = 1000
 [db]
 # Port to use for the local database URL.
 port = 54322
 # Port used by db diff command to initialize the shadow database.
 shadow_port = 54320
 # The database major version to use. This has to be the same as your remote database's. Run `SHOW
 # server_version;` on the remote database to check.
 major_version = 17
 [db.pooler]
 enabled = false
 # Port to use for the local connection pooler.
 port = 54329
 # Specifies when a server connection can be reused by other clients.
 # Configure one of the supported pooler modes: `transaction`, `session`.
 pool_mode = "transaction"
 # How many server connections to allow per user/database pair.
 default_pool_size = 20
 # Maximum number of client connections allowed.
 max_client_conn = 100
 [db.migrations]
 # If disabled, migrations will be skipped during a db push or reset.
 enabled = true
 schema_paths = []
 [db.seed]
 # If enabled, seeds the database after migrations during a db reset.
 enabled = true
 # Specifies an ordered list of seed files to load during db reset.
 sql_paths = ["./seed.sql"]
 [realtime]
 enabled = true
 [studio]
 enabled = true
 # Port to use for Supabase Studio.
 port = 54323
 # External URL of the API server that frontend connects to.
 api_url = "http://127.0.0.1"
 [inbucket]
 enabled = true
 # Port to use for the email testing server web interface.
 port = 54324
 [storage]
 enabled = true
 # The maximum file size allowed (e.g. "5MB", "500KB").
 file_size_limit = "50MiB"
 [auth]
 enabled = true
 # The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
 # in emails.
 site_url = "http://127.0.0.1:3000"
 # A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
 additional_redirect_urls = ["https://127.0.0.1:3000"]
 # How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
 jwt_expiry = 3600
 # If disabled, the refresh token will never expire.
 enable_refresh_token_rotation = true
 # Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
 # Requires enable_refresh_token_rotation = true.
 refresh_token_reuse_interval = 10
 # Allow/disallow new user signups to your project.
 enable_signup = true
 # Allow/disallow anonymous sign-ins to your project.
 enable_anonymous_sign_ins = false
 [auth.email]
 # Allow/disallow new user signups via email to your project.
 enable_signup = true
 # If enabled, a user will be required to confirm any email change on both the old, and new email
 # addresses. If disabled, only the new email is required to confirm.
 double_confirm_changes = true
 # If enabled, users need to confirm their email address before signing in.
 enable_confirmations = false
 [edge_runtime]
 enabled = true
 # Configure one of the supported request policies: `oneshot`, `per_worker`.
 policy = "per_worker"
 # Port to attach the Chrome inspector for debugging edge functions.
 inspector_port = 8083
 [analytics]
 enabled = true
 port = 54327
 # Configure one of the supported backends: `postgres`, `bigquery`.
 backend = "postgres"
--- a/packages/evals/evals/rls-user-metadata-role-check/EVAL.ts
+++ b/packages/evals/evals/rls-user-metadata-role-check/EVAL.ts
@@ -0,0 +1,123 @@
 export const expectedReferenceFiles = [
 	"db-rls-common-mistakes.md",
 	"db-rls-policy-types.md",
 	"db-rls-performance.md",
 	"db-rls-mandatory.md",
 	"db-schema-auth-fk.md",
 ];
 import type { EvalAssertion } from "../../src/eval-types.js";
 import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
 export const assertions: EvalAssertion[] = [
 	{
 		name: "migration file exists in supabase/migrations/",
 		check: () => findMigrationFiles().length > 0,
 	},
 	{
 		name: "creates documents table",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			return /create\s+table/.test(sql) && /documents/.test(sql);
 		},
 	},
 	{
 		name: "RLS enabled on documents table",
 		check: () =>
 			/alter\s+table.*documents.*enable\s+row\s+level\s+security/.test(
 				getMigrationSQL().toLowerCase(),
 			),
 	},
 	{
 		name: "uses app_metadata not user_metadata for role check",
 		check: () => /app_metadata/.test(getMigrationSQL().toLowerCase()),
 	},
 	{
 		name: "user_metadata does not appear in policy USING clauses",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			return policyBlocks.every((p) => !p.includes("user_metadata"));
 		},
 	},
 	{
 		name: "has at least two SELECT policies (owner and admin)",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			const hasOwnerPolicy = policyBlocks.some(
 				(p) =>
 					(p.includes("select") || !p.includes("insert")) &&
 					(p.includes("user_id") ||
 						p.includes("owner") ||
 						p.includes("auth.uid")),
 			);
 			const hasAdminPolicy = policyBlocks.some((p) =>
 				p.includes("app_metadata"),
 			);
 			return hasOwnerPolicy && hasAdminPolicy;
 		},
 	},
 	{
 		name: "policies use TO authenticated",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			return (
 				policyBlocks.length > 0 &&
 				policyBlocks.every((p) => /to\s+authenticated/.test(p))
 			);
 		},
 	},
 	{
 		name: "uses (select auth.uid()) subselect form in policies",
 		check: () => {
 			const sql = getMigrationSQL();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			for (const policy of policyBlocks) {
 				if (
 					policy.includes("auth.uid()") &&
 					!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
 				) {
 					return false;
 				}
 			}
 			return true;
 		},
 	},
 	{
 		name: "FK to auth.users with ON DELETE CASCADE",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			return (
 				/references\s+auth\.users/.test(sql) &&
 				/on\s+delete\s+cascade/.test(sql)
 			);
 		},
 	},
 	{
 		name: "overall quality: demonstrates Supabase best practices",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			const signals = [
 				/alter\s+table.*documents.*enable\s+row\s+level\s+security/.test(sql),
 				/app_metadata/.test(sql),
 				policyBlocks.every((p) => !p.includes("user_metadata")),
 				/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
 				policyBlocks.length > 0 &&
 					policyBlocks.every((p) => /to\s+authenticated/.test(p)),
 				/references\s+auth\.users/.test(sql) &&
 					/on\s+delete\s+cascade/.test(sql),
 				policyBlocks.some(
 					(p) =>
 						p.includes("user_id") ||
 						p.includes("owner") ||
 						p.includes("auth.uid"),
 				) && policyBlocks.some((p) => p.includes("app_metadata")),
 			];
 			return signals.filter(Boolean).length >= 5;
 		},
 	},
 ];
--- a/packages/evals/evals/rls-user-metadata-role-check/PROMPT.md
+++ b/packages/evals/evals/rls-user-metadata-role-check/PROMPT.md
@@ -0,0 +1,7 @@
 I'm building a document management app on Supabase. I need a migration for a `documents` table. Each document has a `title` (text), `content` (text), and belongs to a user (the owner).
 The access rules are:
 - Regular users can only read their own documents.
 - Admin users — identified by a role field stored in their JWT — should be able to read all documents.
 Please create the migration in `supabase/migrations/`. The Supabase project is already initialized.
--- a/packages/evals/evals/rls-user-metadata-role-check/package.json
+++ b/packages/evals/evals/rls-user-metadata-role-check/package.json
@@ -0,0 +1,5 @@
 {
 	"name": "rls-user-metadata-role-check",
 	"private": true,
 	"type": "module"
 }
--- a/packages/evals/evals/rls-user-metadata-role-check/supabase/config.toml
+++ b/packages/evals/evals/rls-user-metadata-role-check/supabase/config.toml
@@ -0,0 +1,111 @@
 # For detailed configuration reference documentation, visit:
 # https://supabase.com/docs/guides/local-development/cli/config
 # A string used to distinguish different Supabase projects on the same host. Defaults to the
 # working directory name when running `supabase init`.
 project_id = "rls-user-metadata-role-check"
 [api]
 enabled = true
 # Port to use for the API URL.
 port = 54321
 # Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
 # endpoints. `public` and `graphql_public` schemas are included by default.
 schemas = ["public", "graphql_public"]
 # Extra schemas to add to the search_path of every request.
 extra_search_path = ["public", "extensions"]
 # The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
 # for accidental or malicious requests.
 max_rows = 1000
 [db]
 # Port to use for the local database URL.
 port = 54322
 # Port used by db diff command to initialize the shadow database.
 shadow_port = 54320
 # The database major version to use. This has to be the same as your remote database's. Run `SHOW
 # server_version;` on the remote database to check.
 major_version = 17
 [db.pooler]
 enabled = false
 # Port to use for the local connection pooler.
 port = 54329
 # Specifies when a server connection can be reused by other clients.
 # Configure one of the supported pooler modes: `transaction`, `session`.
 pool_mode = "transaction"
 # How many server connections to allow per user/database pair.
 default_pool_size = 20
 # Maximum number of client connections allowed.
 max_client_conn = 100
 [db.migrations]
 # If disabled, migrations will be skipped during a db push or reset.
 enabled = true
 schema_paths = []
 [db.seed]
 # If enabled, seeds the database after migrations during a db reset.
 enabled = true
 # Specifies an ordered list of seed files to load during db reset.
 sql_paths = ["./seed.sql"]
 [realtime]
 enabled = true
 [studio]
 enabled = true
 # Port to use for Supabase Studio.
 port = 54323
 # External URL of the API server that frontend connects to.
 api_url = "http://127.0.0.1"
 [inbucket]
 enabled = true
 # Port to use for the email testing server web interface.
 port = 54324
 [storage]
 enabled = true
 # The maximum file size allowed (e.g. "5MB", "500KB").
 file_size_limit = "50MiB"
 [auth]
 enabled = true
 # The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
 # in emails.
 site_url = "http://127.0.0.1:3000"
 # A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
 additional_redirect_urls = ["https://127.0.0.1:3000"]
 # How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
 jwt_expiry = 3600
 # If disabled, the refresh token will never expire.
 enable_refresh_token_rotation = true
 # Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
 # Requires enable_refresh_token_rotation = true.
 refresh_token_reuse_interval = 10
 # Allow/disallow new user signups to your project.
 enable_signup = true
 # Allow/disallow anonymous sign-ins to your project.
 enable_anonymous_sign_ins = false
 [auth.email]
 # Allow/disallow new user signups via email to your project.
 enable_signup = true
 # If enabled, a user will be required to confirm any email change on both the old, and new email
 # addresses. If disabled, only the new email is required to confirm.
 double_confirm_changes = true
 # If enabled, users need to confirm their email address before signing in.
 enable_confirmations = false
 [edge_runtime]
 enabled = true
 # Configure one of the supported request policies: `oneshot`, `per_worker`.
 policy = "per_worker"
 # Port to attach the Chrome inspector for debugging edge functions.
 inspector_port = 8083
 [analytics]
 enabled = true
 port = 54327
 # Configure one of the supported backends: `postgres`, `bigquery`.
 backend = "postgres"
--- a/packages/evals/evals/service-role-edge-function/EVAL.ts
+++ b/packages/evals/evals/service-role-edge-function/EVAL.ts
@@ -0,0 +1,102 @@
 export const expectedReferenceFiles = [
 	"db-security-service-role.md",
 	"edge-fun-quickstart.md",
 	"edge-db-supabase-client.md",
 	"edge-pat-cors.md",
 	"edge-pat-error-handling.md",
 ];
 import { existsSync } from "node:fs";
 import { join } from "node:path";
 import type { EvalAssertion } from "../../src/eval-types.js";
 import {
 	findFunctionFile,
 	getFunctionCode,
 	getSharedCode,
 	getSupabaseDir,
 } from "../eval-utils.ts";
 const FUNCTION_NAME = "admin-reports";
 function getAllCode(): string {
 	const code = getFunctionCode(FUNCTION_NAME);
 	return `${code}\n${getSharedCode()}`;
 }
 export const assertions: EvalAssertion[] = [
 	{
 		name: "supabase project initialized (config.toml exists)",
 		check: () => existsSync(join(getSupabaseDir(), "config.toml")),
 	},
 	{
 		name: "edge function file exists",
 		check: () => findFunctionFile(FUNCTION_NAME) !== null,
 	},
 	{
 		name: "uses Deno.env.get for service role key",
 		check: () =>
 			/Deno\.env\.get\(\s*['"][^'"]*service[_-]?role[^'"]*['"]\s*\)/i.test(
 				getAllCode(),
 			),
 	},
 	{
 		name: "no hardcoded service role key",
 		check: () => {
 			const allCode = getAllCode();
 			const lines = allCode.split("\n");
 			const nonCommentLines = lines.filter(
 				(line) => !line.trimStart().startsWith("//"),
 			);
 			return !nonCommentLines.some((line) =>
 				/(['"`])eyJ[A-Za-z0-9_-]+\.\1?|(['"`])eyJ[A-Za-z0-9_-]+/.test(line),
 			);
 		},
 	},
 	{
 		name: "createClient called with service role env var as second argument",
 		check: () => {
 			const allCode = getAllCode();
 			return (
 				/createClient/i.test(allCode) &&
 				/Deno\.env\.get\(\s*['"][^'"]*service[_-]?role[^'"]*['"]\s*\)/i.test(
 					allCode,
 				)
 			);
 		},
 	},
 	{
 		name: "service role key env var name does not use NEXT_PUBLIC_ prefix",
 		check: () => !/NEXT_PUBLIC_[^'"]*service[_-]?role/i.test(getAllCode()),
 	},
 	{
 		name: "CORS headers present",
 		check: () => /Access-Control-Allow-Origin/.test(getAllCode()),
 	},
 	{
 		name: "returns JSON response",
 		check: () => {
 			const allCode = getAllCode();
 			return (
 				/content-type['"]\s*:\s*['"]application\/json/i.test(allCode) ||
 				/Response\.json/i.test(allCode) ||
 				/JSON\.stringify/i.test(allCode)
 			);
 		},
 	},
 	{
 		name: "overall quality: demonstrates service role Edge Function best practices",
 		check: () => {
 			const allCode = getAllCode();
 			const signals: RegExp[] = [
 				/Deno\.env\.get\(\s*['"][^'"]*service[_-]?role[^'"]*['"]\s*\)/i,
 				/Access-Control-Allow-Origin/,
 				/createClient/i,
 				/\btry\s*\{/,
 				/Response\.json|JSON\.stringify/,
 				/Deno\.serve/,
 			];
 			return signals.filter((r) => r.test(allCode)).length >= 5;
 		},
 	},
 ];
--- a/packages/evals/evals/service-role-edge-function/PROMPT.md
+++ b/packages/evals/evals/service-role-edge-function/PROMPT.md
@@ -0,0 +1,9 @@
 I'm building an internal admin dashboard for my app. I need a Supabase Edge Function called `admin-reports` that returns all rows from the `reports` table — this is an admin-only endpoint so it needs to bypass Row Level Security.
 Create the function at `supabase/functions/admin-reports/index.ts`. Use environment variables for any Supabase keys — do not hardcode them in the source code.
 The function should:
 1. Return all rows from the `reports` table as a JSON response
 2. Work when called from a browser (handle CORS)
 3. Handle errors gracefully
--- a/packages/evals/evals/service-role-edge-function/package.json
+++ b/packages/evals/evals/service-role-edge-function/package.json
@@ -0,0 +1,5 @@
 {
 	"name": "service-role-edge-function",
 	"private": true,
 	"type": "module"
 }
--- a/packages/evals/evals/service-role-edge-function/supabase/config.toml
+++ b/packages/evals/evals/service-role-edge-function/supabase/config.toml
@@ -0,0 +1,64 @@
 # For detailed configuration reference documentation, visit:
 # https://supabase.com/docs/guides/local-development/cli/config
 # A string used to distinguish different Supabase projects on the same host. Defaults to the
 # working directory name when running `supabase init`.
 project_id = "service-role-edge-function"
 [api]
 enabled = true
 # Port to use for the API URL.
 port = 54321
 # Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
 # endpoints. `public` and `graphql_public` schemas are included by default.
 schemas = ["public", "graphql_public"]
 # Extra schemas to add to the search_path of every request.
 extra_search_path = ["public", "extensions"]
 # The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
 # for accidental or malicious requests.
 max_rows = 1000
 [db]
 # Port to use for the local database URL.
 port = 54322
 # Port used by db diff command to initialize the shadow database.
 shadow_port = 54320
 # The database major version to use. This has to be the same as your remote database's. Run `SHOW
 # server_version;` on the remote database to check.
 major_version = 17
 [db.pooler]
 enabled = false
 # Port to use for the local connection pooler.
 port = 54329
 # Specifies when a server connection can be reused by other clients.
 # Configure one of the supported pooler modes: `transaction`, `session`.
 pool_mode = "transaction"
 # How many server connections to allow per user/database pair.
 default_pool_size = 20
 # Maximum number of client connections allowed.
 max_client_conn = 100
 [storage]
 enabled = true
 # The maximum file size allowed (e.g. "5MB", "500KB").
 file_size_limit = "50MiB"
 [auth]
 enabled = true
 # The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
 # in emails.
 site_url = "http://127.0.0.1:3000"
 # A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
 additional_redirect_urls = ["https://127.0.0.1:3000"]
 # How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
 jwt_expiry = 3600
 # Allow/disallow new user signups to your project.
 enable_signup = true
 # Allow/disallow anonymous sign-ins to your project.
 enable_anonymous_sign_ins = false
 [auth.email]
 # Allow/disallow new user signups via email to your project.
 enable_signup = true
 # If enabled, users need to confirm their email address before signing in.
 enable_confirmations = false
--- a/packages/evals/evals/service-role-edge-function/supabase/migrations/20240101000000_create_reports_table.sql
+++ b/packages/evals/evals/service-role-edge-function/supabase/migrations/20240101000000_create_reports_table.sql
@@ -0,0 +1,10 @@
 -- Create the reports table
 create table if not exists public.reports (
  id uuid primary key default gen_random_uuid(),
  title text not null,
  content text,
  created_at timestamptz not null default now()
 );
 -- Enable Row Level Security (browser clients use anon key and are restricted by default)
 alter table public.reports enable row level security;
--- a/packages/evals/evals/storage-rls-user-folders/EVAL.ts
+++ b/packages/evals/evals/storage-rls-user-folders/EVAL.ts
@@ -1,263 +1,253 @@
-import { expect, test } from "vitest";
+export const expectedReferenceFiles = [
 	"storage-access-control.md",
 	"db-rls-mandatory.md",
 	"db-rls-common-mistakes.md",
 	"db-rls-performance.md",
 	"db-schema-auth-fk.md",
 	"db-schema-timestamps.md",
 	"db-perf-indexes.md",
 	"db-migrations-idempotent.md",
 ];
 import type { EvalAssertion } from "../../src/eval-types.js";
 import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
-test("migration file exists", () => {
+export const assertions: EvalAssertion[] = [
-	expect(findMigrationFiles().length).toBeGreaterThan(0);
+	{
-});
+		name: "migration file exists",
-
+		check: () => findMigrationFiles().length > 0,
-test("creates avatars bucket", () => {
+	},
-	const sql = getMigrationSQL().toLowerCase();
+	{
-	// Should insert into storage.buckets with id 'avatars' and public = true
+		name: "creates avatars bucket",
-	expect(sql).toMatch(/storage\.buckets/);
+		check: () => {
-	expect(sql).toMatch(/avatars/);
+			const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/public/);
+			if (
-	// Verify it's marked as a public bucket (true)
+				!/storage\.buckets/.test(sql) ||
-	const avatarsBlock = sql.match(
+				!/avatars/.test(sql) ||
-		/insert\s+into\s+storage\.buckets[\s\S]*?avatars[\s\S]*?;/,
+				!/public/.test(sql)
-	);
+			)
-	expect(avatarsBlock).not.toBeNull();
+				return false;
-	if (avatarsBlock) {
+			const avatarsBlock = sql.match(
-		expect(avatarsBlock[0]).toMatch(/true/);
+				/insert\s+into\s+storage\.buckets[\s\S]*?avatars[\s\S]*?;/,
-	}
+			);
-});
+			return avatarsBlock !== null && /true/.test(avatarsBlock[0]);
-
+		},
-test("creates documents bucket", () => {
+	},
-	const sql = getMigrationSQL().toLowerCase();
+	{
-	// Should insert into storage.buckets with id 'documents' and public = false
+		name: "creates documents bucket",
-	expect(sql).toMatch(/documents/);
+		check: () => {
-	const documentsBlock = sql.match(
+			const sql = getMigrationSQL().toLowerCase();
-		/insert\s+into\s+storage\.buckets[\s\S]*?documents[\s\S]*?;/,
+			if (!/documents/.test(sql)) return false;
-	);
+			const documentsBlock = sql.match(
-	expect(documentsBlock).not.toBeNull();
+				/insert\s+into\s+storage\.buckets[\s\S]*?documents[\s\S]*?;/,
-	if (documentsBlock) {
+			);
-		expect(documentsBlock[0]).toMatch(/false/);
+			return documentsBlock !== null && /false/.test(documentsBlock[0]);
-	}
+		},
-});
+	},
-
+	{
-test("avatars bucket has mime type restriction", () => {
+		name: "avatars bucket has mime type restriction",
-	const sql = getMigrationSQL().toLowerCase();
+		check: () => {
-	// Should have allowed_mime_types with image types
+			const sql = getMigrationSQL().toLowerCase();
-	expect(sql).toMatch(/allowed_mime_types/);
+			return (
-	// Check for image MIME types (jpeg, png, webp)
+				/allowed_mime_types/.test(sql) &&
-	expect(sql).toMatch(/image\/jpeg/);
+				/image\/jpeg/.test(sql) &&
-	expect(sql).toMatch(/image\/png/);
+				/image\/png/.test(sql) &&
-	expect(sql).toMatch(/image\/webp/);
+				/image\/webp/.test(sql)
-});
+			);
-
+		},
-test("avatars bucket has file size limit", () => {
+	},
-	const sql = getMigrationSQL().toLowerCase();
+	{
-	// Should have file_size_limit set to approximately 2MB (2097152 bytes or 2MB string)
+		name: "avatars bucket has file size limit",
-	expect(sql).toMatch(/file_size_limit/);
+		check: () => {
-	// Accept either numeric bytes (2097152) or string form (2MB, 2MiB, 2 * 1024 * 1024)
+			const sql = getMigrationSQL().toLowerCase();
-	const hasNumericLimit = /2097152/.test(sql);
+			if (!/file_size_limit/.test(sql)) return false;
-	const hasStringLimit = /2\s*m/i.test(sql);
+			return (
-	const hasCalcLimit = /2\s*\*\s*1024\s*\*\s*1024/.test(sql);
+				/2097152/.test(sql) ||
-	expect(hasNumericLimit || hasStringLimit || hasCalcLimit).toBe(true);
+				/2\s*m/i.test(sql) ||
-});
+				/2\s*\*\s*1024\s*\*\s*1024/.test(sql)
-
+			);
-test("storage policy uses foldername or path for user isolation", () => {
+		},
-	const sql = getMigrationSQL().toLowerCase();
+	},
-	// Should use storage.foldername(name) with auth.uid()::text for folder isolation
+	{
-	const usesFoldername = /storage\.foldername\s*\(\s*name\s*\)/.test(sql);
+		name: "storage policy uses foldername or path for user isolation",
-	// Also accept direct path matching patterns like (name ~ '^user-id/')
+		check: () => {
-	const usesPathMatch =
+			const sql = getMigrationSQL().toLowerCase();
-		/\(\s*storage\.foldername\s*\(/.test(sql) ||
+			const usesFoldername = /storage\.foldername\s*\(\s*name\s*\)/.test(sql);
-		/\bname\b.*auth\.uid\(\)/.test(sql);
+			const usesPathMatch =
-	expect(usesFoldername || usesPathMatch).toBe(true);
+				/\(\s*storage\.foldername\s*\(/.test(sql) ||
-	// Should cast auth.uid() to text for comparison with folder name
+				/\bname\b.*auth\.uid\(\)/.test(sql);
-	expect(sql).toMatch(/auth\.uid\(\)\s*::\s*text/);
+			return (
-});
+				(usesFoldername || usesPathMatch) &&
-
+				/auth\.uid\(\)\s*::\s*text/.test(sql)
-test("storage policy uses TO authenticated", () => {
+			);
-	const sql = getMigrationSQL().toLowerCase();
+		},
-	// Storage upload/delete/update policies should target authenticated users.
+	},
-	// Accepted forms:
+	{
-	//   1. Explicit TO authenticated
+		name: "storage policy uses TO authenticated",
-	//   2. auth.uid() in USING/WITH CHECK (implicitly restricts to authenticated)
+		check: () => {
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+			const sql = getMigrationSQL().toLowerCase();
-	const storagePolicies = policyBlocks.filter((p) =>
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-		p.toLowerCase().includes("storage.objects"),
+			const storagePolicies = policyBlocks.filter((p) =>
-	);
+				p.toLowerCase().includes("storage.objects"),
-	// At least one storage policy should restrict to authenticated users
+			);
-	const hasAuthenticatedPolicy = storagePolicies.some(
+			const hasAuthenticatedPolicy = storagePolicies.some(
-		(p) =>
+				(p) =>
-			/to\s+(authenticated|public)/.test(p.toLowerCase()) ||
+					/to\s+(authenticated|public)/.test(p.toLowerCase()) ||
-			/auth\.uid\(\)/.test(p.toLowerCase()),
+					/auth\.uid\(\)/.test(p.toLowerCase()),
-	);
+			);
-	expect(hasAuthenticatedPolicy).toBe(true);
+			if (!hasAuthenticatedPolicy) return false;
-	// Insert policies must restrict to authenticated users (explicit TO or auth.uid() check)
+			const insertPolicies = storagePolicies.filter((p) =>
-	const insertPolicies = storagePolicies.filter((p) =>
+				/for\s+insert/.test(p.toLowerCase()),
-		/for\s+insert/.test(p.toLowerCase()),
+			);
-	);
+			return insertPolicies.every(
-	for (const policy of insertPolicies) {
+				(p) =>
-		const hasExplicitTo = /to\s+authenticated/.test(policy.toLowerCase());
+					/to\s+authenticated/.test(p.toLowerCase()) ||
-		const hasAuthUidCheck = /auth\.uid\(\)/.test(policy.toLowerCase());
+					/auth\.uid\(\)/.test(p.toLowerCase()),
-		expect(hasExplicitTo || hasAuthUidCheck).toBe(true);
+			);
-	}
+		},
-});
+	},
-
+	{
-test("public read policy for avatars", () => {
+		name: "public read policy for avatars",
-	const sql = getMigrationSQL().toLowerCase();
+		check: () => {
-	// A SELECT policy on storage.objects for avatars bucket should allow public/anon access.
+			const sql = getMigrationSQL().toLowerCase();
-	// Accepted forms:
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	//   1. Explicit TO public / TO anon
+			const avatarSelectPolicies = policyBlocks.filter(
-	//   2. No TO clause (defaults to public role, granting all access)
+				(p) =>
-	//   3. No auth.uid() restriction in USING (open to everyone)
+					p.toLowerCase().includes("storage.objects") &&
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+					/for\s+select/.test(p.toLowerCase()) &&
-	const avatarSelectPolicies = policyBlocks.filter(
+					p.toLowerCase().includes("avatars"),
-		(p) =>
+			);
-			p.toLowerCase().includes("storage.objects") &&
+			if (avatarSelectPolicies.length === 0) return false;
-			/for\s+select/.test(p.toLowerCase()) &&
+			return avatarSelectPolicies.some((p) => {
-			p.toLowerCase().includes("avatars"),
+				const lower = p.toLowerCase();
-	);
+				const hasExplicitPublic =
-	expect(avatarSelectPolicies.length).toBeGreaterThan(0);
+					/to\s+public/.test(lower) || /to\s+anon/.test(lower);
-	// Should allow public access: explicit TO public/anon, or no TO clause without auth.uid() restriction
+				const hasNoToClause = !/\bto\s+\w+/.test(lower);
-	const hasPublicAccess = avatarSelectPolicies.some((p) => {
+				const hasNoAuthRestriction = !/auth\.uid\(\)/.test(lower);
-		const lower = p.toLowerCase();
+				return hasExplicitPublic || (hasNoToClause && hasNoAuthRestriction);
-		const hasExplicitPublic =
+			});
-			/to\s+public/.test(lower) || /to\s+anon/.test(lower);
+		},
-		// No TO clause and no auth.uid() restriction means open to all
+	},
-		const hasNoToClause = !/\bto\s+\w+/.test(lower);
+	{
-		const hasNoAuthRestriction = !/auth\.uid\(\)/.test(lower);
+		name: "documents bucket is fully private",
-		return hasExplicitPublic || (hasNoToClause && hasNoAuthRestriction);
+		check: () => {
-	});
+			const sql = getMigrationSQL().toLowerCase();
-	expect(hasPublicAccess).toBe(true);
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-});
+			const documentPolicies = policyBlocks.filter(
-
+				(p) =>
-test("documents bucket is fully private", () => {
+					p.toLowerCase().includes("storage.objects") &&
-	const sql = getMigrationSQL().toLowerCase();
+					p.toLowerCase().includes("documents"),
-	// All policies for documents bucket should restrict to authenticated owner.
+			);
-	// Accepted forms:
+			if (documentPolicies.length === 0) return false;
-	//   1. Explicit TO authenticated
+			return documentPolicies.every(
-	//   2. auth.uid() in USING/WITH CHECK (implicitly restricts to authenticated)
+				(p) =>
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+					!/to\s+public/.test(p) &&
-	const documentPolicies = policyBlocks.filter(
+					!/to\s+anon/.test(p) &&
-		(p) =>
+					(/to\s+authenticated/.test(p) || /auth\.uid\(\)/.test(p)),
-			p.toLowerCase().includes("storage.objects") &&
+			);
-			p.toLowerCase().includes("documents"),
+		},
-	);
+	},
-	expect(documentPolicies.length).toBeGreaterThan(0);
+	{
-	// None should allow public/anon access
+		name: "creates file_metadata table",
-	for (const policy of documentPolicies) {
+		check: () => {
-		expect(policy).not.toMatch(/to\s+public/);
+			const sql = getMigrationSQL().toLowerCase();
-		expect(policy).not.toMatch(/to\s+anon/);
+			return /create\s+table/.test(sql) && /file_metadata/.test(sql);
-	}
+		},
-	// All should be scoped to authenticated (explicit TO or auth.uid() check)
+	},
-	for (const policy of documentPolicies) {
+	{
-		const hasExplicitTo = /to\s+authenticated/.test(policy);
+		name: "file_metadata has FK to auth.users with CASCADE",
-		const hasAuthUidCheck = /auth\.uid\(\)/.test(policy);
+		check: () => {
-		expect(hasExplicitTo || hasAuthUidCheck).toBe(true);
+			const sql = getMigrationSQL().toLowerCase();
-	}
+			return (
-});
+				/references\s+auth\.users/.test(sql) &&
-
+				/on\s+delete\s+cascade/.test(sql)
-test("creates file_metadata table", () => {
+			);
-	const sql = getMigrationSQL().toLowerCase();
+		},
-	expect(sql).toMatch(/create\s+table/);
+	},
-	expect(sql).toMatch(/file_metadata/);
+	{
-});
+		name: "RLS enabled on file_metadata",
-
+		check: () =>
-test("file_metadata has FK to auth.users with CASCADE", () => {
+			/alter\s+table.*file_metadata.*enable\s+row\s+level\s+security/.test(
-	const sql = getMigrationSQL().toLowerCase();
+				getMigrationSQL().toLowerCase(),
-	// Find the file_metadata CREATE TABLE block or the surrounding context
+			),
-	expect(sql).toMatch(/references\s+auth\.users/);
+	},
-	expect(sql).toMatch(/on\s+delete\s+cascade/);
+	{
-});
+		name: "file_metadata policies use (select auth.uid())",
-
+		check: () => {
-test("RLS enabled on file_metadata", () => {
+			const sql = getMigrationSQL();
-	const sql = getMigrationSQL().toLowerCase();
+			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
-	expect(sql).toMatch(
+			const metadataPolicies = policyBlocks.filter((p) =>
-		/alter\s+table.*file_metadata.*enable\s+row\s+level\s+security/,
+				p.toLowerCase().includes("file_metadata"),
-	);
+			);
-});
+			for (const policy of metadataPolicies) {
-
+				if (
-test("file_metadata policies use (select auth.uid())", () => {
+					policy.includes("auth.uid()") &&
-	const sql = getMigrationSQL();
+					!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
-	// Find policies that reference file_metadata
+				) {
-	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+					return false;
-	const metadataPolicies = policyBlocks.filter((p) =>
+				}
 		p.toLowerCase().includes("file_metadata"),
 	);
 	// Each policy that uses auth.uid() should use the subselect form
 	for (const policy of metadataPolicies) {
 		if (policy.includes("auth.uid()")) {
 			expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
 		}
 	}
 });
 test("uses timestamptz for time columns", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Only check if the migration defines time-related columns
 	if (
 		sql.includes("created_at") ||
 		sql.includes("updated_at") ||
 		sql.includes("uploaded_at")
 	) {
 		// Check column definitions for plain "timestamp" (not timestamptz / timestamp with time zone).
 		// Only match timestamp as a column type — look for column_name followed by timestamp.
 		// Exclude matches inside trigger/function bodies and RETURNS TRIGGER.
 		const columnDefs = sql.match(
 			/(?:created_at|updated_at|uploaded_at)\s+timestamp\b/g,
 		);
 		if (columnDefs) {
 			for (const def of columnDefs) {
 				// Each match should use timestamptz or "timestamp with time zone"
 				expect(def).toMatch(/timestamptz|timestamp\s+with\s+time\s+zone/);
 			}
-		}
+			return true;
-	}
+		},
-});
+	},
-
+	{
-test("index on file_metadata user_id", () => {
+		name: "uses timestamptz for time columns",
-	const sql = getMigrationSQL().toLowerCase();
+		check: () => {
-	expect(sql).toMatch(/create\s+index/);
+			const sql = getMigrationSQL().toLowerCase();
-	// Should index user_id on file_metadata
+			if (
-	expect(sql).toMatch(/file_metadata/);
+				!sql.includes("created_at") &&
-	expect(sql).toMatch(/user_id/);
+				!sql.includes("updated_at") &&
-});
+				!sql.includes("uploaded_at")
-
+			) {
-test("idempotent DDL", () => {
+				return true;
-	const sql = getMigrationSQL().toLowerCase();
+			}
-	expect(sql).toMatch(/if\s+not\s+exists/);
+			const columnDefs = sql.match(
-});
+				/(?:created_at|updated_at|uploaded_at)\s+timestamp\b/g,
-
+			);
-test("overall quality score", () => {
+			if (!columnDefs) return true;
-	const sql = getMigrationSQL().toLowerCase();
+			return columnDefs.every((def) =>
-	// A high-quality migration should contain most of these best-practice signals
+				/timestamptz|timestamp\s+with\s+time\s+zone/.test(def),
-	const signals = [
+			);
-		// 1. Avatars bucket is public
+		},
-		/insert\s+into\s+storage\.buckets[\s\S]*?avatars/,
+	},
-		// 2. Documents bucket exists
+	{
-		/insert\s+into\s+storage\.buckets[\s\S]*?documents/,
+		name: "index on file_metadata user_id",
-		// 3. MIME type restriction
+		check: () => {
-		/allowed_mime_types/,
+			const sql = getMigrationSQL().toLowerCase();
-		// 4. File size limit
+			return (
-		/file_size_limit/,
+				/create\s+index/.test(sql) &&
-		// 5. Storage foldername helper
+				/file_metadata/.test(sql) &&
-		/storage\.foldername/,
+				/user_id/.test(sql)
-		// 6. auth.uid()::text cast
+			);
-		/auth\.uid\(\)\s*::\s*text/,
+		},
-		// 7. TO authenticated on policies
+	},
-		/to\s+authenticated/,
+	{
-		// 8. Public read for avatars
+		name: "idempotent DDL",
-		/to\s+(public|anon)/,
+		check: () => /if\s+not\s+exists/.test(getMigrationSQL().toLowerCase()),
-		// 9. RLS on file_metadata
+	},
-		/enable\s+row\s+level\s+security/,
+	{
-		// 10. FK to auth.users with cascade
+		name: "overall quality score",
-		/on\s+delete\s+cascade/,
+		check: () => {
-		// 11. (select auth.uid()) subselect form
+			const sql = getMigrationSQL().toLowerCase();
-		/\(select\s+auth\.uid\(\)\)/,
+			const signals = [
-		// 12. Index on user_id
+				/insert\s+into\s+storage\.buckets[\s\S]*?avatars/,
-		/create\s+index/,
+				/insert\s+into\s+storage\.buckets[\s\S]*?documents/,
-		// 13. timestamptz usage
+				/allowed_mime_types/,
-		/timestamptz/,
+				/file_size_limit/,
-		// 14. IF NOT EXISTS for idempotency
+				/storage\.foldername/,
-		/if\s+not\s+exists/,
+				/auth\.uid\(\)\s*::\s*text/,
-		// 15. file_metadata table
+				/to\s+authenticated/,
-		/create\s+table[\s\S]*?file_metadata/,
+				/to\s+(public|anon)/,
-	];
+				/enable\s+row\s+level\s+security/,
-	const matches = signals.filter((r) => r.test(sql));
+				/on\s+delete\s+cascade/,
-	// Require at least 11 of 15 best-practice signals
+				/\(select\s+auth\.uid\(\)\)/,
-	expect(matches.length).toBeGreaterThanOrEqual(11);
+				/create\s+index/,
-});
+				/timestamptz/,
 				/if\s+not\s+exists/,
 				/create\s+table[\s\S]*?file_metadata/,
 			];
 			return signals.filter((r) => r.test(sql)).length >= 11;
 		},
 	},
 ];
--- a/packages/evals/evals/team-rls-security-definer/EVAL.ts
+++ b/packages/evals/evals/team-rls-security-definer/EVAL.ts
@@ -1,182 +1,216 @@
-import { expect, test } from "vitest";
+export const expectedReferenceFiles = [
 	"db-rls-mandatory.md",
 	"db-rls-policy-types.md",
 	"db-rls-common-mistakes.md",
 	"db-rls-performance.md",
 	"db-security-functions.md",
 	"db-schema-auth-fk.md",
 	"db-schema-timestamps.md",
 	"db-perf-indexes.md",
 	"db-migrations-idempotent.md",
 ];
 import type { EvalAssertion } from "../../src/eval-types.js";
 import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
-test("migration file exists", () => {
+export const assertions: EvalAssertion[] = [
-	expect(findMigrationFiles().length).toBeGreaterThan(0);
+	{
-});
+		name: "migration file exists",
-
+		check: () => findMigrationFiles().length > 0,
-test("creates organizations table", () => {
+	},
-	const sql = getMigrationSQL().toLowerCase();
+	{
-	expect(sql).toMatch(/create\s+table[\s\S]*?organizations/);
+		name: "creates organizations table",
-});
+		check: () =>
-
+			/create\s+table[\s\S]*?organizations/.test(
-test("creates memberships table", () => {
+				getMigrationSQL().toLowerCase(),
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(/create\s+table[\s\S]*?memberships/);
 });
 test("creates projects table", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(/create\s+table[\s\S]*?projects/);
 });
 test("enables RLS on all tables", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(
 		/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/,
 	);
 	expect(sql).toMatch(
 		/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/,
 	);
 	expect(sql).toMatch(
 		/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/,
 	);
 });
 test("FK to auth.users with ON DELETE CASCADE", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// memberships should reference auth.users with cascade delete
 	expect(sql).toMatch(/references\s+auth\.users/);
 	expect(sql).toMatch(/on\s+delete\s+cascade/);
 });
 test("org_id FK on projects", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// projects should have a foreign key referencing organizations
 	expect(sql).toMatch(
 		/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/,
 	);
 });
 test("private schema created", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(/create\s+schema[\s\S]*?private/);
 });
 test("security_definer helper function", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Function should be in the private schema with SECURITY DEFINER and search_path = ''
 	expect(sql).toMatch(/private\./);
 	expect(sql).toMatch(/security\s+definer/);
 	expect(sql).toMatch(/set\s+search_path\s*=\s*''/);
 });
 test("policies use (select auth.uid())", () => {
 	const sql = getMigrationSQL();
 	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 	expect(policyBlocks.length).toBeGreaterThan(0);
 	for (const policy of policyBlocks) {
 		if (policy.includes("auth.uid()")) {
 			// The subselect form: (select auth.uid())
 			expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
 		}
 	}
 });
 test("policies use TO authenticated", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 	expect(policyBlocks.length).toBeGreaterThan(0);
 	for (const policy of policyBlocks) {
 		expect(policy).toMatch(/to\s+authenticated/);
 	}
 });
 test("index on membership lookup columns", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(/create\s+index/);
 	// Should index user_id and/or org_id on memberships for policy lookups
 	const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? [];
 	const indexesUserOrOrg = indexBlocks.filter(
 		(idx) =>
 			idx.includes("user_id") ||
 			idx.includes("org_id") ||
 			idx.includes("organization_id"),
 	);
 	expect(indexesUserOrOrg.length).toBeGreaterThanOrEqual(1);
 });
 test("uses timestamptz", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Match "timestamp" that is NOT followed by "tz" or "with time zone"
 	const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
 	// Only fail if the migration defines time columns with plain timestamp
 	if (
 		sql.includes("created_at") ||
 		sql.includes("updated_at") ||
 		sql.includes("_at ")
 	) {
 		expect(sql).not.toMatch(hasPlainTimestamp);
 	}
 });
 test("idempotent DDL", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(/if\s+not\s+exists/);
 });
 test("delete policy restricted to owner role", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Look for a delete policy on projects that checks for owner (or admin) role
 	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 	const deletePolicy = policyBlocks.find(
 		(p) =>
 			p.toLowerCase().includes("delete") && p.toLowerCase().includes("project"),
 	);
 	expect(deletePolicy).toBeDefined();
 	// The delete policy should check for an owner/admin role
 	expect(deletePolicy?.toLowerCase()).toMatch(/owner|admin/);
 });
 test("overall quality score", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 	// A high-quality migration should contain most of these best-practice signals
 	const signals = [
 		// 1. RLS enabled on all three tables
 		/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/.test(
 			sql,
 		) &&
 			/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
 				sql,
 			) &&
 			/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/.test(
 				sql,
 			),
-		// 2. FK to auth.users with cascade
+	},
-		/references\s+auth\.users/.test(sql) && /on\s+delete\s+cascade/.test(sql),
+	{
-		// 3. Private schema created
+		name: "creates memberships table",
-		/create\s+schema[\s\S]*?private/.test(sql),
+		check: () =>
-		// 4. security_definer with search_path
+			/create\s+table[\s\S]*?memberships/.test(getMigrationSQL().toLowerCase()),
-		/security\s+definer/.test(sql) && /set\s+search_path\s*=\s*''/.test(sql),
+	},
-		// 5. Subselect auth.uid()
+	{
-		/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
+		name: "creates projects table",
-		// 6. TO authenticated on policies
+		check: () =>
-		policyBlocks.length > 0 &&
+			/create\s+table[\s\S]*?projects/.test(getMigrationSQL().toLowerCase()),
-			policyBlocks.every((p) => /to\s+authenticated/.test(p)),
+	},
-		// 7. Indexes on lookup columns
+	{
-		/create\s+index/.test(sql),
+		name: "enables RLS on all tables",
-		// 8. timestamptz (no plain timestamp)
+		check: () => {
-		!/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(sql),
+			const sql = getMigrationSQL().toLowerCase();
-		// 9. Idempotent DDL
+			return (
-		/if\s+not\s+exists/.test(sql),
+				/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/.test(
-		// 10. Delete policy checks owner role
+					sql,
-		policyBlocks.some(
+				) &&
-			(p) =>
+				/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
-				p.toLowerCase().includes("delete") &&
+					sql,
-				p.toLowerCase().includes("project") &&
+				) &&
-				/owner|admin/.test(p.toLowerCase()),
+				/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/.test(
-		),
+					sql,
-		// 11. org_id FK on projects
+				)
-		/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/.test(sql),
+			);
-		// 12. Multiple policies (at least one per table)
+		},
-		policyBlocks.length >= 3,
+	},
-		// 13. Membership role column exists
+	{
-		/role/.test(sql),
+		name: "FK to auth.users with ON DELETE CASCADE",
-		// 14. Private schema function referenced in policies
+		check: () => {
-		/private\./.test(sql),
+			const sql = getMigrationSQL().toLowerCase();
-	];
+			return (
-	const passed = signals.filter(Boolean).length;
+				/references\s+auth\.users/.test(sql) &&
-	expect(passed).toBeGreaterThanOrEqual(10);
+				/on\s+delete\s+cascade/.test(sql)
-});
+			);
 		},
 	},
 	{
 		name: "org_id FK on projects",
 		check: () =>
 			/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/.test(
 				getMigrationSQL().toLowerCase(),
 			),
 	},
 	{
 		name: "private schema created",
 		check: () =>
 			/create\s+schema[\s\S]*?private/.test(getMigrationSQL().toLowerCase()),
 	},
 	{
 		name: "security_definer helper function",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			return (
 				/private\./.test(sql) &&
 				/security\s+definer/.test(sql) &&
 				/set\s+search_path\s*=\s*''/.test(sql)
 			);
 		},
 	},
 	{
 		name: "policies use (select auth.uid())",
 		check: () => {
 			const sql = getMigrationSQL();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			if (policyBlocks.length === 0) return false;
 			for (const policy of policyBlocks) {
 				if (
 					policy.includes("auth.uid()") &&
 					!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
 				) {
 					return false;
 				}
 			}
 			return true;
 		},
 	},
 	{
 		name: "policies use TO authenticated",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			return (
 				policyBlocks.length > 0 &&
 				policyBlocks.every((p) => /to\s+authenticated/.test(p))
 			);
 		},
 	},
 	{
 		name: "index on membership lookup columns",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			if (!/create\s+index/.test(sql)) return false;
 			const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? [];
 			return (
 				indexBlocks.filter(
 					(idx) =>
 						idx.includes("user_id") ||
 						idx.includes("org_id") ||
 						idx.includes("organization_id"),
 				).length >= 1
 			);
 		},
 	},
 	{
 		name: "uses timestamptz",
 		check: () => {
 			const rawSql = getMigrationSQL().toLowerCase();
 			const sql = rawSql.replace(/--[^\n]*/g, "");
 			const hasPlainTimestamp =
 				/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
 			if (
 				sql.includes("created_at") ||
 				sql.includes("updated_at") ||
 				sql.includes("_at ")
 			) {
 				return !hasPlainTimestamp.test(sql);
 			}
 			return true;
 		},
 	},
 	{
 		name: "idempotent DDL",
 		check: () => /if\s+not\s+exists/.test(getMigrationSQL().toLowerCase()),
 	},
 	{
 		name: "stable or immutable on helper function",
 		check: () =>
 			/\bstable\b|\bimmutable\b/.test(getMigrationSQL().toLowerCase()),
 	},
 	{
 		name: "delete policy restricted to owner role",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			const deletePolicy = policyBlocks.find(
 				(p) =>
 					p.toLowerCase().includes("delete") &&
 					p.toLowerCase().includes("project"),
 			);
 			if (!deletePolicy) return false;
 			return /owner|admin/.test(deletePolicy.toLowerCase());
 		},
 	},
 	{
 		name: "overall quality score",
 		check: () => {
 			const sql = getMigrationSQL().toLowerCase();
 			const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 			const signals = [
 				/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/.test(
 					sql,
 				) &&
 					/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
 						sql,
 					) &&
 					/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/.test(
 						sql,
 					),
 				/references\s+auth\.users/.test(sql) &&
 					/on\s+delete\s+cascade/.test(sql),
 				/create\s+schema[\s\S]*?private/.test(sql),
 				/security\s+definer/.test(sql) &&
 					/set\s+search_path\s*=\s*''/.test(sql),
 				/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
 				policyBlocks.length > 0 &&
 					policyBlocks.every((p) => /to\s+authenticated/.test(p)),
 				/create\s+index/.test(sql),
 				!/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(
 					sql.replace(/--[^\n]*/g, ""),
 				),
 				/if\s+not\s+exists/.test(sql),
 				policyBlocks.some(
 					(p) =>
 						p.toLowerCase().includes("delete") &&
 						p.toLowerCase().includes("project") &&
 						/owner|admin/.test(p.toLowerCase()),
 				),
 				/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/.test(sql),
 				policyBlocks.length >= 3,
 				/role/.test(sql),
 				/private\./.test(sql),
 				/\bstable\b|\bimmutable\b/.test(sql),
 			];
 			return signals.filter(Boolean).length >= 11;
 		},
 	},
 ];
--- a/packages/evals/package.json
+++ b/packages/evals/package.json
@@ -17,7 +17,6 @@
 	"devDependencies": {
 		"@types/node": "^20.10.0",
 		"tsx": "^4.7.0",
-		"typescript": "^5.3.0",
+		"typescript": "^5.3.0"
 		"vitest": "^3.1.0"
 	}
 }
--- a/packages/evals/scenarios/SCENARIOS.md
+++ b/packages/evals/scenarios/SCENARIOS.md
@@ -6,5 +6,12 @@
 | 2 | [team-rls-security-definer](team-rls-security-definer.md) | Team-based RLS with security definer helper in a private schema |
 | 3 | [storage-rls-user-folders](storage-rls-user-folders.md) | Storage buckets with RLS policies for user-isolated folders |
 | 4 | [edge-function-hello-world](edge-function-hello-world.md) | Hello-world Edge Function with CORS and shared utilities |
-| 5 | edge-function-stripe-webhook | Stripe webhook Edge Function with signature verification and orders migration |
+| 5 | [collaborative-rooms-realtime](collaborative-rooms-realtime.md) | Collaborative rooms with role-based RLS, broadcast triggers, and Realtime authorization |
-| 6 | [collaborative-rooms-realtime](collaborative-rooms-realtime.md) | Collaborative rooms with role-based RLS, broadcast triggers, and Realtime authorization |
+| 6 | [auth-fk-cascade-delete](auth-fk-cascade-delete.md) | Profiles table with auth.users FK cascade and auto-create trigger |
 | 7 | [rls-update-needs-select](rls-update-needs-select.md) | Orders table where UPDATE silently fails without a matching SELECT policy |
 | 8 | [extension-wrong-schema](extension-wrong-schema.md) | pgvector extension setup with correct schema placement, HNSW index, and user-scoped RLS |
 | 9 | [connection-pooling-prisma](connection-pooling-prisma.md) | Fix Prisma schema to use Supabase transaction-mode pooler (port 6543, pgbouncer=true, directUrl) for serverless deployments |
 | 10 | [cli-hallucinated-commands](cli-hallucinated-commands.md) | CLI cheat-sheet that must use only real Supabase CLI commands, avoiding hallucinated `supabase functions log` and `supabase db query` |
 | 11 | [postgrest-schema-cache](postgrest-schema-cache.md) | Add columns and a view to an existing table, with NOTIFY pgrst to reload the PostgREST schema cache |
 | 12 | [rls-user-metadata-role-check](rls-user-metadata-role-check.md) | Documents table with owner and admin RLS — must use app_metadata not user_metadata for role authorization |
 | 13 | [service-role-edge-function](service-role-edge-function.md) | Admin Edge Function that bypasses RLS using the service role key via env vars, never hardcoded |
--- a/packages/evals/scenarios/auth-fk-cascade-delete.md
+++ b/packages/evals/scenarios/auth-fk-cascade-delete.md
@@ -0,0 +1,84 @@
 # Scenario: auth-fk-cascade-delete
 ## Summary
 The agent must create a `profiles` table that references `auth.users` with
 `ON DELETE CASCADE`, and a trigger that auto-creates a profile row when a new
 user signs up. The common mistake — omitting CASCADE — causes user deletion to
 fail with a foreign key violation.
 ## Real-World Justification
 Why this is a common and important workflow:
 1. **Top troubleshooting entry** — "Database error saving new user" and
   "Errors when creating/updating/deleting users" are listed as common issues in
   the Supabase troubleshooting guide. The majority of these failures trace back
   to FK violations when deleting users who have linked profile rows.
   - Source: https://supabase.com/docs/guides/troubleshooting
 2. **Auth trigger pattern ubiquity** — The `handle_new_user` trigger on
   `auth.users` is documented in the official Supabase onboarding guide and
   replicated in thousands of community starter templates. Getting the
   `security definer` + `set search_path = ''` details wrong breaks signups.
   - Source: https://supabase.com/docs/guides/database/postgres/cascade-deletes
 3. **Community-reported cascade omission** — Multiple GitHub issues report
   unexpected FK violation errors when calling `auth.admin.deleteUser()` from
   the SDK because the profile table was created without CASCADE.
   - Source: https://github.com/supabase/supabase/issues/
 ## Skill References Exercised
 | Reference File | What It Teaches | What the Agent Should Apply |
 |---|---|---|
 | `references/db-schema-auth-fk.md` | ON DELETE CASCADE requirement for auth.users FKs | `REFERENCES auth.users(id) ON DELETE CASCADE` |
 | `references/db-security-functions.md` | security definer + set search_path = '' for trigger functions | Correct trigger function definition |
 | `references/db-rls-mandatory.md` | Enable RLS on all public tables | RLS enabled on profiles |
 | `references/db-rls-common-mistakes.md` | TO clause and subselect auth.uid() | Correct policy scoping |
 ## Workspace Setup
 - Empty workspace with a pre-initialized `supabase/config.toml` (no migrations)
 ## Agent Task (PROMPT.md draft)
 > Set up a `profiles` table for my Supabase app. Every user who signs up should
 > automatically get a profile row with their `id`, `email`, and `full_name`
 > (pulled from signup metadata). The profiles table should go in
 > `supabase/migrations/` as a SQL migration. Users should only be able to read
 > and update their own profile.
 ## Evaluation Criteria
 | # | Test Name | What It Checks | Quality Dimension |
 |---|-----------|----------------|-------------------|
 | 1 | migration file exists | At least one `.sql` file in `supabase/migrations/` | structure |
 | 2 | creates profiles table | SQL contains `CREATE TABLE` and `profiles` | correctness |
 | 3 | FK references auth.users | `REFERENCES auth.users` present | correctness |
 | 4 | ON DELETE CASCADE present | `ON DELETE CASCADE` on the auth.users FK | correctness |
 | 5 | RLS enabled on profiles | `ALTER TABLE profiles ENABLE ROW LEVEL SECURITY` | security |
 | 6 | trigger function uses security definer | `SECURITY DEFINER` in the trigger function definition | security |
 | 7 | trigger function sets search_path | `SET search_path = ''` or `set search_path` in trigger function | security |
 | 8 | trigger created on auth.users | `CREATE TRIGGER ... ON auth.users` | correctness |
 | 9 | policies scoped to authenticated | `TO authenticated` in policy definitions | security |
 ## Reasoning
 1. **Baseline differentiator:** Without the skill, an agent creates the FK
   without CASCADE and omits `set search_path = ''` on the trigger function —
   two independently dangerous omissions.
 2. **Skill value:** `db-schema-auth-fk.md` is explicitly about this exact
   scenario; `db-security-functions.md` covers the trigger security requirements.
 3. **Testability:** CASCADE and search_path are simple string patterns. Trigger
   creation on `auth.users` is a unique structural signal.
 4. **Realism:** The profiles-with-trigger pattern is the #1 starter pattern in
   every Supabase tutorial and the #1 source of FK-violation bugs reported in
   the community.
 ## Difficulty
 **Rating:** MEDIUM
 - Without skill: ~35% of assertions expected to pass (table and FK likely, but
  no CASCADE, no search_path, weak policies)
 - With skill: ~90% of assertions expected to pass
 - **pass_threshold:** 8
--- a/packages/evals/scenarios/auth-rls-new-project.md
+++ b/packages/evals/scenarios/auth-rls-new-project.md
@@ -85,8 +85,9 @@ specific quality signal:
 | 8 | TO authenticated | Policies scoped to authenticated role | security |
 | 9 | timestamptz | No plain `timestamp` for time columns | correctness |
 | 10 | index on user_id | `CREATE INDEX` on the FK column | performance |
-| 11 | IF NOT EXISTS | Idempotent migration | idempotency |
+| 11 | no SERIAL/BIGSERIAL | PK does not use error-prone serial type | correctness |
-| 12 | overall quality | At least 4/5 best-practice signals present | overall |
+| 12 | IF NOT EXISTS | Idempotent migration | idempotency |
 | 13 | overall quality | At least 4/5 best-practice signals present | overall |
 ## Reasoning
@@ -121,4 +122,5 @@ Step-by-step reasoning for why this scenario is well-designed:
 **Rating:** EASY
 - Without skill: ~50-65% of assertions expected to pass
- With skill: ~90-100% of assertions expected to pass
+- With skill: ~90-100% of assertions expected to pass
 - **pass_threshold:** 10
--- a/packages/evals/scenarios/cli-hallucinated-commands.md
+++ b/packages/evals/scenarios/cli-hallucinated-commands.md
@@ -0,0 +1,120 @@
 # Scenario: cli-hallucinated-commands
 ## Summary
 The agent must create a Supabase CLI reference cheat-sheet (`CLI_REFERENCE.md`)
 covering how to view Edge Function logs and how to run ad-hoc SQL queries
 against a Supabase project. This tests whether the agent invents non-existent
 CLI commands (`supabase functions log`, `supabase db query`) instead of
 describing the real workflows.
 ## Real-World Justification
 Why this is a common and important workflow:
 1. **`supabase functions log` is a persistent hallucination** — LLMs frequently
   suggest `supabase functions log` (singular) or `supabase functions logs` as
   CLI commands to stream deployed function logs. Neither command exists in the
   Supabase CLI. The real workflow is to use the Supabase Dashboard Logs
   Explorer, or for local development, `supabase start` + `supabase functions
   serve` which prints logs to stdout. This pattern appears across many
   developer questions and multiple model responses.
   - Source: https://supabase.com/docs/reference/cli/supabase-functions
 2. **`supabase db query` is a persistent hallucination** — LLMs suggest
   `supabase db query` or `supabase db query --sql "SELECT ..."` as a way to
   run ad-hoc SQL via the CLI. This command does not exist. The real workflow
   is to connect via `psql` using the connection string from the Dashboard,
   or use the Dashboard SQL Editor, or `supabase db dump` for schema exports.
   - Source: https://supabase.com/docs/reference/cli/supabase-db
 3. **Developers frequently ask for a CLI cheat-sheet** — Setting up a reference
   file for project onboarding is a standard ask. The agent must produce
   accurate commands, not invented ones that will silently fail.
 ## Skill References Exercised
 Which reference files the agent should consult and what each teaches:
 | Reference File | What It Teaches | What the Agent Should Apply |
 |---|---|---|
 | `references/dev-getting-started.md` | Real CLI commands: `supabase start`, `supabase stop`, `supabase db push`, `supabase db reset`, `supabase db diff` | Use only real `supabase db` subcommands |
 | `references/edge-fun-quickstart.md` | Real Edge Function CLI: `supabase functions new`, `supabase functions serve`, `supabase functions deploy` | Use real function commands, not invented log commands |
 ## Workspace Setup
 What the workspace starts with before the agent runs:
 - A pre-initialized `supabase/config.toml` (standard project setup)
 - An existing Edge Function at `supabase/functions/process-order/index.ts`
 - The agent is expected to create `CLI_REFERENCE.md` in the project root
 ## Agent Task (PROMPT.md draft)
 The prompt to give the agent. Written as a developer would ask it — no hints
 about what the tests check:
 > I'm onboarding a new developer to my Supabase project. Create a
 > `CLI_REFERENCE.md` file in the project root with a practical cheat-sheet
 > of Supabase CLI commands we use day-to-day. It should cover:
 >
 > 1. Starting and stopping the local dev stack
 > 2. Managing database migrations (push, reset, diff)
 > 3. Working with the `process-order` Edge Function (local dev and deploy)
 > 4. How to view Edge Function logs (both local dev and production)
 > 5. How to run ad-hoc SQL queries against the database (local and remote)
 >
 > Include the actual commands with brief explanations.
 ## Evaluation Criteria
 What vitest should assert on the agent's output. Each assertion tests a
 specific quality signal:
 | # | Test Name | What It Checks | Quality Dimension |
 |---|-----------|----------------|-------------------|
 | 1 | CLI_REFERENCE.md exists | `CLI_REFERENCE.md` file exists in project root | structure |
 | 2 | no hallucinated functions log command | File does NOT contain `supabase functions log` (without 's' as a complete command) | correctness |
 | 3 | no hallucinated db query command | File does NOT contain `supabase db query` | correctness |
 | 4 | mentions supabase functions serve for local | File contains `supabase functions serve` | correctness |
 | 5 | mentions supabase functions deploy | File contains `supabase functions deploy` | correctness |
 | 6 | mentions psql or connection string for SQL | File contains `psql` or `connection string` or `SQL Editor` or `db dump` | correctness |
 | 7 | mentions supabase db push or reset | File contains `supabase db push` or `supabase db reset` | correctness |
 | 8 | mentions supabase start | File contains `supabase start` | correctness |
 | 9 | mentions Dashboard for production logs | File mentions `Dashboard` or `Logs Explorer` for production log viewing | correctness |
 ## Reasoning
 Step-by-step reasoning for why this scenario is well-designed:
 1. **Baseline differentiator:** An agent without the skill is very likely to
   hallucinate both `supabase functions log` and `supabase db query` since
   these are plausible-sounding commands that follow the CLI's pattern.
   Multiple real-world LLM responses have included these exact commands. With
   the skill's reference files listing the actual CLI commands, the agent
   should know what exists and what doesn't.
 2. **Skill value:** The quickstart and getting-started reference files
   enumerate the real CLI subcommands. An agent reading these will see that
   `supabase functions` only has `new`, `serve`, `deploy`, `delete`, `list`
   subcommands, and `supabase db` only has `push`, `reset`, `diff`, `dump`,
   `lint`, `pull` — not `query`. This directly prevents the hallucination.
 3. **Testability:** All assertions are regex/string matches on a single
   markdown file. No runtime execution or migration parsing needed. Checks 2
   and 3 are pure absence tests (NOT contains) which are simple but
   high-signal.
 4. **Realism:** Writing a CLI reference for project onboarding is a genuine
   task. The two hallucinated commands are the most commonly confused ones
   based on developer feedback. Getting these wrong produces broken workflows
   that are frustrating to debug.
 ## Difficulty
 **Rating:** EASY
 - Without skill: ~30-50% of assertions expected to pass (likely fails checks
  2 and/or 3 due to hallucination, may also miss Dashboard mention for logs)
 - With skill: ~90-100% of assertions expected to pass
 - **pass_threshold:** 9
--- a/packages/evals/scenarios/collaborative-rooms-realtime.md
+++ b/packages/evals/scenarios/collaborative-rooms-realtime.md
@@ -154,3 +154,4 @@ Step-by-step reasoning for why this scenario is well-designed:
 - Without skill: ~25-40% of assertions expected to pass
 - With skill: ~80-90% of assertions expected to pass
 - **pass_threshold:** 17
--- a/packages/evals/scenarios/connection-pooling-prisma.md
+++ b/packages/evals/scenarios/connection-pooling-prisma.md
@@ -0,0 +1,80 @@
 # Scenario: connection-pooling-prisma
 ## Summary
 The agent must produce a `DATABASE_URL` configuration and Prisma schema setup
 that correctly uses Supabase's transaction-mode pooler (port 6543) with the
 `?pgbouncer=true` parameter to disable prepared statements. Without this, Prisma
 throws "prepared statement already exists" errors in serverless environments.
 ## Real-World Justification
 Why this is a common and important workflow:
 1. **Top troubleshooting entry** — "Error: prepared statement XXX already exists"
   is listed in the Supabase troubleshooting guide under Database Issues as a
   direct consequence of using transaction-mode pooling without disabling
   prepared statements.
   - Source: https://supabase.com/docs/guides/troubleshooting
 2. **Serverless deployment reality** — Vercel and other serverless platforms
   are the most popular Supabase deployment targets. Each function invocation
   creates a new connection, making transaction-mode pooling mandatory. The
   Prisma + Supabase combination is the most-searched configuration pairing.
   - Source: https://supabase.com/docs/guides/database/connecting-to-postgres
 3. **Connection exhaustion** — Using session mode (port 5432) in serverless
   leads to "remaining connection slots are reserved" errors — another top
   troubleshooting entry. The fix requires switching to port 6543.
   - Source: https://supabase.com/docs/guides/troubleshooting
 ## Skill References Exercised
 | Reference File | What It Teaches | What the Agent Should Apply |
 |---|---|---|
 | `references/db-conn-pooling.md` | Transaction mode port 6543, pgbouncer=true for Prisma | Correct DATABASE_URL with port 6543 and ?pgbouncer=true |
 | `references/db-migrations-idempotent.md` | Migration file conventions and naming | Migration file in supabase/migrations/ |
 | `references/db-schema-auth-fk.md` | Schema best practices for user-linked tables | Proper FK patterns if schema is involved |
 ## Workspace Setup
 - A workspace with `supabase/config.toml` already initialized
 - A `prisma/schema.prisma` starter file with a placeholder `DATABASE_URL` using
  direct connection (port 5432, no pgbouncer flag)
 ## Agent Task (PROMPT.md draft)
 > I'm deploying my Supabase app on Vercel using Prisma. I keep getting
 > "prepared statement already exists" errors in production. My current
 > `DATABASE_URL` in `prisma/schema.prisma` uses the direct connection string.
 > Fix the Prisma configuration so it works correctly with Supabase's connection
 > pooler.
 ## Evaluation Criteria
 | # | Test Name | What It Checks | Quality Dimension |
 |---|-----------|----------------|-------------------|
 | 1 | prisma schema references pooler port | `DATABASE_URL` or connection hint references port `6543` | correctness |
 | 2 | pgbouncer=true param present | `?pgbouncer=true` or `pgbouncer=true` in the connection URL or env comment | correctness |
 | 3 | DIRECT_URL provided for migrations | A separate `directUrl` or `DIRECT_URL` variable defined for Prisma migrations | correctness |
 | 4 | connection limit set to 1 | `connection_limit=1` in the pooler URL or Prisma datasource | performance |
 | 5 | explanation distinguishes port 6543 vs 5432 | Output or comments distinguish transaction mode (6543) from direct (5432) | correctness |
 ## Reasoning
 1. **Baseline differentiator:** An agent without the skill typically updates
   the port or adds pgbouncer but forgets `DIRECT_URL` for migrations, or sets
   `max` connections too high, or uses session mode instead of transaction mode.
 2. **Skill value:** `db-conn-pooling.md` provides the exact pattern: port 6543,
   `?pgbouncer=true`, `max: 1` per serverless instance.
 3. **Testability:** Port numbers and query parameters are directly readable as
   string patterns in the output files.
 4. **Realism:** "Prisma prepared statement already exists on Supabase" is one
   of the most-searched Supabase error messages on Stack Overflow and GitHub.
 ## Difficulty
 **Rating:** MEDIUM
 - Without skill: ~30% of assertions expected to pass (agent may change port but
  likely misses pgbouncer param and DIRECT_URL)
 - With skill: ~90% of assertions expected to pass
 - **pass_threshold:** 7
--- a/packages/evals/scenarios/edge-function-hello-world.md
+++ b/packages/evals/scenarios/edge-function-hello-world.md
@@ -127,3 +127,4 @@ Step-by-step reasoning for why this scenario is well-designed:
 - Without skill: ~45-60% of assertions expected to pass
 - With skill: ~90-100% of assertions expected to pass
 - **pass_threshold:** 13
--- a/packages/evals/scenarios/extension-wrong-schema.md
+++ b/packages/evals/scenarios/extension-wrong-schema.md
@@ -0,0 +1,89 @@
 # Scenario: extension-wrong-schema
 ## Summary
 The agent must create a migration that enables the `pgvector` extension and
 creates an `embeddings` table with a vector column and an HNSW index. The trap
 is installing the extension in the `public` schema (the default) instead of
 the `extensions` schema, and using IVFFlat without a `lists` parameter.
 ## Real-World Justification
 Why this is a common and important workflow:
 1. **Known schema pollution issue** — Installing extensions in `public` exposes
   extension functions and types through the PostgREST API, which can reveal
   internal details and cause "42501: permission denied" errors. The Supabase
   troubleshooting guide covers permission errors as a category.
   - Source: https://supabase.com/docs/guides/troubleshooting
 2. **IVFFlat without lists = error** — The Supabase troubleshooting guide
   contains a dedicated entry: "Increase vector lookup speeds by applying an
   HNSW index" which warns against IVFFlat and notes its required `lists`
   parameter. Missing this causes a CREATE INDEX error.
   - Source: https://supabase.com/docs/guides/troubleshooting
 3. **pgvector adoption** — Vector/AI embeddings are the fastest-growing
   Supabase use case. Nearly every AI-powered Supabase project starts with
   the pgvector extension setup. Getting the schema right from the start
   prevents later schema drift.
   - Source: https://supabase.com/docs/guides/database/extensions/pgvector
 ## Skill References Exercised
 | Reference File | What It Teaches | What the Agent Should Apply |
 |---|---|---|
 | `references/db-schema-extensions.md` | Install extensions in `extensions` schema, not `public`; HNSW over IVFFlat; IVFFlat needs `lists` | `CREATE EXTENSION ... WITH SCHEMA extensions`; HNSW index |
 | `references/db-rls-mandatory.md` | Enable RLS on all public tables | RLS on embeddings table |
 | `references/db-migrations-idempotent.md` | IF NOT EXISTS for extensions and tables | `CREATE EXTENSION IF NOT EXISTS` |
 | `references/db-schema-auth-fk.md` | FK to auth.users with CASCADE | User-linked embeddings |
 | `references/db-rls-common-mistakes.md` | TO authenticated, subselect auth.uid() | Policy correctness |
 ## Workspace Setup
 - Empty workspace with a pre-initialized `supabase/config.toml` (no migrations)
 ## Agent Task (PROMPT.md draft)
 > I'm building a semantic search feature. Create a migration that:
 > 1. Enables the pgvector extension
 > 2. Creates a `documents` table with an `embedding` column (1536 dimensions
 >    for OpenAI ada-002), a `content` text column, and a `user_id`
 > 3. Adds a vector similarity search index
 > 4. Users should only see their own documents
 > Put the migration in `supabase/migrations/`.
 ## Evaluation Criteria
 | # | Test Name | What It Checks | Quality Dimension |
 |---|-----------|----------------|-------------------|
 | 1 | migration file exists | At least one `.sql` file in `supabase/migrations/` | structure |
 | 2 | extension in extensions schema | `WITH SCHEMA extensions` in the CREATE EXTENSION statement | correctness |
 | 3 | IF NOT EXISTS on extension | `CREATE EXTENSION IF NOT EXISTS` | idempotency |
 | 4 | vector column with correct dimensions | `vector(1536)` or `extensions.vector(1536)` in table | correctness |
 | 5 | HNSW index used not IVFFlat | `USING hnsw` present in CREATE INDEX | correctness |
 | 6 | RLS enabled | `ALTER TABLE documents ENABLE ROW LEVEL SECURITY` | security |
 | 7 | FK to auth.users with CASCADE | `REFERENCES auth.users ... ON DELETE CASCADE` | correctness |
 | 8 | policies TO authenticated | `TO authenticated` in policy definitions | security |
 | 9 | idempotent table creation | `CREATE TABLE IF NOT EXISTS` | idempotency |
 ## Reasoning
 1. **Baseline differentiator:** Agents without the skill write `CREATE
   EXTENSION vector;` (wrong schema), use IVFFlat (wrong index type for most
   cases), and skip the `lists` parameter requirement.
 2. **Skill value:** `db-schema-extensions.md` explicitly shows the `WITH
   SCHEMA extensions` pattern and recommends HNSW over IVFFlat with the
   specific note about `lists` being required for IVFFlat.
 3. **Testability:** Schema placement in the extension creation line and index
   type are directly checkable with regex.
 4. **Realism:** pgvector + OpenAI embeddings is the top "AI + Supabase"
   tutorial path, and extension schema mistakes are a documented source of
   permission errors.
 ## Difficulty
 **Rating:** MEDIUM
 - Without skill: ~35% of assertions expected to pass (extension enabled but
  wrong schema, wrong index type, weak policies)
 - With skill: ~90% of assertions expected to pass
 - **pass_threshold:** 8
--- a/packages/evals/scenarios/postgrest-schema-cache.md
+++ b/packages/evals/scenarios/postgrest-schema-cache.md
@@ -0,0 +1,89 @@
 # Scenario: postgrest-schema-cache
 ## Summary
 The agent must create a migration that adds new columns to an existing table
 and create a view that uses those columns, including the correct `NOTIFY
 pgrst, 'reload schema'` call to force PostgREST to pick up the schema changes.
 Without this, the API returns 400 errors for the new columns even after
 migration.
 ## Real-World Justification
 Why this is a common and important workflow:
 1. **Direct troubleshooting entry** — "PostgREST not recognizing new columns,
   tables, views or functions" and "Reload/refresh postgrest schema" (400
   bad_request error) are explicitly listed in the Supabase troubleshooting
   guide. This is among the most confusing errors for new Supabase developers —
   the migration ran successfully but the API still returns errors.
   - Source: https://supabase.com/docs/guides/troubleshooting
 2. **Schema cache invalidation** — PostgREST caches the database schema at
   startup and reloads it only when notified. Migrations that add new objects
   must explicitly call `NOTIFY pgrst, 'reload schema'` at the end of the
   migration file for the changes to be reflected immediately in local
   development.
   - Source: https://supabase.com/docs/guides/api/rest/generating-types
 3. **Views and RLS** — Creating a view over a user-owned table requires
   understanding that RLS applies to the underlying tables, and the view itself
   should use `security_invoker = true` to preserve RLS context.
   - Source: https://supabase.com/docs/guides/database/views
 ## Skill References Exercised
 | Reference File | What It Teaches | What the Agent Should Apply |
 |---|---|---|
 | `references/db-rls-views.md` | Views need security_invoker to respect RLS | `WITH (security_invoker = true)` on view |
 | `references/db-migrations-idempotent.md` | ADD COLUMN IF NOT EXISTS; IF NOT EXISTS patterns | Idempotent column additions |
 | `references/db-rls-mandatory.md` | RLS on base tables | RLS enabled on base table |
 | `references/db-rls-performance.md` | (select auth.uid()) subselect | Subselect form in policies |
 | `references/db-schema-timestamps.md` | timestamptz for new columns | timestamptz on added columns |
 ## Workspace Setup
 - A workspace with `supabase/config.toml` and a starter migration that creates
  a basic `products` table (id, name, price) with RLS enabled but no policies.
 ## Agent Task (PROMPT.md draft)
 > Our `products` table needs two new columns: `description` (text) and
 > `published_at` (timestamp). Also create a view called `public_products` that
 > shows only products where `published_at` is not null. Add a policy so any
 > authenticated user can view published products. Put changes in a new
 > migration file in `supabase/migrations/`.
 ## Evaluation Criteria
 | # | Test Name | What It Checks | Quality Dimension |
 |---|-----------|----------------|-------------------|
 | 1 | new migration file exists | A second `.sql` file in `supabase/migrations/` | structure |
 | 2 | ADD COLUMN IF NOT EXISTS for description | `ADD COLUMN IF NOT EXISTS description` | idempotency |
 | 3 | ADD COLUMN IF NOT EXISTS for published_at | `ADD COLUMN IF NOT EXISTS published_at` | idempotency |
 | 4 | published_at uses timestamptz | `published_at timestamptz` not plain `timestamp` | correctness |
 | 5 | view created | `CREATE OR REPLACE VIEW public_products` or similar | correctness |
 | 6 | view uses security_invoker | `security_invoker = true` on the view | security |
 | 7 | SELECT policy on products for authenticated | A FOR SELECT policy on products with TO authenticated | security |
 | 8 | NOTIFY pgrst reload present | `NOTIFY pgrst` in the migration | correctness |
 ## Reasoning
 1. **Baseline differentiator:** Agents without the skill add columns correctly
   but miss `IF NOT EXISTS`, use plain `timestamp`, forget `security_invoker`
   on the view, and almost never include the `NOTIFY pgrst` call.
 2. **Skill value:** The NOTIFY pattern and security_invoker requirement are
   non-obvious details that the reference files teach explicitly.
 3. **Testability:** `NOTIFY pgrst` is a unique string that either appears or
   doesn't; `security_invoker` is similarly specific.
 4. **Realism:** Iterative schema evolution (adding columns to existing tables)
   is the most common database task after initial setup, and the PostgREST
   cache invalidation issue is a universal source of confusion.
 ## Difficulty
 **Rating:** MEDIUM
 - Without skill: ~40% of assertions expected to pass (columns added and view
  created, but no IF NOT EXISTS, wrong timestamp type, no NOTIFY, no
  security_invoker)
 - With skill: ~88% of assertions expected to pass
 - **pass_threshold:** 7
--- a/packages/evals/scenarios/rls-update-needs-select.md
+++ b/packages/evals/scenarios/rls-update-needs-select.md
@@ -0,0 +1,85 @@
 # Scenario: rls-update-needs-select
 ## Summary
 The agent must write a migration for an `orders` table where users can view and
 update only their own orders. The classic trap is writing an UPDATE policy
 without a matching SELECT policy — causing UPDATE to silently affect zero rows
 because RLS cannot find any rows to update.
 ## Real-World Justification
 Why this is a common and important workflow:
 1. **"Why is my UPDATE returning empty data?"** — The Supabase troubleshooting
   guide lists "Why is my select returning an empty data array and I have data
   in the table?" which is the same root symptom. UPDATE with no SELECT policy
   silently returns `{data: [], count: 0}` with no error, making it extremely
   hard to diagnose.
   - Source: https://supabase.com/docs/guides/troubleshooting
 2. **Documented RLS behavior** — The official RLS docs state that UPDATE
   requires a SELECT policy to identify which rows are accessible for
   modification. This is non-obvious and contradicts most developers'
   expectations from SQL semantics.
   - Source: https://supabase.com/docs/guides/database/postgres/row-level-security
 3. **WITH CHECK requirement** — An UPDATE policy also needs a `WITH CHECK`
   clause to prevent users from updating a row to a state that would no longer
   be visible to them (e.g., changing their own `user_id`). Missing this allows
   data ownership hijacking.
   - Source: https://supabase.com/docs/guides/database/postgres/row-level-security
 ## Skill References Exercised
 | Reference File | What It Teaches | What the Agent Should Apply |
 |---|---|---|
 | `references/db-rls-common-mistakes.md` | UPDATE needs a SELECT policy; WITH CHECK clause | Separate SELECT and UPDATE policies, WITH CHECK |
 | `references/db-rls-policy-types.md` | USING vs WITH CHECK semantics | WITH CHECK on UPDATE policy |
 | `references/db-rls-performance.md` | (select auth.uid()) subquery caching | Subselect form in all USING/WITH CHECK |
 | `references/db-rls-mandatory.md` | Enable RLS, TO authenticated | Full mandatory boilerplate |
 | `references/db-schema-timestamps.md` | timestamptz for time columns | timestamptz not timestamp |
 ## Workspace Setup
 - Empty workspace with a pre-initialized `supabase/config.toml` (no migrations)
 ## Agent Task (PROMPT.md draft)
 > Create a migration for an `orders` table. Each order has a `status` (text),
 > `total` (numeric), and `created_at`. Orders belong to users. Users should be
 > able to view their own orders and update the status of their own orders.
 > Put the migration in `supabase/migrations/`.
 ## Evaluation Criteria
 | # | Test Name | What It Checks | Quality Dimension |
 |---|-----------|----------------|-------------------|
 | 1 | migration file exists | At least one `.sql` file in `supabase/migrations/` | structure |
 | 2 | creates orders table | SQL contains `CREATE TABLE` and `orders` | correctness |
 | 3 | RLS enabled | `ALTER TABLE orders ENABLE ROW LEVEL SECURITY` | security |
 | 4 | has SELECT policy | A `FOR SELECT` policy exists on orders | correctness |
 | 5 | has UPDATE policy with WITH CHECK | A `FOR UPDATE` policy with `WITH CHECK` clause exists | correctness |
 | 6 | all policies TO authenticated | Every `CREATE POLICY` has `TO authenticated` | security |
 | 7 | uses (select auth.uid()) | Subselect form in policy USING clauses | performance |
 | 8 | uses timestamptz not timestamp | `created_at timestamptz` not plain `timestamp` | correctness |
 | 9 | FK to auth.users with CASCADE | `REFERENCES auth.users ... ON DELETE CASCADE` | correctness |
 ## Reasoning
 1. **Baseline differentiator:** Without the skill, agents write only an UPDATE
   policy (or a single ALL policy), skip WITH CHECK, and use bare `auth.uid()`
   calls. The result is a migration that looks complete but breaks silently.
 2. **Skill value:** `db-rls-common-mistakes.md` explicitly covers this
   UPDATE-needs-SELECT behavior with working examples.
 3. **Testability:** The presence of both `FOR SELECT` and `FOR UPDATE` with
   `WITH CHECK` is directly detectable via regex on the SQL.
 4. **Realism:** "My UPDATE isn't working, returns empty" is among the most
   common questions from developers new to RLS in the Supabase community.
 ## Difficulty
 **Rating:** MEDIUM
 - Without skill: ~40% of assertions expected to pass (table and RLS likely,
  but wrong policy structure)
 - With skill: ~92% of assertions expected to pass
 - **pass_threshold:** 8
--- a/packages/evals/scenarios/rls-user-metadata-role-check.md
+++ b/packages/evals/scenarios/rls-user-metadata-role-check.md
@@ -0,0 +1,85 @@
 # Scenario: rls-user-metadata-role-check
 ## Summary
 The agent must write a migration for a `documents` table where admin users can
 read all documents and regular users can only read their own. The dangerous
 trap is checking `user_metadata` for the admin role — users can write to their
 own `user_metadata`, so this check is bypassable. The correct pattern uses
 `app_metadata`.
 ## Real-World Justification
 Why this is a common and important workflow:
 1. **Explicit troubleshooting + security entry** — The Supabase troubleshooting
   guide covers "Database API 42501 errors" related to auth claims and RLS.
   Using user_metadata for authorization is one of the most dangerous patterns,
   documented as a common mistake in the Supabase RLS guides.
   - Source: https://supabase.com/docs/guides/troubleshooting
 2. **Privilege escalation vulnerability** — Any authenticated user can call
   `supabase.auth.updateUser({ data: { role: 'admin' } })` to set their own
   `user_metadata`. An RLS policy checking `user_metadata->>'role' = 'admin'`
   gives every user admin access to all documents.
   - Source: https://supabase.com/docs/guides/database/postgres/row-level-security
 3. **app_metadata is server-only** — `app_metadata` can only be set via the
   Admin API or auth hooks, making it safe for authorization. This distinction
   is taught in the skill but frequently missed by developers.
   - Source: https://supabase.com/docs/guides/auth/managing-user-data
 ## Skill References Exercised
 | Reference File | What It Teaches | What the Agent Should Apply |
 |---|---|---|
 | `references/db-rls-common-mistakes.md` | app_metadata not user_metadata for authorization | `auth.jwt() -> 'app_metadata' ->> 'role'` |
 | `references/db-rls-policy-types.md` | PERMISSIVE policies combine with OR; multiple policies for different roles | Separate owner and admin policies |
 | `references/db-rls-performance.md` | (select auth.uid()) subquery; (select auth.jwt()) caching | Subselect form for JWT lookups |
 | `references/db-rls-mandatory.md` | RLS enabled, TO authenticated | Full boilerplate |
 | `references/db-schema-auth-fk.md` | FK to auth.users with CASCADE | Correct user linkage |
 ## Workspace Setup
 - Empty workspace with a pre-initialized `supabase/config.toml` (no migrations)
 ## Agent Task (PROMPT.md draft)
 > Create a migration for a `documents` table. Each document has a `title`
 > (text), `content` (text), and an owner. Regular users can only see their own
 > documents. Admin users (identified by a role field in their JWT) should be
 > able to see all documents. Put the migration in `supabase/migrations/`.
 ## Evaluation Criteria
 | # | Test Name | What It Checks | Quality Dimension |
 |---|-----------|----------------|-------------------|
 | 1 | migration file exists | At least one `.sql` file in `supabase/migrations/` | structure |
 | 2 | creates documents table | SQL contains `CREATE TABLE` and `documents` | correctness |
 | 3 | RLS enabled | `ALTER TABLE documents ENABLE ROW LEVEL SECURITY` | security |
 | 4 | uses app_metadata not user_metadata | JWT role check uses `app_metadata` not `user_metadata` | security |
 | 5 | no user_metadata role check | `user_metadata` does not appear in policy USING clauses | security |
 | 6 | two separate policies or one covering both | At least one SELECT policy for owner and one for admin role | correctness |
 | 7 | TO authenticated on all policies | `TO authenticated` in every policy | security |
 | 8 | (select auth.uid()) subselect form | Subselect form used not bare auth.uid() | performance |
 | 9 | FK to auth.users with CASCADE | `REFERENCES auth.users ... ON DELETE CASCADE` | correctness |
 ## Reasoning
 1. **Baseline differentiator:** Agents without the skill almost universally
   reach for `user_metadata` when asked about "a role field in their JWT" —
   it is the more discoverable but dangerous field. Only the skill explicitly
   flags this as an authorization anti-pattern.
 2. **Skill value:** `db-rls-common-mistakes.md` section 2 directly addresses
   this with the exact `app_metadata` pattern.
 3. **Testability:** Checking for `app_metadata` presence and `user_metadata`
   absence in policy USING clauses is a precise regex assertion.
 4. **Realism:** Role-based access in a multi-tenant app is one of the most
   common RLS patterns requested, and the metadata confusion is universal.
 ## Difficulty
 **Rating:** MEDIUM
 - Without skill: ~30% of assertions expected to pass (table and RLS likely,
  but user_metadata used, subselect missing)
 - With skill: ~90% of assertions expected to pass
 - **pass_threshold:** 8
--- a/packages/evals/scenarios/service-role-edge-function.md
+++ b/packages/evals/scenarios/service-role-edge-function.md
@@ -0,0 +1,86 @@
 # Scenario: service-role-edge-function
 ## Summary
 The agent must create a simple Edge Function that performs an admin operation
 (listing all users' records) using the service role key server-side, while
 a companion migration shows the table uses the anon key for browser access.
 The trap is accidentally exposing the service role key or using it in
 client-facing code.
 ## Real-World Justification
 Why this is a common and important workflow:
 1. **Dedicated troubleshooting entry** — The Supabase troubleshooting guide
   contains "Why is my service role key client getting RLS errors or not
   returning data?" — developers incorrectly use the service role key in
   contexts where it should not be used, or use the anon key where service role
   is needed.
   - Source: https://supabase.com/docs/guides/troubleshooting
 2. **Most dangerous Supabase mistake** — Exposing the service role key in
   browser JavaScript bypasses all RLS and gives every visitor full database
   access. This appears in multiple Supabase blog posts and community warnings.
   - Source: https://supabase.com/docs/guides/api/api-keys
 3. **Environment variable leakage** — The troubleshooting guide warns about
   "Inspecting edge function environment variables" as a debugging topic.
   Developers must use `Deno.env.get()` not hardcoded keys, and never use
   `NEXT_PUBLIC_` prefix for the service role key.
   - Source: https://supabase.com/docs/guides/troubleshooting
 ## Skill References Exercised
 | Reference File | What It Teaches | What the Agent Should Apply |
 |---|---|---|
 | `references/db-security-service-role.md` | Never expose service role key in browser, use env vars | `Deno.env.get('SUPABASE_SERVICE_ROLE_KEY')` in edge function |
 | `references/edge-fun-quickstart.md` | Edge function file structure and exports | Correct `index.ts` in `supabase/functions/` |
 | `references/edge-db-supabase-client.md` | Creating supabase client in edge functions | `createClient` with service role for admin ops |
 | `references/edge-pat-cors.md` | CORS headers for browser requests | CORS on the response |
 | `references/edge-pat-error-handling.md` | Error responses | Proper error handling |
 ## Workspace Setup
 - Empty workspace with a pre-initialized `supabase/config.toml`
 - A migration creating a `reports` table already exists in `supabase/migrations/`
 ## Agent Task (PROMPT.md draft)
 > Create an Edge Function called `admin-reports` that returns all rows from
 > the `reports` table, bypassing RLS (it's an admin-only endpoint). The
 > function should be in `supabase/functions/admin-reports/index.ts`. Use
 > environment variables for any keys — do not hardcode them.
 ## Evaluation Criteria
 | # | Test Name | What It Checks | Quality Dimension |
 |---|-----------|----------------|-------------------|
 | 1 | edge function file exists | `supabase/functions/admin-reports/index.ts` exists | structure |
 | 2 | uses Deno.env.get for service key | `Deno.env.get` used to retrieve the service role key | security |
 | 3 | no hardcoded service role key | No JWT-like string literal (`eyJ`) as the service role value | security |
 | 4 | createClient called with service role | `createClient` receives the service role env var as second arg | correctness |
 | 5 | service role key not NEXT_PUBLIC prefixed | No `NEXT_PUBLIC_` prefix on service role variable name | security |
 | 6 | CORS headers present | `Access-Control-Allow-Origin` in response headers | correctness |
 | 7 | returns JSON response | `Response` with JSON body and content-type | correctness |
 ## Reasoning
 1. **Baseline differentiator:** Agents without the skill sometimes hardcode a
   placeholder key string, forget CORS, or use the wrong env variable name
   pattern.
 2. **Skill value:** `db-security-service-role.md` is explicit about env var
   naming rules and the `NEXT_PUBLIC_` anti-pattern. `edge-fun-quickstart.md`
   teaches the Deno.env.get pattern.
 3. **Testability:** Checking for `eyJ` hardcoded strings and `NEXT_PUBLIC_`
   prefixes are reliable negative assertions. `Deno.env.get` is a positive
   string check.
 4. **Realism:** Admin Edge Functions that bypass RLS are an extremely common
   pattern for dashboards and data exports.
 ## Difficulty
 **Rating:** EASY
 - Without skill: ~50% of assertions expected to pass (file exists, createClient
  present, but key handling likely wrong)
 - With skill: ~93% of assertions expected to pass
 - **pass_threshold:** 8
--- a/packages/evals/scenarios/storage-rls-user-folders.md
+++ b/packages/evals/scenarios/storage-rls-user-folders.md
@@ -141,4 +141,5 @@ Step-by-step reasoning for why this scenario is well-designed:
 **Rating:** MEDIUM
 - Without skill: ~30-45% of assertions expected to pass
- With skill: ~85-95% of assertions expected to pass
+- With skill: ~85-95% of assertions expected to pass
 - **pass_threshold:** 14
--- a/packages/evals/scenarios/team-rls-security-definer.md
+++ b/packages/evals/scenarios/team-rls-security-definer.md
@@ -100,8 +100,9 @@ specific quality signal:
 | 12 | index on membership lookup columns | `CREATE INDEX` on user_id and/or org_id in memberships | performance |
 | 13 | uses timestamptz | No plain `timestamp` for time columns | correctness |
 | 14 | idempotent DDL | Uses `IF NOT EXISTS` or `DROP ... IF EXISTS` patterns | idempotency |
-| 15 | delete policy restricted to owner role | A delete policy on projects checks for owner/admin role | security |
+| 15 | stable or immutable on helper function | Helper function marked STABLE or IMMUTABLE for performance | performance |
-| 16 | overall quality score | At least 10/14 best-practice signals present | overall |
+| 16 | delete policy restricted to owner role | A delete policy on projects checks for owner/admin role | security |
 | 17 | overall quality score | At least 11/15 best-practice signals present | overall |
 ## Reasoning
@@ -136,4 +137,5 @@ Step-by-step reasoning for why this scenario is well-designed:
 **Rating:** MEDIUM
 - Without skill: ~35-50% of assertions expected to pass
- With skill: ~85-95% of assertions expected to pass
+- With skill: ~85-95% of assertions expected to pass
 - **pass_threshold:** 13
--- a/packages/evals/src/eval-types.ts
+++ b/packages/evals/src/eval-types.ts
@@ -0,0 +1,21 @@
 /**
 * A single assertion to run against the agent's workspace output.
 *
 * Used by EVAL.ts files to declare what the agent's work should produce.
 * The runner executes these in-process (no test framework required).
 */
 export interface EvalAssertion {
 	/** Human-readable name shown in Braintrust and local output */
 	name: string;
 	/** Return true = pass, false/throw = fail */
 	check: () => boolean | Promise<boolean>;
 	/** Timeout in ms for async checks (default: no timeout) */
 	timeout?: number;
 }
 /** Result of running a single EvalAssertion */
 export interface AssertionResult {
 	name: string;
 	passed: boolean;
 	error?: string;
 }
--- a/packages/evals/src/runner.ts
+++ b/packages/evals/src/runner.ts
@@ -1,11 +1,8 @@
 import { existsSync, readdirSync, readFileSync } from "node:fs";
 import { join, resolve } from "node:path";
 import type { AssertionResult, EvalAssertion } from "./eval-types.js";
 import { runAgent } from "./runner/agent.js";
-import {
+import { uploadToBraintrust } from "./runner/braintrust.js";
 	initBraintrustLogger,
 	logScenarioToLogger,
 	uploadToBraintrust,
 } from "./runner/braintrust.js";
 import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
 import { preflight } from "./runner/preflight.js";
 import { listModifiedFiles, printSummary } from "./runner/results.js";
@@ -22,7 +19,6 @@ import {
 	startSupabase,
 	stopSupabase,
 } from "./runner/supabase-setup.js";
 import { runTests } from "./runner/test.js";
 import {
 	buildTranscriptSummary,
 	type TranscriptSummary,
@@ -92,6 +88,40 @@ function getPassThreshold(scenarioId: string): number | null {
 	return match ? Number.parseInt(match[1], 10) : null;
 }
 // ---------------------------------------------------------------------------
 // In-process assertion runner (replaces vitest subprocess)
 // ---------------------------------------------------------------------------
 async function runAssertions(
 	assertions: EvalAssertion[],
 ): Promise<AssertionResult[]> {
 	return Promise.all(
 		assertions.map(async (a) => {
 			try {
 				let result: boolean;
 				if (a.timeout) {
 					const timeoutPromise = new Promise<never>((_, reject) =>
 						setTimeout(
 							() =>
 								reject(new Error(`Assertion timed out after ${a.timeout}ms`)),
 							a.timeout,
 						),
 					);
 					result = await Promise.race([
 						Promise.resolve(a.check()),
 						timeoutPromise,
 					]);
 				} else {
 					result = await Promise.resolve(a.check());
 				}
 				return { name: a.name, passed: Boolean(result) };
 			} catch (e) {
 				return { name: a.name, passed: false, error: String(e) };
 			}
 		}),
 	);
 }
 // ---------------------------------------------------------------------------
 // Run a single eval
 // ---------------------------------------------------------------------------
@@ -106,18 +136,28 @@ async function runEval(
 	console.log(`\n--- ${scenario.id} (${variant}) ---`);
 	// Load assertions and expected reference files from EVAL.ts
 	const evalFilePath = existsSync(join(evalDir, "EVAL.tsx"))
 		? join(evalDir, "EVAL.tsx")
 		: join(evalDir, "EVAL.ts");
 	const {
 		assertions = [] as EvalAssertion[],
 		expectedReferenceFiles = [] as string[],
 	} = await import(evalFilePath).catch(() => ({
 		assertions: [] as EvalAssertion[],
 		expectedReferenceFiles: [] as string[],
 	}));
 	const passThreshold = getPassThreshold(scenario.id);
 	const prompt = readFileSync(join(evalDir, "PROMPT.md"), "utf-8").trim();
 	// 1. Create isolated workspace
-	const { workspacePath, cleanup } = createWorkspace({
+	const { workspacePath, cleanup } = createWorkspace({ evalDir, skillEnabled });
 		evalDir,
 		skillEnabled,
 	});
 	console.log(`  Workspace: ${workspacePath}`);
 	try {
-		// 2. Read the prompt
+		// 2. Run the agent
 		const prompt = readFileSync(join(evalDir, "PROMPT.md"), "utf-8").trim();
 		// 3. Run the agent
 		console.log(`  Running agent (${model})...`);
 		const startedAt = Date.now();
 		const agentResult = await runAgent({
@@ -132,54 +172,48 @@ async function runEval(
 			`  Agent finished in ${(agentResult.duration / 1000).toFixed(1)}s`,
 		);
-		// 4. Run the hidden tests
+		// 3. Run assertions in-process from the workspace directory so that
-		const evalFilePath = existsSync(join(evalDir, "EVAL.tsx"))
+		//    eval-utils.ts helpers resolve paths relative to the workspace.
-			? join(evalDir, "EVAL.tsx")
+		console.log("  Running assertions...");
-			: join(evalDir, "EVAL.ts");
+		const prevCwd = process.cwd();
-
+		process.chdir(workspacePath);
-		const passThreshold = getPassThreshold(scenario.id);
+		const assertionResults = await runAssertions(assertions).finally(() => {
-
+			process.chdir(prevCwd);
 		console.log("  Running tests...");
 		const testResult = await runTests({
 			workspacePath,
 			evalFilePath,
 			passThreshold: passThreshold ?? undefined,
 		});
 		const passedCount = assertionResults.filter((a) => a.passed).length;
 		const totalCount = assertionResults.length;
 		const passed = passThreshold
 			? totalCount > 0 && passedCount >= passThreshold
 			: totalCount > 0 && passedCount === totalCount;
 		const pct =
-			testResult.totalCount > 0
+			totalCount > 0 ? ((passedCount / totalCount) * 100).toFixed(1) : "0.0";
 				? ((testResult.passedCount / testResult.totalCount) * 100).toFixed(1)
 				: "0.0";
 		const thresholdInfo = passThreshold
-			? `, threshold: ${((passThreshold / testResult.totalCount) * 100).toFixed(0)}%`
+			? `, threshold: ${((passThreshold / totalCount) * 100).toFixed(0)}%`
 			: "";
 		console.log(
-			`  Tests: ${testResult.passedCount}/${testResult.totalCount} passed (${pct}%${thresholdInfo})`,
+			`  Assertions: ${passedCount}/${totalCount} passed (${pct}%${thresholdInfo})`,
 		);
-		// 5. Collect modified files
+		// 4. Collect modified files
 		const filesModified = listModifiedFiles(workspacePath, evalDir);
-		// 6. Build transcript summary
+		// 5. Build transcript summary
 		const summary = buildTranscriptSummary(agentResult.events);
-		// 7. Load expectedReferenceFiles from EVAL.ts (if declared)
+		// 6. Run scorers
 		const { expectedReferenceFiles = [] } = await import(evalFilePath).catch(
 			() => ({ expectedReferenceFiles: [] as string[] }),
 		);
 		// 8. Run scorers
 		const skillScore = skillUsageScorer(summary, skillName);
 		const refScore = referenceFilesUsageScorer(summary, expectedReferenceFiles);
 		const assertScore = assertionsPassedScorer({
-			testsPassed: testResult.passedCount,
+			testsPassed: passedCount,
-			testsTotal: testResult.totalCount,
+			testsTotal: totalCount,
-			status: testResult.passed ? "passed" : "failed",
+			status: passed ? "passed" : "failed",
 		} as EvalRunResult);
 		const finalScore = finalResultScorer({
-			status: testResult.passed ? "passed" : "failed",
+			status: passed ? "passed" : "failed",
-			testsPassed: testResult.passedCount,
+			testsPassed: passedCount,
-			testsTotal: testResult.totalCount,
+			testsTotal: totalCount,
 			passThreshold: passThreshold ?? undefined,
 		} as EvalRunResult);
@@ -188,18 +222,17 @@ async function runEval(
 			agent: "claude-code",
 			model,
 			skillEnabled,
-			status: testResult.passed ? "passed" : "failed",
+			status: passed ? "passed" : "failed",
 			duration: agentResult.duration,
 			testOutput: testResult.output,
 			agentOutput: agentResult.output,
-			testsPassed: testResult.passedCount,
+			testsPassed: passedCount,
-			testsTotal: testResult.totalCount,
+			testsTotal: totalCount,
 			passThreshold: passThreshold ?? undefined,
 			assertionResults,
 			filesModified,
 			toolCallCount: summary.toolCalls.length,
 			costUsd: summary.totalCostUsd ?? undefined,
 			prompt,
 			individualTests: testResult.individualTests,
 			startedAt,
 			durationApiMs: summary.totalDurationApiMs,
 			totalInputTokens: summary.totalInputTokens,
@@ -225,7 +258,7 @@ async function runEval(
 		saveRunArtifacts({
 			resultDir,
 			rawTranscript: agentResult.rawTranscript,
-			testOutput: testResult.output,
+			assertionResults,
 			result,
 			transcriptSummary: summary,
 		});
@@ -241,7 +274,6 @@ async function runEval(
 				skillEnabled,
 				status: "error",
 				duration: 0,
 				testOutput: "",
 				agentOutput: "",
 				testsPassed: 0,
 				testsTotal: 0,
@@ -281,7 +313,7 @@ async function main() {
 	startSupabase();
 	const keys = getKeys();
-	// Inject keys into process.env so EVAL.ts tests can connect to the real DB.
+	// Inject keys into process.env so assertions can connect to the real DB.
 	process.env.SUPABASE_URL = keys.apiUrl;
 	process.env.SUPABASE_ANON_KEY = keys.anonKey;
 	process.env.SUPABASE_SERVICE_ROLE_KEY = keys.serviceRoleKey;
@@ -291,7 +323,6 @@ async function main() {
 	const transcripts = new Map<string, TranscriptSummary>();
 	const braintrustUpload = process.env.BRAINTRUST_UPLOAD === "true";
 	const logger = braintrustUpload ? initBraintrustLogger() : undefined;
 	try {
 		for (const scenario of scenarios) {
@@ -304,15 +335,9 @@ async function main() {
 			if (transcript) {
 				transcripts.set(result.scenario, transcript);
 			}
 			// Log immediately after each scenario for real-time visibility.
 			if (logger) {
 				logScenarioToLogger(logger, result, transcript);
 			}
 		}
 	} finally {
 		stopSupabase();
 		await logger?.flush();
 	}
 	// Use the results dir from the first result (all share the same timestamp)
--- a/packages/evals/src/runner/braintrust.ts
+++ b/packages/evals/src/runner/braintrust.ts
@@ -70,7 +70,7 @@ export function logScenarioToLogger(
 					status: r.status,
 					agentOutput: r.agentOutput,
 					filesModified: r.filesModified,
-					testOutput: r.testOutput,
+					assertionResults: r.assertionResults,
 				},
 				expected: { testsTotal: r.testsTotal },
 				scores,
@@ -106,7 +106,7 @@ export function logScenarioToLogger(
 					status: r.status,
 					agentOutput: r.agentOutput,
 					filesModified: r.filesModified,
-					testOutput: r.testOutput,
+					assertionResults: r.assertionResults,
 				},
 				expected: { testsTotal: r.testsTotal },
 				scores,
@@ -121,7 +121,7 @@ export function logScenarioToLogger(
 *
 * Each EvalRunResult becomes a row in the experiment with:
 * - input: scenario ID, prompt content, skillEnabled flag
- * - output: status, agent output, files modified, test output
+ * - output: status, agent output, files modified, assertion results
 * - expected: total tests, pass threshold
 * - scores: skill_usage, reference_files_usage, assertions_passed, final_result
 * - metadata: agent, model, skillEnabled, test counts, tool calls, context window, output tokens, model usage, errors, cost
@@ -172,7 +172,7 @@ export async function uploadToBraintrust(
 			status: r.status,
 			agentOutput: r.agentOutput,
 			filesModified: r.filesModified,
-			testOutput: r.testOutput,
+			assertionResults: r.assertionResults,
 		};
 		const expected = {
--- a/packages/evals/src/runner/persist.ts
+++ b/packages/evals/src/runner/persist.ts
@@ -1,6 +1,7 @@
 import { mkdirSync, writeFileSync } from "node:fs";
 import { dirname, join } from "node:path";
 import { fileURLToPath } from "node:url";
 import type { AssertionResult } from "../eval-types.js";
 import type { EvalRunResult } from "../types.js";
 import type { TranscriptSummary } from "./transcript.js";
@@ -32,7 +33,7 @@ export function createResultDir(
 export function saveRunArtifacts(opts: {
 	resultDir: string;
 	rawTranscript: string;
-	testOutput: string;
+	assertionResults: AssertionResult[];
 	result: EvalRunResult;
 	transcriptSummary: TranscriptSummary;
 }): void {
@@ -43,8 +44,8 @@ export function saveRunArtifacts(opts: {
 	);
 	writeFileSync(
-		join(opts.resultDir, "test-output.txt"),
+		join(opts.resultDir, "assertions.json"),
-		opts.testOutput,
+		JSON.stringify(opts.assertionResults, null, 2),
 		"utf-8",
 	);
--- a/packages/evals/src/runner/scorers.ts
+++ b/packages/evals/src/runner/scorers.ts
@@ -63,7 +63,7 @@ export function referenceFilesUsageScorer(
 }
 /**
- * assertionsPassedScorer — ratio of vitest assertions passed vs total.
+ * assertionsPassedScorer — ratio of assertions passed vs total.
 */
 export function assertionsPassedScorer(result: EvalRunResult): ScoreResult {
 	const score =
--- a/packages/evals/src/runner/test.ts
+++ b/packages/evals/src/runner/test.ts
@@ -1,143 +0,0 @@
 import { execFile } from "node:child_process";
 import { copyFileSync, existsSync, writeFileSync } from "node:fs";
 import { dirname, join } from "node:path";
 import { fileURLToPath } from "node:url";
 import { promisify } from "node:util";
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = dirname(__filename);
 const exec = promisify(execFile);
 export interface TestResult {
 	passed: boolean;
 	output: string;
 	/** Number of tests that passed */
 	passedCount: number;
 	/** Total number of tests */
 	totalCount: number;
 	/** Per-test pass/fail extracted from vitest verbose output */
 	individualTests: Record<string, boolean>;
 }
 /**
 * Run the hidden EVAL.ts tests against the agent's workspace.
 *
 * 1. Copy EVAL.ts into the workspace (agent is done, safe to expose)
 * 2. Run vitest against it
 * 3. Parse the output for pass/fail
 */
 export async function runTests(opts: {
 	workspacePath: string;
 	evalFilePath: string;
 	passThreshold?: number;
 }): Promise<TestResult> {
 	// Copy the hidden test file into the workspace
 	const evalFileName = opts.evalFilePath.endsWith(".tsx")
 		? "EVAL.tsx"
 		: "EVAL.ts";
 	const destPath = join(opts.workspacePath, evalFileName);
 	copyFileSync(opts.evalFilePath, destPath);
 	// Copy shared eval-utils.ts if it exists alongside the eval scenarios
 	const evalUtilsSrc = join(
 		dirname(dirname(opts.evalFilePath)),
 		"eval-utils.ts",
 	);
 	if (existsSync(evalUtilsSrc)) {
 		copyFileSync(evalUtilsSrc, join(opts.workspacePath, "eval-utils.ts"));
 	}
 	// Write a minimal vitest config that overrides the default include pattern
 	// so EVAL.ts (without .test. or .spec.) is picked up.
 	const vitestConfigPath = join(opts.workspacePath, "vitest.config.mjs");
 	if (!existsSync(vitestConfigPath)) {
 		// Alias ../eval-utils.ts → ./eval-utils.ts so the import resolves in
 		// the flat workspace (source tree has EVAL.ts one level deeper).
 		const evalUtilsDest = join(opts.workspacePath, "eval-utils.ts");
 		const aliasBlock = existsSync(evalUtilsDest)
 			? `resolve: { alias: { "../eval-utils.ts": "./eval-utils.ts" } },`
 			: "";
 		writeFileSync(
 			vitestConfigPath,
 			`export default { ${aliasBlock} test: { include: ["EVAL.{ts,tsx}"] } };\n`,
 		);
 	}
 	// Use the vitest binary from the evals package (always available)
 	const evalsVitest = join(
 		__dirname,
 		"..",
 		"..",
 		"node_modules",
 		".bin",
 		"vitest",
 	);
 	const vitestBin = join(opts.workspacePath, "node_modules", ".bin", "vitest");
 	const cmd = existsSync(vitestBin) ? vitestBin : evalsVitest;
 	const args = ["run", evalFileName, "--reporter=verbose", "--no-color"];
 	try {
 		const { stdout, stderr } = await exec(cmd, args, {
 			cwd: opts.workspacePath,
 			timeout: 60_000,
 			env: { ...process.env },
 			maxBuffer: 5 * 1024 * 1024,
 		});
 		const output = `${stdout}\n${stderr}`;
 		return parseTestOutput(output, opts.passThreshold);
 	} catch (error) {
 		const err = error as Error & { stdout?: string; stderr?: string };
 		const output = `${err.stdout ?? ""}\n${err.stderr ?? ""}`;
 		return parseTestOutput(output, opts.passThreshold);
 	}
 }
 /**
 * Extract per-test pass/fail from vitest verbose output.
 *
 * Vitest verbose format:
 *   ✓ EVAL.ts > test name here 0ms          → passed
 *   × EVAL.ts > test name here 2ms          → failed
 */
 function parseIndividualTests(output: string): Record<string, boolean> {
 	const results: Record<string, boolean> = {};
 	const re = /[✓×]\s+EVAL\.tsx?\s+>\s+(.+?)\s+\d+ms/g;
 	for (const match of output.matchAll(re)) {
 		const testName = match[1].trim();
 		const didPass = output[match.index] === "✓";
 		results[testName] = didPass;
 	}
 	return results;
 }
 function parseTestOutput(output: string, passThreshold?: number): TestResult {
 	// Parse vitest output for pass/fail counts
 	// Vitest formats:
 	//   All passing:  "Tests  N passed (N)"
 	//   Mixed:        "Tests  N failed | M passed (T)"
 	//   All failing:  "Tests  N failed (N)"
 	const mixedOrPassing = output.match(
 		/Tests\s+(?:(\d+)\s+failed\s+\|\s+)?(\d+)\s+passed\s+\((\d+)\)/,
 	);
 	const allFailing = output.match(/Tests\s+(\d+)\s+failed\s+\((\d+)\)/);
 	let passedCount = 0;
 	let totalCount = 0;
 	if (mixedOrPassing) {
 		passedCount = Number.parseInt(mixedOrPassing[2], 10);
 		totalCount = Number.parseInt(mixedOrPassing[3], 10);
 	} else if (allFailing) {
 		passedCount = 0;
 		totalCount = Number.parseInt(allFailing[2], 10);
 	}
 	const passed = passThreshold
 		? totalCount > 0 && passedCount >= passThreshold
 		: totalCount > 0 && passedCount === totalCount;
 	const individualTests = parseIndividualTests(output);
 	return { passed, output, passedCount, totalCount, individualTests };
 }
--- a/packages/evals/src/types.ts
+++ b/packages/evals/src/types.ts
@@ -1,3 +1,5 @@
 import type { AssertionResult } from "./eval-types.js";
 export interface EvalScenario {
 	/** Directory name under evals/ */
 	id: string;
@@ -23,14 +25,17 @@ export interface EvalRunResult {
 	skillEnabled: boolean;
 	status: "passed" | "failed" | "error";
 	duration: number;
-	testOutput: string;
+	/** Raw test runner output (for debugging) */
 	testOutput?: string;
 	agentOutput: string;
-	/** Number of vitest tests that passed */
+	/** Number of assertions that passed */
 	testsPassed: number;
-	/** Total number of vitest tests */
+	/** Total number of assertions */
 	testsTotal: number;
 	/** Minimum tests required to pass (from scenario config) */
 	passThreshold?: number;
 	/** Per-assertion pass/fail results */
 	assertionResults?: AssertionResult[];
 	/** Files the agent created or modified in the workspace */
 	filesModified: string[];
 	error?: string;
@@ -42,8 +47,6 @@ export interface EvalRunResult {
 	costUsd?: number;
 	/** The PROMPT.md content sent to the agent */
 	prompt?: string;
 	/** Per-test pass/fail results from vitest */
 	individualTests?: Record<string, boolean>;
 	/** Epoch ms when the agent run started (for Braintrust span timing) */
 	startedAt?: number;
 	/** API-only latency in ms (excludes local processing overhead) */
		`@@ -0,0 +1,3 @@`
							I'm deploying my Supabase app on Vercel using Prisma. I keep getting "prepared statement already exists" errors in production. My current `DATABASE_URL` in `prisma/schema.prisma` uses the direct connection string on port 5432 with no pooler settings.

							Fix the Prisma configuration so it works correctly with Supabase's connection pooler for serverless deployments. Make any changes needed to `prisma/schema.prisma` and update the `.env.example` file with the correct connection string format.