more two scenarios and claude code cli is now a dependency

2026-03-27 10:09:26 +08:00 · 2026-02-20 15:02:59 +00:00
parent 9a23c6b021
commit e03bc99ebb
24 changed files with 1766 additions and 21 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,6 @@ dist/
 # Generated skills in any dot directory
 .*/skills/
 .claude/
 # Eval results (local debugging artifacts)
 packages/evals/results/
--- a/packages/evals/AGENTS.md
+++ b/packages/evals/AGENTS.md
@@ -20,6 +20,12 @@ hidden tests check the result. Binary pass/fail.
 The agent is **Claude Code** invoked via `claude -p` (print mode). It operates
 on a real filesystem in a temp directory and can read/write files freely.
 **Important**: MCP servers are disabled via `--strict-mcp-config` with an empty
 config. This ensures the agent uses only local tools (Bash, Edit, Write, Read,
 Glob, Grep) and cannot access remote services like Supabase MCP or Neon. All
 work must happen on the local filesystem — e.g., creating migration files in
 `supabase/migrations/`, not applying them to a remote project.
 ## Eval Structure
 Each eval lives in `evals/{scenario-name}/`:
--- a/packages/evals/evals/storage-rls-user-folders/EVAL.ts
+++ b/packages/evals/evals/storage-rls-user-folders/EVAL.ts
@@ -0,0 +1,252 @@
 import { existsSync, readdirSync, readFileSync } from "node:fs";
 import { join } from "node:path";
 import { expect, test } from "vitest";
 const supabaseDir = join(process.cwd(), "supabase");
 const migrationsDir = join(supabaseDir, "migrations");
 /** Find all .sql migration files (agent may create one or more). */
 function findMigrationFiles(): string[] {
 	if (!existsSync(migrationsDir)) return [];
 	return readdirSync(migrationsDir)
 		.filter((f) => f.endsWith(".sql"))
 		.map((f) => join(migrationsDir, f));
 }
 /** Read and concatenate all migration SQL files. */
 function getMigrationSQL(): string {
 	const files = findMigrationFiles();
 	if (files.length === 0)
 		throw new Error("No migration file found in supabase/migrations/");
 	return files.map((f) => readFileSync(f, "utf-8")).join("\n");
 }
 test("migration file exists", () => {
 	expect(findMigrationFiles().length).toBeGreaterThan(0);
 });
 test("creates avatars bucket", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Should insert into storage.buckets with id 'avatars' and public = true
 	expect(sql).toMatch(/storage\.buckets/);
 	expect(sql).toMatch(/avatars/);
 	expect(sql).toMatch(/public/);
 	// Verify it's marked as a public bucket (true)
 	const avatarsBlock = sql.match(
 		/insert\s+into\s+storage\.buckets[\s\S]*?avatars[\s\S]*?;/,
 	);
 	expect(avatarsBlock).not.toBeNull();
 	if (avatarsBlock) {
 		expect(avatarsBlock[0]).toMatch(/true/);
 	}
 });
 test("creates documents bucket", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Should insert into storage.buckets with id 'documents' and public = false
 	expect(sql).toMatch(/documents/);
 	const documentsBlock = sql.match(
 		/insert\s+into\s+storage\.buckets[\s\S]*?documents[\s\S]*?;/,
 	);
 	expect(documentsBlock).not.toBeNull();
 	if (documentsBlock) {
 		expect(documentsBlock[0]).toMatch(/false/);
 	}
 });
 test("avatars bucket has mime type restriction", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Should have allowed_mime_types with image types
 	expect(sql).toMatch(/allowed_mime_types/);
 	// Check for image MIME types (jpeg, png, webp)
 	expect(sql).toMatch(/image\/jpeg/);
 	expect(sql).toMatch(/image\/png/);
 	expect(sql).toMatch(/image\/webp/);
 });
 test("avatars bucket has file size limit", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Should have file_size_limit set to approximately 2MB (2097152 bytes or 2MB string)
 	expect(sql).toMatch(/file_size_limit/);
 	// Accept either numeric bytes (2097152) or string form (2MB, 2MiB, 2 * 1024 * 1024)
 	const hasNumericLimit = /2097152/.test(sql);
 	const hasStringLimit = /2\s*m/i.test(sql);
 	const hasCalcLimit = /2\s*\*\s*1024\s*\*\s*1024/.test(sql);
 	expect(hasNumericLimit || hasStringLimit || hasCalcLimit).toBe(true);
 });
 test("storage policy uses foldername or path for user isolation", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Should use storage.foldername(name) with auth.uid()::text for folder isolation
 	const usesFoldername = /storage\.foldername\s*\(\s*name\s*\)/.test(sql);
 	// Also accept direct path matching patterns like (name ~ '^user-id/')
 	const usesPathMatch =
 		/\(\s*storage\.foldername\s*\(/.test(sql) ||
 		/\bname\b.*auth\.uid\(\)/.test(sql);
 	expect(usesFoldername || usesPathMatch).toBe(true);
 	// Should cast auth.uid() to text for comparison with folder name
 	expect(sql).toMatch(/auth\.uid\(\)\s*::\s*text/);
 });
 test("storage policy uses TO authenticated", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Storage upload/delete/update policies should use TO authenticated
 	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 	const storagePolicies = policyBlocks.filter((p) =>
 		p.toLowerCase().includes("storage.objects"),
 	);
 	// At least one storage policy should have TO authenticated
 	const hasAuthenticatedPolicy = storagePolicies.some((p) =>
 		/to\s+(authenticated|public)/.test(p.toLowerCase()),
 	);
 	expect(hasAuthenticatedPolicy).toBe(true);
 	// Specifically, upload/insert policies should be TO authenticated (not public)
 	const insertPolicies = storagePolicies.filter((p) =>
 		/for\s+insert/.test(p.toLowerCase()),
 	);
 	for (const policy of insertPolicies) {
 		expect(policy.toLowerCase()).toMatch(/to\s+authenticated/);
 	}
 });
 test("public read policy for avatars", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// A SELECT policy on storage.objects for avatars bucket should allow public/anon access
 	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 	const avatarSelectPolicies = policyBlocks.filter(
 		(p) =>
 			p.toLowerCase().includes("storage.objects") &&
 			/for\s+select/.test(p.toLowerCase()) &&
 			p.toLowerCase().includes("avatars"),
 	);
 	expect(avatarSelectPolicies.length).toBeGreaterThan(0);
 	// Should use TO public (or TO anon) for public read access
 	const hasPublicAccess = avatarSelectPolicies.some(
 		(p) =>
 			/to\s+public/.test(p.toLowerCase()) || /to\s+anon/.test(p.toLowerCase()),
 	);
 	expect(hasPublicAccess).toBe(true);
 });
 test("documents bucket is fully private", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// All policies for documents bucket should restrict to authenticated owner
 	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 	const documentPolicies = policyBlocks.filter(
 		(p) =>
 			p.toLowerCase().includes("storage.objects") &&
 			p.toLowerCase().includes("documents"),
 	);
 	expect(documentPolicies.length).toBeGreaterThan(0);
 	// None should allow public/anon access
 	for (const policy of documentPolicies) {
 		expect(policy).not.toMatch(/to\s+public/);
 		expect(policy).not.toMatch(/to\s+anon/);
 	}
 	// All should be scoped to authenticated
 	for (const policy of documentPolicies) {
 		expect(policy).toMatch(/to\s+authenticated/);
 	}
 });
 test("creates file_metadata table", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(/create\s+table/);
 	expect(sql).toMatch(/file_metadata/);
 });
 test("file_metadata has FK to auth.users with CASCADE", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Find the file_metadata CREATE TABLE block or the surrounding context
 	expect(sql).toMatch(/references\s+auth\.users/);
 	expect(sql).toMatch(/on\s+delete\s+cascade/);
 });
 test("RLS enabled on file_metadata", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(
 		/alter\s+table.*file_metadata.*enable\s+row\s+level\s+security/,
 	);
 });
 test("file_metadata policies use (select auth.uid())", () => {
 	const sql = getMigrationSQL();
 	// Find policies that reference file_metadata
 	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 	const metadataPolicies = policyBlocks.filter((p) =>
 		p.toLowerCase().includes("file_metadata"),
 	);
 	// Each policy that uses auth.uid() should use the subselect form
 	for (const policy of metadataPolicies) {
 		if (policy.includes("auth.uid()")) {
 			expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
 		}
 	}
 });
 test("uses timestamptz for time columns", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Match "timestamp" that is NOT followed by "tz" or "with time zone"
 	const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
 	// Only check if the migration defines time-related columns
 	if (
 		sql.includes("created_at") ||
 		sql.includes("updated_at") ||
 		sql.includes("uploaded_at")
 	) {
 		expect(sql).not.toMatch(hasPlainTimestamp);
 	}
 });
 test("index on file_metadata user_id", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(/create\s+index/);
 	// Should index user_id on file_metadata
 	expect(sql).toMatch(/file_metadata/);
 	expect(sql).toMatch(/user_id/);
 });
 test("idempotent DDL", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(/if\s+not\s+exists/);
 });
 test("overall quality score", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// A high-quality migration should contain most of these best-practice signals
 	const signals = [
 		// 1. Avatars bucket is public
 		/insert\s+into\s+storage\.buckets[\s\S]*?avatars/,
 		// 2. Documents bucket exists
 		/insert\s+into\s+storage\.buckets[\s\S]*?documents/,
 		// 3. MIME type restriction
 		/allowed_mime_types/,
 		// 4. File size limit
 		/file_size_limit/,
 		// 5. Storage foldername helper
 		/storage\.foldername/,
 		// 6. auth.uid()::text cast
 		/auth\.uid\(\)\s*::\s*text/,
 		// 7. TO authenticated on policies
 		/to\s+authenticated/,
 		// 8. Public read for avatars
 		/to\s+(public|anon)/,
 		// 9. RLS on file_metadata
 		/enable\s+row\s+level\s+security/,
 		// 10. FK to auth.users with cascade
 		/on\s+delete\s+cascade/,
 		// 11. (select auth.uid()) subselect form
 		/\(select\s+auth\.uid\(\)\)/,
 		// 12. Index on user_id
 		/create\s+index/,
 		// 13. timestamptz usage
 		/timestamptz/,
 		// 14. IF NOT EXISTS for idempotency
 		/if\s+not\s+exists/,
 		// 15. file_metadata table
 		/create\s+table[\s\S]*?file_metadata/,
 	];
 	const matches = signals.filter((r) => r.test(sql));
 	// Require at least 11 of 15 best-practice signals
 	expect(matches.length).toBeGreaterThanOrEqual(11);
 });
--- a/packages/evals/evals/storage-rls-user-folders/PROMPT.md
+++ b/packages/evals/evals/storage-rls-user-folders/PROMPT.md
@@ -0,0 +1,12 @@
 I need to set up file storage for my app. There are two use cases:
 1. **Avatars** -- Users upload a profile picture. Anyone can view avatars but only the owning user can upload or replace their own. Only allow image files (JPEG, PNG, WebP). Max 2MB.
 2. **Documents** -- Users upload private documents that only they can access. Max 50MB. No file type restriction.
 The Supabase project is already initialized in the `supabase/` directory. Create a SQL migration that:
 - Configures both storage buckets
 - Adds RLS policies on `storage.objects` so each user can only access their own folder (folder name = user ID)
 - Creates a `file_metadata` table to track uploaded files (file name, bucket, size, user reference) with appropriate security
 Users are authenticated via Supabase Auth.
--- a/packages/evals/evals/storage-rls-user-folders/package.json
+++ b/packages/evals/evals/storage-rls-user-folders/package.json
@@ -0,0 +1,5 @@
 {
 	"name": "storage-rls-user-folders",
 	"private": true,
 	"type": "module"
 }
--- a/packages/evals/evals/storage-rls-user-folders/supabase/config.toml
+++ b/packages/evals/evals/storage-rls-user-folders/supabase/config.toml
@@ -0,0 +1,64 @@
 # For detailed configuration reference documentation, visit:
 # https://supabase.com/docs/guides/local-development/cli/config
 # A string used to distinguish different Supabase projects on the same host. Defaults to the
 # working directory name when running `supabase init`.
 project_id = "storage-rls-user-folders"
 [api]
 enabled = true
 # Port to use for the API URL.
 port = 54321
 # Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
 # endpoints. `public` and `graphql_public` schemas are included by default.
 schemas = ["public", "graphql_public"]
 # Extra schemas to add to the search_path of every request.
 extra_search_path = ["public", "extensions"]
 # The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
 # for accidental or malicious requests.
 max_rows = 1000
 [db]
 # Port to use for the local database URL.
 port = 54322
 # Port used by db diff command to initialize the shadow database.
 shadow_port = 54320
 # The database major version to use. This has to be the same as your remote database's. Run `SHOW
 # server_version;` on the remote database to check.
 major_version = 17
 [db.pooler]
 enabled = false
 # Port to use for the local connection pooler.
 port = 54329
 # Specifies when a server connection can be reused by other clients.
 # Configure one of the supported pooler modes: `transaction`, `session`.
 pool_mode = "transaction"
 # How many server connections to allow per user/database pair.
 default_pool_size = 20
 # Maximum number of client connections allowed.
 max_client_conn = 100
 [storage]
 enabled = true
 # The maximum file size allowed (e.g. "5MB", "500KB").
 file_size_limit = "50MiB"
 [auth]
 enabled = true
 # The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
 # in emails.
 site_url = "http://127.0.0.1:3000"
 # A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
 additional_redirect_urls = ["https://127.0.0.1:3000"]
 # How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
 jwt_expiry = 3600
 # Allow/disallow new user signups to your project.
 enable_signup = true
 # Allow/disallow anonymous sign-ins to your project.
 enable_anonymous_sign_ins = false
 [auth.email]
 # Allow/disallow new user signups via email to your project.
 enable_signup = true
 # If enabled, users need to confirm their email address before signing in.
 enable_confirmations = false
--- a/packages/evals/evals/storage-rls-user-folders/supabase/migrations/.gitkeep
+++ b/packages/evals/evals/storage-rls-user-folders/supabase/migrations/.gitkeep
--- a/packages/evals/evals/team-rls-security-definer/EVAL.ts
+++ b/packages/evals/evals/team-rls-security-definer/EVAL.ts
@@ -0,0 +1,201 @@
 import { existsSync, readdirSync, readFileSync } from "node:fs";
 import { join } from "node:path";
 import { expect, test } from "vitest";
 const supabaseDir = join(process.cwd(), "supabase");
 const migrationsDir = join(supabaseDir, "migrations");
 /** Find all .sql migration files (agent may create one or multiple). */
 function findMigrationFiles(): string[] {
 	if (!existsSync(migrationsDir)) return [];
 	return readdirSync(migrationsDir)
 		.filter((f) => f.endsWith(".sql"))
 		.map((f) => join(migrationsDir, f));
 }
 /** Concatenate all migration SQL into a single string for assertions. */
 function getMigrationSQL(): string {
 	const files = findMigrationFiles();
 	if (files.length === 0)
 		throw new Error("No migration file found in supabase/migrations/");
 	return files.map((f) => readFileSync(f, "utf-8")).join("\n");
 }
 test("migration file exists", () => {
 	expect(findMigrationFiles().length).toBeGreaterThan(0);
 });
 test("creates organizations table", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(/create\s+table[\s\S]*?organizations/);
 });
 test("creates memberships table", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(/create\s+table[\s\S]*?memberships/);
 });
 test("creates projects table", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(/create\s+table[\s\S]*?projects/);
 });
 test("enables RLS on all tables", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(
 		/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/,
 	);
 	expect(sql).toMatch(
 		/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/,
 	);
 	expect(sql).toMatch(
 		/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/,
 	);
 });
 test("FK to auth.users with ON DELETE CASCADE", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// memberships should reference auth.users with cascade delete
 	expect(sql).toMatch(/references\s+auth\.users/);
 	expect(sql).toMatch(/on\s+delete\s+cascade/);
 });
 test("org_id FK on projects", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// projects should have a foreign key referencing organizations
 	expect(sql).toMatch(
 		/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/,
 	);
 });
 test("private schema created", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(/create\s+schema[\s\S]*?private/);
 });
 test("security_definer helper function", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Function should be in the private schema with SECURITY DEFINER and search_path = ''
 	expect(sql).toMatch(/private\./);
 	expect(sql).toMatch(/security\s+definer/);
 	expect(sql).toMatch(/set\s+search_path\s*=\s*''/);
 });
 test("policies use (select auth.uid())", () => {
 	const sql = getMigrationSQL();
 	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 	expect(policyBlocks.length).toBeGreaterThan(0);
 	for (const policy of policyBlocks) {
 		if (policy.includes("auth.uid()")) {
 			// The subselect form: (select auth.uid())
 			expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
 		}
 	}
 });
 test("policies use TO authenticated", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 	expect(policyBlocks.length).toBeGreaterThan(0);
 	for (const policy of policyBlocks) {
 		expect(policy).toMatch(/to\s+authenticated/);
 	}
 });
 test("index on membership lookup columns", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(/create\s+index/);
 	// Should index user_id and/or org_id on memberships for policy lookups
 	const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? [];
 	const indexesUserOrOrg = indexBlocks.filter(
 		(idx) =>
 			idx.includes("user_id") ||
 			idx.includes("org_id") ||
 			idx.includes("organization_id"),
 	);
 	expect(indexesUserOrOrg.length).toBeGreaterThanOrEqual(1);
 });
 test("uses timestamptz", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Match "timestamp" that is NOT followed by "tz" or "with time zone"
 	const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
 	// Only fail if the migration defines time columns with plain timestamp
 	if (
 		sql.includes("created_at") ||
 		sql.includes("updated_at") ||
 		sql.includes("_at ")
 	) {
 		expect(sql).not.toMatch(hasPlainTimestamp);
 	}
 });
 test("idempotent DDL", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	expect(sql).toMatch(/if\s+not\s+exists/);
 });
 test("delete policy restricted to owner role", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	// Look for a delete policy on projects that checks for owner (or admin) role
 	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 	const deletePolicy = policyBlocks.find(
 		(p) =>
 			p.toLowerCase().includes("delete") && p.toLowerCase().includes("project"),
 	);
 	expect(deletePolicy).toBeDefined();
 	// The delete policy should check for an owner/admin role
 	expect(deletePolicy?.toLowerCase()).toMatch(/owner|admin/);
 });
 test("overall quality score", () => {
 	const sql = getMigrationSQL().toLowerCase();
 	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
 	// A high-quality migration should contain most of these best-practice signals
 	const signals = [
 		// 1. RLS enabled on all three tables
 		/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/.test(
 			sql,
 		) &&
 			/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
 				sql,
 			) &&
 			/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/.test(
 				sql,
 			),
 		// 2. FK to auth.users with cascade
 		/references\s+auth\.users/.test(sql) && /on\s+delete\s+cascade/.test(sql),
 		// 3. Private schema created
 		/create\s+schema[\s\S]*?private/.test(sql),
 		// 4. security_definer with search_path
 		/security\s+definer/.test(sql) && /set\s+search_path\s*=\s*''/.test(sql),
 		// 5. Subselect auth.uid()
 		/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
 		// 6. TO authenticated on policies
 		policyBlocks.length > 0 &&
 			policyBlocks.every((p) => /to\s+authenticated/.test(p)),
 		// 7. Indexes on lookup columns
 		/create\s+index/.test(sql),
 		// 8. timestamptz (no plain timestamp)
 		!/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(sql),
 		// 9. Idempotent DDL
 		/if\s+not\s+exists/.test(sql),
 		// 10. Delete policy checks owner role
 		policyBlocks.some(
 			(p) =>
 				p.toLowerCase().includes("delete") &&
 				p.toLowerCase().includes("project") &&
 				/owner|admin/.test(p.toLowerCase()),
 		),
 		// 11. org_id FK on projects
 		/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/.test(sql),
 		// 12. Multiple policies (at least one per table)
 		policyBlocks.length >= 3,
 		// 13. Membership role column exists
 		/role/.test(sql),
 		// 14. Private schema function referenced in policies
 		/private\./.test(sql),
 	];
 	const passed = signals.filter(Boolean).length;
 	expect(passed).toBeGreaterThanOrEqual(10);
 });
--- a/packages/evals/evals/team-rls-security-definer/PROMPT.md
+++ b/packages/evals/evals/team-rls-security-definer/PROMPT.md
@@ -0,0 +1,14 @@
 I'm building a project management app where users can belong to multiple organizations. Each organization has projects that all members can view and edit.
 The Supabase project is already initialized in the `supabase/` directory. Create a SQL migration with:
 1. An `organizations` table (name, slug)
 2. A `memberships` table linking users to organizations with a role column (owner, admin, member)
 3. A `projects` table (name, description, status) belonging to an organization
 Set up Row Level Security so:
 - Users can only see organizations they belong to
 - Users can only see and manage projects in their organizations
 - Only org owners can delete projects
 The migration should handle the case where a user is deleted from auth.
--- a/packages/evals/evals/team-rls-security-definer/package.json
+++ b/packages/evals/evals/team-rls-security-definer/package.json
@@ -0,0 +1,5 @@
 {
 	"name": "team-rls-security-definer",
 	"private": true,
 	"type": "module"
 }
--- a/packages/evals/evals/team-rls-security-definer/supabase/config.toml
+++ b/packages/evals/evals/team-rls-security-definer/supabase/config.toml
@@ -0,0 +1,111 @@
 # For detailed configuration reference documentation, visit:
 # https://supabase.com/docs/guides/local-development/cli/config
 # A string used to distinguish different Supabase projects on the same host. Defaults to the
 # working directory name when running `supabase init`.
 project_id = "team-rls-security-definer"
 [api]
 enabled = true
 # Port to use for the API URL.
 port = 54321
 # Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
 # endpoints. `public` and `graphql_public` schemas are included by default.
 schemas = ["public", "graphql_public"]
 # Extra schemas to add to the search_path of every request.
 extra_search_path = ["public", "extensions"]
 # The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
 # for accidental or malicious requests.
 max_rows = 1000
 [db]
 # Port to use for the local database URL.
 port = 54322
 # Port used by db diff command to initialize the shadow database.
 shadow_port = 54320
 # The database major version to use. This has to be the same as your remote database's. Run `SHOW
 # server_version;` on the remote database to check.
 major_version = 17
 [db.pooler]
 enabled = false
 # Port to use for the local connection pooler.
 port = 54329
 # Specifies when a server connection can be reused by other clients.
 # Configure one of the supported pooler modes: `transaction`, `session`.
 pool_mode = "transaction"
 # How many server connections to allow per user/database pair.
 default_pool_size = 20
 # Maximum number of client connections allowed.
 max_client_conn = 100
 [db.migrations]
 # If disabled, migrations will be skipped during a db push or reset.
 enabled = true
 schema_paths = []
 [db.seed]
 # If enabled, seeds the database after migrations during a db reset.
 enabled = true
 # Specifies an ordered list of seed files to load during db reset.
 sql_paths = ["./seed.sql"]
 [realtime]
 enabled = true
 [studio]
 enabled = true
 # Port to use for Supabase Studio.
 port = 54323
 # External URL of the API server that frontend connects to.
 api_url = "http://127.0.0.1"
 [inbucket]
 enabled = true
 # Port to use for the email testing server web interface.
 port = 54324
 [storage]
 enabled = true
 # The maximum file size allowed (e.g. "5MB", "500KB").
 file_size_limit = "50MiB"
 [auth]
 enabled = true
 # The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
 # in emails.
 site_url = "http://127.0.0.1:3000"
 # A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
 additional_redirect_urls = ["https://127.0.0.1:3000"]
 # How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
 jwt_expiry = 3600
 # If disabled, the refresh token will never expire.
 enable_refresh_token_rotation = true
 # Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
 # Requires enable_refresh_token_rotation = true.
 refresh_token_reuse_interval = 10
 # Allow/disallow new user signups to your project.
 enable_signup = true
 # Allow/disallow anonymous sign-ins to your project.
 enable_anonymous_sign_ins = false
 [auth.email]
 # Allow/disallow new user signups via email to your project.
 enable_signup = true
 # If enabled, a user will be required to confirm any email change on both the old, and new email
 # addresses. If disabled, only the new email is required to confirm.
 double_confirm_changes = true
 # If enabled, users need to confirm their email address before signing in.
 enable_confirmations = false
 [edge_runtime]
 enabled = true
 # Configure one of the supported request policies: `oneshot`, `per_worker`.
 policy = "per_worker"
 # Port to attach the Chrome inspector for debugging edge functions.
 inspector_port = 8083
 [analytics]
 enabled = true
 port = 54327
 # Configure one of the supported backends: `postgres`, `bigquery`.
 backend = "postgres"
--- a/packages/evals/package-lock.json
+++ b/packages/evals/package-lock.json
@@ -9,6 +9,7 @@
 			"version": "1.0.0",
 			"license": "MIT",
 			"dependencies": {
 				"@anthropic-ai/claude-code": "^2.1.49",
 				"braintrust": "^3.0.0"
 			},
 			"devDependencies": {
@@ -18,6 +19,29 @@
 				"vitest": "^3.1.0"
 			}
 		},
 		"node_modules/@anthropic-ai/claude-code": {
 			"version": "2.1.49",
 			"resolved": "https://registry.npmjs.org/@anthropic-ai/claude-code/-/claude-code-2.1.49.tgz",
 			"integrity": "sha512-PonEmTZlB5IZbBu9TmtOpGZnupU7OxOXTsJKcXE/4Ak5qp3ptN1wSBRdgKYnn6GDYhXijTXuVVwrCQU+NAgwPA==",
 			"license": "SEE LICENSE IN README.md",
 			"bin": {
 				"claude": "cli.js"
 			},
 			"engines": {
 				"node": ">=18.0.0"
 			},
 			"optionalDependencies": {
 				"@img/sharp-darwin-arm64": "^0.34.2",
 				"@img/sharp-darwin-x64": "^0.34.2",
 				"@img/sharp-linux-arm": "^0.34.2",
 				"@img/sharp-linux-arm64": "^0.34.2",
 				"@img/sharp-linux-x64": "^0.34.2",
 				"@img/sharp-linuxmusl-arm64": "^0.34.2",
 				"@img/sharp-linuxmusl-x64": "^0.34.2",
 				"@img/sharp-win32-arm64": "^0.34.2",
 				"@img/sharp-win32-x64": "^0.34.2"
 			}
 		},
 		"node_modules/@colors/colors": {
 			"version": "1.5.0",
 			"resolved": "https://registry.npmjs.org/@colors/colors/-/colors-1.5.0.tgz",
@@ -444,6 +468,310 @@
 				"node": ">=18"
 			}
 		},
 		"node_modules/@img/sharp-darwin-arm64": {
 			"version": "0.34.5",
 			"resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.5.tgz",
 			"integrity": "sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w==",
 			"cpu": [
 				"arm64"
 			],
 			"license": "Apache-2.0",
 			"optional": true,
 			"os": [
 				"darwin"
 			],
 			"engines": {
 				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
 			},
 			"funding": {
 				"url": "https://opencollective.com/libvips"
 			},
 			"optionalDependencies": {
 				"@img/sharp-libvips-darwin-arm64": "1.2.4"
 			}
 		},
 		"node_modules/@img/sharp-darwin-x64": {
 			"version": "0.34.5",
 			"resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.5.tgz",
 			"integrity": "sha512-YNEFAF/4KQ/PeW0N+r+aVVsoIY0/qxxikF2SWdp+NRkmMB7y9LBZAVqQ4yhGCm/H3H270OSykqmQMKLBhBJDEw==",
 			"cpu": [
 				"x64"
 			],
 			"license": "Apache-2.0",
 			"optional": true,
 			"os": [
 				"darwin"
 			],
 			"engines": {
 				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
 			},
 			"funding": {
 				"url": "https://opencollective.com/libvips"
 			},
 			"optionalDependencies": {
 				"@img/sharp-libvips-darwin-x64": "1.2.4"
 			}
 		},
 		"node_modules/@img/sharp-libvips-darwin-arm64": {
 			"version": "1.2.4",
 			"resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.2.4.tgz",
 			"integrity": "sha512-zqjjo7RatFfFoP0MkQ51jfuFZBnVE2pRiaydKJ1G/rHZvnsrHAOcQALIi9sA5co5xenQdTugCvtb1cuf78Vf4g==",
 			"cpu": [
 				"arm64"
 			],
 			"license": "LGPL-3.0-or-later",
 			"optional": true,
 			"os": [
 				"darwin"
 			],
 			"funding": {
 				"url": "https://opencollective.com/libvips"
 			}
 		},
 		"node_modules/@img/sharp-libvips-darwin-x64": {
 			"version": "1.2.4",
 			"resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.2.4.tgz",
 			"integrity": "sha512-1IOd5xfVhlGwX+zXv2N93k0yMONvUlANylbJw1eTah8K/Jtpi15KC+WSiaX/nBmbm2HxRM1gZ0nSdjSsrZbGKg==",
 			"cpu": [
 				"x64"
 			],
 			"license": "LGPL-3.0-or-later",
 			"optional": true,
 			"os": [
 				"darwin"
 			],
 			"funding": {
 				"url": "https://opencollective.com/libvips"
 			}
 		},
 		"node_modules/@img/sharp-libvips-linux-arm": {
 			"version": "1.2.4",
 			"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.2.4.tgz",
 			"integrity": "sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A==",
 			"cpu": [
 				"arm"
 			],
 			"license": "LGPL-3.0-or-later",
 			"optional": true,
 			"os": [
 				"linux"
 			],
 			"funding": {
 				"url": "https://opencollective.com/libvips"
 			}
 		},
 		"node_modules/@img/sharp-libvips-linux-arm64": {
 			"version": "1.2.4",
 			"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.2.4.tgz",
 			"integrity": "sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw==",
 			"cpu": [
 				"arm64"
 			],
 			"license": "LGPL-3.0-or-later",
 			"optional": true,
 			"os": [
 				"linux"
 			],
 			"funding": {
 				"url": "https://opencollective.com/libvips"
 			}
 		},
 		"node_modules/@img/sharp-libvips-linux-x64": {
 			"version": "1.2.4",
 			"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.2.4.tgz",
 			"integrity": "sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw==",
 			"cpu": [
 				"x64"
 			],
 			"license": "LGPL-3.0-or-later",
 			"optional": true,
 			"os": [
 				"linux"
 			],
 			"funding": {
 				"url": "https://opencollective.com/libvips"
 			}
 		},
 		"node_modules/@img/sharp-libvips-linuxmusl-arm64": {
 			"version": "1.2.4",
 			"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.2.4.tgz",
 			"integrity": "sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw==",
 			"cpu": [
 				"arm64"
 			],
 			"license": "LGPL-3.0-or-later",
 			"optional": true,
 			"os": [
 				"linux"
 			],
 			"funding": {
 				"url": "https://opencollective.com/libvips"
 			}
 		},
 		"node_modules/@img/sharp-libvips-linuxmusl-x64": {
 			"version": "1.2.4",
 			"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.2.4.tgz",
 			"integrity": "sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg==",
 			"cpu": [
 				"x64"
 			],
 			"license": "LGPL-3.0-or-later",
 			"optional": true,
 			"os": [
 				"linux"
 			],
 			"funding": {
 				"url": "https://opencollective.com/libvips"
 			}
 		},
 		"node_modules/@img/sharp-linux-arm": {
 			"version": "0.34.5",
 			"resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.5.tgz",
 			"integrity": "sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw==",
 			"cpu": [
 				"arm"
 			],
 			"license": "Apache-2.0",
 			"optional": true,
 			"os": [
 				"linux"
 			],
 			"engines": {
 				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
 			},
 			"funding": {
 				"url": "https://opencollective.com/libvips"
 			},
 			"optionalDependencies": {
 				"@img/sharp-libvips-linux-arm": "1.2.4"
 			}
 		},
 		"node_modules/@img/sharp-linux-arm64": {
 			"version": "0.34.5",
 			"resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.5.tgz",
 			"integrity": "sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg==",
 			"cpu": [
 				"arm64"
 			],
 			"license": "Apache-2.0",
 			"optional": true,
 			"os": [
 				"linux"
 			],
 			"engines": {
 				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
 			},
 			"funding": {
 				"url": "https://opencollective.com/libvips"
 			},
 			"optionalDependencies": {
 				"@img/sharp-libvips-linux-arm64": "1.2.4"
 			}
 		},
 		"node_modules/@img/sharp-linux-x64": {
 			"version": "0.34.5",
 			"resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.5.tgz",
 			"integrity": "sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ==",
 			"cpu": [
 				"x64"
 			],
 			"license": "Apache-2.0",
 			"optional": true,
 			"os": [
 				"linux"
 			],
 			"engines": {
 				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
 			},
 			"funding": {
 				"url": "https://opencollective.com/libvips"
 			},
 			"optionalDependencies": {
 				"@img/sharp-libvips-linux-x64": "1.2.4"
 			}
 		},
 		"node_modules/@img/sharp-linuxmusl-arm64": {
 			"version": "0.34.5",
 			"resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.5.tgz",
 			"integrity": "sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg==",
 			"cpu": [
 				"arm64"
 			],
 			"license": "Apache-2.0",
 			"optional": true,
 			"os": [
 				"linux"
 			],
 			"engines": {
 				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
 			},
 			"funding": {
 				"url": "https://opencollective.com/libvips"
 			},
 			"optionalDependencies": {
 				"@img/sharp-libvips-linuxmusl-arm64": "1.2.4"
 			}
 		},
 		"node_modules/@img/sharp-linuxmusl-x64": {
 			"version": "0.34.5",
 			"resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.5.tgz",
 			"integrity": "sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q==",
 			"cpu": [
 				"x64"
 			],
 			"license": "Apache-2.0",
 			"optional": true,
 			"os": [
 				"linux"
 			],
 			"engines": {
 				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
 			},
 			"funding": {
 				"url": "https://opencollective.com/libvips"
 			},
 			"optionalDependencies": {
 				"@img/sharp-libvips-linuxmusl-x64": "1.2.4"
 			}
 		},
 		"node_modules/@img/sharp-win32-arm64": {
 			"version": "0.34.5",
 			"resolved": "https://registry.npmjs.org/@img/sharp-win32-arm64/-/sharp-win32-arm64-0.34.5.tgz",
 			"integrity": "sha512-WQ3AgWCWYSb2yt+IG8mnC6Jdk9Whs7O0gxphblsLvdhSpSTtmu69ZG1Gkb6NuvxsNACwiPV6cNSZNzt0KPsw7g==",
 			"cpu": [
 				"arm64"
 			],
 			"license": "Apache-2.0 AND LGPL-3.0-or-later",
 			"optional": true,
 			"os": [
 				"win32"
 			],
 			"engines": {
 				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
 			},
 			"funding": {
 				"url": "https://opencollective.com/libvips"
 			}
 		},
 		"node_modules/@img/sharp-win32-x64": {
 			"version": "0.34.5",
 			"resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.5.tgz",
 			"integrity": "sha512-+29YMsqY2/9eFEiW93eqWnuLcWcufowXewwSNIT6UwZdUUCrM3oFjMWH/Z6/TMmb4hlFenmfAVbpWeup2jryCw==",
 			"cpu": [
 				"x64"
 			],
 			"license": "Apache-2.0 AND LGPL-3.0-or-later",
 			"optional": true,
 			"os": [
 				"win32"
 			],
 			"engines": {
 				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
 			},
 			"funding": {
 				"url": "https://opencollective.com/libvips"
 			}
 		},
 		"node_modules/@jridgewell/sourcemap-codec": {
 			"version": "1.5.5",
 			"resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz",
--- a/packages/evals/package.json
+++ b/packages/evals/package.json
@@ -10,6 +10,7 @@
 		"eval:upload": "BRAINTRUST_UPLOAD=true tsx src/runner.ts"
 	},
 	"dependencies": {
 		"@anthropic-ai/claude-code": "^2.1.49",
 		"braintrust": "^3.0.0"
 	},
 	"devDependencies": {
--- a/packages/evals/scenarios/SCENARIOS.md
+++ b/packages/evals/scenarios/SCENARIOS.md
@@ -49,3 +49,121 @@ The agent initializes a Supabase project and creates a migration file that:
 | index on user_id | `CREATE INDEX` on the FK column |
 | IF NOT EXISTS | Idempotent migration |
 | overall quality | At least 4/5 best-practice signals present |
 ## Scenario 2: team-rls-security-definer
 **Description:** Create a SQL migration for a team-based project management app
 where users belong to organizations via a membership table. The migration must
 define tables for organizations, memberships, and projects, then secure them
 with RLS policies that use a `security definer` helper function in a private
 schema to efficiently resolve team membership without per-row joins.
 **Setup:** The workspace starts with a pre-initialized Supabase project
 (`supabase/config.toml` exists, empty `supabase/migrations/` directory). The
 agent creates migration files within this structure.
 **Expected skill files read:**
 - `SKILL.md` (skill body with reference file index)
 - `references/db-rls-mandatory.md`
 - `references/db-rls-policy-types.md`
 - `references/db-rls-common-mistakes.md`
 - `references/db-rls-performance.md`
 - `references/db-security-functions.md`
 - `references/db-schema-auth-fk.md`
 - `references/db-schema-timestamps.md`
 - `references/db-perf-indexes.md`
 - `references/db-migrations-idempotent.md`
 **Expected result:**
 The agent creates a migration file that:
 - Creates organizations, memberships, and projects tables with `timestamptz` columns
 - Has `user_id` FK to `auth.users(id)` with `ON DELETE CASCADE` on memberships
 - Has `org_id` FK on projects referencing organizations
 - Enables RLS on all three tables
 - Creates a private schema with a `security definer` helper function (`SET search_path = ''`)
 - Creates RLS policies using `(select auth.uid())` with `TO authenticated`
 - Creates indexes on membership lookup columns (user_id, org_id)
 - Has a delete policy on projects restricted to owner role
 - Uses `IF NOT EXISTS` for idempotency
 **Scorer:** Binary pass/fail (16 vitest assertions)
 | Test | What it checks |
 | --- | --- |
 | migration file exists | A `.sql` file exists in `supabase/migrations/` |
 | creates organizations table | SQL contains `CREATE TABLE` for organizations |
 | creates memberships table | SQL contains `CREATE TABLE` for memberships |
 | creates projects table | SQL contains `CREATE TABLE` for projects |
 | enables RLS on all tables | `ALTER TABLE ... ENABLE ROW LEVEL SECURITY` for all three tables |
 | FK to auth.users with ON DELETE CASCADE | memberships references `auth.users` with cascade |
 | org_id FK on projects | projects references organizations |
 | private schema created | `CREATE SCHEMA ... private` present |
 | security_definer helper function | Function in private schema with `SECURITY DEFINER` and `SET search_path = ''` |
 | policies use (select auth.uid()) | Subselect form in all policies referencing auth.uid() |
 | policies use TO authenticated | All policies scoped to authenticated role |
 | index on membership lookup columns | `CREATE INDEX` on user_id and/or org_id in memberships |
 | uses timestamptz | No plain `timestamp` for time columns |
 | idempotent DDL | Uses `IF NOT EXISTS` or `DROP ... IF EXISTS` patterns |
 | delete policy restricted to owner role | A delete policy on projects checks for owner/admin role |
 | overall quality score | At least 10/14 best-practice signals present |
 ## Scenario 3: storage-rls-user-folders
 **Description:** Create a SQL migration that sets up Supabase Storage buckets
 with RLS policies for user-content. An avatars bucket (public reads,
 authenticated uploads restricted to user folders) and a documents bucket (fully
 private, user-isolated), with file type restrictions, storage helper functions
 in policies, and a file_metadata tracking table secured with RLS.
 **Setup:** Pre-initialized Supabase project (`supabase/config.toml` exists)
 with an empty `supabase/migrations/` directory. The agent creates migration
 files within this structure.
 **Expected skill files read:**
 - `SKILL.md` (skill body with reference file index)
 - `references/storage-access-control.md`
 - `references/db-rls-mandatory.md`
 - `references/db-rls-common-mistakes.md`
 - `references/db-rls-performance.md`
 - `references/db-schema-auth-fk.md`
 - `references/db-schema-timestamps.md`
 - `references/db-perf-indexes.md`
 - `references/db-migrations-idempotent.md`
 **Expected result:**
 The agent creates a migration file that:
 - Inserts avatars bucket into `storage.buckets` with `public = true`, MIME type restrictions, and file size limit
 - Inserts documents bucket with `public = false`
 - Creates RLS policies on `storage.objects` using `storage.foldername(name)` with `auth.uid()::text`
 - Scopes upload policies `TO authenticated` and avatars SELECT policy `TO public`
 - Creates `file_metadata` table with FK to `auth.users` with `ON DELETE CASCADE`
 - Enables RLS on `file_metadata` with policies using `(select auth.uid())`
 - Uses `timestamptz` for time columns, indexes `user_id`, and `IF NOT EXISTS` for idempotency
 **Scorer:** Binary pass/fail (17 vitest assertions)
 | Test | What it checks |
 | --- | --- |
 | migration file exists | A `.sql` file exists in `supabase/migrations/` |
 | creates avatars bucket | SQL inserts into `storage.buckets` with id 'avatars' and `public = true` |
 | creates documents bucket | SQL inserts into `storage.buckets` with id 'documents' and `public = false` |
 | avatars bucket has mime type restriction | `allowed_mime_types` includes image types (jpeg, png, webp) |
 | avatars bucket has file size limit | `file_size_limit` set (around 2MB / 2097152 bytes) |
 | storage policy uses foldername or path for user isolation | Policy references `storage.foldername(name)` with `auth.uid()::text` |
 | storage policy uses TO authenticated | Storage upload/delete policies scoped to `TO authenticated` |
 | public read policy for avatars | A SELECT policy on storage.objects for avatars allows public/anon access |
 | documents bucket is fully private | Policies for documents restrict all operations to authenticated owner |
 | creates file_metadata table | SQL contains `CREATE TABLE` for file_metadata |
 | file_metadata has FK to auth.users with CASCADE | `REFERENCES auth.users` with `ON DELETE CASCADE` |
 | RLS enabled on file_metadata | `ALTER TABLE file_metadata ENABLE ROW LEVEL SECURITY` |
 | file_metadata policies use (select auth.uid()) | Subselect form in policies |
 | uses timestamptz for time columns | No plain `timestamp` in file_metadata |
 | index on file_metadata user_id | `CREATE INDEX` on user_id column |
 | idempotent DDL | Uses `IF NOT EXISTS` patterns |
 | overall quality score | At least 11/15 best-practice signals present |
--- a/packages/evals/scenarios/storage-rls-user-folders.md
+++ b/packages/evals/scenarios/storage-rls-user-folders.md
@@ -0,0 +1,144 @@
 # Scenario: storage-rls-user-folders
 ## Summary
 The agent must create a SQL migration that sets up Supabase Storage buckets
 with RLS policies for a user-content application. The migration must configure
 an avatars bucket (public reads, authenticated uploads restricted to user
 folders) and a documents bucket (fully private, user-isolated), with proper
 file type restrictions, storage helper functions in policies, and a
 file_metadata tracking table secured with RLS.
 ## Real-World Justification
 Why this is a common and important workflow:
 1. **Storage RLS is confusing and under-documented compared to table RLS** --
   Developers consistently struggle with the distinction between public/private
   buckets and the RLS policies needed on `storage.objects`. Multiple GitHub
   discussions show confusion about which SDK operations map to which SQL
   operations (INSERT, SELECT, UPDATE, DELETE).
   - Source: https://github.com/orgs/supabase/discussions/37611
   - Source: https://github.com/orgs/supabase/discussions/38700
 2. **User-folder isolation is the canonical storage security pattern** -- The
   official Supabase docs demonstrate folder-based isolation using
   `storage.foldername(name)` and `auth.uid()::text`, but developers frequently
   get the casting or array indexing wrong.
   - Source: https://supabase.com/docs/guides/storage/security/access-control
 3. **Missing file type restrictions leads to security vulnerabilities** --
   Without `allowed_mime_types` on the bucket or extension checks in RLS
   policies, users can upload executable files or oversized payloads. The
   Supabase security best practices guide calls this out as a common oversight.
   - Source: https://supaexplorer.com/guides/supabase-security-best-practices
   - Source: https://supabase.com/docs/guides/storage/buckets/fundamentals
 ## Skill References Exercised
 Which reference files the agent should consult and what each teaches:
 | Reference File | What It Teaches | What the Agent Should Apply |
 |---|---|---|
 | `references/storage-access-control.md` | Bucket visibility, RLS on storage.objects, storage helper functions, SDK-to-SQL operation mapping | User-folder policies using `storage.foldername()`, separate SELECT/INSERT policies |
 | `references/db-rls-mandatory.md` | RLS must be enabled on all public tables | Enable RLS on the file_metadata tracking table |
 | `references/db-rls-common-mistakes.md` | Missing TO clause, missing SELECT policy for UPDATE | Use `TO authenticated` (or `TO public` for public reads), include SELECT policy |
 | `references/db-rls-performance.md` | Wrap auth.uid() in SELECT subquery | Use `(select auth.uid())` in both storage and table policies |
 | `references/db-schema-auth-fk.md` | FK to auth.users with ON DELETE CASCADE | file_metadata.user_id references auth.users with cascade |
 | `references/db-schema-timestamps.md` | Use timestamptz not timestamp | Time columns on file_metadata use timestamptz |
 | `references/db-perf-indexes.md` | Index columns used in policy lookups | Index user_id on file_metadata |
 | `references/db-migrations-idempotent.md` | IF NOT EXISTS for safe reruns | Idempotent DDL throughout |
 ## Workspace Setup
 What the workspace starts with before the agent runs:
 - Pre-initialized Supabase project (`supabase/config.toml` exists)
 - Empty `supabase/migrations/` directory
 - The agent creates migration files within this structure
 ## Agent Task (PROMPT.md draft)
 The prompt to give the agent. Written as a developer would ask it:
 > I need to set up file storage for my app. There are two use cases:
 >
 > 1. **Avatars** -- Users upload a profile picture. Anyone can view avatars but
 >    only the owning user can upload or replace their own. Only allow image
 >    files (JPEG, PNG, WebP). Max 2MB.
 >
 > 2. **Documents** -- Users upload private documents that only they can access.
 >    Max 50MB. No file type restriction.
 >
 > Create a SQL migration that:
 > - Configures both storage buckets
 > - Adds RLS policies on `storage.objects` so each user can only access their
 >   own folder (folder name = user ID)
 > - Creates a `file_metadata` table to track uploaded files (file name, bucket,
 >   size, user reference) with appropriate security
 >
 > Users are authenticated via Supabase Auth.
 ## Evaluation Criteria
 What vitest should assert on the agent's output. Each assertion tests a
 specific quality signal:
 | # | Test Name | What It Checks | Quality Dimension |
 |---|-----------|----------------|-------------------|
 | 1 | migration file exists | A `.sql` file exists in `supabase/migrations/` | structure |
 | 2 | creates avatars bucket | SQL inserts into `storage.buckets` with id 'avatars' and `public = true` | correctness |
 | 3 | creates documents bucket | SQL inserts into `storage.buckets` with id 'documents' and `public = false` | correctness |
 | 4 | avatars bucket has mime type restriction | `allowed_mime_types` includes image types (jpeg, png, webp) | security |
 | 5 | avatars bucket has file size limit | `file_size_limit` set (around 2MB / 2097152 bytes) | security |
 | 6 | storage policy uses foldername or path for user isolation | Policy references `storage.foldername(name)` with `auth.uid()::text` | security |
 | 7 | storage policy uses TO authenticated | Storage upload/delete policies scoped to `TO authenticated` | security |
 | 8 | public read policy for avatars | A SELECT policy on storage.objects for avatars bucket allows public/anon access | correctness |
 | 9 | documents bucket is fully private | Policies for documents bucket restrict all operations to authenticated owner | security |
 | 10 | creates file_metadata table | SQL contains `CREATE TABLE` for file_metadata | correctness |
 | 11 | file_metadata has FK to auth.users with CASCADE | `REFERENCES auth.users` with `ON DELETE CASCADE` | correctness |
 | 12 | RLS enabled on file_metadata | `ALTER TABLE file_metadata ENABLE ROW LEVEL SECURITY` | security |
 | 13 | file_metadata policies use (select auth.uid()) | Subselect form in policies | performance |
 | 14 | uses timestamptz for time columns | No plain `timestamp` in file_metadata | correctness |
 | 15 | index on file_metadata user_id | `CREATE INDEX` on user_id column | performance |
 | 16 | idempotent DDL | Uses `IF NOT EXISTS` patterns | idempotency |
 | 17 | overall quality score | At least 11/15 best-practice signals present | overall |
 ## Reasoning
 Step-by-step reasoning for why this scenario is well-designed:
 1. **Baseline differentiator:** An agent without the skill would likely: (a)
   confuse public bucket visibility with unrestricted upload access, (b) write
   storage policies without using `storage.foldername()` or get the array
   indexing wrong, (c) forget to set `allowed_mime_types` on the bucket itself,
   (d) omit the `TO authenticated` clause on storage policies, (e) use bare
   `auth.uid()` instead of the subselect form, (f) skip the `::text` cast when
   comparing auth.uid() to folder names. These are all Supabase-specific
   patterns that require reading the skill references.
 2. **Skill value:** The storage-access-control reference explicitly documents:
   the public vs private bucket distinction, the `storage.foldername()` helper
   function pattern, the SDK-to-SQL operation mapping, and bucket configuration
   with mime types and size limits. Combined with the database security
   references (RLS mandatory, common mistakes, performance), this scenario
   exercises 8 reference files.
 3. **Testability:** Bucket configuration (INSERT INTO storage.buckets), storage
   helper function usage (storage.foldername), policy clauses (TO
   authenticated, TO public), mime types, file size limits, and all table-level
   patterns (RLS, FK, indexes, timestamptz) are reliably detectable via regex
   on SQL text.
 4. **Realism:** Nearly every Supabase application that handles user-generated
   content needs avatar uploads and document storage. This is a day-one task
   for any SaaS product. The GitHub discussions linked above show dozens of
   developers hitting exactly these issues when setting up storage for the
   first time.
 ## Difficulty
 **Rating:** MEDIUM
 - Without skill: ~30-45% of assertions expected to pass
 - With skill: ~85-95% of assertions expected to pass
--- a/packages/evals/scenarios/team-rls-security-definer.md
+++ b/packages/evals/scenarios/team-rls-security-definer.md
@@ -0,0 +1,139 @@
 # Scenario: team-rls-security-definer
 ## Summary
 The agent must create a SQL migration for a team-based project management app
 where users belong to organizations via a membership table. The migration must
 define tables for organizations, memberships, and projects, then secure them
 with RLS policies that use a `security definer` helper function in a private
 schema to efficiently resolve team membership without per-row joins.
 ## Real-World Justification
 Why this is a common and important workflow:
 1. **Multi-tenant team access is the most-asked RLS question on Supabase** --
   The official Supabase GitHub has multiple high-engagement discussions about
   how to write RLS policies that check team/org membership without causing
   performance issues or security holes.
   - Source: https://github.com/supabase/supabase/discussions/4509
   - Source: https://github.com/supabase/supabase/discussions/811
 2. **security_definer in public schema is a documented security anti-pattern** --
   Developers frequently place security_definer functions in the public schema,
   inadvertently exposing them via the PostgREST API. The Supabase docs and
   community discussions explicitly warn against this.
   - Source: https://github.com/supabase/supabase/discussions/3269
   - Source: https://supabase.com/docs/guides/database/postgres/row-level-security
 3. **RLS policy performance with joins is a top pain point** -- Naive policies
   that join against a memberships table execute per-row, causing severe
   performance degradation. The recommended pattern is a security_definer
   function that caches results via subselect.
   - Source: https://github.com/orgs/supabase/discussions/1148
   - Source: https://makerkit.dev/blog/tutorials/supabase-rls-best-practices
 ## Skill References Exercised
 Which reference files the agent should consult and what each teaches:
 | Reference File | What It Teaches | What the Agent Should Apply |
 |---|---|---|
 | `references/db-rls-mandatory.md` | RLS must be enabled on all public tables | Enable RLS on organizations, memberships, and projects |
 | `references/db-rls-policy-types.md` | PERMISSIVE vs RESTRICTIVE policies | Use PERMISSIVE policies for team OR owner access patterns |
 | `references/db-rls-common-mistakes.md` | Missing TO clause, user_metadata pitfalls | Always use `TO authenticated` on all policies |
 | `references/db-rls-performance.md` | Wrap auth.uid() in SELECT, use security_definer for joins | Use `(select auth.uid())` and a private-schema helper function |
 | `references/db-security-functions.md` | security_definer in private schema with search_path = '' | Create helper function in private schema, revoke default permissions |
 | `references/db-schema-auth-fk.md` | FK to auth.users with ON DELETE CASCADE | Reference auth.users with cascade on memberships |
 | `references/db-schema-timestamps.md` | Use timestamptz not timestamp | All time columns use timestamptz |
 | `references/db-perf-indexes.md` | Index columns used in RLS policies | Index user_id and org_id columns used in policy lookups |
 | `references/db-migrations-idempotent.md` | IF NOT EXISTS for safe reruns | Idempotent DDL throughout the migration |
 ## Workspace Setup
 What the workspace starts with before the agent runs:
 - Pre-initialized Supabase project (`supabase/config.toml` exists)
 - Empty `supabase/migrations/` directory
 - The agent creates migration files within this structure
 ## Agent Task (PROMPT.md draft)
 The prompt to give the agent. Written as a developer would ask it:
 > I'm building a project management app where users can belong to multiple
 > organizations. Each organization has projects that all members can view and
 > edit.
 >
 > Create a SQL migration with:
 >
 > 1. An `organizations` table (name, slug)
 > 2. A `memberships` table linking users to organizations with a role column
 >    (owner, admin, member)
 > 3. A `projects` table (name, description, status) belonging to an organization
 >
 > Set up Row Level Security so:
 > - Users can only see organizations they belong to
 > - Users can only see and manage projects in their organizations
 > - Only org owners can delete projects
 >
 > The migration should handle the case where a user is deleted from auth.
 ## Evaluation Criteria
 What vitest should assert on the agent's output. Each assertion tests a
 specific quality signal:
 | # | Test Name | What It Checks | Quality Dimension |
 |---|-----------|----------------|-------------------|
 | 1 | migration file exists | A `.sql` file exists in `supabase/migrations/` | structure |
 | 2 | creates organizations table | SQL contains `CREATE TABLE` for organizations | correctness |
 | 3 | creates memberships table | SQL contains `CREATE TABLE` for memberships | correctness |
 | 4 | creates projects table | SQL contains `CREATE TABLE` for projects | correctness |
 | 5 | enables RLS on all tables | `ALTER TABLE ... ENABLE ROW LEVEL SECURITY` for all three tables | security |
 | 6 | FK to auth.users with ON DELETE CASCADE | memberships references `auth.users` with cascade | correctness |
 | 7 | org_id FK on projects | projects references organizations | correctness |
 | 8 | private schema created | `CREATE SCHEMA ... private` present | security |
 | 9 | security_definer helper function | A function in the private schema with `SECURITY DEFINER` and `SET search_path = ''` | security |
 | 10 | policies use (select auth.uid()) | Subselect form in all policies referencing auth.uid() | performance |
 | 11 | policies use TO authenticated | All policies scoped to authenticated role | security |
 | 12 | index on membership lookup columns | `CREATE INDEX` on user_id and/or org_id in memberships | performance |
 | 13 | uses timestamptz | No plain `timestamp` for time columns | correctness |
 | 14 | idempotent DDL | Uses `IF NOT EXISTS` or `DROP ... IF EXISTS` patterns | idempotency |
 | 15 | delete policy restricted to owner role | A delete policy on projects checks for owner/admin role | security |
 | 16 | overall quality score | At least 10/14 best-practice signals present | overall |
 ## Reasoning
 Step-by-step reasoning for why this scenario is well-designed:
 1. **Baseline differentiator:** An agent without the skill would likely put the
   security_definer function in the public schema, omit `SET search_path = ''`,
   use bare `auth.uid()` instead of the subselect form, write inline joins in
   policies instead of using a helper function, and possibly forget `TO
   authenticated` on some policies. These are all patterns that require specific
   knowledge of Supabase conventions.
 2. **Skill value:** The skill explicitly teaches: (a) private schema for
   security_definer functions, (b) `SET search_path = ''` to prevent injection,
   (c) `(select auth.uid())` for per-statement caching, (d) using
   security_definer functions to avoid per-row joins in policies, (e) `TO
   authenticated` on every policy. This is a scenario where reading 5+ reference
   files materially improves the output.
 3. **Testability:** Every assertion checks for specific SQL patterns via regex.
   The private schema, security_definer, search_path, subselect auth.uid(), TO
   authenticated, indexes, and timestamptz are all reliably detectable in SQL
   text without runtime execution.
 4. **Realism:** Multi-tenant team-based access control is one of the most common
   Supabase use cases. The GitHub discussions linked above have hundreds of
   comments from developers working on exactly this pattern. Project management
   apps (Notion, Linear, Asana clones) are a canonical example.
 ## Difficulty
 **Rating:** MEDIUM
 - Without skill: ~35-50% of assertions expected to pass
 - With skill: ~85-95% of assertions expected to pass
--- a/packages/evals/src/runner.ts
+++ b/packages/evals/src/runner.ts
@@ -2,10 +2,12 @@ import { existsSync, readdirSync, readFileSync } from "node:fs";
 import { join, resolve } from "node:path";
 import { runAgent } from "./runner/agent.js";
 import { uploadToBraintrust } from "./runner/braintrust.js";
 import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
 import { preflight } from "./runner/preflight.js";
 import { listModifiedFiles, printSummary } from "./runner/results.js";
 import { createWorkspace } from "./runner/scaffold.js";
 import { runTests } from "./runner/test.js";
 import { buildTranscriptSummary } from "./runner/transcript.js";
 import type { EvalRunResult, EvalScenario } from "./types.js";
 // ---------------------------------------------------------------------------
@@ -19,6 +21,12 @@ const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL;
 const scenarioFilter = process.env.EVAL_SCENARIO;
 const runBaseline = process.env.EVAL_BASELINE === "true";
 // Run-level timestamp shared across all scenarios in a single invocation
 const runTimestamp = new Date()
 	.toISOString()
 	.replace(/[:.]/g, "-")
 	.replace("Z", "");
 // ---------------------------------------------------------------------------
 // Discover scenarios
 // ---------------------------------------------------------------------------
@@ -58,10 +66,9 @@ async function runEval(
 ): Promise<EvalRunResult> {
 	const evalsDir = findEvalsDir();
 	const evalDir = join(evalsDir, scenario.id);
 	const variant = skillEnabled ? "with-skill" : "baseline";
-	console.log(
+	console.log(`\n--- ${scenario.id} (${variant}) ---`);
 		`\n--- ${scenario.id} (${skillEnabled ? "with-skill" : "baseline"}) ---`,
 	);
 	// 1. Create isolated workspace
 	const { workspacePath, cleanup } = createWorkspace({
@@ -104,7 +111,10 @@ async function runEval(
 		// 5. Collect modified files
 		const filesModified = listModifiedFiles(workspacePath, evalDir);
-		return {
+		// 6. Build transcript summary
 		const summary = buildTranscriptSummary(agentResult.events);
 		const result: EvalRunResult = {
 			scenario: scenario.id,
 			agent: "claude-code",
 			model,
@@ -116,7 +126,22 @@ async function runEval(
 			testsPassed: testResult.passedCount,
 			testsTotal: testResult.totalCount,
 			filesModified,
 			toolCallCount: summary.toolCalls.length,
 			costUsd: summary.totalCostUsd ?? undefined,
 		};
 		// 7. Persist results
 		const resultDir = createResultDir(runTimestamp, scenario.id, variant);
 		result.resultsDir = resultDir;
 		saveRunArtifacts({
 			resultDir,
 			rawTranscript: agentResult.rawTranscript,
 			testOutput: testResult.output,
 			result,
 			transcriptSummary: summary,
 		});
 		return result;
 	} catch (error) {
 		const err = error as Error;
 		return {
@@ -175,7 +200,9 @@ async function main() {
 		}
 	}
-	printSummary(results);
+	// Use the results dir from the first result (all share the same timestamp)
 	const resultsDir = results.find((r) => r.resultsDir)?.resultsDir;
 	printSummary(results, resultsDir);
 	if (process.env.BRAINTRUST_UPLOAD === "true") {
 		console.log("\nUploading to Braintrust...");
--- a/packages/evals/src/runner/agent.ts
+++ b/packages/evals/src/runner/agent.ts
@@ -1,13 +1,27 @@
 import { spawn } from "node:child_process";
 import { resolveClaudeBin } from "./preflight.js";
 import {
 	extractFinalOutput,
 	parseStreamJsonOutput,
 	type TranscriptEvent,
 } from "./transcript.js";
 export interface AgentRunResult {
 	/** Extracted final text output (backward-compatible). */
 	output: string;
 	duration: number;
 	/** Raw NDJSON transcript string from stream-json. */
 	rawTranscript: string;
 	/** Parsed transcript events. */
 	events: TranscriptEvent[];
 }
 /**
 * Invoke Claude Code in print mode as a subprocess.
 *
 * Uses --output-format stream-json to capture structured NDJSON events
 * including tool calls, results, and reasoning steps.
 *
 * The agent operates in the workspace directory and can read/write files.
 * When the skill is installed (symlinked into workspace), Claude Code
 * discovers it automatically and uses it for guidance.
@@ -23,14 +37,22 @@ export async function runAgent(opts: {
 	const args = [
 		"-p", // Print mode (non-interactive)
 		"--verbose",
 		"--output-format",
-		"text",
+		"stream-json",
 		"--model",
 		opts.model,
 		"--no-session-persistence",
 		"--dangerously-skip-permissions",
 		"--tools",
 		"Edit,Write,Bash,Read,Glob,Grep",
 		// Disable all MCP servers so the agent uses only local filesystem tools.
 		// Without this, MCP tools from the parent env (e.g. Supabase, Neon)
 		// leak in and the agent may apply migrations to a remote project
 		// instead of creating local files.
 		"--mcp-config",
 		'{"mcpServers":{}}',
 		"--strict-mcp-config",
 	];
 	// Disable skills for baseline runs so the agent relies on innate knowledge
@@ -46,8 +68,10 @@ export async function runAgent(opts: {
 		}
 	}
 	const claudeBin = resolveClaudeBin();
 	return new Promise<AgentRunResult>((resolve) => {
-		const child = spawn("claude", args, {
+		const child = spawn(claudeBin, args, {
 			cwd: opts.cwd,
 			env,
 			stdio: ["pipe", "pipe", "pipe"],
@@ -73,9 +97,15 @@ export async function runAgent(opts: {
 		child.on("close", () => {
 			clearTimeout(timer);
 			const rawTranscript = stdout || stderr;
 			const events = parseStreamJsonOutput(rawTranscript);
 			const output = extractFinalOutput(events) || rawTranscript;
 			resolve({
-				output: stdout || stderr,
+				output,
 				duration: Date.now() - start,
 				rawTranscript,
 				events,
 			});
 		});
 	});
--- a/packages/evals/src/runner/persist.ts
+++ b/packages/evals/src/runner/persist.ts
@@ -0,0 +1,56 @@
 import { mkdirSync, writeFileSync } from "node:fs";
 import { dirname, join } from "node:path";
 import { fileURLToPath } from "node:url";
 import type { EvalRunResult } from "../types.js";
 import type { TranscriptSummary } from "./transcript.js";
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = dirname(__filename);
 /** Resolve the evals package root (packages/evals). */
 function evalsRoot(): string {
 	// __dirname is packages/evals/src/runner
 	return join(__dirname, "..", "..");
 }
 /** Create the results directory for a single scenario run. Returns the path. */
 export function createResultDir(
 	runTimestamp: string,
 	scenarioId: string,
 	variant: "with-skill" | "baseline",
 ): string {
 	const dir = join(evalsRoot(), "results", runTimestamp, scenarioId, variant);
 	mkdirSync(dir, { recursive: true });
 	return dir;
 }
 /** Save all artifacts for a single eval run. */
 export function saveRunArtifacts(opts: {
 	resultDir: string;
 	rawTranscript: string;
 	testOutput: string;
 	result: EvalRunResult;
 	transcriptSummary: TranscriptSummary;
 }): void {
 	writeFileSync(
 		join(opts.resultDir, "transcript.jsonl"),
 		opts.rawTranscript,
 		"utf-8",
 	);
 	writeFileSync(
 		join(opts.resultDir, "test-output.txt"),
 		opts.testOutput,
 		"utf-8",
 	);
 	writeFileSync(
 		join(opts.resultDir, "result.json"),
 		JSON.stringify(
 			{ ...opts.result, transcript: opts.transcriptSummary },
 			null,
 			2,
 		),
 		"utf-8",
 	);
 }
--- a/packages/evals/src/runner/preflight.ts
+++ b/packages/evals/src/runner/preflight.ts
@@ -1,10 +1,61 @@
 import { execFileSync } from "node:child_process";
 import { existsSync } from "node:fs";
 import { dirname, join } from "node:path";
 import { fileURLToPath } from "node:url";
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = dirname(__filename);
 /**
 * Resolve the `claude` binary path.
 *
 * Looks in the following order:
 * 1. Local node_modules/.bin/claude (installed via @anthropic-ai/claude-code)
 * 2. Global `claude` on PATH
 *
 * Throws with an actionable message when neither is found.
 */
 export function resolveClaudeBin(): string {
 	// packages/evals/node_modules/.bin/claude
 	const localBin = join(
 		__dirname,
 		"..",
 		"..",
 		"node_modules",
 		".bin",
 		"claude",
 	);
 	if (existsSync(localBin)) {
 		return localBin;
 	}
 	// Fall back to PATH
 	try {
 		execFileSync("claude", ["--version"], {
 			stdio: "ignore",
 			timeout: 10_000,
 		});
 		return "claude";
 	} catch {
 		throw new Error(
 			[
 				"claude CLI not found.",
 				"",
 				"Install it in one of these ways:",
 				"  npm install          (uses @anthropic-ai/claude-code from package.json)",
 				"  npm i -g @anthropic-ai/claude-code",
 				"",
 				"Ensure ANTHROPIC_API_KEY is set in the environment.",
 			].join("\n"),
 		);
 	}
 }
 /**
 * Verify the host environment has everything needed before spending
 * API credits on an eval run.
 *
- * Checks: Node >= 20, Docker running, claude CLI available.
+ * Checks: Node >= 20, Docker running, claude CLI available, API key set.
 */
 export function preflight(): void {
 	const errors: string[] = [];
@@ -24,12 +75,16 @@ export function preflight(): void {
 	// Claude CLI available
 	try {
-		execFileSync("claude", ["--version"], {
+		resolveClaudeBin();
-			stdio: "ignore",
+	} catch (err) {
-			timeout: 10_000,
+		errors.push((err as Error).message);
-		});
+	}
-	} catch {
+
-		errors.push("claude CLI not found on PATH");
+	// API key
 	if (!process.env.ANTHROPIC_API_KEY) {
 		errors.push(
 			"ANTHROPIC_API_KEY is not set. Claude Code requires this for authentication.",
 		);
 	}
 	if (errors.length > 0) {
--- a/packages/evals/src/runner/results.ts
+++ b/packages/evals/src/runner/results.ts
@@ -46,7 +46,10 @@ export function listModifiedFiles(
 }
 /** Print a summary table of eval results. */
-export function printSummary(results: EvalRunResult[]): void {
+export function printSummary(
 	results: EvalRunResult[],
 	resultsDir?: string,
 ): void {
 	console.log("\n=== Eval Results ===\n");
 	for (const r of results) {
@@ -65,4 +68,8 @@ export function printSummary(results: EvalRunResult[]): void {
 	const passed = results.filter((r) => r.status === "passed").length;
 	console.log(`\nTotal: ${passed}/${results.length} passed`);
 	if (resultsDir) {
 		console.log(`\nResults saved to: ${resultsDir}`);
 	}
 }
--- a/packages/evals/src/runner/test.ts
+++ b/packages/evals/src/runner/test.ts
@@ -78,17 +78,24 @@ export async function runTests(opts: {
 function parseTestOutput(output: string): TestResult {
 	// Parse vitest output for pass/fail counts
-	// Format: "Tests  N passed (M)" or "Tests  N failed | M passed (T)"
+	// Vitest formats:
-	const testsLine = output.match(
+	//   All passing:  "Tests  N passed (N)"
 	//   Mixed:        "Tests  N failed | M passed (T)"
 	//   All failing:  "Tests  N failed (N)"
 	const mixedOrPassing = output.match(
 		/Tests\s+(?:(\d+)\s+failed\s+\|\s+)?(\d+)\s+passed\s+\((\d+)\)/,
 	);
 	const allFailing = output.match(/Tests\s+(\d+)\s+failed\s+\((\d+)\)/);
 	let passedCount = 0;
 	let totalCount = 0;
-	if (testsLine) {
+	if (mixedOrPassing) {
-		passedCount = Number.parseInt(testsLine[2], 10);
+		passedCount = Number.parseInt(mixedOrPassing[2], 10);
-		totalCount = Number.parseInt(testsLine[3], 10);
+		totalCount = Number.parseInt(mixedOrPassing[3], 10);
 	} else if (allFailing) {
 		passedCount = 0;
 		totalCount = Number.parseInt(allFailing[2], 10);
 	}
 	const passed = totalCount > 0 && passedCount === totalCount;
--- a/packages/evals/src/runner/transcript.ts
+++ b/packages/evals/src/runner/transcript.ts
@@ -0,0 +1,154 @@
 export interface TranscriptEvent {
 	type: string;
 	[key: string]: unknown;
 }
 export interface ToolCallSummary {
 	tool: string;
 	toolUseId: string;
 	input: Record<string, unknown>;
 	/** First ~200 chars of output for quick scanning */
 	outputPreview: string;
 }
 export interface TranscriptSummary {
 	totalTurns: number;
 	totalDurationMs: number;
 	totalCostUsd: number | null;
 	model: string | null;
 	toolCalls: ToolCallSummary[];
 	finalOutput: string;
 }
 /** Parse a single NDJSON line. Returns null on empty or invalid input. */
 export function parseStreamJsonLine(line: string): TranscriptEvent | null {
 	const trimmed = line.trim();
 	if (!trimmed) return null;
 	try {
 		return JSON.parse(trimmed) as TranscriptEvent;
 	} catch {
 		return null;
 	}
 }
 /** Parse raw NDJSON stdout into an array of events. */
 export function parseStreamJsonOutput(raw: string): TranscriptEvent[] {
 	const events: TranscriptEvent[] = [];
 	for (const line of raw.split("\n")) {
 		const event = parseStreamJsonLine(line);
 		if (event) events.push(event);
 	}
 	return events;
 }
 /** Extract the final text output from parsed events (for backward compat). */
 export function extractFinalOutput(events: TranscriptEvent[]): string {
 	// Prefer the result event
 	for (const event of events) {
 		if (event.type === "result") {
 			const result = (event as Record<string, unknown>).result;
 			if (typeof result === "string") return result;
 		}
 	}
 	// Fallback: concatenate text blocks from the last assistant message
 	for (let i = events.length - 1; i >= 0; i--) {
 		const event = events[i];
 		if (event.type === "assistant") {
 			const msg = (event as Record<string, unknown>).message as
 				| Record<string, unknown>
 				| undefined;
 			const content = msg?.content;
 			if (Array.isArray(content)) {
 				const texts = content
 					.filter(
 						(b: Record<string, unknown>) =>
 							b.type === "text" && typeof b.text === "string",
 					)
 					.map((b: Record<string, unknown>) => b.text as string);
 				if (texts.length > 0) return texts.join("\n");
 			}
 		}
 	}
 	return "";
 }
 /** Walk parsed events to build a transcript summary. */
 export function buildTranscriptSummary(
 	events: TranscriptEvent[],
 ): TranscriptSummary {
 	const toolCalls: ToolCallSummary[] = [];
 	let finalOutput = "";
 	let totalDurationMs = 0;
 	let totalCostUsd: number | null = null;
 	let model: string | null = null;
 	let totalTurns = 0;
 	for (const event of events) {
 		const e = event as Record<string, unknown>;
 		// System init: extract model
 		if (e.type === "system" && e.subtype === "init") {
 			model = typeof e.model === "string" ? e.model : null;
 		}
 		// Assistant messages: extract tool_use blocks
 		if (e.type === "assistant") {
 			const msg = e.message as Record<string, unknown> | undefined;
 			const content = msg?.content;
 			if (Array.isArray(content)) {
 				for (const block of content) {
 					if (block.type === "tool_use") {
 						toolCalls.push({
 							tool: block.name ?? "unknown",
 							toolUseId: block.id ?? "",
 							input: block.input ?? {},
 							outputPreview: "",
 						});
 					}
 				}
 			}
 		}
 		// User messages: extract tool_result blocks and match to tool calls
 		if (e.type === "user") {
 			const msg = e.message as Record<string, unknown> | undefined;
 			const content = msg?.content;
 			if (Array.isArray(content)) {
 				for (const block of content) {
 					if (block.type === "tool_result") {
 						const matching = toolCalls.find(
 							(tc) => tc.toolUseId === block.tool_use_id,
 						);
 						if (matching) {
 							const text =
 								typeof block.content === "string"
 									? block.content
 									: JSON.stringify(block.content);
 							matching.outputPreview = text.slice(0, 200);
 						}
 					}
 				}
 			}
 		}
 		// Result event: final output, cost, duration, turns
 		if (e.type === "result") {
 			finalOutput = typeof e.result === "string" ? e.result : "";
 			totalDurationMs = typeof e.duration_ms === "number" ? e.duration_ms : 0;
 			totalCostUsd =
 				typeof e.total_cost_usd === "number" ? e.total_cost_usd : null;
 			totalTurns = typeof e.num_turns === "number" ? e.num_turns : 0;
 		}
 	}
 	return {
 		totalTurns,
 		totalDurationMs,
 		totalCostUsd,
 		model,
 		toolCalls,
 		finalOutput,
 	};
 }
--- a/packages/evals/src/types.ts
+++ b/packages/evals/src/types.ts
@@ -32,4 +32,10 @@ export interface EvalRunResult {
 	/** Files the agent created or modified in the workspace */
 	filesModified: string[];
 	error?: string;
 	/** Path to the persisted results directory for this run */
 	resultsDir?: string;
 	/** Number of tool calls the agent made */
 	toolCallCount?: number;
 	/** Total cost in USD (from stream-json result event) */
 	costUsd?: number;
 }