more two scenarios and claude code cli is now a dependency

2026-03-27 10:09:26 +08:00 · 2026-02-20 15:02:59 +00:00
parent 9a23c6b021
commit e03bc99ebb
24 changed files with 1766 additions and 21 deletions
--- a/packages/evals/AGENTS.md
+++ b/packages/evals/AGENTS.md
@@ -20,6 +20,12 @@ hidden tests check the result. Binary pass/fail.
 The agent is **Claude Code** invoked via `claude -p` (print mode). It operates
 on a real filesystem in a temp directory and can read/write files freely.

+**Important**: MCP servers are disabled via `--strict-mcp-config` with an empty
+config. This ensures the agent uses only local tools (Bash, Edit, Write, Read,
+Glob, Grep) and cannot access remote services like Supabase MCP or Neon. All
+work must happen on the local filesystem — e.g., creating migration files in
+`supabase/migrations/`, not applying them to a remote project.
+
 ## Eval Structure

 Each eval lives in `evals/{scenario-name}/`:
--- a/packages/evals/evals/storage-rls-user-folders/EVAL.ts
+++ b/packages/evals/evals/storage-rls-user-folders/EVAL.ts
@@ -0,0 +1,252 @@
+import { existsSync, readdirSync, readFileSync } from "node:fs";
+import { join } from "node:path";
+import { expect, test } from "vitest";
+
+const supabaseDir = join(process.cwd(), "supabase");
+const migrationsDir = join(supabaseDir, "migrations");
+
+/** Find all .sql migration files (agent may create one or more). */
+function findMigrationFiles(): string[] {
+	if (!existsSync(migrationsDir)) return [];
+	return readdirSync(migrationsDir)
+		.filter((f) => f.endsWith(".sql"))
+		.map((f) => join(migrationsDir, f));
+}
+
+/** Read and concatenate all migration SQL files. */
+function getMigrationSQL(): string {
+	const files = findMigrationFiles();
+	if (files.length === 0)
+		throw new Error("No migration file found in supabase/migrations/");
+	return files.map((f) => readFileSync(f, "utf-8")).join("\n");
+}
+
+test("migration file exists", () => {
+	expect(findMigrationFiles().length).toBeGreaterThan(0);
+});
+
+test("creates avatars bucket", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	// Should insert into storage.buckets with id 'avatars' and public = true
+	expect(sql).toMatch(/storage\.buckets/);
+	expect(sql).toMatch(/avatars/);
+	expect(sql).toMatch(/public/);
+	// Verify it's marked as a public bucket (true)
+	const avatarsBlock = sql.match(
+		/insert\s+into\s+storage\.buckets[\s\S]*?avatars[\s\S]*?;/,
+	);
+	expect(avatarsBlock).not.toBeNull();
+	if (avatarsBlock) {
+		expect(avatarsBlock[0]).toMatch(/true/);
+	}
+});
+
+test("creates documents bucket", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	// Should insert into storage.buckets with id 'documents' and public = false
+	expect(sql).toMatch(/documents/);
+	const documentsBlock = sql.match(
+		/insert\s+into\s+storage\.buckets[\s\S]*?documents[\s\S]*?;/,
+	);
+	expect(documentsBlock).not.toBeNull();
+	if (documentsBlock) {
+		expect(documentsBlock[0]).toMatch(/false/);
+	}
+});
+
+test("avatars bucket has mime type restriction", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	// Should have allowed_mime_types with image types
+	expect(sql).toMatch(/allowed_mime_types/);
+	// Check for image MIME types (jpeg, png, webp)
+	expect(sql).toMatch(/image\/jpeg/);
+	expect(sql).toMatch(/image\/png/);
+	expect(sql).toMatch(/image\/webp/);
+});
+
+test("avatars bucket has file size limit", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	// Should have file_size_limit set to approximately 2MB (2097152 bytes or 2MB string)
+	expect(sql).toMatch(/file_size_limit/);
+	// Accept either numeric bytes (2097152) or string form (2MB, 2MiB, 2 * 1024 * 1024)
+	const hasNumericLimit = /2097152/.test(sql);
+	const hasStringLimit = /2\s*m/i.test(sql);
+	const hasCalcLimit = /2\s*\*\s*1024\s*\*\s*1024/.test(sql);
+	expect(hasNumericLimit || hasStringLimit || hasCalcLimit).toBe(true);
+});
+
+test("storage policy uses foldername or path for user isolation", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	// Should use storage.foldername(name) with auth.uid()::text for folder isolation
+	const usesFoldername = /storage\.foldername\s*\(\s*name\s*\)/.test(sql);
+	// Also accept direct path matching patterns like (name ~ '^user-id/')
+	const usesPathMatch =
+		/\(\s*storage\.foldername\s*\(/.test(sql) ||
+		/\bname\b.*auth\.uid\(\)/.test(sql);
+	expect(usesFoldername || usesPathMatch).toBe(true);
+	// Should cast auth.uid() to text for comparison with folder name
+	expect(sql).toMatch(/auth\.uid\(\)\s*::\s*text/);
+});
+
+test("storage policy uses TO authenticated", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	// Storage upload/delete/update policies should use TO authenticated
+	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+	const storagePolicies = policyBlocks.filter((p) =>
+		p.toLowerCase().includes("storage.objects"),
+	);
+	// At least one storage policy should have TO authenticated
+	const hasAuthenticatedPolicy = storagePolicies.some((p) =>
+		/to\s+(authenticated|public)/.test(p.toLowerCase()),
+	);
+	expect(hasAuthenticatedPolicy).toBe(true);
+	// Specifically, upload/insert policies should be TO authenticated (not public)
+	const insertPolicies = storagePolicies.filter((p) =>
+		/for\s+insert/.test(p.toLowerCase()),
+	);
+	for (const policy of insertPolicies) {
+		expect(policy.toLowerCase()).toMatch(/to\s+authenticated/);
+	}
+});
+
+test("public read policy for avatars", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	// A SELECT policy on storage.objects for avatars bucket should allow public/anon access
+	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+	const avatarSelectPolicies = policyBlocks.filter(
+		(p) =>
+			p.toLowerCase().includes("storage.objects") &&
+			/for\s+select/.test(p.toLowerCase()) &&
+			p.toLowerCase().includes("avatars"),
+	);
+	expect(avatarSelectPolicies.length).toBeGreaterThan(0);
+	// Should use TO public (or TO anon) for public read access
+	const hasPublicAccess = avatarSelectPolicies.some(
+		(p) =>
+			/to\s+public/.test(p.toLowerCase()) || /to\s+anon/.test(p.toLowerCase()),
+	);
+	expect(hasPublicAccess).toBe(true);
+});
+
+test("documents bucket is fully private", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	// All policies for documents bucket should restrict to authenticated owner
+	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+	const documentPolicies = policyBlocks.filter(
+		(p) =>
+			p.toLowerCase().includes("storage.objects") &&
+			p.toLowerCase().includes("documents"),
+	);
+	expect(documentPolicies.length).toBeGreaterThan(0);
+	// None should allow public/anon access
+	for (const policy of documentPolicies) {
+		expect(policy).not.toMatch(/to\s+public/);
+		expect(policy).not.toMatch(/to\s+anon/);
+	}
+	// All should be scoped to authenticated
+	for (const policy of documentPolicies) {
+		expect(policy).toMatch(/to\s+authenticated/);
+	}
+});
+
+test("creates file_metadata table", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	expect(sql).toMatch(/create\s+table/);
+	expect(sql).toMatch(/file_metadata/);
+});
+
+test("file_metadata has FK to auth.users with CASCADE", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	// Find the file_metadata CREATE TABLE block or the surrounding context
+	expect(sql).toMatch(/references\s+auth\.users/);
+	expect(sql).toMatch(/on\s+delete\s+cascade/);
+});
+
+test("RLS enabled on file_metadata", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	expect(sql).toMatch(
+		/alter\s+table.*file_metadata.*enable\s+row\s+level\s+security/,
+	);
+});
+
+test("file_metadata policies use (select auth.uid())", () => {
+	const sql = getMigrationSQL();
+	// Find policies that reference file_metadata
+	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+	const metadataPolicies = policyBlocks.filter((p) =>
+		p.toLowerCase().includes("file_metadata"),
+	);
+	// Each policy that uses auth.uid() should use the subselect form
+	for (const policy of metadataPolicies) {
+		if (policy.includes("auth.uid()")) {
+			expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
+		}
+	}
+});
+
+test("uses timestamptz for time columns", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	// Match "timestamp" that is NOT followed by "tz" or "with time zone"
+	const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
+	// Only check if the migration defines time-related columns
+	if (
+		sql.includes("created_at") ||
+		sql.includes("updated_at") ||
+		sql.includes("uploaded_at")
+	) {
+		expect(sql).not.toMatch(hasPlainTimestamp);
+	}
+});
+
+test("index on file_metadata user_id", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	expect(sql).toMatch(/create\s+index/);
+	// Should index user_id on file_metadata
+	expect(sql).toMatch(/file_metadata/);
+	expect(sql).toMatch(/user_id/);
+});
+
+test("idempotent DDL", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	expect(sql).toMatch(/if\s+not\s+exists/);
+});
+
+test("overall quality score", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	// A high-quality migration should contain most of these best-practice signals
+	const signals = [
+		// 1. Avatars bucket is public
+		/insert\s+into\s+storage\.buckets[\s\S]*?avatars/,
+		// 2. Documents bucket exists
+		/insert\s+into\s+storage\.buckets[\s\S]*?documents/,
+		// 3. MIME type restriction
+		/allowed_mime_types/,
+		// 4. File size limit
+		/file_size_limit/,
+		// 5. Storage foldername helper
+		/storage\.foldername/,
+		// 6. auth.uid()::text cast
+		/auth\.uid\(\)\s*::\s*text/,
+		// 7. TO authenticated on policies
+		/to\s+authenticated/,
+		// 8. Public read for avatars
+		/to\s+(public|anon)/,
+		// 9. RLS on file_metadata
+		/enable\s+row\s+level\s+security/,
+		// 10. FK to auth.users with cascade
+		/on\s+delete\s+cascade/,
+		// 11. (select auth.uid()) subselect form
+		/\(select\s+auth\.uid\(\)\)/,
+		// 12. Index on user_id
+		/create\s+index/,
+		// 13. timestamptz usage
+		/timestamptz/,
+		// 14. IF NOT EXISTS for idempotency
+		/if\s+not\s+exists/,
+		// 15. file_metadata table
+		/create\s+table[\s\S]*?file_metadata/,
+	];
+	const matches = signals.filter((r) => r.test(sql));
+	// Require at least 11 of 15 best-practice signals
+	expect(matches.length).toBeGreaterThanOrEqual(11);
+});
--- a/packages/evals/evals/storage-rls-user-folders/PROMPT.md
+++ b/packages/evals/evals/storage-rls-user-folders/PROMPT.md
@@ -0,0 +1,12 @@
+I need to set up file storage for my app. There are two use cases:
+
+1. **Avatars** -- Users upload a profile picture. Anyone can view avatars but only the owning user can upload or replace their own. Only allow image files (JPEG, PNG, WebP). Max 2MB.
+
+2. **Documents** -- Users upload private documents that only they can access. Max 50MB. No file type restriction.
+
+The Supabase project is already initialized in the `supabase/` directory. Create a SQL migration that:
+- Configures both storage buckets
+- Adds RLS policies on `storage.objects` so each user can only access their own folder (folder name = user ID)
+- Creates a `file_metadata` table to track uploaded files (file name, bucket, size, user reference) with appropriate security
+
+Users are authenticated via Supabase Auth.
--- a/packages/evals/evals/storage-rls-user-folders/package.json
+++ b/packages/evals/evals/storage-rls-user-folders/package.json
@@ -0,0 +1,5 @@
+{
+	"name": "storage-rls-user-folders",
+	"private": true,
+	"type": "module"
+}
--- a/packages/evals/evals/storage-rls-user-folders/supabase/config.toml
+++ b/packages/evals/evals/storage-rls-user-folders/supabase/config.toml
@@ -0,0 +1,64 @@
+# For detailed configuration reference documentation, visit:
+# https://supabase.com/docs/guides/local-development/cli/config
+# A string used to distinguish different Supabase projects on the same host. Defaults to the
+# working directory name when running `supabase init`.
+project_id = "storage-rls-user-folders"
+
+[api]
+enabled = true
+# Port to use for the API URL.
+port = 54321
+# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
+# endpoints. `public` and `graphql_public` schemas are included by default.
+schemas = ["public", "graphql_public"]
+# Extra schemas to add to the search_path of every request.
+extra_search_path = ["public", "extensions"]
+# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
+# for accidental or malicious requests.
+max_rows = 1000
+
+[db]
+# Port to use for the local database URL.
+port = 54322
+# Port used by db diff command to initialize the shadow database.
+shadow_port = 54320
+# The database major version to use. This has to be the same as your remote database's. Run `SHOW
+# server_version;` on the remote database to check.
+major_version = 17
+
+[db.pooler]
+enabled = false
+# Port to use for the local connection pooler.
+port = 54329
+# Specifies when a server connection can be reused by other clients.
+# Configure one of the supported pooler modes: `transaction`, `session`.
+pool_mode = "transaction"
+# How many server connections to allow per user/database pair.
+default_pool_size = 20
+# Maximum number of client connections allowed.
+max_client_conn = 100
+
+[storage]
+enabled = true
+# The maximum file size allowed (e.g. "5MB", "500KB").
+file_size_limit = "50MiB"
+
+[auth]
+enabled = true
+# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
+# in emails.
+site_url = "http://127.0.0.1:3000"
+# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
+additional_redirect_urls = ["https://127.0.0.1:3000"]
+# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
+jwt_expiry = 3600
+# Allow/disallow new user signups to your project.
+enable_signup = true
+# Allow/disallow anonymous sign-ins to your project.
+enable_anonymous_sign_ins = false
+
+[auth.email]
+# Allow/disallow new user signups via email to your project.
+enable_signup = true
+# If enabled, users need to confirm their email address before signing in.
+enable_confirmations = false
--- a/packages/evals/evals/storage-rls-user-folders/supabase/migrations/.gitkeep
+++ b/packages/evals/evals/storage-rls-user-folders/supabase/migrations/.gitkeep
--- a/packages/evals/evals/team-rls-security-definer/EVAL.ts
+++ b/packages/evals/evals/team-rls-security-definer/EVAL.ts
@@ -0,0 +1,201 @@
+import { existsSync, readdirSync, readFileSync } from "node:fs";
+import { join } from "node:path";
+import { expect, test } from "vitest";
+
+const supabaseDir = join(process.cwd(), "supabase");
+const migrationsDir = join(supabaseDir, "migrations");
+
+/** Find all .sql migration files (agent may create one or multiple). */
+function findMigrationFiles(): string[] {
+	if (!existsSync(migrationsDir)) return [];
+	return readdirSync(migrationsDir)
+		.filter((f) => f.endsWith(".sql"))
+		.map((f) => join(migrationsDir, f));
+}
+
+/** Concatenate all migration SQL into a single string for assertions. */
+function getMigrationSQL(): string {
+	const files = findMigrationFiles();
+	if (files.length === 0)
+		throw new Error("No migration file found in supabase/migrations/");
+	return files.map((f) => readFileSync(f, "utf-8")).join("\n");
+}
+
+test("migration file exists", () => {
+	expect(findMigrationFiles().length).toBeGreaterThan(0);
+});
+
+test("creates organizations table", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	expect(sql).toMatch(/create\s+table[\s\S]*?organizations/);
+});
+
+test("creates memberships table", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	expect(sql).toMatch(/create\s+table[\s\S]*?memberships/);
+});
+
+test("creates projects table", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	expect(sql).toMatch(/create\s+table[\s\S]*?projects/);
+});
+
+test("enables RLS on all tables", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	expect(sql).toMatch(
+		/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/,
+	);
+	expect(sql).toMatch(
+		/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/,
+	);
+	expect(sql).toMatch(
+		/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/,
+	);
+});
+
+test("FK to auth.users with ON DELETE CASCADE", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	// memberships should reference auth.users with cascade delete
+	expect(sql).toMatch(/references\s+auth\.users/);
+	expect(sql).toMatch(/on\s+delete\s+cascade/);
+});
+
+test("org_id FK on projects", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	// projects should have a foreign key referencing organizations
+	expect(sql).toMatch(
+		/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/,
+	);
+});
+
+test("private schema created", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	expect(sql).toMatch(/create\s+schema[\s\S]*?private/);
+});
+
+test("security_definer helper function", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	// Function should be in the private schema with SECURITY DEFINER and search_path = ''
+	expect(sql).toMatch(/private\./);
+	expect(sql).toMatch(/security\s+definer/);
+	expect(sql).toMatch(/set\s+search_path\s*=\s*''/);
+});
+
+test("policies use (select auth.uid())", () => {
+	const sql = getMigrationSQL();
+	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+	expect(policyBlocks.length).toBeGreaterThan(0);
+	for (const policy of policyBlocks) {
+		if (policy.includes("auth.uid()")) {
+			// The subselect form: (select auth.uid())
+			expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
+		}
+	}
+});
+
+test("policies use TO authenticated", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+	expect(policyBlocks.length).toBeGreaterThan(0);
+	for (const policy of policyBlocks) {
+		expect(policy).toMatch(/to\s+authenticated/);
+	}
+});
+
+test("index on membership lookup columns", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	expect(sql).toMatch(/create\s+index/);
+	// Should index user_id and/or org_id on memberships for policy lookups
+	const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? [];
+	const indexesUserOrOrg = indexBlocks.filter(
+		(idx) =>
+			idx.includes("user_id") ||
+			idx.includes("org_id") ||
+			idx.includes("organization_id"),
+	);
+	expect(indexesUserOrOrg.length).toBeGreaterThanOrEqual(1);
+});
+
+test("uses timestamptz", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	// Match "timestamp" that is NOT followed by "tz" or "with time zone"
+	const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
+	// Only fail if the migration defines time columns with plain timestamp
+	if (
+		sql.includes("created_at") ||
+		sql.includes("updated_at") ||
+		sql.includes("_at ")
+	) {
+		expect(sql).not.toMatch(hasPlainTimestamp);
+	}
+});
+
+test("idempotent DDL", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	expect(sql).toMatch(/if\s+not\s+exists/);
+});
+
+test("delete policy restricted to owner role", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	// Look for a delete policy on projects that checks for owner (or admin) role
+	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+	const deletePolicy = policyBlocks.find(
+		(p) =>
+			p.toLowerCase().includes("delete") && p.toLowerCase().includes("project"),
+	);
+	expect(deletePolicy).toBeDefined();
+	// The delete policy should check for an owner/admin role
+	expect(deletePolicy?.toLowerCase()).toMatch(/owner|admin/);
+});
+
+test("overall quality score", () => {
+	const sql = getMigrationSQL().toLowerCase();
+	const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
+	// A high-quality migration should contain most of these best-practice signals
+	const signals = [
+		// 1. RLS enabled on all three tables
+		/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/.test(
+			sql,
+		) &&
+			/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
+				sql,
+			) &&
+			/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/.test(
+				sql,
+			),
+		// 2. FK to auth.users with cascade
+		/references\s+auth\.users/.test(sql) && /on\s+delete\s+cascade/.test(sql),
+		// 3. Private schema created
+		/create\s+schema[\s\S]*?private/.test(sql),
+		// 4. security_definer with search_path
+		/security\s+definer/.test(sql) && /set\s+search_path\s*=\s*''/.test(sql),
+		// 5. Subselect auth.uid()
+		/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
+		// 6. TO authenticated on policies
+		policyBlocks.length > 0 &&
+			policyBlocks.every((p) => /to\s+authenticated/.test(p)),
+		// 7. Indexes on lookup columns
+		/create\s+index/.test(sql),
+		// 8. timestamptz (no plain timestamp)
+		!/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(sql),
+		// 9. Idempotent DDL
+		/if\s+not\s+exists/.test(sql),
+		// 10. Delete policy checks owner role
+		policyBlocks.some(
+			(p) =>
+				p.toLowerCase().includes("delete") &&
+				p.toLowerCase().includes("project") &&
+				/owner|admin/.test(p.toLowerCase()),
+		),
+		// 11. org_id FK on projects
+		/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/.test(sql),
+		// 12. Multiple policies (at least one per table)
+		policyBlocks.length >= 3,
+		// 13. Membership role column exists
+		/role/.test(sql),
+		// 14. Private schema function referenced in policies
+		/private\./.test(sql),
+	];
+	const passed = signals.filter(Boolean).length;
+	expect(passed).toBeGreaterThanOrEqual(10);
+});
--- a/packages/evals/evals/team-rls-security-definer/PROMPT.md
+++ b/packages/evals/evals/team-rls-security-definer/PROMPT.md
@@ -0,0 +1,14 @@
+I'm building a project management app where users can belong to multiple organizations. Each organization has projects that all members can view and edit.
+
+The Supabase project is already initialized in the `supabase/` directory. Create a SQL migration with:
+
+1. An `organizations` table (name, slug)
+2. A `memberships` table linking users to organizations with a role column (owner, admin, member)
+3. A `projects` table (name, description, status) belonging to an organization
+
+Set up Row Level Security so:
+- Users can only see organizations they belong to
+- Users can only see and manage projects in their organizations
+- Only org owners can delete projects
+
+The migration should handle the case where a user is deleted from auth.
--- a/packages/evals/evals/team-rls-security-definer/package.json
+++ b/packages/evals/evals/team-rls-security-definer/package.json
@@ -0,0 +1,5 @@
+{
+	"name": "team-rls-security-definer",
+	"private": true,
+	"type": "module"
+}
--- a/packages/evals/evals/team-rls-security-definer/supabase/config.toml
+++ b/packages/evals/evals/team-rls-security-definer/supabase/config.toml
@@ -0,0 +1,111 @@
+# For detailed configuration reference documentation, visit:
+# https://supabase.com/docs/guides/local-development/cli/config
+# A string used to distinguish different Supabase projects on the same host. Defaults to the
+# working directory name when running `supabase init`.
+project_id = "team-rls-security-definer"
+
+[api]
+enabled = true
+# Port to use for the API URL.
+port = 54321
+# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
+# endpoints. `public` and `graphql_public` schemas are included by default.
+schemas = ["public", "graphql_public"]
+# Extra schemas to add to the search_path of every request.
+extra_search_path = ["public", "extensions"]
+# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
+# for accidental or malicious requests.
+max_rows = 1000
+
+[db]
+# Port to use for the local database URL.
+port = 54322
+# Port used by db diff command to initialize the shadow database.
+shadow_port = 54320
+# The database major version to use. This has to be the same as your remote database's. Run `SHOW
+# server_version;` on the remote database to check.
+major_version = 17
+
+[db.pooler]
+enabled = false
+# Port to use for the local connection pooler.
+port = 54329
+# Specifies when a server connection can be reused by other clients.
+# Configure one of the supported pooler modes: `transaction`, `session`.
+pool_mode = "transaction"
+# How many server connections to allow per user/database pair.
+default_pool_size = 20
+# Maximum number of client connections allowed.
+max_client_conn = 100
+
+[db.migrations]
+# If disabled, migrations will be skipped during a db push or reset.
+enabled = true
+schema_paths = []
+
+[db.seed]
+# If enabled, seeds the database after migrations during a db reset.
+enabled = true
+# Specifies an ordered list of seed files to load during db reset.
+sql_paths = ["./seed.sql"]
+
+[realtime]
+enabled = true
+
+[studio]
+enabled = true
+# Port to use for Supabase Studio.
+port = 54323
+# External URL of the API server that frontend connects to.
+api_url = "http://127.0.0.1"
+
+[inbucket]
+enabled = true
+# Port to use for the email testing server web interface.
+port = 54324
+
+[storage]
+enabled = true
+# The maximum file size allowed (e.g. "5MB", "500KB").
+file_size_limit = "50MiB"
+
+[auth]
+enabled = true
+# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
+# in emails.
+site_url = "http://127.0.0.1:3000"
+# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
+additional_redirect_urls = ["https://127.0.0.1:3000"]
+# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
+jwt_expiry = 3600
+# If disabled, the refresh token will never expire.
+enable_refresh_token_rotation = true
+# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
+# Requires enable_refresh_token_rotation = true.
+refresh_token_reuse_interval = 10
+# Allow/disallow new user signups to your project.
+enable_signup = true
+# Allow/disallow anonymous sign-ins to your project.
+enable_anonymous_sign_ins = false
+
+[auth.email]
+# Allow/disallow new user signups via email to your project.
+enable_signup = true
+# If enabled, a user will be required to confirm any email change on both the old, and new email
+# addresses. If disabled, only the new email is required to confirm.
+double_confirm_changes = true
+# If enabled, users need to confirm their email address before signing in.
+enable_confirmations = false
+
+[edge_runtime]
+enabled = true
+# Configure one of the supported request policies: `oneshot`, `per_worker`.
+policy = "per_worker"
+# Port to attach the Chrome inspector for debugging edge functions.
+inspector_port = 8083
+
+[analytics]
+enabled = true
+port = 54327
+# Configure one of the supported backends: `postgres`, `bigquery`.
+backend = "postgres"
--- a/packages/evals/package-lock.json
+++ b/packages/evals/package-lock.json
@@ -9,6 +9,7 @@
 			"version": "1.0.0",
 			"license": "MIT",
 			"dependencies": {
+				"@anthropic-ai/claude-code": "^2.1.49",
 				"braintrust": "^3.0.0"
 			},
 			"devDependencies": {
@@ -18,6 +19,29 @@
 				"vitest": "^3.1.0"
 			}
 		},
+		"node_modules/@anthropic-ai/claude-code": {
+			"version": "2.1.49",
+			"resolved": "https://registry.npmjs.org/@anthropic-ai/claude-code/-/claude-code-2.1.49.tgz",
+			"integrity": "sha512-PonEmTZlB5IZbBu9TmtOpGZnupU7OxOXTsJKcXE/4Ak5qp3ptN1wSBRdgKYnn6GDYhXijTXuVVwrCQU+NAgwPA==",
+			"license": "SEE LICENSE IN README.md",
+			"bin": {
+				"claude": "cli.js"
+			},
+			"engines": {
+				"node": ">=18.0.0"
+			},
+			"optionalDependencies": {
+				"@img/sharp-darwin-arm64": "^0.34.2",
+				"@img/sharp-darwin-x64": "^0.34.2",
+				"@img/sharp-linux-arm": "^0.34.2",
+				"@img/sharp-linux-arm64": "^0.34.2",
+				"@img/sharp-linux-x64": "^0.34.2",
+				"@img/sharp-linuxmusl-arm64": "^0.34.2",
+				"@img/sharp-linuxmusl-x64": "^0.34.2",
+				"@img/sharp-win32-arm64": "^0.34.2",
+				"@img/sharp-win32-x64": "^0.34.2"
+			}
+		},
 		"node_modules/@colors/colors": {
 			"version": "1.5.0",
 			"resolved": "https://registry.npmjs.org/@colors/colors/-/colors-1.5.0.tgz",
@@ -444,6 +468,310 @@
 				"node": ">=18"
 			}
 		},
+		"node_modules/@img/sharp-darwin-arm64": {
+			"version": "0.34.5",
+			"resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.5.tgz",
+			"integrity": "sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w==",
+			"cpu": [
+				"arm64"
+			],
+			"license": "Apache-2.0",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"engines": {
+				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			},
+			"optionalDependencies": {
+				"@img/sharp-libvips-darwin-arm64": "1.2.4"
+			}
+		},
+		"node_modules/@img/sharp-darwin-x64": {
+			"version": "0.34.5",
+			"resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.5.tgz",
+			"integrity": "sha512-YNEFAF/4KQ/PeW0N+r+aVVsoIY0/qxxikF2SWdp+NRkmMB7y9LBZAVqQ4yhGCm/H3H270OSykqmQMKLBhBJDEw==",
+			"cpu": [
+				"x64"
+			],
+			"license": "Apache-2.0",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"engines": {
+				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			},
+			"optionalDependencies": {
+				"@img/sharp-libvips-darwin-x64": "1.2.4"
+			}
+		},
+		"node_modules/@img/sharp-libvips-darwin-arm64": {
+			"version": "1.2.4",
+			"resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.2.4.tgz",
+			"integrity": "sha512-zqjjo7RatFfFoP0MkQ51jfuFZBnVE2pRiaydKJ1G/rHZvnsrHAOcQALIi9sA5co5xenQdTugCvtb1cuf78Vf4g==",
+			"cpu": [
+				"arm64"
+			],
+			"license": "LGPL-3.0-or-later",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			}
+		},
+		"node_modules/@img/sharp-libvips-darwin-x64": {
+			"version": "1.2.4",
+			"resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.2.4.tgz",
+			"integrity": "sha512-1IOd5xfVhlGwX+zXv2N93k0yMONvUlANylbJw1eTah8K/Jtpi15KC+WSiaX/nBmbm2HxRM1gZ0nSdjSsrZbGKg==",
+			"cpu": [
+				"x64"
+			],
+			"license": "LGPL-3.0-or-later",
+			"optional": true,
+			"os": [
+				"darwin"
+			],
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			}
+		},
+		"node_modules/@img/sharp-libvips-linux-arm": {
+			"version": "1.2.4",
+			"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.2.4.tgz",
+			"integrity": "sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A==",
+			"cpu": [
+				"arm"
+			],
+			"license": "LGPL-3.0-or-later",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			}
+		},
+		"node_modules/@img/sharp-libvips-linux-arm64": {
+			"version": "1.2.4",
+			"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.2.4.tgz",
+			"integrity": "sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw==",
+			"cpu": [
+				"arm64"
+			],
+			"license": "LGPL-3.0-or-later",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			}
+		},
+		"node_modules/@img/sharp-libvips-linux-x64": {
+			"version": "1.2.4",
+			"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.2.4.tgz",
+			"integrity": "sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw==",
+			"cpu": [
+				"x64"
+			],
+			"license": "LGPL-3.0-or-later",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			}
+		},
+		"node_modules/@img/sharp-libvips-linuxmusl-arm64": {
+			"version": "1.2.4",
+			"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.2.4.tgz",
+			"integrity": "sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw==",
+			"cpu": [
+				"arm64"
+			],
+			"license": "LGPL-3.0-or-later",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			}
+		},
+		"node_modules/@img/sharp-libvips-linuxmusl-x64": {
+			"version": "1.2.4",
+			"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.2.4.tgz",
+			"integrity": "sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg==",
+			"cpu": [
+				"x64"
+			],
+			"license": "LGPL-3.0-or-later",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			}
+		},
+		"node_modules/@img/sharp-linux-arm": {
+			"version": "0.34.5",
+			"resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.5.tgz",
+			"integrity": "sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw==",
+			"cpu": [
+				"arm"
+			],
+			"license": "Apache-2.0",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			},
+			"optionalDependencies": {
+				"@img/sharp-libvips-linux-arm": "1.2.4"
+			}
+		},
+		"node_modules/@img/sharp-linux-arm64": {
+			"version": "0.34.5",
+			"resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.5.tgz",
+			"integrity": "sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg==",
+			"cpu": [
+				"arm64"
+			],
+			"license": "Apache-2.0",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			},
+			"optionalDependencies": {
+				"@img/sharp-libvips-linux-arm64": "1.2.4"
+			}
+		},
+		"node_modules/@img/sharp-linux-x64": {
+			"version": "0.34.5",
+			"resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.5.tgz",
+			"integrity": "sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ==",
+			"cpu": [
+				"x64"
+			],
+			"license": "Apache-2.0",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			},
+			"optionalDependencies": {
+				"@img/sharp-libvips-linux-x64": "1.2.4"
+			}
+		},
+		"node_modules/@img/sharp-linuxmusl-arm64": {
+			"version": "0.34.5",
+			"resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.5.tgz",
+			"integrity": "sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg==",
+			"cpu": [
+				"arm64"
+			],
+			"license": "Apache-2.0",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			},
+			"optionalDependencies": {
+				"@img/sharp-libvips-linuxmusl-arm64": "1.2.4"
+			}
+		},
+		"node_modules/@img/sharp-linuxmusl-x64": {
+			"version": "0.34.5",
+			"resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.5.tgz",
+			"integrity": "sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q==",
+			"cpu": [
+				"x64"
+			],
+			"license": "Apache-2.0",
+			"optional": true,
+			"os": [
+				"linux"
+			],
+			"engines": {
+				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			},
+			"optionalDependencies": {
+				"@img/sharp-libvips-linuxmusl-x64": "1.2.4"
+			}
+		},
+		"node_modules/@img/sharp-win32-arm64": {
+			"version": "0.34.5",
+			"resolved": "https://registry.npmjs.org/@img/sharp-win32-arm64/-/sharp-win32-arm64-0.34.5.tgz",
+			"integrity": "sha512-WQ3AgWCWYSb2yt+IG8mnC6Jdk9Whs7O0gxphblsLvdhSpSTtmu69ZG1Gkb6NuvxsNACwiPV6cNSZNzt0KPsw7g==",
+			"cpu": [
+				"arm64"
+			],
+			"license": "Apache-2.0 AND LGPL-3.0-or-later",
+			"optional": true,
+			"os": [
+				"win32"
+			],
+			"engines": {
+				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			}
+		},
+		"node_modules/@img/sharp-win32-x64": {
+			"version": "0.34.5",
+			"resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.5.tgz",
+			"integrity": "sha512-+29YMsqY2/9eFEiW93eqWnuLcWcufowXewwSNIT6UwZdUUCrM3oFjMWH/Z6/TMmb4hlFenmfAVbpWeup2jryCw==",
+			"cpu": [
+				"x64"
+			],
+			"license": "Apache-2.0 AND LGPL-3.0-or-later",
+			"optional": true,
+			"os": [
+				"win32"
+			],
+			"engines": {
+				"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
+			},
+			"funding": {
+				"url": "https://opencollective.com/libvips"
+			}
+		},
 		"node_modules/@jridgewell/sourcemap-codec": {
 			"version": "1.5.5",
 			"resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz",
--- a/packages/evals/package.json
+++ b/packages/evals/package.json
@@ -10,6 +10,7 @@
 		"eval:upload": "BRAINTRUST_UPLOAD=true tsx src/runner.ts"
 	},
 	"dependencies": {
+		"@anthropic-ai/claude-code": "^2.1.49",
 		"braintrust": "^3.0.0"
 	},
 	"devDependencies": {
--- a/packages/evals/scenarios/SCENARIOS.md
+++ b/packages/evals/scenarios/SCENARIOS.md
@@ -49,3 +49,121 @@ The agent initializes a Supabase project and creates a migration file that:
 | index on user_id | `CREATE INDEX` on the FK column |
 | IF NOT EXISTS | Idempotent migration |
 | overall quality | At least 4/5 best-practice signals present |
+
+## Scenario 2: team-rls-security-definer
+
+**Description:** Create a SQL migration for a team-based project management app
+where users belong to organizations via a membership table. The migration must
+define tables for organizations, memberships, and projects, then secure them
+with RLS policies that use a `security definer` helper function in a private
+schema to efficiently resolve team membership without per-row joins.
+
+**Setup:** The workspace starts with a pre-initialized Supabase project
+(`supabase/config.toml` exists, empty `supabase/migrations/` directory). The
+agent creates migration files within this structure.
+
+**Expected skill files read:**
+
+- `SKILL.md` (skill body with reference file index)
+- `references/db-rls-mandatory.md`
+- `references/db-rls-policy-types.md`
+- `references/db-rls-common-mistakes.md`
+- `references/db-rls-performance.md`
+- `references/db-security-functions.md`
+- `references/db-schema-auth-fk.md`
+- `references/db-schema-timestamps.md`
+- `references/db-perf-indexes.md`
+- `references/db-migrations-idempotent.md`
+
+**Expected result:**
+
+The agent creates a migration file that:
+
+- Creates organizations, memberships, and projects tables with `timestamptz` columns
+- Has `user_id` FK to `auth.users(id)` with `ON DELETE CASCADE` on memberships
+- Has `org_id` FK on projects referencing organizations
+- Enables RLS on all three tables
+- Creates a private schema with a `security definer` helper function (`SET search_path = ''`)
+- Creates RLS policies using `(select auth.uid())` with `TO authenticated`
+- Creates indexes on membership lookup columns (user_id, org_id)
+- Has a delete policy on projects restricted to owner role
+- Uses `IF NOT EXISTS` for idempotency
+
+**Scorer:** Binary pass/fail (16 vitest assertions)
+
+| Test | What it checks |
+| --- | --- |
+| migration file exists | A `.sql` file exists in `supabase/migrations/` |
+| creates organizations table | SQL contains `CREATE TABLE` for organizations |
+| creates memberships table | SQL contains `CREATE TABLE` for memberships |
+| creates projects table | SQL contains `CREATE TABLE` for projects |
+| enables RLS on all tables | `ALTER TABLE ... ENABLE ROW LEVEL SECURITY` for all three tables |
+| FK to auth.users with ON DELETE CASCADE | memberships references `auth.users` with cascade |
+| org_id FK on projects | projects references organizations |
+| private schema created | `CREATE SCHEMA ... private` present |
+| security_definer helper function | Function in private schema with `SECURITY DEFINER` and `SET search_path = ''` |
+| policies use (select auth.uid()) | Subselect form in all policies referencing auth.uid() |
+| policies use TO authenticated | All policies scoped to authenticated role |
+| index on membership lookup columns | `CREATE INDEX` on user_id and/or org_id in memberships |
+| uses timestamptz | No plain `timestamp` for time columns |
+| idempotent DDL | Uses `IF NOT EXISTS` or `DROP ... IF EXISTS` patterns |
+| delete policy restricted to owner role | A delete policy on projects checks for owner/admin role |
+| overall quality score | At least 10/14 best-practice signals present |
+
+## Scenario 3: storage-rls-user-folders
+
+**Description:** Create a SQL migration that sets up Supabase Storage buckets
+with RLS policies for user-content. An avatars bucket (public reads,
+authenticated uploads restricted to user folders) and a documents bucket (fully
+private, user-isolated), with file type restrictions, storage helper functions
+in policies, and a file_metadata tracking table secured with RLS.
+
+**Setup:** Pre-initialized Supabase project (`supabase/config.toml` exists)
+with an empty `supabase/migrations/` directory. The agent creates migration
+files within this structure.
+
+**Expected skill files read:**
+
+- `SKILL.md` (skill body with reference file index)
+- `references/storage-access-control.md`
+- `references/db-rls-mandatory.md`
+- `references/db-rls-common-mistakes.md`
+- `references/db-rls-performance.md`
+- `references/db-schema-auth-fk.md`
+- `references/db-schema-timestamps.md`
+- `references/db-perf-indexes.md`
+- `references/db-migrations-idempotent.md`
+
+**Expected result:**
+
+The agent creates a migration file that:
+
+- Inserts avatars bucket into `storage.buckets` with `public = true`, MIME type restrictions, and file size limit
+- Inserts documents bucket with `public = false`
+- Creates RLS policies on `storage.objects` using `storage.foldername(name)` with `auth.uid()::text`
+- Scopes upload policies `TO authenticated` and avatars SELECT policy `TO public`
+- Creates `file_metadata` table with FK to `auth.users` with `ON DELETE CASCADE`
+- Enables RLS on `file_metadata` with policies using `(select auth.uid())`
+- Uses `timestamptz` for time columns, indexes `user_id`, and `IF NOT EXISTS` for idempotency
+
+**Scorer:** Binary pass/fail (17 vitest assertions)
+
+| Test | What it checks |
+| --- | --- |
+| migration file exists | A `.sql` file exists in `supabase/migrations/` |
+| creates avatars bucket | SQL inserts into `storage.buckets` with id 'avatars' and `public = true` |
+| creates documents bucket | SQL inserts into `storage.buckets` with id 'documents' and `public = false` |
+| avatars bucket has mime type restriction | `allowed_mime_types` includes image types (jpeg, png, webp) |
+| avatars bucket has file size limit | `file_size_limit` set (around 2MB / 2097152 bytes) |
+| storage policy uses foldername or path for user isolation | Policy references `storage.foldername(name)` with `auth.uid()::text` |
+| storage policy uses TO authenticated | Storage upload/delete policies scoped to `TO authenticated` |
+| public read policy for avatars | A SELECT policy on storage.objects for avatars allows public/anon access |
+| documents bucket is fully private | Policies for documents restrict all operations to authenticated owner |
+| creates file_metadata table | SQL contains `CREATE TABLE` for file_metadata |
+| file_metadata has FK to auth.users with CASCADE | `REFERENCES auth.users` with `ON DELETE CASCADE` |
+| RLS enabled on file_metadata | `ALTER TABLE file_metadata ENABLE ROW LEVEL SECURITY` |
+| file_metadata policies use (select auth.uid()) | Subselect form in policies |
+| uses timestamptz for time columns | No plain `timestamp` in file_metadata |
+| index on file_metadata user_id | `CREATE INDEX` on user_id column |
+| idempotent DDL | Uses `IF NOT EXISTS` patterns |
+| overall quality score | At least 11/15 best-practice signals present |
--- a/packages/evals/scenarios/storage-rls-user-folders.md
+++ b/packages/evals/scenarios/storage-rls-user-folders.md
@@ -0,0 +1,144 @@
+# Scenario: storage-rls-user-folders
+
+## Summary
+
+The agent must create a SQL migration that sets up Supabase Storage buckets
+with RLS policies for a user-content application. The migration must configure
+an avatars bucket (public reads, authenticated uploads restricted to user
+folders) and a documents bucket (fully private, user-isolated), with proper
+file type restrictions, storage helper functions in policies, and a
+file_metadata tracking table secured with RLS.
+
+## Real-World Justification
+
+Why this is a common and important workflow:
+
+1. **Storage RLS is confusing and under-documented compared to table RLS** --
+   Developers consistently struggle with the distinction between public/private
+   buckets and the RLS policies needed on `storage.objects`. Multiple GitHub
+   discussions show confusion about which SDK operations map to which SQL
+   operations (INSERT, SELECT, UPDATE, DELETE).
+   - Source: https://github.com/orgs/supabase/discussions/37611
+   - Source: https://github.com/orgs/supabase/discussions/38700
+
+2. **User-folder isolation is the canonical storage security pattern** -- The
+   official Supabase docs demonstrate folder-based isolation using
+   `storage.foldername(name)` and `auth.uid()::text`, but developers frequently
+   get the casting or array indexing wrong.
+   - Source: https://supabase.com/docs/guides/storage/security/access-control
+
+3. **Missing file type restrictions leads to security vulnerabilities** --
+   Without `allowed_mime_types` on the bucket or extension checks in RLS
+   policies, users can upload executable files or oversized payloads. The
+   Supabase security best practices guide calls this out as a common oversight.
+   - Source: https://supaexplorer.com/guides/supabase-security-best-practices
+   - Source: https://supabase.com/docs/guides/storage/buckets/fundamentals
+
+## Skill References Exercised
+
+Which reference files the agent should consult and what each teaches:
+
+| Reference File | What It Teaches | What the Agent Should Apply |
+|---|---|---|
+| `references/storage-access-control.md` | Bucket visibility, RLS on storage.objects, storage helper functions, SDK-to-SQL operation mapping | User-folder policies using `storage.foldername()`, separate SELECT/INSERT policies |
+| `references/db-rls-mandatory.md` | RLS must be enabled on all public tables | Enable RLS on the file_metadata tracking table |
+| `references/db-rls-common-mistakes.md` | Missing TO clause, missing SELECT policy for UPDATE | Use `TO authenticated` (or `TO public` for public reads), include SELECT policy |
+| `references/db-rls-performance.md` | Wrap auth.uid() in SELECT subquery | Use `(select auth.uid())` in both storage and table policies |
+| `references/db-schema-auth-fk.md` | FK to auth.users with ON DELETE CASCADE | file_metadata.user_id references auth.users with cascade |
+| `references/db-schema-timestamps.md` | Use timestamptz not timestamp | Time columns on file_metadata use timestamptz |
+| `references/db-perf-indexes.md` | Index columns used in policy lookups | Index user_id on file_metadata |
+| `references/db-migrations-idempotent.md` | IF NOT EXISTS for safe reruns | Idempotent DDL throughout |
+
+## Workspace Setup
+
+What the workspace starts with before the agent runs:
+
+- Pre-initialized Supabase project (`supabase/config.toml` exists)
+- Empty `supabase/migrations/` directory
+- The agent creates migration files within this structure
+
+## Agent Task (PROMPT.md draft)
+
+The prompt to give the agent. Written as a developer would ask it:
+
+> I need to set up file storage for my app. There are two use cases:
+>
+> 1. **Avatars** -- Users upload a profile picture. Anyone can view avatars but
+>    only the owning user can upload or replace their own. Only allow image
+>    files (JPEG, PNG, WebP). Max 2MB.
+>
+> 2. **Documents** -- Users upload private documents that only they can access.
+>    Max 50MB. No file type restriction.
+>
+> Create a SQL migration that:
+> - Configures both storage buckets
+> - Adds RLS policies on `storage.objects` so each user can only access their
+>   own folder (folder name = user ID)
+> - Creates a `file_metadata` table to track uploaded files (file name, bucket,
+>   size, user reference) with appropriate security
+>
+> Users are authenticated via Supabase Auth.
+
+## Evaluation Criteria
+
+What vitest should assert on the agent's output. Each assertion tests a
+specific quality signal:
+
+| # | Test Name | What It Checks | Quality Dimension |
+|---|-----------|----------------|-------------------|
+| 1 | migration file exists | A `.sql` file exists in `supabase/migrations/` | structure |
+| 2 | creates avatars bucket | SQL inserts into `storage.buckets` with id 'avatars' and `public = true` | correctness |
+| 3 | creates documents bucket | SQL inserts into `storage.buckets` with id 'documents' and `public = false` | correctness |
+| 4 | avatars bucket has mime type restriction | `allowed_mime_types` includes image types (jpeg, png, webp) | security |
+| 5 | avatars bucket has file size limit | `file_size_limit` set (around 2MB / 2097152 bytes) | security |
+| 6 | storage policy uses foldername or path for user isolation | Policy references `storage.foldername(name)` with `auth.uid()::text` | security |
+| 7 | storage policy uses TO authenticated | Storage upload/delete policies scoped to `TO authenticated` | security |
+| 8 | public read policy for avatars | A SELECT policy on storage.objects for avatars bucket allows public/anon access | correctness |
+| 9 | documents bucket is fully private | Policies for documents bucket restrict all operations to authenticated owner | security |
+| 10 | creates file_metadata table | SQL contains `CREATE TABLE` for file_metadata | correctness |
+| 11 | file_metadata has FK to auth.users with CASCADE | `REFERENCES auth.users` with `ON DELETE CASCADE` | correctness |
+| 12 | RLS enabled on file_metadata | `ALTER TABLE file_metadata ENABLE ROW LEVEL SECURITY` | security |
+| 13 | file_metadata policies use (select auth.uid()) | Subselect form in policies | performance |
+| 14 | uses timestamptz for time columns | No plain `timestamp` in file_metadata | correctness |
+| 15 | index on file_metadata user_id | `CREATE INDEX` on user_id column | performance |
+| 16 | idempotent DDL | Uses `IF NOT EXISTS` patterns | idempotency |
+| 17 | overall quality score | At least 11/15 best-practice signals present | overall |
+
+## Reasoning
+
+Step-by-step reasoning for why this scenario is well-designed:
+
+1. **Baseline differentiator:** An agent without the skill would likely: (a)
+   confuse public bucket visibility with unrestricted upload access, (b) write
+   storage policies without using `storage.foldername()` or get the array
+   indexing wrong, (c) forget to set `allowed_mime_types` on the bucket itself,
+   (d) omit the `TO authenticated` clause on storage policies, (e) use bare
+   `auth.uid()` instead of the subselect form, (f) skip the `::text` cast when
+   comparing auth.uid() to folder names. These are all Supabase-specific
+   patterns that require reading the skill references.
+
+2. **Skill value:** The storage-access-control reference explicitly documents:
+   the public vs private bucket distinction, the `storage.foldername()` helper
+   function pattern, the SDK-to-SQL operation mapping, and bucket configuration
+   with mime types and size limits. Combined with the database security
+   references (RLS mandatory, common mistakes, performance), this scenario
+   exercises 8 reference files.
+
+3. **Testability:** Bucket configuration (INSERT INTO storage.buckets), storage
+   helper function usage (storage.foldername), policy clauses (TO
+   authenticated, TO public), mime types, file size limits, and all table-level
+   patterns (RLS, FK, indexes, timestamptz) are reliably detectable via regex
+   on SQL text.
+
+4. **Realism:** Nearly every Supabase application that handles user-generated
+   content needs avatar uploads and document storage. This is a day-one task
+   for any SaaS product. The GitHub discussions linked above show dozens of
+   developers hitting exactly these issues when setting up storage for the
+   first time.
+
+## Difficulty
+
+**Rating:** MEDIUM
+
+- Without skill: ~30-45% of assertions expected to pass
+- With skill: ~85-95% of assertions expected to pass
--- a/packages/evals/scenarios/team-rls-security-definer.md
+++ b/packages/evals/scenarios/team-rls-security-definer.md
@@ -0,0 +1,139 @@
+# Scenario: team-rls-security-definer
+
+## Summary
+
+The agent must create a SQL migration for a team-based project management app
+where users belong to organizations via a membership table. The migration must
+define tables for organizations, memberships, and projects, then secure them
+with RLS policies that use a `security definer` helper function in a private
+schema to efficiently resolve team membership without per-row joins.
+
+## Real-World Justification
+
+Why this is a common and important workflow:
+
+1. **Multi-tenant team access is the most-asked RLS question on Supabase** --
+   The official Supabase GitHub has multiple high-engagement discussions about
+   how to write RLS policies that check team/org membership without causing
+   performance issues or security holes.
+   - Source: https://github.com/supabase/supabase/discussions/4509
+   - Source: https://github.com/supabase/supabase/discussions/811
+
+2. **security_definer in public schema is a documented security anti-pattern** --
+   Developers frequently place security_definer functions in the public schema,
+   inadvertently exposing them via the PostgREST API. The Supabase docs and
+   community discussions explicitly warn against this.
+   - Source: https://github.com/supabase/supabase/discussions/3269
+   - Source: https://supabase.com/docs/guides/database/postgres/row-level-security
+
+3. **RLS policy performance with joins is a top pain point** -- Naive policies
+   that join against a memberships table execute per-row, causing severe
+   performance degradation. The recommended pattern is a security_definer
+   function that caches results via subselect.
+   - Source: https://github.com/orgs/supabase/discussions/1148
+   - Source: https://makerkit.dev/blog/tutorials/supabase-rls-best-practices
+
+## Skill References Exercised
+
+Which reference files the agent should consult and what each teaches:
+
+| Reference File | What It Teaches | What the Agent Should Apply |
+|---|---|---|
+| `references/db-rls-mandatory.md` | RLS must be enabled on all public tables | Enable RLS on organizations, memberships, and projects |
+| `references/db-rls-policy-types.md` | PERMISSIVE vs RESTRICTIVE policies | Use PERMISSIVE policies for team OR owner access patterns |
+| `references/db-rls-common-mistakes.md` | Missing TO clause, user_metadata pitfalls | Always use `TO authenticated` on all policies |
+| `references/db-rls-performance.md` | Wrap auth.uid() in SELECT, use security_definer for joins | Use `(select auth.uid())` and a private-schema helper function |
+| `references/db-security-functions.md` | security_definer in private schema with search_path = '' | Create helper function in private schema, revoke default permissions |
+| `references/db-schema-auth-fk.md` | FK to auth.users with ON DELETE CASCADE | Reference auth.users with cascade on memberships |
+| `references/db-schema-timestamps.md` | Use timestamptz not timestamp | All time columns use timestamptz |
+| `references/db-perf-indexes.md` | Index columns used in RLS policies | Index user_id and org_id columns used in policy lookups |
+| `references/db-migrations-idempotent.md` | IF NOT EXISTS for safe reruns | Idempotent DDL throughout the migration |
+
+## Workspace Setup
+
+What the workspace starts with before the agent runs:
+
+- Pre-initialized Supabase project (`supabase/config.toml` exists)
+- Empty `supabase/migrations/` directory
+- The agent creates migration files within this structure
+
+## Agent Task (PROMPT.md draft)
+
+The prompt to give the agent. Written as a developer would ask it:
+
+> I'm building a project management app where users can belong to multiple
+> organizations. Each organization has projects that all members can view and
+> edit.
+>
+> Create a SQL migration with:
+>
+> 1. An `organizations` table (name, slug)
+> 2. A `memberships` table linking users to organizations with a role column
+>    (owner, admin, member)
+> 3. A `projects` table (name, description, status) belonging to an organization
+>
+> Set up Row Level Security so:
+> - Users can only see organizations they belong to
+> - Users can only see and manage projects in their organizations
+> - Only org owners can delete projects
+>
+> The migration should handle the case where a user is deleted from auth.
+
+## Evaluation Criteria
+
+What vitest should assert on the agent's output. Each assertion tests a
+specific quality signal:
+
+| # | Test Name | What It Checks | Quality Dimension |
+|---|-----------|----------------|-------------------|
+| 1 | migration file exists | A `.sql` file exists in `supabase/migrations/` | structure |
+| 2 | creates organizations table | SQL contains `CREATE TABLE` for organizations | correctness |
+| 3 | creates memberships table | SQL contains `CREATE TABLE` for memberships | correctness |
+| 4 | creates projects table | SQL contains `CREATE TABLE` for projects | correctness |
+| 5 | enables RLS on all tables | `ALTER TABLE ... ENABLE ROW LEVEL SECURITY` for all three tables | security |
+| 6 | FK to auth.users with ON DELETE CASCADE | memberships references `auth.users` with cascade | correctness |
+| 7 | org_id FK on projects | projects references organizations | correctness |
+| 8 | private schema created | `CREATE SCHEMA ... private` present | security |
+| 9 | security_definer helper function | A function in the private schema with `SECURITY DEFINER` and `SET search_path = ''` | security |
+| 10 | policies use (select auth.uid()) | Subselect form in all policies referencing auth.uid() | performance |
+| 11 | policies use TO authenticated | All policies scoped to authenticated role | security |
+| 12 | index on membership lookup columns | `CREATE INDEX` on user_id and/or org_id in memberships | performance |
+| 13 | uses timestamptz | No plain `timestamp` for time columns | correctness |
+| 14 | idempotent DDL | Uses `IF NOT EXISTS` or `DROP ... IF EXISTS` patterns | idempotency |
+| 15 | delete policy restricted to owner role | A delete policy on projects checks for owner/admin role | security |
+| 16 | overall quality score | At least 10/14 best-practice signals present | overall |
+
+## Reasoning
+
+Step-by-step reasoning for why this scenario is well-designed:
+
+1. **Baseline differentiator:** An agent without the skill would likely put the
+   security_definer function in the public schema, omit `SET search_path = ''`,
+   use bare `auth.uid()` instead of the subselect form, write inline joins in
+   policies instead of using a helper function, and possibly forget `TO
+   authenticated` on some policies. These are all patterns that require specific
+   knowledge of Supabase conventions.
+
+2. **Skill value:** The skill explicitly teaches: (a) private schema for
+   security_definer functions, (b) `SET search_path = ''` to prevent injection,
+   (c) `(select auth.uid())` for per-statement caching, (d) using
+   security_definer functions to avoid per-row joins in policies, (e) `TO
+   authenticated` on every policy. This is a scenario where reading 5+ reference
+   files materially improves the output.
+
+3. **Testability:** Every assertion checks for specific SQL patterns via regex.
+   The private schema, security_definer, search_path, subselect auth.uid(), TO
+   authenticated, indexes, and timestamptz are all reliably detectable in SQL
+   text without runtime execution.
+
+4. **Realism:** Multi-tenant team-based access control is one of the most common
+   Supabase use cases. The GitHub discussions linked above have hundreds of
+   comments from developers working on exactly this pattern. Project management
+   apps (Notion, Linear, Asana clones) are a canonical example.
+
+## Difficulty
+
+**Rating:** MEDIUM
+
+- Without skill: ~35-50% of assertions expected to pass
+- With skill: ~85-95% of assertions expected to pass
--- a/packages/evals/src/runner.ts
+++ b/packages/evals/src/runner.ts
@@ -2,10 +2,12 @@ import { existsSync, readdirSync, readFileSync } from "node:fs";
 import { join, resolve } from "node:path";
 import { runAgent } from "./runner/agent.js";
 import { uploadToBraintrust } from "./runner/braintrust.js";
+import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
 import { preflight } from "./runner/preflight.js";
 import { listModifiedFiles, printSummary } from "./runner/results.js";
 import { createWorkspace } from "./runner/scaffold.js";
 import { runTests } from "./runner/test.js";
+import { buildTranscriptSummary } from "./runner/transcript.js";
 import type { EvalRunResult, EvalScenario } from "./types.js";

 // ---------------------------------------------------------------------------
@@ -19,6 +21,12 @@ const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL;
 const scenarioFilter = process.env.EVAL_SCENARIO;
 const runBaseline = process.env.EVAL_BASELINE === "true";

+// Run-level timestamp shared across all scenarios in a single invocation
+const runTimestamp = new Date()
+	.toISOString()
+	.replace(/[:.]/g, "-")
+	.replace("Z", "");
+
 // ---------------------------------------------------------------------------
 // Discover scenarios
 // ---------------------------------------------------------------------------
@@ -58,10 +66,9 @@ async function runEval(
 ): Promise<EvalRunResult> {
 	const evalsDir = findEvalsDir();
 	const evalDir = join(evalsDir, scenario.id);
+	const variant = skillEnabled ? "with-skill" : "baseline";

-	console.log(
-		`\n--- ${scenario.id} (${skillEnabled ? "with-skill" : "baseline"}) ---`,
-	);
+	console.log(`\n--- ${scenario.id} (${variant}) ---`);

 	// 1. Create isolated workspace
 	const { workspacePath, cleanup } = createWorkspace({
@@ -104,7 +111,10 @@ async function runEval(
 		// 5. Collect modified files
 		const filesModified = listModifiedFiles(workspacePath, evalDir);

-		return {
+		// 6. Build transcript summary
+		const summary = buildTranscriptSummary(agentResult.events);
+
+		const result: EvalRunResult = {
 			scenario: scenario.id,
 			agent: "claude-code",
 			model,
@@ -116,7 +126,22 @@ async function runEval(
 			testsPassed: testResult.passedCount,
 			testsTotal: testResult.totalCount,
 			filesModified,
+			toolCallCount: summary.toolCalls.length,
+			costUsd: summary.totalCostUsd ?? undefined,
 		};
+
+		// 7. Persist results
+		const resultDir = createResultDir(runTimestamp, scenario.id, variant);
+		result.resultsDir = resultDir;
+		saveRunArtifacts({
+			resultDir,
+			rawTranscript: agentResult.rawTranscript,
+			testOutput: testResult.output,
+			result,
+			transcriptSummary: summary,
+		});
+
+		return result;
 	} catch (error) {
 		const err = error as Error;
 		return {
@@ -175,7 +200,9 @@ async function main() {
 		}
 	}

-	printSummary(results);
+	// Use the results dir from the first result (all share the same timestamp)
+	const resultsDir = results.find((r) => r.resultsDir)?.resultsDir;
+	printSummary(results, resultsDir);

 	if (process.env.BRAINTRUST_UPLOAD === "true") {
 		console.log("\nUploading to Braintrust...");
--- a/packages/evals/src/runner/agent.ts
+++ b/packages/evals/src/runner/agent.ts
@@ -1,13 +1,27 @@
 import { spawn } from "node:child_process";
+import { resolveClaudeBin } from "./preflight.js";
+import {
+	extractFinalOutput,
+	parseStreamJsonOutput,
+	type TranscriptEvent,
+} from "./transcript.js";

 export interface AgentRunResult {
+	/** Extracted final text output (backward-compatible). */
 	output: string;
 	duration: number;
+	/** Raw NDJSON transcript string from stream-json. */
+	rawTranscript: string;
+	/** Parsed transcript events. */
+	events: TranscriptEvent[];
 }

 /**
 * Invoke Claude Code in print mode as a subprocess.
 *
+ * Uses --output-format stream-json to capture structured NDJSON events
+ * including tool calls, results, and reasoning steps.
+ *
 * The agent operates in the workspace directory and can read/write files.
 * When the skill is installed (symlinked into workspace), Claude Code
 * discovers it automatically and uses it for guidance.
@@ -23,14 +37,22 @@ export async function runAgent(opts: {

 	const args = [
 		"-p", // Print mode (non-interactive)
+		"--verbose",
 		"--output-format",
-		"text",
+		"stream-json",
 		"--model",
 		opts.model,
 		"--no-session-persistence",
 		"--dangerously-skip-permissions",
 		"--tools",
 		"Edit,Write,Bash,Read,Glob,Grep",
+		// Disable all MCP servers so the agent uses only local filesystem tools.
+		// Without this, MCP tools from the parent env (e.g. Supabase, Neon)
+		// leak in and the agent may apply migrations to a remote project
+		// instead of creating local files.
+		"--mcp-config",
+		'{"mcpServers":{}}',
+		"--strict-mcp-config",
 	];

 	// Disable skills for baseline runs so the agent relies on innate knowledge
@@ -46,8 +68,10 @@ export async function runAgent(opts: {
 		}
 	}

+	const claudeBin = resolveClaudeBin();
+
 	return new Promise<AgentRunResult>((resolve) => {
-		const child = spawn("claude", args, {
+		const child = spawn(claudeBin, args, {
 			cwd: opts.cwd,
 			env,
 			stdio: ["pipe", "pipe", "pipe"],
@@ -73,9 +97,15 @@ export async function runAgent(opts: {

 		child.on("close", () => {
 			clearTimeout(timer);
+			const rawTranscript = stdout || stderr;
+			const events = parseStreamJsonOutput(rawTranscript);
+			const output = extractFinalOutput(events) || rawTranscript;
+
 			resolve({
-				output: stdout || stderr,
+				output,
 				duration: Date.now() - start,
+				rawTranscript,
+				events,
 			});
 		});
 	});
--- a/packages/evals/src/runner/persist.ts
+++ b/packages/evals/src/runner/persist.ts
@@ -0,0 +1,56 @@
+import { mkdirSync, writeFileSync } from "node:fs";
+import { dirname, join } from "node:path";
+import { fileURLToPath } from "node:url";
+import type { EvalRunResult } from "../types.js";
+import type { TranscriptSummary } from "./transcript.js";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+
+/** Resolve the evals package root (packages/evals). */
+function evalsRoot(): string {
+	// __dirname is packages/evals/src/runner
+	return join(__dirname, "..", "..");
+}
+
+/** Create the results directory for a single scenario run. Returns the path. */
+export function createResultDir(
+	runTimestamp: string,
+	scenarioId: string,
+	variant: "with-skill" | "baseline",
+): string {
+	const dir = join(evalsRoot(), "results", runTimestamp, scenarioId, variant);
+	mkdirSync(dir, { recursive: true });
+	return dir;
+}
+
+/** Save all artifacts for a single eval run. */
+export function saveRunArtifacts(opts: {
+	resultDir: string;
+	rawTranscript: string;
+	testOutput: string;
+	result: EvalRunResult;
+	transcriptSummary: TranscriptSummary;
+}): void {
+	writeFileSync(
+		join(opts.resultDir, "transcript.jsonl"),
+		opts.rawTranscript,
+		"utf-8",
+	);
+
+	writeFileSync(
+		join(opts.resultDir, "test-output.txt"),
+		opts.testOutput,
+		"utf-8",
+	);
+
+	writeFileSync(
+		join(opts.resultDir, "result.json"),
+		JSON.stringify(
+			{ ...opts.result, transcript: opts.transcriptSummary },
+			null,
+			2,
+		),
+		"utf-8",
+	);
+}
--- a/packages/evals/src/runner/preflight.ts
+++ b/packages/evals/src/runner/preflight.ts
@@ -1,10 +1,61 @@
 import { execFileSync } from "node:child_process";
+import { existsSync } from "node:fs";
+import { dirname, join } from "node:path";
+import { fileURLToPath } from "node:url";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+
+/**
+ * Resolve the `claude` binary path.
+ *
+ * Looks in the following order:
+ * 1. Local node_modules/.bin/claude (installed via @anthropic-ai/claude-code)
+ * 2. Global `claude` on PATH
+ *
+ * Throws with an actionable message when neither is found.
+ */
+export function resolveClaudeBin(): string {
+	// packages/evals/node_modules/.bin/claude
+	const localBin = join(
+		__dirname,
+		"..",
+		"..",
+		"node_modules",
+		".bin",
+		"claude",
+	);
+	if (existsSync(localBin)) {
+		return localBin;
+	}
+
+	// Fall back to PATH
+	try {
+		execFileSync("claude", ["--version"], {
+			stdio: "ignore",
+			timeout: 10_000,
+		});
+		return "claude";
+	} catch {
+		throw new Error(
+			[
+				"claude CLI not found.",
+				"",
+				"Install it in one of these ways:",
+				"  npm install          (uses @anthropic-ai/claude-code from package.json)",
+				"  npm i -g @anthropic-ai/claude-code",
+				"",
+				"Ensure ANTHROPIC_API_KEY is set in the environment.",
+			].join("\n"),
+		);
+	}
+}

 /**
 * Verify the host environment has everything needed before spending
 * API credits on an eval run.
 *
- * Checks: Node >= 20, Docker running, claude CLI available.
+ * Checks: Node >= 20, Docker running, claude CLI available, API key set.
 */
 export function preflight(): void {
 	const errors: string[] = [];
@@ -24,12 +75,16 @@ export function preflight(): void {

 	// Claude CLI available
 	try {
-		execFileSync("claude", ["--version"], {
-			stdio: "ignore",
-			timeout: 10_000,
-		});
-	} catch {
-		errors.push("claude CLI not found on PATH");
+		resolveClaudeBin();
+	} catch (err) {
+		errors.push((err as Error).message);
+	}
+
+	// API key
+	if (!process.env.ANTHROPIC_API_KEY) {
+		errors.push(
+			"ANTHROPIC_API_KEY is not set. Claude Code requires this for authentication.",
+		);
 	}

 	if (errors.length > 0) {
--- a/packages/evals/src/runner/results.ts
+++ b/packages/evals/src/runner/results.ts
@@ -46,7 +46,10 @@ export function listModifiedFiles(
 }

 /** Print a summary table of eval results. */
-export function printSummary(results: EvalRunResult[]): void {
+export function printSummary(
+	results: EvalRunResult[],
+	resultsDir?: string,
+): void {
 	console.log("\n=== Eval Results ===\n");

 	for (const r of results) {
@@ -65,4 +68,8 @@ export function printSummary(results: EvalRunResult[]): void {

 	const passed = results.filter((r) => r.status === "passed").length;
 	console.log(`\nTotal: ${passed}/${results.length} passed`);
+
+	if (resultsDir) {
+		console.log(`\nResults saved to: ${resultsDir}`);
+	}
 }
--- a/packages/evals/src/runner/test.ts
+++ b/packages/evals/src/runner/test.ts
@@ -78,17 +78,24 @@ export async function runTests(opts: {

 function parseTestOutput(output: string): TestResult {
 	// Parse vitest output for pass/fail counts
-	// Format: "Tests  N passed (M)" or "Tests  N failed | M passed (T)"
-	const testsLine = output.match(
+	// Vitest formats:
+	//   All passing:  "Tests  N passed (N)"
+	//   Mixed:        "Tests  N failed | M passed (T)"
+	//   All failing:  "Tests  N failed (N)"
+	const mixedOrPassing = output.match(
 		/Tests\s+(?:(\d+)\s+failed\s+\|\s+)?(\d+)\s+passed\s+\((\d+)\)/,
 	);
+	const allFailing = output.match(/Tests\s+(\d+)\s+failed\s+\((\d+)\)/);

 	let passedCount = 0;
 	let totalCount = 0;

-	if (testsLine) {
-		passedCount = Number.parseInt(testsLine[2], 10);
-		totalCount = Number.parseInt(testsLine[3], 10);
+	if (mixedOrPassing) {
+		passedCount = Number.parseInt(mixedOrPassing[2], 10);
+		totalCount = Number.parseInt(mixedOrPassing[3], 10);
+	} else if (allFailing) {
+		passedCount = 0;
+		totalCount = Number.parseInt(allFailing[2], 10);
 	}

 	const passed = totalCount > 0 && passedCount === totalCount;
--- a/packages/evals/src/runner/transcript.ts
+++ b/packages/evals/src/runner/transcript.ts
@@ -0,0 +1,154 @@
+export interface TranscriptEvent {
+	type: string;
+	[key: string]: unknown;
+}
+
+export interface ToolCallSummary {
+	tool: string;
+	toolUseId: string;
+	input: Record<string, unknown>;
+	/** First ~200 chars of output for quick scanning */
+	outputPreview: string;
+}
+
+export interface TranscriptSummary {
+	totalTurns: number;
+	totalDurationMs: number;
+	totalCostUsd: number | null;
+	model: string | null;
+	toolCalls: ToolCallSummary[];
+	finalOutput: string;
+}
+
+/** Parse a single NDJSON line. Returns null on empty or invalid input. */
+export function parseStreamJsonLine(line: string): TranscriptEvent | null {
+	const trimmed = line.trim();
+	if (!trimmed) return null;
+	try {
+		return JSON.parse(trimmed) as TranscriptEvent;
+	} catch {
+		return null;
+	}
+}
+
+/** Parse raw NDJSON stdout into an array of events. */
+export function parseStreamJsonOutput(raw: string): TranscriptEvent[] {
+	const events: TranscriptEvent[] = [];
+	for (const line of raw.split("\n")) {
+		const event = parseStreamJsonLine(line);
+		if (event) events.push(event);
+	}
+	return events;
+}
+
+/** Extract the final text output from parsed events (for backward compat). */
+export function extractFinalOutput(events: TranscriptEvent[]): string {
+	// Prefer the result event
+	for (const event of events) {
+		if (event.type === "result") {
+			const result = (event as Record<string, unknown>).result;
+			if (typeof result === "string") return result;
+		}
+	}
+
+	// Fallback: concatenate text blocks from the last assistant message
+	for (let i = events.length - 1; i >= 0; i--) {
+		const event = events[i];
+		if (event.type === "assistant") {
+			const msg = (event as Record<string, unknown>).message as
+				| Record<string, unknown>
+				| undefined;
+			const content = msg?.content;
+			if (Array.isArray(content)) {
+				const texts = content
+					.filter(
+						(b: Record<string, unknown>) =>
+							b.type === "text" && typeof b.text === "string",
+					)
+					.map((b: Record<string, unknown>) => b.text as string);
+				if (texts.length > 0) return texts.join("\n");
+			}
+		}
+	}
+
+	return "";
+}
+
+/** Walk parsed events to build a transcript summary. */
+export function buildTranscriptSummary(
+	events: TranscriptEvent[],
+): TranscriptSummary {
+	const toolCalls: ToolCallSummary[] = [];
+	let finalOutput = "";
+	let totalDurationMs = 0;
+	let totalCostUsd: number | null = null;
+	let model: string | null = null;
+	let totalTurns = 0;
+
+	for (const event of events) {
+		const e = event as Record<string, unknown>;
+
+		// System init: extract model
+		if (e.type === "system" && e.subtype === "init") {
+			model = typeof e.model === "string" ? e.model : null;
+		}
+
+		// Assistant messages: extract tool_use blocks
+		if (e.type === "assistant") {
+			const msg = e.message as Record<string, unknown> | undefined;
+			const content = msg?.content;
+			if (Array.isArray(content)) {
+				for (const block of content) {
+					if (block.type === "tool_use") {
+						toolCalls.push({
+							tool: block.name ?? "unknown",
+							toolUseId: block.id ?? "",
+							input: block.input ?? {},
+							outputPreview: "",
+						});
+					}
+				}
+			}
+		}
+
+		// User messages: extract tool_result blocks and match to tool calls
+		if (e.type === "user") {
+			const msg = e.message as Record<string, unknown> | undefined;
+			const content = msg?.content;
+			if (Array.isArray(content)) {
+				for (const block of content) {
+					if (block.type === "tool_result") {
+						const matching = toolCalls.find(
+							(tc) => tc.toolUseId === block.tool_use_id,
+						);
+						if (matching) {
+							const text =
+								typeof block.content === "string"
+									? block.content
+									: JSON.stringify(block.content);
+							matching.outputPreview = text.slice(0, 200);
+						}
+					}
+				}
+			}
+		}
+
+		// Result event: final output, cost, duration, turns
+		if (e.type === "result") {
+			finalOutput = typeof e.result === "string" ? e.result : "";
+			totalDurationMs = typeof e.duration_ms === "number" ? e.duration_ms : 0;
+			totalCostUsd =
+				typeof e.total_cost_usd === "number" ? e.total_cost_usd : null;
+			totalTurns = typeof e.num_turns === "number" ? e.num_turns : 0;
+		}
+	}
+
+	return {
+		totalTurns,
+		totalDurationMs,
+		totalCostUsd,
+		model,
+		toolCalls,
+		finalOutput,
+	};
+}
--- a/packages/evals/src/types.ts
+++ b/packages/evals/src/types.ts
@@ -32,4 +32,10 @@ export interface EvalRunResult {
 	/** Files the agent created or modified in the workspace */
 	filesModified: string[];
 	error?: string;
+	/** Path to the persisted results directory for this run */
+	resultsDir?: string;
+	/** Number of tool calls the agent made */
+	toolCallCount?: number;
+	/** Total cost in USD (from stream-json result event) */
+	costUsd?: number;
 }