From e03bc99ebba0c459dc086c6f40a6271a77972db5 Mon Sep 17 00:00:00 2001 From: Pedro Rodrigues Date: Fri, 20 Feb 2026 15:02:59 +0000 Subject: [PATCH] more two scenarios and claude code cli is now a dependency --- .gitignore | 3 + packages/evals/AGENTS.md | 6 + .../evals/storage-rls-user-folders/EVAL.ts | 252 ++++++++++++++ .../evals/storage-rls-user-folders/PROMPT.md | 12 + .../storage-rls-user-folders/package.json | 5 + .../supabase/config.toml | 64 ++++ .../supabase/migrations/.gitkeep | 0 .../evals/team-rls-security-definer/EVAL.ts | 201 +++++++++++ .../evals/team-rls-security-definer/PROMPT.md | 14 + .../team-rls-security-definer/package.json | 5 + .../supabase/config.toml | 111 ++++++ packages/evals/package-lock.json | 328 ++++++++++++++++++ packages/evals/package.json | 1 + packages/evals/scenarios/SCENARIOS.md | 118 +++++++ .../scenarios/storage-rls-user-folders.md | 144 ++++++++ .../scenarios/team-rls-security-definer.md | 139 ++++++++ packages/evals/src/runner.ts | 37 +- packages/evals/src/runner/agent.ts | 36 +- packages/evals/src/runner/persist.ts | 56 +++ packages/evals/src/runner/preflight.ts | 69 +++- packages/evals/src/runner/results.ts | 9 +- packages/evals/src/runner/test.ts | 17 +- packages/evals/src/runner/transcript.ts | 154 ++++++++ packages/evals/src/types.ts | 6 + 24 files changed, 1766 insertions(+), 21 deletions(-) create mode 100644 packages/evals/evals/storage-rls-user-folders/EVAL.ts create mode 100644 packages/evals/evals/storage-rls-user-folders/PROMPT.md create mode 100644 packages/evals/evals/storage-rls-user-folders/package.json create mode 100644 packages/evals/evals/storage-rls-user-folders/supabase/config.toml create mode 100644 packages/evals/evals/storage-rls-user-folders/supabase/migrations/.gitkeep create mode 100644 packages/evals/evals/team-rls-security-definer/EVAL.ts create mode 100644 packages/evals/evals/team-rls-security-definer/PROMPT.md create mode 100644 packages/evals/evals/team-rls-security-definer/package.json create mode 100644 packages/evals/evals/team-rls-security-definer/supabase/config.toml create mode 100644 packages/evals/scenarios/storage-rls-user-folders.md create mode 100644 packages/evals/scenarios/team-rls-security-definer.md create mode 100644 packages/evals/src/runner/persist.ts create mode 100644 packages/evals/src/runner/transcript.ts diff --git a/.gitignore b/.gitignore index 63d70a0..a9f4037 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,6 @@ dist/ # Generated skills in any dot directory .*/skills/ .claude/ + +# Eval results (local debugging artifacts) +packages/evals/results/ diff --git a/packages/evals/AGENTS.md b/packages/evals/AGENTS.md index 4fc2025..aeebdd7 100644 --- a/packages/evals/AGENTS.md +++ b/packages/evals/AGENTS.md @@ -20,6 +20,12 @@ hidden tests check the result. Binary pass/fail. The agent is **Claude Code** invoked via `claude -p` (print mode). It operates on a real filesystem in a temp directory and can read/write files freely. +**Important**: MCP servers are disabled via `--strict-mcp-config` with an empty +config. This ensures the agent uses only local tools (Bash, Edit, Write, Read, +Glob, Grep) and cannot access remote services like Supabase MCP or Neon. All +work must happen on the local filesystem — e.g., creating migration files in +`supabase/migrations/`, not applying them to a remote project. + ## Eval Structure Each eval lives in `evals/{scenario-name}/`: diff --git a/packages/evals/evals/storage-rls-user-folders/EVAL.ts b/packages/evals/evals/storage-rls-user-folders/EVAL.ts new file mode 100644 index 0000000..d4697f2 --- /dev/null +++ b/packages/evals/evals/storage-rls-user-folders/EVAL.ts @@ -0,0 +1,252 @@ +import { existsSync, readdirSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import { expect, test } from "vitest"; + +const supabaseDir = join(process.cwd(), "supabase"); +const migrationsDir = join(supabaseDir, "migrations"); + +/** Find all .sql migration files (agent may create one or more). */ +function findMigrationFiles(): string[] { + if (!existsSync(migrationsDir)) return []; + return readdirSync(migrationsDir) + .filter((f) => f.endsWith(".sql")) + .map((f) => join(migrationsDir, f)); +} + +/** Read and concatenate all migration SQL files. */ +function getMigrationSQL(): string { + const files = findMigrationFiles(); + if (files.length === 0) + throw new Error("No migration file found in supabase/migrations/"); + return files.map((f) => readFileSync(f, "utf-8")).join("\n"); +} + +test("migration file exists", () => { + expect(findMigrationFiles().length).toBeGreaterThan(0); +}); + +test("creates avatars bucket", () => { + const sql = getMigrationSQL().toLowerCase(); + // Should insert into storage.buckets with id 'avatars' and public = true + expect(sql).toMatch(/storage\.buckets/); + expect(sql).toMatch(/avatars/); + expect(sql).toMatch(/public/); + // Verify it's marked as a public bucket (true) + const avatarsBlock = sql.match( + /insert\s+into\s+storage\.buckets[\s\S]*?avatars[\s\S]*?;/, + ); + expect(avatarsBlock).not.toBeNull(); + if (avatarsBlock) { + expect(avatarsBlock[0]).toMatch(/true/); + } +}); + +test("creates documents bucket", () => { + const sql = getMigrationSQL().toLowerCase(); + // Should insert into storage.buckets with id 'documents' and public = false + expect(sql).toMatch(/documents/); + const documentsBlock = sql.match( + /insert\s+into\s+storage\.buckets[\s\S]*?documents[\s\S]*?;/, + ); + expect(documentsBlock).not.toBeNull(); + if (documentsBlock) { + expect(documentsBlock[0]).toMatch(/false/); + } +}); + +test("avatars bucket has mime type restriction", () => { + const sql = getMigrationSQL().toLowerCase(); + // Should have allowed_mime_types with image types + expect(sql).toMatch(/allowed_mime_types/); + // Check for image MIME types (jpeg, png, webp) + expect(sql).toMatch(/image\/jpeg/); + expect(sql).toMatch(/image\/png/); + expect(sql).toMatch(/image\/webp/); +}); + +test("avatars bucket has file size limit", () => { + const sql = getMigrationSQL().toLowerCase(); + // Should have file_size_limit set to approximately 2MB (2097152 bytes or 2MB string) + expect(sql).toMatch(/file_size_limit/); + // Accept either numeric bytes (2097152) or string form (2MB, 2MiB, 2 * 1024 * 1024) + const hasNumericLimit = /2097152/.test(sql); + const hasStringLimit = /2\s*m/i.test(sql); + const hasCalcLimit = /2\s*\*\s*1024\s*\*\s*1024/.test(sql); + expect(hasNumericLimit || hasStringLimit || hasCalcLimit).toBe(true); +}); + +test("storage policy uses foldername or path for user isolation", () => { + const sql = getMigrationSQL().toLowerCase(); + // Should use storage.foldername(name) with auth.uid()::text for folder isolation + const usesFoldername = /storage\.foldername\s*\(\s*name\s*\)/.test(sql); + // Also accept direct path matching patterns like (name ~ '^user-id/') + const usesPathMatch = + /\(\s*storage\.foldername\s*\(/.test(sql) || + /\bname\b.*auth\.uid\(\)/.test(sql); + expect(usesFoldername || usesPathMatch).toBe(true); + // Should cast auth.uid() to text for comparison with folder name + expect(sql).toMatch(/auth\.uid\(\)\s*::\s*text/); +}); + +test("storage policy uses TO authenticated", () => { + const sql = getMigrationSQL().toLowerCase(); + // Storage upload/delete/update policies should use TO authenticated + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const storagePolicies = policyBlocks.filter((p) => + p.toLowerCase().includes("storage.objects"), + ); + // At least one storage policy should have TO authenticated + const hasAuthenticatedPolicy = storagePolicies.some((p) => + /to\s+(authenticated|public)/.test(p.toLowerCase()), + ); + expect(hasAuthenticatedPolicy).toBe(true); + // Specifically, upload/insert policies should be TO authenticated (not public) + const insertPolicies = storagePolicies.filter((p) => + /for\s+insert/.test(p.toLowerCase()), + ); + for (const policy of insertPolicies) { + expect(policy.toLowerCase()).toMatch(/to\s+authenticated/); + } +}); + +test("public read policy for avatars", () => { + const sql = getMigrationSQL().toLowerCase(); + // A SELECT policy on storage.objects for avatars bucket should allow public/anon access + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const avatarSelectPolicies = policyBlocks.filter( + (p) => + p.toLowerCase().includes("storage.objects") && + /for\s+select/.test(p.toLowerCase()) && + p.toLowerCase().includes("avatars"), + ); + expect(avatarSelectPolicies.length).toBeGreaterThan(0); + // Should use TO public (or TO anon) for public read access + const hasPublicAccess = avatarSelectPolicies.some( + (p) => + /to\s+public/.test(p.toLowerCase()) || /to\s+anon/.test(p.toLowerCase()), + ); + expect(hasPublicAccess).toBe(true); +}); + +test("documents bucket is fully private", () => { + const sql = getMigrationSQL().toLowerCase(); + // All policies for documents bucket should restrict to authenticated owner + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const documentPolicies = policyBlocks.filter( + (p) => + p.toLowerCase().includes("storage.objects") && + p.toLowerCase().includes("documents"), + ); + expect(documentPolicies.length).toBeGreaterThan(0); + // None should allow public/anon access + for (const policy of documentPolicies) { + expect(policy).not.toMatch(/to\s+public/); + expect(policy).not.toMatch(/to\s+anon/); + } + // All should be scoped to authenticated + for (const policy of documentPolicies) { + expect(policy).toMatch(/to\s+authenticated/); + } +}); + +test("creates file_metadata table", () => { + const sql = getMigrationSQL().toLowerCase(); + expect(sql).toMatch(/create\s+table/); + expect(sql).toMatch(/file_metadata/); +}); + +test("file_metadata has FK to auth.users with CASCADE", () => { + const sql = getMigrationSQL().toLowerCase(); + // Find the file_metadata CREATE TABLE block or the surrounding context + expect(sql).toMatch(/references\s+auth\.users/); + expect(sql).toMatch(/on\s+delete\s+cascade/); +}); + +test("RLS enabled on file_metadata", () => { + const sql = getMigrationSQL().toLowerCase(); + expect(sql).toMatch( + /alter\s+table.*file_metadata.*enable\s+row\s+level\s+security/, + ); +}); + +test("file_metadata policies use (select auth.uid())", () => { + const sql = getMigrationSQL(); + // Find policies that reference file_metadata + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const metadataPolicies = policyBlocks.filter((p) => + p.toLowerCase().includes("file_metadata"), + ); + // Each policy that uses auth.uid() should use the subselect form + for (const policy of metadataPolicies) { + if (policy.includes("auth.uid()")) { + expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i); + } + } +}); + +test("uses timestamptz for time columns", () => { + const sql = getMigrationSQL().toLowerCase(); + // Match "timestamp" that is NOT followed by "tz" or "with time zone" + const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/; + // Only check if the migration defines time-related columns + if ( + sql.includes("created_at") || + sql.includes("updated_at") || + sql.includes("uploaded_at") + ) { + expect(sql).not.toMatch(hasPlainTimestamp); + } +}); + +test("index on file_metadata user_id", () => { + const sql = getMigrationSQL().toLowerCase(); + expect(sql).toMatch(/create\s+index/); + // Should index user_id on file_metadata + expect(sql).toMatch(/file_metadata/); + expect(sql).toMatch(/user_id/); +}); + +test("idempotent DDL", () => { + const sql = getMigrationSQL().toLowerCase(); + expect(sql).toMatch(/if\s+not\s+exists/); +}); + +test("overall quality score", () => { + const sql = getMigrationSQL().toLowerCase(); + // A high-quality migration should contain most of these best-practice signals + const signals = [ + // 1. Avatars bucket is public + /insert\s+into\s+storage\.buckets[\s\S]*?avatars/, + // 2. Documents bucket exists + /insert\s+into\s+storage\.buckets[\s\S]*?documents/, + // 3. MIME type restriction + /allowed_mime_types/, + // 4. File size limit + /file_size_limit/, + // 5. Storage foldername helper + /storage\.foldername/, + // 6. auth.uid()::text cast + /auth\.uid\(\)\s*::\s*text/, + // 7. TO authenticated on policies + /to\s+authenticated/, + // 8. Public read for avatars + /to\s+(public|anon)/, + // 9. RLS on file_metadata + /enable\s+row\s+level\s+security/, + // 10. FK to auth.users with cascade + /on\s+delete\s+cascade/, + // 11. (select auth.uid()) subselect form + /\(select\s+auth\.uid\(\)\)/, + // 12. Index on user_id + /create\s+index/, + // 13. timestamptz usage + /timestamptz/, + // 14. IF NOT EXISTS for idempotency + /if\s+not\s+exists/, + // 15. file_metadata table + /create\s+table[\s\S]*?file_metadata/, + ]; + const matches = signals.filter((r) => r.test(sql)); + // Require at least 11 of 15 best-practice signals + expect(matches.length).toBeGreaterThanOrEqual(11); +}); diff --git a/packages/evals/evals/storage-rls-user-folders/PROMPT.md b/packages/evals/evals/storage-rls-user-folders/PROMPT.md new file mode 100644 index 0000000..b04d041 --- /dev/null +++ b/packages/evals/evals/storage-rls-user-folders/PROMPT.md @@ -0,0 +1,12 @@ +I need to set up file storage for my app. There are two use cases: + +1. **Avatars** -- Users upload a profile picture. Anyone can view avatars but only the owning user can upload or replace their own. Only allow image files (JPEG, PNG, WebP). Max 2MB. + +2. **Documents** -- Users upload private documents that only they can access. Max 50MB. No file type restriction. + +The Supabase project is already initialized in the `supabase/` directory. Create a SQL migration that: +- Configures both storage buckets +- Adds RLS policies on `storage.objects` so each user can only access their own folder (folder name = user ID) +- Creates a `file_metadata` table to track uploaded files (file name, bucket, size, user reference) with appropriate security + +Users are authenticated via Supabase Auth. diff --git a/packages/evals/evals/storage-rls-user-folders/package.json b/packages/evals/evals/storage-rls-user-folders/package.json new file mode 100644 index 0000000..1e523fb --- /dev/null +++ b/packages/evals/evals/storage-rls-user-folders/package.json @@ -0,0 +1,5 @@ +{ + "name": "storage-rls-user-folders", + "private": true, + "type": "module" +} diff --git a/packages/evals/evals/storage-rls-user-folders/supabase/config.toml b/packages/evals/evals/storage-rls-user-folders/supabase/config.toml new file mode 100644 index 0000000..1eea0ee --- /dev/null +++ b/packages/evals/evals/storage-rls-user-folders/supabase/config.toml @@ -0,0 +1,64 @@ +# For detailed configuration reference documentation, visit: +# https://supabase.com/docs/guides/local-development/cli/config +# A string used to distinguish different Supabase projects on the same host. Defaults to the +# working directory name when running `supabase init`. +project_id = "storage-rls-user-folders" + +[api] +enabled = true +# Port to use for the API URL. +port = 54321 +# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API +# endpoints. `public` and `graphql_public` schemas are included by default. +schemas = ["public", "graphql_public"] +# Extra schemas to add to the search_path of every request. +extra_search_path = ["public", "extensions"] +# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size +# for accidental or malicious requests. +max_rows = 1000 + +[db] +# Port to use for the local database URL. +port = 54322 +# Port used by db diff command to initialize the shadow database. +shadow_port = 54320 +# The database major version to use. This has to be the same as your remote database's. Run `SHOW +# server_version;` on the remote database to check. +major_version = 17 + +[db.pooler] +enabled = false +# Port to use for the local connection pooler. +port = 54329 +# Specifies when a server connection can be reused by other clients. +# Configure one of the supported pooler modes: `transaction`, `session`. +pool_mode = "transaction" +# How many server connections to allow per user/database pair. +default_pool_size = 20 +# Maximum number of client connections allowed. +max_client_conn = 100 + +[storage] +enabled = true +# The maximum file size allowed (e.g. "5MB", "500KB"). +file_size_limit = "50MiB" + +[auth] +enabled = true +# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used +# in emails. +site_url = "http://127.0.0.1:3000" +# A list of *exact* URLs that auth providers are permitted to redirect to post authentication. +additional_redirect_urls = ["https://127.0.0.1:3000"] +# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week). +jwt_expiry = 3600 +# Allow/disallow new user signups to your project. +enable_signup = true +# Allow/disallow anonymous sign-ins to your project. +enable_anonymous_sign_ins = false + +[auth.email] +# Allow/disallow new user signups via email to your project. +enable_signup = true +# If enabled, users need to confirm their email address before signing in. +enable_confirmations = false diff --git a/packages/evals/evals/storage-rls-user-folders/supabase/migrations/.gitkeep b/packages/evals/evals/storage-rls-user-folders/supabase/migrations/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/packages/evals/evals/team-rls-security-definer/EVAL.ts b/packages/evals/evals/team-rls-security-definer/EVAL.ts new file mode 100644 index 0000000..4729c3e --- /dev/null +++ b/packages/evals/evals/team-rls-security-definer/EVAL.ts @@ -0,0 +1,201 @@ +import { existsSync, readdirSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import { expect, test } from "vitest"; + +const supabaseDir = join(process.cwd(), "supabase"); +const migrationsDir = join(supabaseDir, "migrations"); + +/** Find all .sql migration files (agent may create one or multiple). */ +function findMigrationFiles(): string[] { + if (!existsSync(migrationsDir)) return []; + return readdirSync(migrationsDir) + .filter((f) => f.endsWith(".sql")) + .map((f) => join(migrationsDir, f)); +} + +/** Concatenate all migration SQL into a single string for assertions. */ +function getMigrationSQL(): string { + const files = findMigrationFiles(); + if (files.length === 0) + throw new Error("No migration file found in supabase/migrations/"); + return files.map((f) => readFileSync(f, "utf-8")).join("\n"); +} + +test("migration file exists", () => { + expect(findMigrationFiles().length).toBeGreaterThan(0); +}); + +test("creates organizations table", () => { + const sql = getMigrationSQL().toLowerCase(); + expect(sql).toMatch(/create\s+table[\s\S]*?organizations/); +}); + +test("creates memberships table", () => { + const sql = getMigrationSQL().toLowerCase(); + expect(sql).toMatch(/create\s+table[\s\S]*?memberships/); +}); + +test("creates projects table", () => { + const sql = getMigrationSQL().toLowerCase(); + expect(sql).toMatch(/create\s+table[\s\S]*?projects/); +}); + +test("enables RLS on all tables", () => { + const sql = getMigrationSQL().toLowerCase(); + expect(sql).toMatch( + /alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/, + ); + expect(sql).toMatch( + /alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/, + ); + expect(sql).toMatch( + /alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/, + ); +}); + +test("FK to auth.users with ON DELETE CASCADE", () => { + const sql = getMigrationSQL().toLowerCase(); + // memberships should reference auth.users with cascade delete + expect(sql).toMatch(/references\s+auth\.users/); + expect(sql).toMatch(/on\s+delete\s+cascade/); +}); + +test("org_id FK on projects", () => { + const sql = getMigrationSQL().toLowerCase(); + // projects should have a foreign key referencing organizations + expect(sql).toMatch( + /org[anization_]*id[\s\S]*?references[\s\S]*?organizations/, + ); +}); + +test("private schema created", () => { + const sql = getMigrationSQL().toLowerCase(); + expect(sql).toMatch(/create\s+schema[\s\S]*?private/); +}); + +test("security_definer helper function", () => { + const sql = getMigrationSQL().toLowerCase(); + // Function should be in the private schema with SECURITY DEFINER and search_path = '' + expect(sql).toMatch(/private\./); + expect(sql).toMatch(/security\s+definer/); + expect(sql).toMatch(/set\s+search_path\s*=\s*''/); +}); + +test("policies use (select auth.uid())", () => { + const sql = getMigrationSQL(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + expect(policyBlocks.length).toBeGreaterThan(0); + for (const policy of policyBlocks) { + if (policy.includes("auth.uid()")) { + // The subselect form: (select auth.uid()) + expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i); + } + } +}); + +test("policies use TO authenticated", () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + expect(policyBlocks.length).toBeGreaterThan(0); + for (const policy of policyBlocks) { + expect(policy).toMatch(/to\s+authenticated/); + } +}); + +test("index on membership lookup columns", () => { + const sql = getMigrationSQL().toLowerCase(); + expect(sql).toMatch(/create\s+index/); + // Should index user_id and/or org_id on memberships for policy lookups + const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? []; + const indexesUserOrOrg = indexBlocks.filter( + (idx) => + idx.includes("user_id") || + idx.includes("org_id") || + idx.includes("organization_id"), + ); + expect(indexesUserOrOrg.length).toBeGreaterThanOrEqual(1); +}); + +test("uses timestamptz", () => { + const sql = getMigrationSQL().toLowerCase(); + // Match "timestamp" that is NOT followed by "tz" or "with time zone" + const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/; + // Only fail if the migration defines time columns with plain timestamp + if ( + sql.includes("created_at") || + sql.includes("updated_at") || + sql.includes("_at ") + ) { + expect(sql).not.toMatch(hasPlainTimestamp); + } +}); + +test("idempotent DDL", () => { + const sql = getMigrationSQL().toLowerCase(); + expect(sql).toMatch(/if\s+not\s+exists/); +}); + +test("delete policy restricted to owner role", () => { + const sql = getMigrationSQL().toLowerCase(); + // Look for a delete policy on projects that checks for owner (or admin) role + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const deletePolicy = policyBlocks.find( + (p) => + p.toLowerCase().includes("delete") && p.toLowerCase().includes("project"), + ); + expect(deletePolicy).toBeDefined(); + // The delete policy should check for an owner/admin role + expect(deletePolicy?.toLowerCase()).toMatch(/owner|admin/); +}); + +test("overall quality score", () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + // A high-quality migration should contain most of these best-practice signals + const signals = [ + // 1. RLS enabled on all three tables + /alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ) && + /alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ) && + /alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ), + // 2. FK to auth.users with cascade + /references\s+auth\.users/.test(sql) && /on\s+delete\s+cascade/.test(sql), + // 3. Private schema created + /create\s+schema[\s\S]*?private/.test(sql), + // 4. security_definer with search_path + /security\s+definer/.test(sql) && /set\s+search_path\s*=\s*''/.test(sql), + // 5. Subselect auth.uid() + /\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql), + // 6. TO authenticated on policies + policyBlocks.length > 0 && + policyBlocks.every((p) => /to\s+authenticated/.test(p)), + // 7. Indexes on lookup columns + /create\s+index/.test(sql), + // 8. timestamptz (no plain timestamp) + !/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(sql), + // 9. Idempotent DDL + /if\s+not\s+exists/.test(sql), + // 10. Delete policy checks owner role + policyBlocks.some( + (p) => + p.toLowerCase().includes("delete") && + p.toLowerCase().includes("project") && + /owner|admin/.test(p.toLowerCase()), + ), + // 11. org_id FK on projects + /org[anization_]*id[\s\S]*?references[\s\S]*?organizations/.test(sql), + // 12. Multiple policies (at least one per table) + policyBlocks.length >= 3, + // 13. Membership role column exists + /role/.test(sql), + // 14. Private schema function referenced in policies + /private\./.test(sql), + ]; + const passed = signals.filter(Boolean).length; + expect(passed).toBeGreaterThanOrEqual(10); +}); diff --git a/packages/evals/evals/team-rls-security-definer/PROMPT.md b/packages/evals/evals/team-rls-security-definer/PROMPT.md new file mode 100644 index 0000000..870e6e4 --- /dev/null +++ b/packages/evals/evals/team-rls-security-definer/PROMPT.md @@ -0,0 +1,14 @@ +I'm building a project management app where users can belong to multiple organizations. Each organization has projects that all members can view and edit. + +The Supabase project is already initialized in the `supabase/` directory. Create a SQL migration with: + +1. An `organizations` table (name, slug) +2. A `memberships` table linking users to organizations with a role column (owner, admin, member) +3. A `projects` table (name, description, status) belonging to an organization + +Set up Row Level Security so: +- Users can only see organizations they belong to +- Users can only see and manage projects in their organizations +- Only org owners can delete projects + +The migration should handle the case where a user is deleted from auth. diff --git a/packages/evals/evals/team-rls-security-definer/package.json b/packages/evals/evals/team-rls-security-definer/package.json new file mode 100644 index 0000000..79b0d39 --- /dev/null +++ b/packages/evals/evals/team-rls-security-definer/package.json @@ -0,0 +1,5 @@ +{ + "name": "team-rls-security-definer", + "private": true, + "type": "module" +} diff --git a/packages/evals/evals/team-rls-security-definer/supabase/config.toml b/packages/evals/evals/team-rls-security-definer/supabase/config.toml new file mode 100644 index 0000000..38710cf --- /dev/null +++ b/packages/evals/evals/team-rls-security-definer/supabase/config.toml @@ -0,0 +1,111 @@ +# For detailed configuration reference documentation, visit: +# https://supabase.com/docs/guides/local-development/cli/config +# A string used to distinguish different Supabase projects on the same host. Defaults to the +# working directory name when running `supabase init`. +project_id = "team-rls-security-definer" + +[api] +enabled = true +# Port to use for the API URL. +port = 54321 +# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API +# endpoints. `public` and `graphql_public` schemas are included by default. +schemas = ["public", "graphql_public"] +# Extra schemas to add to the search_path of every request. +extra_search_path = ["public", "extensions"] +# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size +# for accidental or malicious requests. +max_rows = 1000 + +[db] +# Port to use for the local database URL. +port = 54322 +# Port used by db diff command to initialize the shadow database. +shadow_port = 54320 +# The database major version to use. This has to be the same as your remote database's. Run `SHOW +# server_version;` on the remote database to check. +major_version = 17 + +[db.pooler] +enabled = false +# Port to use for the local connection pooler. +port = 54329 +# Specifies when a server connection can be reused by other clients. +# Configure one of the supported pooler modes: `transaction`, `session`. +pool_mode = "transaction" +# How many server connections to allow per user/database pair. +default_pool_size = 20 +# Maximum number of client connections allowed. +max_client_conn = 100 + +[db.migrations] +# If disabled, migrations will be skipped during a db push or reset. +enabled = true +schema_paths = [] + +[db.seed] +# If enabled, seeds the database after migrations during a db reset. +enabled = true +# Specifies an ordered list of seed files to load during db reset. +sql_paths = ["./seed.sql"] + +[realtime] +enabled = true + +[studio] +enabled = true +# Port to use for Supabase Studio. +port = 54323 +# External URL of the API server that frontend connects to. +api_url = "http://127.0.0.1" + +[inbucket] +enabled = true +# Port to use for the email testing server web interface. +port = 54324 + +[storage] +enabled = true +# The maximum file size allowed (e.g. "5MB", "500KB"). +file_size_limit = "50MiB" + +[auth] +enabled = true +# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used +# in emails. +site_url = "http://127.0.0.1:3000" +# A list of *exact* URLs that auth providers are permitted to redirect to post authentication. +additional_redirect_urls = ["https://127.0.0.1:3000"] +# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week). +jwt_expiry = 3600 +# If disabled, the refresh token will never expire. +enable_refresh_token_rotation = true +# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds. +# Requires enable_refresh_token_rotation = true. +refresh_token_reuse_interval = 10 +# Allow/disallow new user signups to your project. +enable_signup = true +# Allow/disallow anonymous sign-ins to your project. +enable_anonymous_sign_ins = false + +[auth.email] +# Allow/disallow new user signups via email to your project. +enable_signup = true +# If enabled, a user will be required to confirm any email change on both the old, and new email +# addresses. If disabled, only the new email is required to confirm. +double_confirm_changes = true +# If enabled, users need to confirm their email address before signing in. +enable_confirmations = false + +[edge_runtime] +enabled = true +# Configure one of the supported request policies: `oneshot`, `per_worker`. +policy = "per_worker" +# Port to attach the Chrome inspector for debugging edge functions. +inspector_port = 8083 + +[analytics] +enabled = true +port = 54327 +# Configure one of the supported backends: `postgres`, `bigquery`. +backend = "postgres" diff --git a/packages/evals/package-lock.json b/packages/evals/package-lock.json index 5dcf1f8..fd8428c 100644 --- a/packages/evals/package-lock.json +++ b/packages/evals/package-lock.json @@ -9,6 +9,7 @@ "version": "1.0.0", "license": "MIT", "dependencies": { + "@anthropic-ai/claude-code": "^2.1.49", "braintrust": "^3.0.0" }, "devDependencies": { @@ -18,6 +19,29 @@ "vitest": "^3.1.0" } }, + "node_modules/@anthropic-ai/claude-code": { + "version": "2.1.49", + "resolved": "https://registry.npmjs.org/@anthropic-ai/claude-code/-/claude-code-2.1.49.tgz", + "integrity": "sha512-PonEmTZlB5IZbBu9TmtOpGZnupU7OxOXTsJKcXE/4Ak5qp3ptN1wSBRdgKYnn6GDYhXijTXuVVwrCQU+NAgwPA==", + "license": "SEE LICENSE IN README.md", + "bin": { + "claude": "cli.js" + }, + "engines": { + "node": ">=18.0.0" + }, + "optionalDependencies": { + "@img/sharp-darwin-arm64": "^0.34.2", + "@img/sharp-darwin-x64": "^0.34.2", + "@img/sharp-linux-arm": "^0.34.2", + "@img/sharp-linux-arm64": "^0.34.2", + "@img/sharp-linux-x64": "^0.34.2", + "@img/sharp-linuxmusl-arm64": "^0.34.2", + "@img/sharp-linuxmusl-x64": "^0.34.2", + "@img/sharp-win32-arm64": "^0.34.2", + "@img/sharp-win32-x64": "^0.34.2" + } + }, "node_modules/@colors/colors": { "version": "1.5.0", "resolved": "https://registry.npmjs.org/@colors/colors/-/colors-1.5.0.tgz", @@ -444,6 +468,310 @@ "node": ">=18" } }, + "node_modules/@img/sharp-darwin-arm64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.5.tgz", + "integrity": "sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w==", + "cpu": [ + "arm64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-darwin-arm64": "1.2.4" + } + }, + "node_modules/@img/sharp-darwin-x64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.5.tgz", + "integrity": "sha512-YNEFAF/4KQ/PeW0N+r+aVVsoIY0/qxxikF2SWdp+NRkmMB7y9LBZAVqQ4yhGCm/H3H270OSykqmQMKLBhBJDEw==", + "cpu": [ + "x64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-darwin-x64": "1.2.4" + } + }, + "node_modules/@img/sharp-libvips-darwin-arm64": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.2.4.tgz", + "integrity": "sha512-zqjjo7RatFfFoP0MkQ51jfuFZBnVE2pRiaydKJ1G/rHZvnsrHAOcQALIi9sA5co5xenQdTugCvtb1cuf78Vf4g==", + "cpu": [ + "arm64" + ], + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "darwin" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-darwin-x64": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.2.4.tgz", + "integrity": "sha512-1IOd5xfVhlGwX+zXv2N93k0yMONvUlANylbJw1eTah8K/Jtpi15KC+WSiaX/nBmbm2HxRM1gZ0nSdjSsrZbGKg==", + "cpu": [ + "x64" + ], + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "darwin" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linux-arm": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.2.4.tgz", + "integrity": "sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A==", + "cpu": [ + "arm" + ], + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linux-arm64": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.2.4.tgz", + "integrity": "sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw==", + "cpu": [ + "arm64" + ], + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linux-x64": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.2.4.tgz", + "integrity": "sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw==", + "cpu": [ + "x64" + ], + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linuxmusl-arm64": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.2.4.tgz", + "integrity": "sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw==", + "cpu": [ + "arm64" + ], + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-libvips-linuxmusl-x64": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.2.4.tgz", + "integrity": "sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg==", + "cpu": [ + "x64" + ], + "license": "LGPL-3.0-or-later", + "optional": true, + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-linux-arm": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.5.tgz", + "integrity": "sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw==", + "cpu": [ + "arm" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-arm": "1.2.4" + } + }, + "node_modules/@img/sharp-linux-arm64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.5.tgz", + "integrity": "sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg==", + "cpu": [ + "arm64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-arm64": "1.2.4" + } + }, + "node_modules/@img/sharp-linux-x64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.5.tgz", + "integrity": "sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ==", + "cpu": [ + "x64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-x64": "1.2.4" + } + }, + "node_modules/@img/sharp-linuxmusl-arm64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.5.tgz", + "integrity": "sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg==", + "cpu": [ + "arm64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linuxmusl-arm64": "1.2.4" + } + }, + "node_modules/@img/sharp-linuxmusl-x64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.5.tgz", + "integrity": "sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q==", + "cpu": [ + "x64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linuxmusl-x64": "1.2.4" + } + }, + "node_modules/@img/sharp-win32-arm64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-win32-arm64/-/sharp-win32-arm64-0.34.5.tgz", + "integrity": "sha512-WQ3AgWCWYSb2yt+IG8mnC6Jdk9Whs7O0gxphblsLvdhSpSTtmu69ZG1Gkb6NuvxsNACwiPV6cNSZNzt0KPsw7g==", + "cpu": [ + "arm64" + ], + "license": "Apache-2.0 AND LGPL-3.0-or-later", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + } + }, + "node_modules/@img/sharp-win32-x64": { + "version": "0.34.5", + "resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.5.tgz", + "integrity": "sha512-+29YMsqY2/9eFEiW93eqWnuLcWcufowXewwSNIT6UwZdUUCrM3oFjMWH/Z6/TMmb4hlFenmfAVbpWeup2jryCw==", + "cpu": [ + "x64" + ], + "license": "Apache-2.0 AND LGPL-3.0-or-later", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + } + }, "node_modules/@jridgewell/sourcemap-codec": { "version": "1.5.5", "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", diff --git a/packages/evals/package.json b/packages/evals/package.json index 253caa1..9909434 100644 --- a/packages/evals/package.json +++ b/packages/evals/package.json @@ -10,6 +10,7 @@ "eval:upload": "BRAINTRUST_UPLOAD=true tsx src/runner.ts" }, "dependencies": { + "@anthropic-ai/claude-code": "^2.1.49", "braintrust": "^3.0.0" }, "devDependencies": { diff --git a/packages/evals/scenarios/SCENARIOS.md b/packages/evals/scenarios/SCENARIOS.md index 8d51641..c7185b3 100644 --- a/packages/evals/scenarios/SCENARIOS.md +++ b/packages/evals/scenarios/SCENARIOS.md @@ -49,3 +49,121 @@ The agent initializes a Supabase project and creates a migration file that: | index on user_id | `CREATE INDEX` on the FK column | | IF NOT EXISTS | Idempotent migration | | overall quality | At least 4/5 best-practice signals present | + +## Scenario 2: team-rls-security-definer + +**Description:** Create a SQL migration for a team-based project management app +where users belong to organizations via a membership table. The migration must +define tables for organizations, memberships, and projects, then secure them +with RLS policies that use a `security definer` helper function in a private +schema to efficiently resolve team membership without per-row joins. + +**Setup:** The workspace starts with a pre-initialized Supabase project +(`supabase/config.toml` exists, empty `supabase/migrations/` directory). The +agent creates migration files within this structure. + +**Expected skill files read:** + +- `SKILL.md` (skill body with reference file index) +- `references/db-rls-mandatory.md` +- `references/db-rls-policy-types.md` +- `references/db-rls-common-mistakes.md` +- `references/db-rls-performance.md` +- `references/db-security-functions.md` +- `references/db-schema-auth-fk.md` +- `references/db-schema-timestamps.md` +- `references/db-perf-indexes.md` +- `references/db-migrations-idempotent.md` + +**Expected result:** + +The agent creates a migration file that: + +- Creates organizations, memberships, and projects tables with `timestamptz` columns +- Has `user_id` FK to `auth.users(id)` with `ON DELETE CASCADE` on memberships +- Has `org_id` FK on projects referencing organizations +- Enables RLS on all three tables +- Creates a private schema with a `security definer` helper function (`SET search_path = ''`) +- Creates RLS policies using `(select auth.uid())` with `TO authenticated` +- Creates indexes on membership lookup columns (user_id, org_id) +- Has a delete policy on projects restricted to owner role +- Uses `IF NOT EXISTS` for idempotency + +**Scorer:** Binary pass/fail (16 vitest assertions) + +| Test | What it checks | +| --- | --- | +| migration file exists | A `.sql` file exists in `supabase/migrations/` | +| creates organizations table | SQL contains `CREATE TABLE` for organizations | +| creates memberships table | SQL contains `CREATE TABLE` for memberships | +| creates projects table | SQL contains `CREATE TABLE` for projects | +| enables RLS on all tables | `ALTER TABLE ... ENABLE ROW LEVEL SECURITY` for all three tables | +| FK to auth.users with ON DELETE CASCADE | memberships references `auth.users` with cascade | +| org_id FK on projects | projects references organizations | +| private schema created | `CREATE SCHEMA ... private` present | +| security_definer helper function | Function in private schema with `SECURITY DEFINER` and `SET search_path = ''` | +| policies use (select auth.uid()) | Subselect form in all policies referencing auth.uid() | +| policies use TO authenticated | All policies scoped to authenticated role | +| index on membership lookup columns | `CREATE INDEX` on user_id and/or org_id in memberships | +| uses timestamptz | No plain `timestamp` for time columns | +| idempotent DDL | Uses `IF NOT EXISTS` or `DROP ... IF EXISTS` patterns | +| delete policy restricted to owner role | A delete policy on projects checks for owner/admin role | +| overall quality score | At least 10/14 best-practice signals present | + +## Scenario 3: storage-rls-user-folders + +**Description:** Create a SQL migration that sets up Supabase Storage buckets +with RLS policies for user-content. An avatars bucket (public reads, +authenticated uploads restricted to user folders) and a documents bucket (fully +private, user-isolated), with file type restrictions, storage helper functions +in policies, and a file_metadata tracking table secured with RLS. + +**Setup:** Pre-initialized Supabase project (`supabase/config.toml` exists) +with an empty `supabase/migrations/` directory. The agent creates migration +files within this structure. + +**Expected skill files read:** + +- `SKILL.md` (skill body with reference file index) +- `references/storage-access-control.md` +- `references/db-rls-mandatory.md` +- `references/db-rls-common-mistakes.md` +- `references/db-rls-performance.md` +- `references/db-schema-auth-fk.md` +- `references/db-schema-timestamps.md` +- `references/db-perf-indexes.md` +- `references/db-migrations-idempotent.md` + +**Expected result:** + +The agent creates a migration file that: + +- Inserts avatars bucket into `storage.buckets` with `public = true`, MIME type restrictions, and file size limit +- Inserts documents bucket with `public = false` +- Creates RLS policies on `storage.objects` using `storage.foldername(name)` with `auth.uid()::text` +- Scopes upload policies `TO authenticated` and avatars SELECT policy `TO public` +- Creates `file_metadata` table with FK to `auth.users` with `ON DELETE CASCADE` +- Enables RLS on `file_metadata` with policies using `(select auth.uid())` +- Uses `timestamptz` for time columns, indexes `user_id`, and `IF NOT EXISTS` for idempotency + +**Scorer:** Binary pass/fail (17 vitest assertions) + +| Test | What it checks | +| --- | --- | +| migration file exists | A `.sql` file exists in `supabase/migrations/` | +| creates avatars bucket | SQL inserts into `storage.buckets` with id 'avatars' and `public = true` | +| creates documents bucket | SQL inserts into `storage.buckets` with id 'documents' and `public = false` | +| avatars bucket has mime type restriction | `allowed_mime_types` includes image types (jpeg, png, webp) | +| avatars bucket has file size limit | `file_size_limit` set (around 2MB / 2097152 bytes) | +| storage policy uses foldername or path for user isolation | Policy references `storage.foldername(name)` with `auth.uid()::text` | +| storage policy uses TO authenticated | Storage upload/delete policies scoped to `TO authenticated` | +| public read policy for avatars | A SELECT policy on storage.objects for avatars allows public/anon access | +| documents bucket is fully private | Policies for documents restrict all operations to authenticated owner | +| creates file_metadata table | SQL contains `CREATE TABLE` for file_metadata | +| file_metadata has FK to auth.users with CASCADE | `REFERENCES auth.users` with `ON DELETE CASCADE` | +| RLS enabled on file_metadata | `ALTER TABLE file_metadata ENABLE ROW LEVEL SECURITY` | +| file_metadata policies use (select auth.uid()) | Subselect form in policies | +| uses timestamptz for time columns | No plain `timestamp` in file_metadata | +| index on file_metadata user_id | `CREATE INDEX` on user_id column | +| idempotent DDL | Uses `IF NOT EXISTS` patterns | +| overall quality score | At least 11/15 best-practice signals present | diff --git a/packages/evals/scenarios/storage-rls-user-folders.md b/packages/evals/scenarios/storage-rls-user-folders.md new file mode 100644 index 0000000..ae953fa --- /dev/null +++ b/packages/evals/scenarios/storage-rls-user-folders.md @@ -0,0 +1,144 @@ +# Scenario: storage-rls-user-folders + +## Summary + +The agent must create a SQL migration that sets up Supabase Storage buckets +with RLS policies for a user-content application. The migration must configure +an avatars bucket (public reads, authenticated uploads restricted to user +folders) and a documents bucket (fully private, user-isolated), with proper +file type restrictions, storage helper functions in policies, and a +file_metadata tracking table secured with RLS. + +## Real-World Justification + +Why this is a common and important workflow: + +1. **Storage RLS is confusing and under-documented compared to table RLS** -- + Developers consistently struggle with the distinction between public/private + buckets and the RLS policies needed on `storage.objects`. Multiple GitHub + discussions show confusion about which SDK operations map to which SQL + operations (INSERT, SELECT, UPDATE, DELETE). + - Source: https://github.com/orgs/supabase/discussions/37611 + - Source: https://github.com/orgs/supabase/discussions/38700 + +2. **User-folder isolation is the canonical storage security pattern** -- The + official Supabase docs demonstrate folder-based isolation using + `storage.foldername(name)` and `auth.uid()::text`, but developers frequently + get the casting or array indexing wrong. + - Source: https://supabase.com/docs/guides/storage/security/access-control + +3. **Missing file type restrictions leads to security vulnerabilities** -- + Without `allowed_mime_types` on the bucket or extension checks in RLS + policies, users can upload executable files or oversized payloads. The + Supabase security best practices guide calls this out as a common oversight. + - Source: https://supaexplorer.com/guides/supabase-security-best-practices + - Source: https://supabase.com/docs/guides/storage/buckets/fundamentals + +## Skill References Exercised + +Which reference files the agent should consult and what each teaches: + +| Reference File | What It Teaches | What the Agent Should Apply | +|---|---|---| +| `references/storage-access-control.md` | Bucket visibility, RLS on storage.objects, storage helper functions, SDK-to-SQL operation mapping | User-folder policies using `storage.foldername()`, separate SELECT/INSERT policies | +| `references/db-rls-mandatory.md` | RLS must be enabled on all public tables | Enable RLS on the file_metadata tracking table | +| `references/db-rls-common-mistakes.md` | Missing TO clause, missing SELECT policy for UPDATE | Use `TO authenticated` (or `TO public` for public reads), include SELECT policy | +| `references/db-rls-performance.md` | Wrap auth.uid() in SELECT subquery | Use `(select auth.uid())` in both storage and table policies | +| `references/db-schema-auth-fk.md` | FK to auth.users with ON DELETE CASCADE | file_metadata.user_id references auth.users with cascade | +| `references/db-schema-timestamps.md` | Use timestamptz not timestamp | Time columns on file_metadata use timestamptz | +| `references/db-perf-indexes.md` | Index columns used in policy lookups | Index user_id on file_metadata | +| `references/db-migrations-idempotent.md` | IF NOT EXISTS for safe reruns | Idempotent DDL throughout | + +## Workspace Setup + +What the workspace starts with before the agent runs: + +- Pre-initialized Supabase project (`supabase/config.toml` exists) +- Empty `supabase/migrations/` directory +- The agent creates migration files within this structure + +## Agent Task (PROMPT.md draft) + +The prompt to give the agent. Written as a developer would ask it: + +> I need to set up file storage for my app. There are two use cases: +> +> 1. **Avatars** -- Users upload a profile picture. Anyone can view avatars but +> only the owning user can upload or replace their own. Only allow image +> files (JPEG, PNG, WebP). Max 2MB. +> +> 2. **Documents** -- Users upload private documents that only they can access. +> Max 50MB. No file type restriction. +> +> Create a SQL migration that: +> - Configures both storage buckets +> - Adds RLS policies on `storage.objects` so each user can only access their +> own folder (folder name = user ID) +> - Creates a `file_metadata` table to track uploaded files (file name, bucket, +> size, user reference) with appropriate security +> +> Users are authenticated via Supabase Auth. + +## Evaluation Criteria + +What vitest should assert on the agent's output. Each assertion tests a +specific quality signal: + +| # | Test Name | What It Checks | Quality Dimension | +|---|-----------|----------------|-------------------| +| 1 | migration file exists | A `.sql` file exists in `supabase/migrations/` | structure | +| 2 | creates avatars bucket | SQL inserts into `storage.buckets` with id 'avatars' and `public = true` | correctness | +| 3 | creates documents bucket | SQL inserts into `storage.buckets` with id 'documents' and `public = false` | correctness | +| 4 | avatars bucket has mime type restriction | `allowed_mime_types` includes image types (jpeg, png, webp) | security | +| 5 | avatars bucket has file size limit | `file_size_limit` set (around 2MB / 2097152 bytes) | security | +| 6 | storage policy uses foldername or path for user isolation | Policy references `storage.foldername(name)` with `auth.uid()::text` | security | +| 7 | storage policy uses TO authenticated | Storage upload/delete policies scoped to `TO authenticated` | security | +| 8 | public read policy for avatars | A SELECT policy on storage.objects for avatars bucket allows public/anon access | correctness | +| 9 | documents bucket is fully private | Policies for documents bucket restrict all operations to authenticated owner | security | +| 10 | creates file_metadata table | SQL contains `CREATE TABLE` for file_metadata | correctness | +| 11 | file_metadata has FK to auth.users with CASCADE | `REFERENCES auth.users` with `ON DELETE CASCADE` | correctness | +| 12 | RLS enabled on file_metadata | `ALTER TABLE file_metadata ENABLE ROW LEVEL SECURITY` | security | +| 13 | file_metadata policies use (select auth.uid()) | Subselect form in policies | performance | +| 14 | uses timestamptz for time columns | No plain `timestamp` in file_metadata | correctness | +| 15 | index on file_metadata user_id | `CREATE INDEX` on user_id column | performance | +| 16 | idempotent DDL | Uses `IF NOT EXISTS` patterns | idempotency | +| 17 | overall quality score | At least 11/15 best-practice signals present | overall | + +## Reasoning + +Step-by-step reasoning for why this scenario is well-designed: + +1. **Baseline differentiator:** An agent without the skill would likely: (a) + confuse public bucket visibility with unrestricted upload access, (b) write + storage policies without using `storage.foldername()` or get the array + indexing wrong, (c) forget to set `allowed_mime_types` on the bucket itself, + (d) omit the `TO authenticated` clause on storage policies, (e) use bare + `auth.uid()` instead of the subselect form, (f) skip the `::text` cast when + comparing auth.uid() to folder names. These are all Supabase-specific + patterns that require reading the skill references. + +2. **Skill value:** The storage-access-control reference explicitly documents: + the public vs private bucket distinction, the `storage.foldername()` helper + function pattern, the SDK-to-SQL operation mapping, and bucket configuration + with mime types and size limits. Combined with the database security + references (RLS mandatory, common mistakes, performance), this scenario + exercises 8 reference files. + +3. **Testability:** Bucket configuration (INSERT INTO storage.buckets), storage + helper function usage (storage.foldername), policy clauses (TO + authenticated, TO public), mime types, file size limits, and all table-level + patterns (RLS, FK, indexes, timestamptz) are reliably detectable via regex + on SQL text. + +4. **Realism:** Nearly every Supabase application that handles user-generated + content needs avatar uploads and document storage. This is a day-one task + for any SaaS product. The GitHub discussions linked above show dozens of + developers hitting exactly these issues when setting up storage for the + first time. + +## Difficulty + +**Rating:** MEDIUM + +- Without skill: ~30-45% of assertions expected to pass +- With skill: ~85-95% of assertions expected to pass \ No newline at end of file diff --git a/packages/evals/scenarios/team-rls-security-definer.md b/packages/evals/scenarios/team-rls-security-definer.md new file mode 100644 index 0000000..ec9ad2a --- /dev/null +++ b/packages/evals/scenarios/team-rls-security-definer.md @@ -0,0 +1,139 @@ +# Scenario: team-rls-security-definer + +## Summary + +The agent must create a SQL migration for a team-based project management app +where users belong to organizations via a membership table. The migration must +define tables for organizations, memberships, and projects, then secure them +with RLS policies that use a `security definer` helper function in a private +schema to efficiently resolve team membership without per-row joins. + +## Real-World Justification + +Why this is a common and important workflow: + +1. **Multi-tenant team access is the most-asked RLS question on Supabase** -- + The official Supabase GitHub has multiple high-engagement discussions about + how to write RLS policies that check team/org membership without causing + performance issues or security holes. + - Source: https://github.com/supabase/supabase/discussions/4509 + - Source: https://github.com/supabase/supabase/discussions/811 + +2. **security_definer in public schema is a documented security anti-pattern** -- + Developers frequently place security_definer functions in the public schema, + inadvertently exposing them via the PostgREST API. The Supabase docs and + community discussions explicitly warn against this. + - Source: https://github.com/supabase/supabase/discussions/3269 + - Source: https://supabase.com/docs/guides/database/postgres/row-level-security + +3. **RLS policy performance with joins is a top pain point** -- Naive policies + that join against a memberships table execute per-row, causing severe + performance degradation. The recommended pattern is a security_definer + function that caches results via subselect. + - Source: https://github.com/orgs/supabase/discussions/1148 + - Source: https://makerkit.dev/blog/tutorials/supabase-rls-best-practices + +## Skill References Exercised + +Which reference files the agent should consult and what each teaches: + +| Reference File | What It Teaches | What the Agent Should Apply | +|---|---|---| +| `references/db-rls-mandatory.md` | RLS must be enabled on all public tables | Enable RLS on organizations, memberships, and projects | +| `references/db-rls-policy-types.md` | PERMISSIVE vs RESTRICTIVE policies | Use PERMISSIVE policies for team OR owner access patterns | +| `references/db-rls-common-mistakes.md` | Missing TO clause, user_metadata pitfalls | Always use `TO authenticated` on all policies | +| `references/db-rls-performance.md` | Wrap auth.uid() in SELECT, use security_definer for joins | Use `(select auth.uid())` and a private-schema helper function | +| `references/db-security-functions.md` | security_definer in private schema with search_path = '' | Create helper function in private schema, revoke default permissions | +| `references/db-schema-auth-fk.md` | FK to auth.users with ON DELETE CASCADE | Reference auth.users with cascade on memberships | +| `references/db-schema-timestamps.md` | Use timestamptz not timestamp | All time columns use timestamptz | +| `references/db-perf-indexes.md` | Index columns used in RLS policies | Index user_id and org_id columns used in policy lookups | +| `references/db-migrations-idempotent.md` | IF NOT EXISTS for safe reruns | Idempotent DDL throughout the migration | + +## Workspace Setup + +What the workspace starts with before the agent runs: + +- Pre-initialized Supabase project (`supabase/config.toml` exists) +- Empty `supabase/migrations/` directory +- The agent creates migration files within this structure + +## Agent Task (PROMPT.md draft) + +The prompt to give the agent. Written as a developer would ask it: + +> I'm building a project management app where users can belong to multiple +> organizations. Each organization has projects that all members can view and +> edit. +> +> Create a SQL migration with: +> +> 1. An `organizations` table (name, slug) +> 2. A `memberships` table linking users to organizations with a role column +> (owner, admin, member) +> 3. A `projects` table (name, description, status) belonging to an organization +> +> Set up Row Level Security so: +> - Users can only see organizations they belong to +> - Users can only see and manage projects in their organizations +> - Only org owners can delete projects +> +> The migration should handle the case where a user is deleted from auth. + +## Evaluation Criteria + +What vitest should assert on the agent's output. Each assertion tests a +specific quality signal: + +| # | Test Name | What It Checks | Quality Dimension | +|---|-----------|----------------|-------------------| +| 1 | migration file exists | A `.sql` file exists in `supabase/migrations/` | structure | +| 2 | creates organizations table | SQL contains `CREATE TABLE` for organizations | correctness | +| 3 | creates memberships table | SQL contains `CREATE TABLE` for memberships | correctness | +| 4 | creates projects table | SQL contains `CREATE TABLE` for projects | correctness | +| 5 | enables RLS on all tables | `ALTER TABLE ... ENABLE ROW LEVEL SECURITY` for all three tables | security | +| 6 | FK to auth.users with ON DELETE CASCADE | memberships references `auth.users` with cascade | correctness | +| 7 | org_id FK on projects | projects references organizations | correctness | +| 8 | private schema created | `CREATE SCHEMA ... private` present | security | +| 9 | security_definer helper function | A function in the private schema with `SECURITY DEFINER` and `SET search_path = ''` | security | +| 10 | policies use (select auth.uid()) | Subselect form in all policies referencing auth.uid() | performance | +| 11 | policies use TO authenticated | All policies scoped to authenticated role | security | +| 12 | index on membership lookup columns | `CREATE INDEX` on user_id and/or org_id in memberships | performance | +| 13 | uses timestamptz | No plain `timestamp` for time columns | correctness | +| 14 | idempotent DDL | Uses `IF NOT EXISTS` or `DROP ... IF EXISTS` patterns | idempotency | +| 15 | delete policy restricted to owner role | A delete policy on projects checks for owner/admin role | security | +| 16 | overall quality score | At least 10/14 best-practice signals present | overall | + +## Reasoning + +Step-by-step reasoning for why this scenario is well-designed: + +1. **Baseline differentiator:** An agent without the skill would likely put the + security_definer function in the public schema, omit `SET search_path = ''`, + use bare `auth.uid()` instead of the subselect form, write inline joins in + policies instead of using a helper function, and possibly forget `TO + authenticated` on some policies. These are all patterns that require specific + knowledge of Supabase conventions. + +2. **Skill value:** The skill explicitly teaches: (a) private schema for + security_definer functions, (b) `SET search_path = ''` to prevent injection, + (c) `(select auth.uid())` for per-statement caching, (d) using + security_definer functions to avoid per-row joins in policies, (e) `TO + authenticated` on every policy. This is a scenario where reading 5+ reference + files materially improves the output. + +3. **Testability:** Every assertion checks for specific SQL patterns via regex. + The private schema, security_definer, search_path, subselect auth.uid(), TO + authenticated, indexes, and timestamptz are all reliably detectable in SQL + text without runtime execution. + +4. **Realism:** Multi-tenant team-based access control is one of the most common + Supabase use cases. The GitHub discussions linked above have hundreds of + comments from developers working on exactly this pattern. Project management + apps (Notion, Linear, Asana clones) are a canonical example. + +## Difficulty + +**Rating:** MEDIUM + +- Without skill: ~35-50% of assertions expected to pass +- With skill: ~85-95% of assertions expected to pass \ No newline at end of file diff --git a/packages/evals/src/runner.ts b/packages/evals/src/runner.ts index ca65f2d..404568e 100644 --- a/packages/evals/src/runner.ts +++ b/packages/evals/src/runner.ts @@ -2,10 +2,12 @@ import { existsSync, readdirSync, readFileSync } from "node:fs"; import { join, resolve } from "node:path"; import { runAgent } from "./runner/agent.js"; import { uploadToBraintrust } from "./runner/braintrust.js"; +import { createResultDir, saveRunArtifacts } from "./runner/persist.js"; import { preflight } from "./runner/preflight.js"; import { listModifiedFiles, printSummary } from "./runner/results.js"; import { createWorkspace } from "./runner/scaffold.js"; import { runTests } from "./runner/test.js"; +import { buildTranscriptSummary } from "./runner/transcript.js"; import type { EvalRunResult, EvalScenario } from "./types.js"; // --------------------------------------------------------------------------- @@ -19,6 +21,12 @@ const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL; const scenarioFilter = process.env.EVAL_SCENARIO; const runBaseline = process.env.EVAL_BASELINE === "true"; +// Run-level timestamp shared across all scenarios in a single invocation +const runTimestamp = new Date() + .toISOString() + .replace(/[:.]/g, "-") + .replace("Z", ""); + // --------------------------------------------------------------------------- // Discover scenarios // --------------------------------------------------------------------------- @@ -58,10 +66,9 @@ async function runEval( ): Promise { const evalsDir = findEvalsDir(); const evalDir = join(evalsDir, scenario.id); + const variant = skillEnabled ? "with-skill" : "baseline"; - console.log( - `\n--- ${scenario.id} (${skillEnabled ? "with-skill" : "baseline"}) ---`, - ); + console.log(`\n--- ${scenario.id} (${variant}) ---`); // 1. Create isolated workspace const { workspacePath, cleanup } = createWorkspace({ @@ -104,7 +111,10 @@ async function runEval( // 5. Collect modified files const filesModified = listModifiedFiles(workspacePath, evalDir); - return { + // 6. Build transcript summary + const summary = buildTranscriptSummary(agentResult.events); + + const result: EvalRunResult = { scenario: scenario.id, agent: "claude-code", model, @@ -116,7 +126,22 @@ async function runEval( testsPassed: testResult.passedCount, testsTotal: testResult.totalCount, filesModified, + toolCallCount: summary.toolCalls.length, + costUsd: summary.totalCostUsd ?? undefined, }; + + // 7. Persist results + const resultDir = createResultDir(runTimestamp, scenario.id, variant); + result.resultsDir = resultDir; + saveRunArtifacts({ + resultDir, + rawTranscript: agentResult.rawTranscript, + testOutput: testResult.output, + result, + transcriptSummary: summary, + }); + + return result; } catch (error) { const err = error as Error; return { @@ -175,7 +200,9 @@ async function main() { } } - printSummary(results); + // Use the results dir from the first result (all share the same timestamp) + const resultsDir = results.find((r) => r.resultsDir)?.resultsDir; + printSummary(results, resultsDir); if (process.env.BRAINTRUST_UPLOAD === "true") { console.log("\nUploading to Braintrust..."); diff --git a/packages/evals/src/runner/agent.ts b/packages/evals/src/runner/agent.ts index 25b5cdd..694f82f 100644 --- a/packages/evals/src/runner/agent.ts +++ b/packages/evals/src/runner/agent.ts @@ -1,13 +1,27 @@ import { spawn } from "node:child_process"; +import { resolveClaudeBin } from "./preflight.js"; +import { + extractFinalOutput, + parseStreamJsonOutput, + type TranscriptEvent, +} from "./transcript.js"; export interface AgentRunResult { + /** Extracted final text output (backward-compatible). */ output: string; duration: number; + /** Raw NDJSON transcript string from stream-json. */ + rawTranscript: string; + /** Parsed transcript events. */ + events: TranscriptEvent[]; } /** * Invoke Claude Code in print mode as a subprocess. * + * Uses --output-format stream-json to capture structured NDJSON events + * including tool calls, results, and reasoning steps. + * * The agent operates in the workspace directory and can read/write files. * When the skill is installed (symlinked into workspace), Claude Code * discovers it automatically and uses it for guidance. @@ -23,14 +37,22 @@ export async function runAgent(opts: { const args = [ "-p", // Print mode (non-interactive) + "--verbose", "--output-format", - "text", + "stream-json", "--model", opts.model, "--no-session-persistence", "--dangerously-skip-permissions", "--tools", "Edit,Write,Bash,Read,Glob,Grep", + // Disable all MCP servers so the agent uses only local filesystem tools. + // Without this, MCP tools from the parent env (e.g. Supabase, Neon) + // leak in and the agent may apply migrations to a remote project + // instead of creating local files. + "--mcp-config", + '{"mcpServers":{}}', + "--strict-mcp-config", ]; // Disable skills for baseline runs so the agent relies on innate knowledge @@ -46,8 +68,10 @@ export async function runAgent(opts: { } } + const claudeBin = resolveClaudeBin(); + return new Promise((resolve) => { - const child = spawn("claude", args, { + const child = spawn(claudeBin, args, { cwd: opts.cwd, env, stdio: ["pipe", "pipe", "pipe"], @@ -73,9 +97,15 @@ export async function runAgent(opts: { child.on("close", () => { clearTimeout(timer); + const rawTranscript = stdout || stderr; + const events = parseStreamJsonOutput(rawTranscript); + const output = extractFinalOutput(events) || rawTranscript; + resolve({ - output: stdout || stderr, + output, duration: Date.now() - start, + rawTranscript, + events, }); }); }); diff --git a/packages/evals/src/runner/persist.ts b/packages/evals/src/runner/persist.ts new file mode 100644 index 0000000..fb4ea7f --- /dev/null +++ b/packages/evals/src/runner/persist.ts @@ -0,0 +1,56 @@ +import { mkdirSync, writeFileSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; +import type { EvalRunResult } from "../types.js"; +import type { TranscriptSummary } from "./transcript.js"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +/** Resolve the evals package root (packages/evals). */ +function evalsRoot(): string { + // __dirname is packages/evals/src/runner + return join(__dirname, "..", ".."); +} + +/** Create the results directory for a single scenario run. Returns the path. */ +export function createResultDir( + runTimestamp: string, + scenarioId: string, + variant: "with-skill" | "baseline", +): string { + const dir = join(evalsRoot(), "results", runTimestamp, scenarioId, variant); + mkdirSync(dir, { recursive: true }); + return dir; +} + +/** Save all artifacts for a single eval run. */ +export function saveRunArtifacts(opts: { + resultDir: string; + rawTranscript: string; + testOutput: string; + result: EvalRunResult; + transcriptSummary: TranscriptSummary; +}): void { + writeFileSync( + join(opts.resultDir, "transcript.jsonl"), + opts.rawTranscript, + "utf-8", + ); + + writeFileSync( + join(opts.resultDir, "test-output.txt"), + opts.testOutput, + "utf-8", + ); + + writeFileSync( + join(opts.resultDir, "result.json"), + JSON.stringify( + { ...opts.result, transcript: opts.transcriptSummary }, + null, + 2, + ), + "utf-8", + ); +} diff --git a/packages/evals/src/runner/preflight.ts b/packages/evals/src/runner/preflight.ts index 11691ea..2c7d8eb 100644 --- a/packages/evals/src/runner/preflight.ts +++ b/packages/evals/src/runner/preflight.ts @@ -1,10 +1,61 @@ import { execFileSync } from "node:child_process"; +import { existsSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +/** + * Resolve the `claude` binary path. + * + * Looks in the following order: + * 1. Local node_modules/.bin/claude (installed via @anthropic-ai/claude-code) + * 2. Global `claude` on PATH + * + * Throws with an actionable message when neither is found. + */ +export function resolveClaudeBin(): string { + // packages/evals/node_modules/.bin/claude + const localBin = join( + __dirname, + "..", + "..", + "node_modules", + ".bin", + "claude", + ); + if (existsSync(localBin)) { + return localBin; + } + + // Fall back to PATH + try { + execFileSync("claude", ["--version"], { + stdio: "ignore", + timeout: 10_000, + }); + return "claude"; + } catch { + throw new Error( + [ + "claude CLI not found.", + "", + "Install it in one of these ways:", + " npm install (uses @anthropic-ai/claude-code from package.json)", + " npm i -g @anthropic-ai/claude-code", + "", + "Ensure ANTHROPIC_API_KEY is set in the environment.", + ].join("\n"), + ); + } +} /** * Verify the host environment has everything needed before spending * API credits on an eval run. * - * Checks: Node >= 20, Docker running, claude CLI available. + * Checks: Node >= 20, Docker running, claude CLI available, API key set. */ export function preflight(): void { const errors: string[] = []; @@ -24,12 +75,16 @@ export function preflight(): void { // Claude CLI available try { - execFileSync("claude", ["--version"], { - stdio: "ignore", - timeout: 10_000, - }); - } catch { - errors.push("claude CLI not found on PATH"); + resolveClaudeBin(); + } catch (err) { + errors.push((err as Error).message); + } + + // API key + if (!process.env.ANTHROPIC_API_KEY) { + errors.push( + "ANTHROPIC_API_KEY is not set. Claude Code requires this for authentication.", + ); } if (errors.length > 0) { diff --git a/packages/evals/src/runner/results.ts b/packages/evals/src/runner/results.ts index 471946a..1e17cfa 100644 --- a/packages/evals/src/runner/results.ts +++ b/packages/evals/src/runner/results.ts @@ -46,7 +46,10 @@ export function listModifiedFiles( } /** Print a summary table of eval results. */ -export function printSummary(results: EvalRunResult[]): void { +export function printSummary( + results: EvalRunResult[], + resultsDir?: string, +): void { console.log("\n=== Eval Results ===\n"); for (const r of results) { @@ -65,4 +68,8 @@ export function printSummary(results: EvalRunResult[]): void { const passed = results.filter((r) => r.status === "passed").length; console.log(`\nTotal: ${passed}/${results.length} passed`); + + if (resultsDir) { + console.log(`\nResults saved to: ${resultsDir}`); + } } diff --git a/packages/evals/src/runner/test.ts b/packages/evals/src/runner/test.ts index f84497a..3d9ec31 100644 --- a/packages/evals/src/runner/test.ts +++ b/packages/evals/src/runner/test.ts @@ -78,17 +78,24 @@ export async function runTests(opts: { function parseTestOutput(output: string): TestResult { // Parse vitest output for pass/fail counts - // Format: "Tests N passed (M)" or "Tests N failed | M passed (T)" - const testsLine = output.match( + // Vitest formats: + // All passing: "Tests N passed (N)" + // Mixed: "Tests N failed | M passed (T)" + // All failing: "Tests N failed (N)" + const mixedOrPassing = output.match( /Tests\s+(?:(\d+)\s+failed\s+\|\s+)?(\d+)\s+passed\s+\((\d+)\)/, ); + const allFailing = output.match(/Tests\s+(\d+)\s+failed\s+\((\d+)\)/); let passedCount = 0; let totalCount = 0; - if (testsLine) { - passedCount = Number.parseInt(testsLine[2], 10); - totalCount = Number.parseInt(testsLine[3], 10); + if (mixedOrPassing) { + passedCount = Number.parseInt(mixedOrPassing[2], 10); + totalCount = Number.parseInt(mixedOrPassing[3], 10); + } else if (allFailing) { + passedCount = 0; + totalCount = Number.parseInt(allFailing[2], 10); } const passed = totalCount > 0 && passedCount === totalCount; diff --git a/packages/evals/src/runner/transcript.ts b/packages/evals/src/runner/transcript.ts new file mode 100644 index 0000000..509d3a3 --- /dev/null +++ b/packages/evals/src/runner/transcript.ts @@ -0,0 +1,154 @@ +export interface TranscriptEvent { + type: string; + [key: string]: unknown; +} + +export interface ToolCallSummary { + tool: string; + toolUseId: string; + input: Record; + /** First ~200 chars of output for quick scanning */ + outputPreview: string; +} + +export interface TranscriptSummary { + totalTurns: number; + totalDurationMs: number; + totalCostUsd: number | null; + model: string | null; + toolCalls: ToolCallSummary[]; + finalOutput: string; +} + +/** Parse a single NDJSON line. Returns null on empty or invalid input. */ +export function parseStreamJsonLine(line: string): TranscriptEvent | null { + const trimmed = line.trim(); + if (!trimmed) return null; + try { + return JSON.parse(trimmed) as TranscriptEvent; + } catch { + return null; + } +} + +/** Parse raw NDJSON stdout into an array of events. */ +export function parseStreamJsonOutput(raw: string): TranscriptEvent[] { + const events: TranscriptEvent[] = []; + for (const line of raw.split("\n")) { + const event = parseStreamJsonLine(line); + if (event) events.push(event); + } + return events; +} + +/** Extract the final text output from parsed events (for backward compat). */ +export function extractFinalOutput(events: TranscriptEvent[]): string { + // Prefer the result event + for (const event of events) { + if (event.type === "result") { + const result = (event as Record).result; + if (typeof result === "string") return result; + } + } + + // Fallback: concatenate text blocks from the last assistant message + for (let i = events.length - 1; i >= 0; i--) { + const event = events[i]; + if (event.type === "assistant") { + const msg = (event as Record).message as + | Record + | undefined; + const content = msg?.content; + if (Array.isArray(content)) { + const texts = content + .filter( + (b: Record) => + b.type === "text" && typeof b.text === "string", + ) + .map((b: Record) => b.text as string); + if (texts.length > 0) return texts.join("\n"); + } + } + } + + return ""; +} + +/** Walk parsed events to build a transcript summary. */ +export function buildTranscriptSummary( + events: TranscriptEvent[], +): TranscriptSummary { + const toolCalls: ToolCallSummary[] = []; + let finalOutput = ""; + let totalDurationMs = 0; + let totalCostUsd: number | null = null; + let model: string | null = null; + let totalTurns = 0; + + for (const event of events) { + const e = event as Record; + + // System init: extract model + if (e.type === "system" && e.subtype === "init") { + model = typeof e.model === "string" ? e.model : null; + } + + // Assistant messages: extract tool_use blocks + if (e.type === "assistant") { + const msg = e.message as Record | undefined; + const content = msg?.content; + if (Array.isArray(content)) { + for (const block of content) { + if (block.type === "tool_use") { + toolCalls.push({ + tool: block.name ?? "unknown", + toolUseId: block.id ?? "", + input: block.input ?? {}, + outputPreview: "", + }); + } + } + } + } + + // User messages: extract tool_result blocks and match to tool calls + if (e.type === "user") { + const msg = e.message as Record | undefined; + const content = msg?.content; + if (Array.isArray(content)) { + for (const block of content) { + if (block.type === "tool_result") { + const matching = toolCalls.find( + (tc) => tc.toolUseId === block.tool_use_id, + ); + if (matching) { + const text = + typeof block.content === "string" + ? block.content + : JSON.stringify(block.content); + matching.outputPreview = text.slice(0, 200); + } + } + } + } + } + + // Result event: final output, cost, duration, turns + if (e.type === "result") { + finalOutput = typeof e.result === "string" ? e.result : ""; + totalDurationMs = typeof e.duration_ms === "number" ? e.duration_ms : 0; + totalCostUsd = + typeof e.total_cost_usd === "number" ? e.total_cost_usd : null; + totalTurns = typeof e.num_turns === "number" ? e.num_turns : 0; + } + } + + return { + totalTurns, + totalDurationMs, + totalCostUsd, + model, + toolCalls, + finalOutput, + }; +} diff --git a/packages/evals/src/types.ts b/packages/evals/src/types.ts index 7307709..4b97534 100644 --- a/packages/evals/src/types.ts +++ b/packages/evals/src/types.ts @@ -32,4 +32,10 @@ export interface EvalRunResult { /** Files the agent created or modified in the workspace */ filesModified: string[]; error?: string; + /** Path to the persisted results directory for this run */ + resultsDir?: string; + /** Number of tool calls the agent made */ + toolCallCount?: number; + /** Total cost in USD (from stream-json result event) */ + costUsd?: number; }