From 34e807a3f6d1feed309795b628340d62c087ae5e Mon Sep 17 00:00:00 2001 From: Pedro Rodrigues Date: Wed, 25 Feb 2026 19:50:54 +0000 Subject: [PATCH] replace vitest for braintrust assertions --- .../evals/auth-fk-cascade-delete/EVAL.ts | 85 +++ .../evals/auth-fk-cascade-delete/PROMPT.md | 7 + .../evals/auth-fk-cascade-delete/package.json | 5 + .../supabase/config.toml | 111 +++ .../evals/evals/auth-rls-new-project/EVAL.ts | 231 +++--- .../evals/auth-rls-new-project/PROMPT.md | 25 +- .../evals/cli-hallucinated-commands/EVAL.ts | 128 ++++ .../evals/cli-hallucinated-commands/PROMPT.md | 9 + .../cli-hallucinated-commands/package.json | 5 + .../supabase/config.toml | 64 ++ .../supabase/functions/process-order/index.ts | 29 + .../collaborative-rooms-realtime/EVAL.ts | 677 +++++++++--------- .../connection-pooling-prisma/.env.example | 3 + .../evals/connection-pooling-prisma/EVAL.ts | 134 ++++ .../evals/connection-pooling-prisma/PROMPT.md | 3 + .../connection-pooling-prisma/package.json | 5 + .../prisma/schema.prisma | 29 + .../supabase/config.toml | 111 +++ .../evals/edge-function-hello-world/EVAL.ts | 255 +++---- packages/evals/evals/eval-utils.ts | 113 ++- .../evals/extension-wrong-schema/EVAL.ts | 100 +++ .../evals/extension-wrong-schema/PROMPT.md | 11 + .../evals/extension-wrong-schema/package.json | 5 + .../supabase/config.toml | 111 +++ .../evals/postgrest-schema-cache/EVAL.ts | 114 +++ .../evals/postgrest-schema-cache/PROMPT.md | 9 + .../evals/postgrest-schema-cache/package.json | 5 + .../supabase/config.toml | 111 +++ .../20240101000000_create_products.sql | 8 + .../evals/rls-update-needs-select/EVAL.ts | 122 ++++ .../evals/rls-update-needs-select/PROMPT.md | 7 + .../rls-update-needs-select/package.json | 5 + .../supabase/config.toml | 111 +++ .../rls-user-metadata-role-check/EVAL.ts | 123 ++++ .../rls-user-metadata-role-check/PROMPT.md | 7 + .../rls-user-metadata-role-check/package.json | 5 + .../supabase/config.toml | 111 +++ .../evals/service-role-edge-function/EVAL.ts | 102 +++ .../service-role-edge-function/PROMPT.md | 9 + .../service-role-edge-function/package.json | 5 + .../supabase/config.toml | 64 ++ .../20240101000000_create_reports_table.sql | 10 + .../evals/storage-rls-user-folders/EVAL.ts | 508 +++++++------ .../evals/team-rls-security-definer/EVAL.ts | 390 +++++----- packages/evals/package.json | 3 +- packages/evals/scenarios/SCENARIOS.md | 11 +- .../evals/scenarios/auth-fk-cascade-delete.md | 84 +++ .../evals/scenarios/auth-rls-new-project.md | 8 +- .../scenarios/cli-hallucinated-commands.md | 120 ++++ .../scenarios/collaborative-rooms-realtime.md | 1 + .../scenarios/connection-pooling-prisma.md | 80 +++ .../scenarios/edge-function-hello-world.md | 1 + .../evals/scenarios/extension-wrong-schema.md | 89 +++ .../evals/scenarios/postgrest-schema-cache.md | 89 +++ .../scenarios/rls-update-needs-select.md | 85 +++ .../scenarios/rls-user-metadata-role-check.md | 85 +++ .../scenarios/service-role-edge-function.md | 86 +++ .../scenarios/storage-rls-user-folders.md | 3 +- .../scenarios/team-rls-security-definer.md | 8 +- packages/evals/src/eval-types.ts | 21 + packages/evals/src/runner.ts | 145 ++-- packages/evals/src/runner/braintrust.ts | 8 +- packages/evals/src/runner/persist.ts | 7 +- packages/evals/src/runner/scorers.ts | 2 +- packages/evals/src/runner/test.ts | 143 ---- packages/evals/src/types.ts | 13 +- 66 files changed, 3940 insertions(+), 1234 deletions(-) create mode 100644 packages/evals/evals/auth-fk-cascade-delete/EVAL.ts create mode 100644 packages/evals/evals/auth-fk-cascade-delete/PROMPT.md create mode 100644 packages/evals/evals/auth-fk-cascade-delete/package.json create mode 100644 packages/evals/evals/auth-fk-cascade-delete/supabase/config.toml create mode 100644 packages/evals/evals/cli-hallucinated-commands/EVAL.ts create mode 100644 packages/evals/evals/cli-hallucinated-commands/PROMPT.md create mode 100644 packages/evals/evals/cli-hallucinated-commands/package.json create mode 100644 packages/evals/evals/cli-hallucinated-commands/supabase/config.toml create mode 100644 packages/evals/evals/cli-hallucinated-commands/supabase/functions/process-order/index.ts create mode 100644 packages/evals/evals/connection-pooling-prisma/.env.example create mode 100644 packages/evals/evals/connection-pooling-prisma/EVAL.ts create mode 100644 packages/evals/evals/connection-pooling-prisma/PROMPT.md create mode 100644 packages/evals/evals/connection-pooling-prisma/package.json create mode 100644 packages/evals/evals/connection-pooling-prisma/prisma/schema.prisma create mode 100644 packages/evals/evals/connection-pooling-prisma/supabase/config.toml create mode 100644 packages/evals/evals/extension-wrong-schema/EVAL.ts create mode 100644 packages/evals/evals/extension-wrong-schema/PROMPT.md create mode 100644 packages/evals/evals/extension-wrong-schema/package.json create mode 100644 packages/evals/evals/extension-wrong-schema/supabase/config.toml create mode 100644 packages/evals/evals/postgrest-schema-cache/EVAL.ts create mode 100644 packages/evals/evals/postgrest-schema-cache/PROMPT.md create mode 100644 packages/evals/evals/postgrest-schema-cache/package.json create mode 100644 packages/evals/evals/postgrest-schema-cache/supabase/config.toml create mode 100644 packages/evals/evals/postgrest-schema-cache/supabase/migrations/20240101000000_create_products.sql create mode 100644 packages/evals/evals/rls-update-needs-select/EVAL.ts create mode 100644 packages/evals/evals/rls-update-needs-select/PROMPT.md create mode 100644 packages/evals/evals/rls-update-needs-select/package.json create mode 100644 packages/evals/evals/rls-update-needs-select/supabase/config.toml create mode 100644 packages/evals/evals/rls-user-metadata-role-check/EVAL.ts create mode 100644 packages/evals/evals/rls-user-metadata-role-check/PROMPT.md create mode 100644 packages/evals/evals/rls-user-metadata-role-check/package.json create mode 100644 packages/evals/evals/rls-user-metadata-role-check/supabase/config.toml create mode 100644 packages/evals/evals/service-role-edge-function/EVAL.ts create mode 100644 packages/evals/evals/service-role-edge-function/PROMPT.md create mode 100644 packages/evals/evals/service-role-edge-function/package.json create mode 100644 packages/evals/evals/service-role-edge-function/supabase/config.toml create mode 100644 packages/evals/evals/service-role-edge-function/supabase/migrations/20240101000000_create_reports_table.sql create mode 100644 packages/evals/scenarios/auth-fk-cascade-delete.md create mode 100644 packages/evals/scenarios/cli-hallucinated-commands.md create mode 100644 packages/evals/scenarios/connection-pooling-prisma.md create mode 100644 packages/evals/scenarios/extension-wrong-schema.md create mode 100644 packages/evals/scenarios/postgrest-schema-cache.md create mode 100644 packages/evals/scenarios/rls-update-needs-select.md create mode 100644 packages/evals/scenarios/rls-user-metadata-role-check.md create mode 100644 packages/evals/scenarios/service-role-edge-function.md create mode 100644 packages/evals/src/eval-types.ts delete mode 100644 packages/evals/src/runner/test.ts diff --git a/packages/evals/evals/auth-fk-cascade-delete/EVAL.ts b/packages/evals/evals/auth-fk-cascade-delete/EVAL.ts new file mode 100644 index 0000000..a9c1d60 --- /dev/null +++ b/packages/evals/evals/auth-fk-cascade-delete/EVAL.ts @@ -0,0 +1,85 @@ +export const expectedReferenceFiles = [ + "db-schema-auth-fk.md", + "db-security-functions.md", + "db-rls-mandatory.md", + "db-rls-common-mistakes.md", +]; + +import type { EvalAssertion } from "../../src/eval-types.js"; + +import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts"; + +export const assertions: EvalAssertion[] = [ + { + name: "migration file exists", + check: () => findMigrationFiles().length > 0, + }, + { + name: "creates profiles table", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return /create\s+table/.test(sql) && /profiles/.test(sql); + }, + }, + { + name: "FK references auth.users", + check: () => + /references\s+auth\.users/.test(getMigrationSQL().toLowerCase()), + }, + { + name: "ON DELETE CASCADE present", + check: () => /on\s+delete\s+cascade/.test(getMigrationSQL().toLowerCase()), + }, + { + name: "RLS enabled on profiles", + check: () => + /alter\s+table.*profiles.*enable\s+row\s+level\s+security/.test( + getMigrationSQL().toLowerCase(), + ), + }, + { + name: "trigger function uses SECURITY DEFINER", + check: () => /security\s+definer/.test(getMigrationSQL().toLowerCase()), + }, + { + name: "trigger function sets search_path", + check: () => + /set\s+search_path\s*=\s*''/.test(getMigrationSQL().toLowerCase()), + }, + { + name: "trigger created on auth.users", + check: () => + /create\s+trigger[\s\S]*?on\s+auth\.users/.test( + getMigrationSQL().toLowerCase(), + ), + }, + { + name: "policies scoped to authenticated", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + return ( + policyBlocks.length > 0 && + policyBlocks.every((p) => /to\s+authenticated/.test(p)) + ); + }, + }, + { + name: "overall quality: demonstrates Supabase best practices", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const signals = [ + /references\s+auth\.users/.test(sql) && + /on\s+delete\s+cascade/.test(sql), + /alter\s+table.*profiles.*enable\s+row\s+level\s+security/.test(sql), + /security\s+definer/.test(sql), + /set\s+search_path\s*=\s*''/.test(sql), + /create\s+trigger[\s\S]*?on\s+auth\.users/.test(sql), + policyBlocks.length > 0 && + policyBlocks.every((p) => /to\s+authenticated/.test(p)), + ]; + return signals.filter(Boolean).length >= 5; + }, + }, +]; diff --git a/packages/evals/evals/auth-fk-cascade-delete/PROMPT.md b/packages/evals/evals/auth-fk-cascade-delete/PROMPT.md new file mode 100644 index 0000000..c0932d6 --- /dev/null +++ b/packages/evals/evals/auth-fk-cascade-delete/PROMPT.md @@ -0,0 +1,7 @@ +I'm building a Supabase app and need to set up a `profiles` table. Every user who signs up should automatically get a profile row containing their `id`, `email`, and `full_name` (pulled from signup metadata). + +Please create a SQL migration in `supabase/migrations/` that: + +1. Creates the `profiles` table linked to Supabase Auth users +2. Sets up a trigger so a profile row is created automatically whenever a new user signs up +3. Enables Row Level Security so users can only read and update their own profile diff --git a/packages/evals/evals/auth-fk-cascade-delete/package.json b/packages/evals/evals/auth-fk-cascade-delete/package.json new file mode 100644 index 0000000..3803299 --- /dev/null +++ b/packages/evals/evals/auth-fk-cascade-delete/package.json @@ -0,0 +1,5 @@ +{ + "name": "auth-fk-cascade-delete", + "private": true, + "type": "module" +} diff --git a/packages/evals/evals/auth-fk-cascade-delete/supabase/config.toml b/packages/evals/evals/auth-fk-cascade-delete/supabase/config.toml new file mode 100644 index 0000000..aa01e0a --- /dev/null +++ b/packages/evals/evals/auth-fk-cascade-delete/supabase/config.toml @@ -0,0 +1,111 @@ +# For detailed configuration reference documentation, visit: +# https://supabase.com/docs/guides/local-development/cli/config +# A string used to distinguish different Supabase projects on the same host. Defaults to the +# working directory name when running `supabase init`. +project_id = "auth-fk-cascade-delete" + +[api] +enabled = true +# Port to use for the API URL. +port = 54321 +# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API +# endpoints. `public` and `graphql_public` schemas are included by default. +schemas = ["public", "graphql_public"] +# Extra schemas to add to the search_path of every request. +extra_search_path = ["public", "extensions"] +# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size +# for accidental or malicious requests. +max_rows = 1000 + +[db] +# Port to use for the local database URL. +port = 54322 +# Port used by db diff command to initialize the shadow database. +shadow_port = 54320 +# The database major version to use. This has to be the same as your remote database's. Run `SHOW +# server_version;` on the remote database to check. +major_version = 17 + +[db.pooler] +enabled = false +# Port to use for the local connection pooler. +port = 54329 +# Specifies when a server connection can be reused by other clients. +# Configure one of the supported pooler modes: `transaction`, `session`. +pool_mode = "transaction" +# How many server connections to allow per user/database pair. +default_pool_size = 20 +# Maximum number of client connections allowed. +max_client_conn = 100 + +[db.migrations] +# If disabled, migrations will be skipped during a db push or reset. +enabled = true +schema_paths = [] + +[db.seed] +# If enabled, seeds the database after migrations during a db reset. +enabled = true +# Specifies an ordered list of seed files to load during db reset. +sql_paths = ["./seed.sql"] + +[realtime] +enabled = true + +[studio] +enabled = true +# Port to use for Supabase Studio. +port = 54323 +# External URL of the API server that frontend connects to. +api_url = "http://127.0.0.1" + +[inbucket] +enabled = true +# Port to use for the email testing server web interface. +port = 54324 + +[storage] +enabled = true +# The maximum file size allowed (e.g. "5MB", "500KB"). +file_size_limit = "50MiB" + +[auth] +enabled = true +# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used +# in emails. +site_url = "http://127.0.0.1:3000" +# A list of *exact* URLs that auth providers are permitted to redirect to post authentication. +additional_redirect_urls = ["https://127.0.0.1:3000"] +# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week). +jwt_expiry = 3600 +# If disabled, the refresh token will never expire. +enable_refresh_token_rotation = true +# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds. +# Requires enable_refresh_token_rotation = true. +refresh_token_reuse_interval = 10 +# Allow/disallow new user signups to your project. +enable_signup = true +# Allow/disallow anonymous sign-ins to your project. +enable_anonymous_sign_ins = false + +[auth.email] +# Allow/disallow new user signups via email to your project. +enable_signup = true +# If enabled, a user will be required to confirm any email change on both the old, and new email +# addresses. If disabled, only the new email is required to confirm. +double_confirm_changes = true +# If enabled, users need to confirm their email address before signing in. +enable_confirmations = false + +[edge_runtime] +enabled = true +# Configure one of the supported request policies: `oneshot`, `per_worker`. +policy = "per_worker" +# Port to attach the Chrome inspector for debugging edge functions. +inspector_port = 8083 + +[analytics] +enabled = true +port = 54327 +# Configure one of the supported backends: `postgres`, `bigquery`. +backend = "postgres" diff --git a/packages/evals/evals/auth-rls-new-project/EVAL.ts b/packages/evals/evals/auth-rls-new-project/EVAL.ts index 4975ed5..c6860cb 100644 --- a/packages/evals/evals/auth-rls-new-project/EVAL.ts +++ b/packages/evals/evals/auth-rls-new-project/EVAL.ts @@ -1,97 +1,150 @@ +export const expectedReferenceFiles = [ + "dev-getting-started.md", + "db-rls-mandatory.md", + "db-rls-policy-types.md", + "db-rls-common-mistakes.md", + "db-schema-auth-fk.md", + "db-schema-timestamps.md", + "db-migrations-idempotent.md", +]; + import { existsSync } from "node:fs"; import { join } from "node:path"; -import { expect, test } from "vitest"; +import type { EvalAssertion } from "../../src/eval-types.js"; import { + anonSeeesNoRows, findMigrationFiles, getMigrationSQL, - supabaseDir, + getSupabaseDir, + queryTable, + tableExists, } from "../eval-utils.ts"; -test("supabase project initialized (config.toml exists)", () => { - expect(existsSync(join(supabaseDir, "config.toml"))).toBe(true); -}); - -test("migration file exists in supabase/migrations/", () => { - expect(findMigrationFiles().length).toBeGreaterThan(0); -}); - -test("creates tasks table", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/create\s+table/); - expect(sql).toMatch(/tasks/); -}); - -test("enables RLS on tasks table", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/alter\s+table.*tasks.*enable\s+row\s+level\s+security/); -}); - -test("has foreign key to auth.users", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/references\s+auth\.users/); -}); - -test("uses ON DELETE CASCADE for auth FK", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/on\s+delete\s+cascade/); -}); - -test("uses (select auth.uid()) not bare auth.uid() in policies", () => { - const sql = getMigrationSQL(); - const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; - for (const policy of policyBlocks) { - if (policy.includes("auth.uid()")) { - // The subselect form: (select auth.uid()) - expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i); - } - } -}); - -test("policies use TO authenticated", () => { - const sql = getMigrationSQL().toLowerCase(); - const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; - expect(policyBlocks.length).toBeGreaterThan(0); - for (const policy of policyBlocks) { - expect(policy).toMatch(/to\s+authenticated/); - } -}); - -test("uses timestamptz not plain timestamp for time columns", () => { - const sql = getMigrationSQL().toLowerCase(); - // Match "timestamp" that is NOT followed by "tz" or "with time zone" - const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/; - // Only fail if the migration defines time columns with plain timestamp - if ( - sql.includes("created_at") || - sql.includes("updated_at") || - sql.includes("due_date") - ) { - expect(sql).not.toMatch(hasPlainTimestamp); - } -}); - -test("creates index on user_id column", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/create\s+index/); - expect(sql).toMatch(/user_id/); -}); - -test("migration is idempotent (uses IF NOT EXISTS)", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/if\s+not\s+exists/); -}); - -test("overall quality: demonstrates Supabase best practices", () => { - const sql = getMigrationSQL().toLowerCase(); - // A high-quality migration should contain most of these patterns - const signals = [ - /enable\s+row\s+level\s+security/, - /\(select\s+auth\.uid\(\)\)/, - /to\s+authenticated/, - /on\s+delete\s+cascade/, - /create\s+index/, - ]; - const matches = signals.filter((r) => r.test(sql)); - expect(matches.length).toBeGreaterThanOrEqual(4); -}); +export const assertions: EvalAssertion[] = [ + { + name: "supabase project initialized (config.toml exists)", + check: () => existsSync(join(getSupabaseDir(), "config.toml")), + }, + { + name: "migration file exists in supabase/migrations/", + check: () => findMigrationFiles().length > 0, + }, + { + name: "creates tasks table", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return /create\s+table/.test(sql) && /tasks/.test(sql); + }, + }, + { + name: "enables RLS on tasks table", + check: () => + /alter\s+table.*tasks.*enable\s+row\s+level\s+security/.test( + getMigrationSQL().toLowerCase(), + ), + }, + { + name: "has foreign key to auth.users", + check: () => + /references\s+auth\.users/.test(getMigrationSQL().toLowerCase()), + }, + { + name: "uses ON DELETE CASCADE for auth FK", + check: () => /on\s+delete\s+cascade/.test(getMigrationSQL().toLowerCase()), + }, + { + name: "uses (select auth.uid()) not bare auth.uid() in policies", + check: () => { + const sql = getMigrationSQL(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + for (const policy of policyBlocks) { + if ( + policy.includes("auth.uid()") && + !/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy) + ) { + return false; + } + } + return true; + }, + }, + { + name: "policies use TO authenticated", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + return ( + policyBlocks.length > 0 && + policyBlocks.every((p) => /to\s+authenticated/.test(p)) + ); + }, + }, + { + name: "uses timestamptz not plain timestamp for time columns", + check: () => { + const rawSql = getMigrationSQL().toLowerCase(); + const sql = rawSql.replace(/--[^\n]*/g, ""); + const hasPlainTimestamp = + /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/; + if ( + sql.includes("created_at") || + sql.includes("updated_at") || + sql.includes("due_date") + ) { + return !hasPlainTimestamp.test(sql); + } + return true; + }, + }, + { + name: "creates index on user_id column", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return /create\s+index/.test(sql) && /user_id/.test(sql); + }, + }, + { + name: "does not use SERIAL or BIGSERIAL for primary key", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return !/\bserial\b/.test(sql) && !/\bbigserial\b/.test(sql); + }, + }, + { + name: "migration is idempotent (uses IF NOT EXISTS)", + check: () => /if\s+not\s+exists/.test(getMigrationSQL().toLowerCase()), + }, + { + name: "overall quality: demonstrates Supabase best practices", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const signals = [ + /enable\s+row\s+level\s+security/, + /\(select\s+auth\.uid\(\)\)/, + /to\s+authenticated/, + /on\s+delete\s+cascade/, + /create\s+index/, + ]; + return signals.filter((r) => r.test(sql)).length >= 4; + }, + }, + { + name: "tasks table exists in the database after migration", + check: () => tableExists("tasks"), + timeout: 10_000, + }, + { + name: "tasks table is queryable with service role", + check: async () => { + const { error } = await queryTable("tasks", "service_role"); + return error === null; + }, + timeout: 10_000, + }, + { + name: "tasks table returns no rows for anon (RLS is active)", + check: () => anonSeeesNoRows("tasks"), + timeout: 10_000, + }, +]; diff --git a/packages/evals/evals/auth-rls-new-project/PROMPT.md b/packages/evals/evals/auth-rls-new-project/PROMPT.md index a0c2601..d9a65c1 100644 --- a/packages/evals/evals/auth-rls-new-project/PROMPT.md +++ b/packages/evals/evals/auth-rls-new-project/PROMPT.md @@ -1,16 +1,15 @@ -I'm starting a new Supabase project from scratch for a task management app. Users should sign up with email/password, and each user should only see their own tasks. +I'm building a task management app. Users sign up with email/password and should only see their own tasks. -Set up the project: +A Supabase project is already initialized and running locally. The `supabase/` directory and `config.toml` are already set up — do not run `supabase init` or `supabase start`. -1. Initialize the Supabase project with the CLI (`npx supabase init`) -2. Start the local Supabase stack (`npx supabase start`) -3. Create a SQL migration for a tasks table with columns: title (text), description (text), status (text), and due_date +Create a SQL migration for a tasks table: -The migration must: - -- Create the tasks table with proper column types -- Link tasks to authenticated users -- Enable Row Level Security -- Create policies so users can only CRUD their own tasks -- Add appropriate indexes -- Be idempotent (safe to run multiple times) +1. Create a new migration file with `npx supabase migration new` +2. Write the migration SQL with: + - A `tasks` table with columns: title (text), description (text), status (text), due_date (timestamptz) + - Link tasks to authenticated users (foreign key to `auth.users`) + - Enable Row Level Security + - RLS policies so users can only CRUD their own tasks + - Appropriate indexes + - Idempotent (safe to run multiple times) +3. Apply the migration with `npx supabase db push` diff --git a/packages/evals/evals/cli-hallucinated-commands/EVAL.ts b/packages/evals/evals/cli-hallucinated-commands/EVAL.ts new file mode 100644 index 0000000..08386bb --- /dev/null +++ b/packages/evals/evals/cli-hallucinated-commands/EVAL.ts @@ -0,0 +1,128 @@ +export const expectedReferenceFiles = [ + "dev-getting-started.md", + "edge-fun-quickstart.md", +]; + +import { readdirSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import type { EvalAssertion } from "../../src/eval-types.js"; + +const cwd = process.cwd(); + +function findReferenceFile(): string | null { + const candidates = readdirSync(cwd).filter((f) => { + const lower = f.toLowerCase(); + return ( + lower === "cli_reference.md" || + lower === "cli-reference.md" || + lower === "clireference.md" + ); + }); + return candidates.length > 0 ? join(cwd, candidates[0]) : null; +} + +function getReferenceContent(): string { + const file = findReferenceFile(); + if (!file) throw new Error("CLI_REFERENCE.md not found in project root"); + return readFileSync(file, "utf-8"); +} + +export const assertions: EvalAssertion[] = [ + { + name: "CLI_REFERENCE.md exists in project root", + check: () => findReferenceFile() !== null, + }, + { + name: "no hallucinated functions log command", + check: () => { + const content = getReferenceContent(); + return ( + !/`supabase\s+functions\s+log`/.test(content) && + !/^\s*npx\s+supabase\s+functions\s+log\b/m.test(content) && + !/^\s*supabase\s+functions\s+log\b/m.test(content) + ); + }, + }, + { + name: "no hallucinated db query command", + check: () => { + const content = getReferenceContent(); + return ( + !/`supabase\s+db\s+query`/.test(content) && + !/^\s*npx\s+supabase\s+db\s+query\b/m.test(content) && + !/^\s*supabase\s+db\s+query\b/m.test(content) + ); + }, + }, + { + name: "mentions supabase functions serve for local development", + check: () => + /supabase\s+functions\s+serve/.test(getReferenceContent().toLowerCase()), + }, + { + name: "mentions supabase functions deploy", + check: () => + /supabase\s+functions\s+deploy/.test(getReferenceContent().toLowerCase()), + }, + { + name: "mentions psql or SQL Editor or connection string for ad-hoc SQL", + check: () => { + const content = getReferenceContent().toLowerCase(); + return ( + /\bpsql\b/.test(content) || + /sql\s+editor/.test(content) || + /connection\s+string/.test(content) || + /supabase\s+db\s+dump/.test(content) + ); + }, + }, + { + name: "mentions supabase db push or supabase db reset for migrations", + check: () => { + const content = getReferenceContent().toLowerCase(); + return ( + /supabase\s+db\s+push/.test(content) || + /supabase\s+db\s+reset/.test(content) + ); + }, + }, + { + name: "mentions supabase start for local stack", + check: () => /supabase\s+start/.test(getReferenceContent().toLowerCase()), + }, + { + name: "mentions Dashboard or Logs Explorer for production log viewing", + check: () => { + const content = getReferenceContent().toLowerCase(); + return /\bdashboard\b/.test(content) || /logs\s+explorer/.test(content); + }, + }, + { + name: "overall quality: uses real CLI commands throughout", + check: () => { + const content = getReferenceContent().toLowerCase(); + const signals = [ + /supabase\s+start/, + /supabase\s+stop/, + /supabase\s+functions\s+serve/, + /supabase\s+functions\s+deploy/, + /supabase\s+db\s+(push|reset|diff)/, + /\bpsql\b|\bsql\s+editor\b|\bconnection\s+string\b/, + /\bdashboard\b|\blogs\s+explorer\b/, + ]; + const hallucinations = [ + /`supabase\s+functions\s+log`/, + /^\s*npx\s+supabase\s+functions\s+log\b/m, + /^\s*supabase\s+functions\s+log\b/m, + /`supabase\s+db\s+query`/, + /^\s*npx\s+supabase\s+db\s+query\b/m, + /^\s*supabase\s+db\s+query\b/m, + ]; + const positiveMatches = signals.filter((r) => r.test(content)).length; + const hallucinationMatches = hallucinations.filter((r) => + r.test(content), + ).length; + return positiveMatches >= 5 && hallucinationMatches === 0; + }, + }, +]; diff --git a/packages/evals/evals/cli-hallucinated-commands/PROMPT.md b/packages/evals/evals/cli-hallucinated-commands/PROMPT.md new file mode 100644 index 0000000..588de46 --- /dev/null +++ b/packages/evals/evals/cli-hallucinated-commands/PROMPT.md @@ -0,0 +1,9 @@ +I'm onboarding a new developer to my Supabase project. Create a `CLI_REFERENCE.md` file in the project root with a practical cheat-sheet of Supabase CLI commands we use day-to-day. It should cover: + +1. Starting and stopping the local dev stack +2. Managing database migrations (push, reset, diff) +3. Working with the `process-order` Edge Function (local dev and deploy) +4. How to view Edge Function logs (both local dev and production) +5. How to run ad-hoc SQL queries against the database (local and remote) + +Include the actual commands with brief explanations. diff --git a/packages/evals/evals/cli-hallucinated-commands/package.json b/packages/evals/evals/cli-hallucinated-commands/package.json new file mode 100644 index 0000000..7fc92a3 --- /dev/null +++ b/packages/evals/evals/cli-hallucinated-commands/package.json @@ -0,0 +1,5 @@ +{ + "name": "cli-hallucinated-commands", + "private": true, + "type": "module" +} diff --git a/packages/evals/evals/cli-hallucinated-commands/supabase/config.toml b/packages/evals/evals/cli-hallucinated-commands/supabase/config.toml new file mode 100644 index 0000000..a1435f6 --- /dev/null +++ b/packages/evals/evals/cli-hallucinated-commands/supabase/config.toml @@ -0,0 +1,64 @@ +# For detailed configuration reference documentation, visit: +# https://supabase.com/docs/guides/local-development/cli/config +# A string used to distinguish different Supabase projects on the same host. Defaults to the +# working directory name when running `supabase init`. +project_id = "cli-hallucinated-commands" + +[api] +enabled = true +# Port to use for the API URL. +port = 54321 +# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API +# endpoints. `public` and `graphql_public` schemas are included by default. +schemas = ["public", "graphql_public"] +# Extra schemas to add to the search_path of every request. +extra_search_path = ["public", "extensions"] +# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size +# for accidental or malicious requests. +max_rows = 1000 + +[db] +# Port to use for the local database URL. +port = 54322 +# Port used by db diff command to initialize the shadow database. +shadow_port = 54320 +# The database major version to use. This has to be the same as your remote database's. Run `SHOW +# server_version;` on the remote database to check. +major_version = 17 + +[db.pooler] +enabled = false +# Port to use for the local connection pooler. +port = 54329 +# Specifies when a server connection can be reused by other clients. +# Configure one of the supported pooler modes: `transaction`, `session`. +pool_mode = "transaction" +# How many server connections to allow per user/database pair. +default_pool_size = 20 +# Maximum number of client connections allowed. +max_client_conn = 100 + +[storage] +enabled = true +# The maximum file size allowed (e.g. "5MB", "500KB"). +file_size_limit = "50MiB" + +[auth] +enabled = true +# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used +# in emails. +site_url = "http://127.0.0.1:3000" +# A list of *exact* URLs that auth providers are permitted to redirect to post authentication. +additional_redirect_urls = ["https://127.0.0.1:3000"] +# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week). +jwt_expiry = 3600 +# Allow/disallow new user signups to your project. +enable_signup = true +# Allow/disallow anonymous sign-ins to your project. +enable_anonymous_sign_ins = false + +[auth.email] +# Allow/disallow new user signups via email to your project. +enable_signup = true +# If enabled, users need to confirm their email address before signing in. +enable_confirmations = false diff --git a/packages/evals/evals/cli-hallucinated-commands/supabase/functions/process-order/index.ts b/packages/evals/evals/cli-hallucinated-commands/supabase/functions/process-order/index.ts new file mode 100644 index 0000000..b7e1641 --- /dev/null +++ b/packages/evals/evals/cli-hallucinated-commands/supabase/functions/process-order/index.ts @@ -0,0 +1,29 @@ +import { createClient } from "jsr:@supabase/supabase-js@2"; + +Deno.serve(async (req) => { + try { + const { orderId } = await req.json(); + + const supabase = createClient( + Deno.env.get("SUPABASE_URL") ?? "", + Deno.env.get("SUPABASE_ANON_KEY") ?? "", + ); + + const { data, error } = await supabase + .from("orders") + .select("*") + .eq("id", orderId) + .single(); + + if (error) throw error; + + return new Response(JSON.stringify({ order: data }), { + headers: { "Content-Type": "application/json" }, + }); + } catch (err) { + return new Response(JSON.stringify({ error: String(err) }), { + status: 500, + headers: { "Content-Type": "application/json" }, + }); + } +}); diff --git a/packages/evals/evals/collaborative-rooms-realtime/EVAL.ts b/packages/evals/evals/collaborative-rooms-realtime/EVAL.ts index 7466303..e80243f 100644 --- a/packages/evals/evals/collaborative-rooms-realtime/EVAL.ts +++ b/packages/evals/evals/collaborative-rooms-realtime/EVAL.ts @@ -1,333 +1,354 @@ -import { expect, test } from "vitest"; +export const expectedReferenceFiles = [ + "db-rls-mandatory.md", + "db-rls-common-mistakes.md", + "db-rls-performance.md", + "db-security-functions.md", + "db-schema-auth-fk.md", + "db-schema-timestamps.md", + "db-schema-realtime.md", + "db-perf-indexes.md", + "db-migrations-idempotent.md", + "realtime-setup-auth.md", + "realtime-broadcast-database.md", + "realtime-setup-channels.md", +]; + +import type { EvalAssertion } from "../../src/eval-types.js"; import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts"; -test("migration file exists", () => { - expect(findMigrationFiles().length).toBeGreaterThan(0); -}); - -test("creates rooms table", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/create\s+table[\s\S]*?rooms/); -}); - -test("creates room_members table", () => { - const sql = getMigrationSQL().toLowerCase(); - // Accept room_members, members, memberships, room_users, etc. - const hasMembership = - /create\s+table[\s\S]*?room_members/.test(sql) || - /create\s+table[\s\S]*?room_users/.test(sql) || - /create\s+table[\s\S]*?memberships/.test(sql); - expect(hasMembership).toBe(true); -}); - -test("creates content table", () => { - const sql = getMigrationSQL().toLowerCase(); - // Accept content, contents, items, room_content, room_items, documents, etc. - const hasContent = - /create\s+table[\s\S]*?content/.test(sql) || - /create\s+table[\s\S]*?items/.test(sql) || - /create\s+table[\s\S]*?documents/.test(sql) || - /create\s+table[\s\S]*?posts/.test(sql) || - /create\s+table[\s\S]*?messages/.test(sql); - expect(hasContent).toBe(true); -}); - -test("room_members has role column with owner/editor/viewer", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/role/); - // Should define the three roles somewhere (enum, check constraint, or comment) - expect(sql).toMatch(/owner/); - expect(sql).toMatch(/editor/); - expect(sql).toMatch(/viewer/); -}); - -test("enables RLS on all application tables", () => { - const sql = getMigrationSQL().toLowerCase(); - // Must enable RLS on rooms - expect(sql).toMatch( - /alter\s+table[\s\S]*?rooms[\s\S]*?enable\s+row\s+level\s+security/, - ); - // Must enable RLS on membership table - const hasMembershipRls = - /alter\s+table[\s\S]*?room_members[\s\S]*?enable\s+row\s+level\s+security/.test( - sql, - ) || - /alter\s+table[\s\S]*?room_users[\s\S]*?enable\s+row\s+level\s+security/.test( - sql, - ) || - /alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test( - sql, - ); - expect(hasMembershipRls).toBe(true); - // Must enable RLS on content table (accept various names) - const hasContentRls = - /alter\s+table[\s\S]*?content[\s\S]*?enable\s+row\s+level\s+security/.test( - sql, - ) || - /alter\s+table[\s\S]*?items[\s\S]*?enable\s+row\s+level\s+security/.test( - sql, - ) || - /alter\s+table[\s\S]*?documents[\s\S]*?enable\s+row\s+level\s+security/.test( - sql, - ) || - /alter\s+table[\s\S]*?posts[\s\S]*?enable\s+row\s+level\s+security/.test( - sql, - ) || - /alter\s+table[\s\S]*?messages[\s\S]*?enable\s+row\s+level\s+security/.test( - sql, - ); - expect(hasContentRls).toBe(true); -}); - -test("FK to auth.users with ON DELETE CASCADE", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/references\s+auth\.users/); - expect(sql).toMatch(/on\s+delete\s+cascade/); -}); - -test("content has room_id FK referencing rooms", () => { - const sql = getMigrationSQL().toLowerCase(); - // Content table should have a foreign key to rooms - expect(sql).toMatch(/room_id[\s\S]*?references[\s\S]*?rooms/); -}); - -test("policies use (select auth.uid())", () => { - const sql = getMigrationSQL(); - const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; - expect(policyBlocks.length).toBeGreaterThan(0); - for (const policy of policyBlocks) { - if (policy.includes("auth.uid()")) { - expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i); - } - } -}); - -test("policies use TO authenticated", () => { - const sql = getMigrationSQL().toLowerCase(); - const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; - // Filter to only application table policies (not realtime.messages which may use different roles) - const appPolicies = policyBlocks.filter( - (p) => !p.includes("realtime.messages"), - ); - expect(appPolicies.length).toBeGreaterThan(0); - for (const policy of appPolicies) { - expect(policy).toMatch(/to\s+authenticated/); - } -}); - -test("private schema with security_definer helper function", () => { - const sql = getMigrationSQL().toLowerCase(); - // Private schema should be created - expect(sql).toMatch(/create\s+schema[\s\S]*?private/); - // A function in the private schema with SECURITY DEFINER - expect(sql).toMatch(/private\./); - expect(sql).toMatch(/security\s+definer/); - expect(sql).toMatch(/set\s+search_path\s*=\s*''/); -}); - -test("role-based write policies: content INSERT/UPDATE restricted to owner or editor", () => { - const sql = getMigrationSQL().toLowerCase(); - const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; - // Find INSERT or UPDATE policies on the content table - const writePolicies = policyBlocks.filter( - (p) => - (/for\s+(insert|update|all)/.test(p) || /insert|update/.test(p)) && - (p.includes("content") || - p.includes("items") || - p.includes("documents") || - p.includes("posts") || - p.includes("messages")), - ); - // At least one write policy should check for owner or editor role - const checksRole = writePolicies.some( - (p) => p.includes("owner") || p.includes("editor"), - ); - expect(checksRole).toBe(true); -}); - -test("viewer role is read-only (no write access to content)", () => { - const sql = getMigrationSQL().toLowerCase(); - const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; - // Find content write policies (INSERT, UPDATE, DELETE) - const contentWritePolicies = policyBlocks.filter( - (p) => - /for\s+(insert|update|delete)/.test(p) && - (p.includes("content") || - p.includes("items") || - p.includes("documents") || - p.includes("posts") || - p.includes("messages")), - ); - // None of the write policies should grant access to viewer role - // They should either explicitly check for owner/editor OR exclude viewer - if (contentWritePolicies.length > 0) { - const anyGrantsViewer = contentWritePolicies.some((p) => { - // If the policy doesn't mention any role, it's too permissive - const mentionsRole = - p.includes("owner") || p.includes("editor") || p.includes("viewer"); - if (!mentionsRole) return true; // no role check = viewer could write - // If it specifically includes viewer in a write context, that's wrong +export const assertions: EvalAssertion[] = [ + { + name: "migration file exists", + check: () => findMigrationFiles().length > 0, + }, + { + name: "creates rooms table", + check: () => + /create\s+table[\s\S]*?rooms/.test(getMigrationSQL().toLowerCase()), + }, + { + name: "creates room_members table", + check: () => { + const sql = getMigrationSQL().toLowerCase(); return ( - p.includes("viewer") && !p.includes("owner") && !p.includes("editor") + /create\s+table[\s\S]*?room_members/.test(sql) || + /create\s+table[\s\S]*?room_users/.test(sql) || + /create\s+table[\s\S]*?memberships/.test(sql) ); - }); - expect(anyGrantsViewer).toBe(false); - } -}); - -test("indexes on membership lookup columns", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/create\s+index/); - const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? []; - // Should index user_id and/or room_id on the membership table - const membershipIndexes = indexBlocks.filter( - (idx) => - idx.toLowerCase().includes("user_id") || - idx.toLowerCase().includes("room_id"), - ); - expect(membershipIndexes.length).toBeGreaterThanOrEqual(1); -}); - -test("uses timestamptz not plain timestamp", () => { - const sql = getMigrationSQL().toLowerCase(); - // Match "timestamp" that is NOT followed by "tz" or "with time zone" - const hasPlainTimestamp = - /(?:created_at|updated_at|invited_at|joined_at)\s+timestamp(?!\s*tz)(?!\s+with\s+time\s+zone)/; - // Only fail if the migration defines time columns with plain timestamp - if ( - sql.includes("created_at") || - sql.includes("updated_at") || - sql.includes("_at ") - ) { - expect(sql).not.toMatch(hasPlainTimestamp); - } -}); - -test("idempotent DDL", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/if\s+not\s+exists/); -}); - -test("realtime publication enabled for content table", () => { - const sql = getMigrationSQL().toLowerCase(); - // Should add the content table to supabase_realtime publication - expect(sql).toMatch(/alter\s+publication\s+supabase_realtime\s+add\s+table/); -}); - -test("broadcast trigger for content changes", () => { - const sql = getMigrationSQL().toLowerCase(); - // Should use realtime.broadcast_changes() or realtime.send() in a trigger - const usesBroadcastChanges = /realtime\.broadcast_changes/.test(sql); - const usesRealtimeSend = /realtime\.send/.test(sql); - expect(usesBroadcastChanges || usesRealtimeSend).toBe(true); - // Should create a trigger on the content table - expect(sql).toMatch(/create\s+trigger/); -}); - -test("broadcast trigger function uses security definer", () => { - const sql = getMigrationSQL().toLowerCase(); - // Find function definitions that reference realtime.broadcast_changes or realtime.send - const functionBlocks = - sql.match(/create[\s\S]*?function[\s\S]*?\$\$[\s\S]*?\$\$/gi) ?? []; - const realtimeFunctions = functionBlocks.filter( - (f) => - f.toLowerCase().includes("realtime.broadcast_changes") || - f.toLowerCase().includes("realtime.send"), - ); - expect(realtimeFunctions.length).toBeGreaterThan(0); - // The trigger function should have security definer and search_path - const hasSecurityDefiner = realtimeFunctions.some( - (f) => - /security\s+definer/.test(f.toLowerCase()) && - /set\s+search_path\s*=\s*''/.test(f.toLowerCase()), - ); - expect(hasSecurityDefiner).toBe(true); -}); - -test("RLS policies on realtime.messages", () => { - const sql = getMigrationSQL().toLowerCase(); - const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; - const realtimePolicies = policyBlocks.filter((p) => - p.includes("realtime.messages"), - ); - expect(realtimePolicies.length).toBeGreaterThan(0); - // At least one policy should target authenticated users - const hasAuthPolicy = realtimePolicies.some( - (p) => /to\s+authenticated/.test(p) || /auth\.uid\(\)/.test(p), - ); - expect(hasAuthPolicy).toBe(true); -}); - -test("realtime policy checks extension column", () => { - const sql = getMigrationSQL().toLowerCase(); - const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; - const realtimePolicies = policyBlocks.filter((p) => - p.includes("realtime.messages"), - ); - // At least one realtime policy should reference the extension column - const checksExtension = realtimePolicies.some( - (p) => - p.includes("extension") && - (p.includes("broadcast") || p.includes("presence")), - ); - expect(checksExtension).toBe(true); -}); - -test("overall quality score", () => { - const sql = getMigrationSQL().toLowerCase(); - const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; - - const signals = [ - // 1. RLS enabled on rooms - /alter\s+table[\s\S]*?rooms[\s\S]*?enable\s+row\s+level\s+security/.test( - sql, - ), - // 2. RLS enabled on membership table - /alter\s+table[\s\S]*?(room_members|room_users|memberships)[\s\S]*?enable\s+row\s+level\s+security/.test( - sql, - ), - // 3. RLS enabled on content table - /alter\s+table[\s\S]*?(content|items|documents|posts|messages)[\s\S]*?enable\s+row\s+level\s+security/.test( - sql, - ), - // 4. FK to auth.users with cascade - /references\s+auth\.users/.test(sql) && /on\s+delete\s+cascade/.test(sql), - // 5. Private schema created - /create\s+schema[\s\S]*?private/.test(sql), - // 6. security_definer with search_path - /security\s+definer/.test(sql) && /set\s+search_path\s*=\s*''/.test(sql), - // 7. Subselect auth.uid() - /\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql), - // 8. TO authenticated on policies - policyBlocks.length > 0 && - policyBlocks.filter((p) => !p.includes("realtime.messages")).length > 0 && - policyBlocks - .filter((p) => !p.includes("realtime.messages")) - .every((p) => /to\s+authenticated/.test(p)), - // 9. Indexes on lookup columns - /create\s+index/.test(sql), - // 10. timestamptz usage (accepts both timestamptz and timestamp with time zone) - /timestamptz/.test(sql) || /timestamp\s+with\s+time\s+zone/.test(sql), - // 11. IF NOT EXISTS for idempotency - /if\s+not\s+exists/.test(sql), - // 12. Role-based policies (owner/editor/viewer) - sql.includes("owner") && sql.includes("editor") && sql.includes("viewer"), - // 13. Realtime publication - /alter\s+publication\s+supabase_realtime\s+add\s+table/.test(sql), - // 14. Broadcast trigger (broadcast_changes or realtime.send) - /realtime\.broadcast_changes/.test(sql) || /realtime\.send/.test(sql), - // 15. Trigger creation - /create\s+trigger/.test(sql), - // 16. RLS on realtime.messages - policyBlocks.some((p) => p.includes("realtime.messages")), - // 17. Extension check in realtime policy - policyBlocks - .filter((p) => p.includes("realtime.messages")) - .some((p) => p.includes("extension")), - // 18. room_id FK on content - /room_id[\s\S]*?references[\s\S]*?rooms/.test(sql), - ]; - const passed = signals.filter(Boolean).length; - expect(passed).toBeGreaterThanOrEqual(13); -}); + }, + }, + { + name: "creates content table", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return ( + /create\s+table[\s\S]*?content/.test(sql) || + /create\s+table[\s\S]*?items/.test(sql) || + /create\s+table[\s\S]*?documents/.test(sql) || + /create\s+table[\s\S]*?posts/.test(sql) || + /create\s+table[\s\S]*?messages/.test(sql) + ); + }, + }, + { + name: "room_members has role column with owner/editor/viewer", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return ( + /role/.test(sql) && + /owner/.test(sql) && + /editor/.test(sql) && + /viewer/.test(sql) + ); + }, + }, + { + name: "enables RLS on all application tables", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const roomsRls = + /alter\s+table[\s\S]*?rooms[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ); + const membershipRls = + /alter\s+table[\s\S]*?room_members[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ) || + /alter\s+table[\s\S]*?room_users[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ) || + /alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ); + const contentRls = + /alter\s+table[\s\S]*?content[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ) || + /alter\s+table[\s\S]*?items[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ) || + /alter\s+table[\s\S]*?documents[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ) || + /alter\s+table[\s\S]*?posts[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ) || + /alter\s+table[\s\S]*?messages[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ); + return roomsRls && membershipRls && contentRls; + }, + }, + { + name: "FK to auth.users with ON DELETE CASCADE", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return ( + /references\s+auth\.users/.test(sql) && + /on\s+delete\s+cascade/.test(sql) + ); + }, + }, + { + name: "content has room_id FK referencing rooms", + check: () => + /room_id[\s\S]*?references[\s\S]*?rooms/.test( + getMigrationSQL().toLowerCase(), + ), + }, + { + name: "policies use (select auth.uid())", + check: () => { + const sql = getMigrationSQL(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + if (policyBlocks.length === 0) return false; + for (const policy of policyBlocks) { + if ( + policy.includes("auth.uid()") && + !/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy) + ) { + return false; + } + } + return true; + }, + }, + { + name: "policies use TO authenticated", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const appPolicies = policyBlocks.filter( + (p) => !p.includes("realtime.messages"), + ); + return ( + appPolicies.length > 0 && + appPolicies.every((p) => /to\s+authenticated/.test(p)) + ); + }, + }, + { + name: "private schema with security_definer helper function", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return ( + /create\s+schema[\s\S]*?private/.test(sql) && + /private\./.test(sql) && + /security\s+definer/.test(sql) && + /set\s+search_path\s*=\s*''/.test(sql) + ); + }, + }, + { + name: "role-based write policies: content INSERT/UPDATE restricted to owner or editor", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const writePolicies = policyBlocks.filter( + (p) => + (/for\s+(insert|update|all)/.test(p) || /insert|update/.test(p)) && + (p.includes("content") || + p.includes("items") || + p.includes("documents") || + p.includes("posts") || + p.includes("messages")), + ); + return writePolicies.some( + (p) => p.includes("owner") || p.includes("editor"), + ); + }, + }, + { + name: "viewer role is read-only (no write access to content)", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const contentWritePolicies = policyBlocks.filter( + (p) => + /for\s+(insert|update|delete)/.test(p) && + (p.includes("content") || + p.includes("items") || + p.includes("documents") || + p.includes("posts") || + p.includes("messages")), + ); + if (contentWritePolicies.length === 0) return true; + return !contentWritePolicies.some((p) => { + const mentionsRole = + p.includes("owner") || p.includes("editor") || p.includes("viewer"); + if (!mentionsRole) return true; + return ( + p.includes("viewer") && !p.includes("owner") && !p.includes("editor") + ); + }); + }, + }, + { + name: "indexes on membership lookup columns", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + if (!/create\s+index/.test(sql)) return false; + const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? []; + return ( + indexBlocks.filter( + (idx) => + idx.toLowerCase().includes("user_id") || + idx.toLowerCase().includes("room_id"), + ).length >= 1 + ); + }, + }, + { + name: "uses timestamptz not plain timestamp", + check: () => { + const rawSql = getMigrationSQL().toLowerCase(); + const sql = rawSql.replace(/--[^\n]*/g, ""); + const hasPlainTimestamp = + /(?:created_at|updated_at|invited_at|joined_at)\s+timestamp(?!\s*tz)(?!\s+with\s+time\s+zone)/; + if ( + sql.includes("created_at") || + sql.includes("updated_at") || + sql.includes("_at ") + ) { + return !hasPlainTimestamp.test(sql); + } + return true; + }, + }, + { + name: "idempotent DDL", + check: () => /if\s+not\s+exists/.test(getMigrationSQL().toLowerCase()), + }, + { + name: "realtime publication enabled for content table", + check: () => + /alter\s+publication\s+supabase_realtime\s+add\s+table/.test( + getMigrationSQL().toLowerCase(), + ), + }, + { + name: "broadcast trigger for content changes", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return ( + (/realtime\.broadcast_changes/.test(sql) || + /realtime\.send/.test(sql)) && + /create\s+trigger/.test(sql) + ); + }, + }, + { + name: "broadcast trigger function uses security definer", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const functionBlocks = + sql.match(/create[\s\S]*?function[\s\S]*?\$\$[\s\S]*?\$\$/gi) ?? []; + const realtimeFunctions = functionBlocks.filter( + (f) => + f.toLowerCase().includes("realtime.broadcast_changes") || + f.toLowerCase().includes("realtime.send"), + ); + if (realtimeFunctions.length === 0) return false; + return realtimeFunctions.some( + (f) => + /security\s+definer/.test(f.toLowerCase()) && + /set\s+search_path\s*=\s*''/.test(f.toLowerCase()), + ); + }, + }, + { + name: "RLS policies on realtime.messages", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const realtimePolicies = policyBlocks.filter((p) => + p.includes("realtime.messages"), + ); + if (realtimePolicies.length === 0) return false; + return realtimePolicies.some( + (p) => /to\s+authenticated/.test(p) || /auth\.uid\(\)/.test(p), + ); + }, + }, + { + name: "realtime policy checks extension column", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const realtimePolicies = policyBlocks.filter((p) => + p.includes("realtime.messages"), + ); + return realtimePolicies.some( + (p) => + p.includes("extension") && + (p.includes("broadcast") || p.includes("presence")), + ); + }, + }, + { + name: "overall quality score", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const signals = [ + /alter\s+table[\s\S]*?rooms[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ), + /alter\s+table[\s\S]*?(room_members|room_users|memberships)[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ), + /alter\s+table[\s\S]*?(content|items|documents|posts|messages)[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ), + /references\s+auth\.users/.test(sql) && + /on\s+delete\s+cascade/.test(sql), + /create\s+schema[\s\S]*?private/.test(sql), + /security\s+definer/.test(sql) && + /set\s+search_path\s*=\s*''/.test(sql), + /\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql), + policyBlocks.length > 0 && + policyBlocks.filter((p) => !p.includes("realtime.messages")).length > + 0 && + policyBlocks + .filter((p) => !p.includes("realtime.messages")) + .every((p) => /to\s+authenticated/.test(p)), + /create\s+index/.test(sql), + /timestamptz/.test(sql) || /timestamp\s+with\s+time\s+zone/.test(sql), + /if\s+not\s+exists/.test(sql), + sql.includes("owner") && + sql.includes("editor") && + sql.includes("viewer"), + /alter\s+publication\s+supabase_realtime\s+add\s+table/.test(sql), + /realtime\.broadcast_changes/.test(sql) || /realtime\.send/.test(sql), + /create\s+trigger/.test(sql), + policyBlocks.some((p) => p.includes("realtime.messages")), + policyBlocks + .filter((p) => p.includes("realtime.messages")) + .some((p) => p.includes("extension")), + /room_id[\s\S]*?references[\s\S]*?rooms/.test(sql), + ]; + return signals.filter(Boolean).length >= 13; + }, + }, +]; diff --git a/packages/evals/evals/connection-pooling-prisma/.env.example b/packages/evals/evals/connection-pooling-prisma/.env.example new file mode 100644 index 0000000..b7d545a --- /dev/null +++ b/packages/evals/evals/connection-pooling-prisma/.env.example @@ -0,0 +1,3 @@ +# Direct connection to the database — used for migrations +# Replace with your Supabase project's direct connection string +DATABASE_URL="postgresql://postgres:[YOUR-PASSWORD]@db.[YOUR-PROJECT-REF].supabase.co:5432/postgres" diff --git a/packages/evals/evals/connection-pooling-prisma/EVAL.ts b/packages/evals/evals/connection-pooling-prisma/EVAL.ts new file mode 100644 index 0000000..e907d8c --- /dev/null +++ b/packages/evals/evals/connection-pooling-prisma/EVAL.ts @@ -0,0 +1,134 @@ +export const expectedReferenceFiles = [ + "db-conn-pooling.md", + "db-migrations-idempotent.md", + "db-schema-auth-fk.md", +]; + +import { existsSync, readdirSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import type { EvalAssertion } from "../../src/eval-types.js"; + +const cwd = process.cwd(); + +function findPrismaSchema(): string | null { + const candidates = [ + join(cwd, "prisma", "schema.prisma"), + join(cwd, "schema.prisma"), + ]; + for (const p of candidates) { + if (existsSync(p)) return p; + } + const prismaDir = join(cwd, "prisma"); + if (existsSync(prismaDir)) { + const files = readdirSync(prismaDir).filter((f) => f.endsWith(".prisma")); + if (files.length > 0) return join(prismaDir, files[0]); + } + return null; +} + +function getPrismaSchema(): string { + const file = findPrismaSchema(); + if (!file) throw new Error("No .prisma schema file found"); + return readFileSync(file, "utf-8"); +} + +function findEnvFiles(): string[] { + const found: string[] = []; + for (const name of [ + ".env", + ".env.example", + ".env.local", + ".env.production", + ".env.development", + ]) { + const p = join(cwd, name); + if (existsSync(p)) found.push(p); + } + return found; +} + +function getAllEnvContent(): string { + return findEnvFiles() + .map((f) => readFileSync(f, "utf-8")) + .join("\n"); +} + +function getAllOutputContent(): string { + const parts: string[] = []; + const schema = findPrismaSchema(); + if (schema) parts.push(readFileSync(schema, "utf-8")); + parts.push(getAllEnvContent()); + const mdFiles = readdirSync(cwd).filter((f) => f.endsWith(".md")); + for (const f of mdFiles) { + parts.push(readFileSync(join(cwd, f), "utf-8")); + } + return parts.join("\n"); +} + +export const assertions: EvalAssertion[] = [ + { + name: "prisma schema file exists", + check: () => findPrismaSchema() !== null, + }, + { + name: "prisma schema references pooler port 6543", + check: () => /6543/.test(getAllOutputContent()), + }, + { + name: "pgbouncer=true param present", + check: () => + /pgbouncer\s*=\s*true/.test(getAllOutputContent().toLowerCase()), + }, + { + name: "DIRECT_URL provided for migrations", + check: () => { + const allContent = `${getPrismaSchema().toLowerCase()}\n${getAllEnvContent().toLowerCase()}`; + return /directurl/.test(allContent) || /direct_url/.test(allContent); + }, + }, + { + name: "datasource block references directUrl or DIRECT_URL env var", + check: () => { + const schema = getPrismaSchema().toLowerCase(); + const datasourceBlock = + schema.match(/datasource\s+\w+\s*\{[\s\S]*?\}/)?.[0] ?? ""; + return ( + /directurl/.test(datasourceBlock) || /direct_url/.test(datasourceBlock) + ); + }, + }, + { + name: "connection limit set to 1 for serverless", + check: () => { + const content = getAllOutputContent().toLowerCase(); + return ( + /connection_limit\s*=\s*1/.test(content) || + /connection_limit:\s*1/.test(content) || + /connectionlimit\s*=\s*1/.test(content) + ); + }, + }, + { + name: "explanation distinguishes port 6543 vs 5432", + check: () => { + const content = getAllOutputContent(); + return /6543/.test(content) && /5432/.test(content); + }, + }, + { + name: "overall quality: demonstrates correct Prisma + Supabase pooler setup", + check: () => { + const schema = getPrismaSchema().toLowerCase(); + const envContent = getAllEnvContent().toLowerCase(); + const allContent = `${schema}\n${envContent}`; + const signals = [ + /6543/, + /pgbouncer\s*=\s*true/, + /directurl|direct_url/, + /connection_limit\s*=\s*1|connection_limit:\s*1/, + /5432/, + ]; + return signals.filter((r) => r.test(allContent)).length >= 4; + }, + }, +]; diff --git a/packages/evals/evals/connection-pooling-prisma/PROMPT.md b/packages/evals/evals/connection-pooling-prisma/PROMPT.md new file mode 100644 index 0000000..f5a8cab --- /dev/null +++ b/packages/evals/evals/connection-pooling-prisma/PROMPT.md @@ -0,0 +1,3 @@ +I'm deploying my Supabase app on Vercel using Prisma. I keep getting "prepared statement already exists" errors in production. My current `DATABASE_URL` in `prisma/schema.prisma` uses the direct connection string on port 5432 with no pooler settings. + +Fix the Prisma configuration so it works correctly with Supabase's connection pooler for serverless deployments. Make any changes needed to `prisma/schema.prisma` and update the `.env.example` file with the correct connection string format. diff --git a/packages/evals/evals/connection-pooling-prisma/package.json b/packages/evals/evals/connection-pooling-prisma/package.json new file mode 100644 index 0000000..5340c2f --- /dev/null +++ b/packages/evals/evals/connection-pooling-prisma/package.json @@ -0,0 +1,5 @@ +{ + "name": "connection-pooling-prisma", + "private": true, + "type": "module" +} diff --git a/packages/evals/evals/connection-pooling-prisma/prisma/schema.prisma b/packages/evals/evals/connection-pooling-prisma/prisma/schema.prisma new file mode 100644 index 0000000..6e285bb --- /dev/null +++ b/packages/evals/evals/connection-pooling-prisma/prisma/schema.prisma @@ -0,0 +1,29 @@ +// This is your Prisma schema file, +// learn more about it in the docs: https://pris.ly/d/prisma-schema + +generator client { + provider = "prisma-client-js" +} + +datasource db { + provider = "postgresql" + url = env("DATABASE_URL") +} + +model User { + id String @id @default(cuid()) + email String @unique + name String? + createdAt DateTime @default(now()) + posts Post[] +} + +model Post { + id String @id @default(cuid()) + title String + content String? + published Boolean @default(false) + author User @relation(fields: [authorId], references: [id]) + authorId String + createdAt DateTime @default(now()) +} diff --git a/packages/evals/evals/connection-pooling-prisma/supabase/config.toml b/packages/evals/evals/connection-pooling-prisma/supabase/config.toml new file mode 100644 index 0000000..7715259 --- /dev/null +++ b/packages/evals/evals/connection-pooling-prisma/supabase/config.toml @@ -0,0 +1,111 @@ +# For detailed configuration reference documentation, visit: +# https://supabase.com/docs/guides/local-development/cli/config +# A string used to distinguish different Supabase projects on the same host. Defaults to the +# working directory name when running `supabase init`. +project_id = "connection-pooling-prisma" + +[api] +enabled = true +# Port to use for the API URL. +port = 54321 +# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API +# endpoints. `public` and `graphql_public` schemas are included by default. +schemas = ["public", "graphql_public"] +# Extra schemas to add to the search_path of every request. +extra_search_path = ["public", "extensions"] +# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size +# for accidental or malicious requests. +max_rows = 1000 + +[db] +# Port to use for the local database URL. +port = 54322 +# Port used by db diff command to initialize the shadow database. +shadow_port = 54320 +# The database major version to use. This has to be the same as your remote database's. Run `SHOW +# server_version;` on the remote database to check. +major_version = 17 + +[db.pooler] +enabled = false +# Port to use for the local connection pooler. +port = 54329 +# Specifies when a server connection can be reused by other clients. +# Configure one of the supported pooler modes: `transaction`, `session`. +pool_mode = "transaction" +# How many server connections to allow per user/database pair. +default_pool_size = 20 +# Maximum number of client connections allowed. +max_client_conn = 100 + +[db.migrations] +# If disabled, migrations will be skipped during a db push or reset. +enabled = true +schema_paths = [] + +[db.seed] +# If enabled, seeds the database after migrations during a db reset. +enabled = true +# Specifies an ordered list of seed files to load during db reset. +sql_paths = ["./seed.sql"] + +[realtime] +enabled = true + +[studio] +enabled = true +# Port to use for Supabase Studio. +port = 54323 +# External URL of the API server that frontend connects to. +api_url = "http://127.0.0.1" + +[inbucket] +enabled = true +# Port to use for the email testing server web interface. +port = 54324 + +[storage] +enabled = true +# The maximum file size allowed (e.g. "5MB", "500KB"). +file_size_limit = "50MiB" + +[auth] +enabled = true +# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used +# in emails. +site_url = "http://127.0.0.1:3000" +# A list of *exact* URLs that auth providers are permitted to redirect to post authentication. +additional_redirect_urls = ["https://127.0.0.1:3000"] +# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week). +jwt_expiry = 3600 +# If disabled, the refresh token will never expire. +enable_refresh_token_rotation = true +# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds. +# Requires enable_refresh_token_rotation = true. +refresh_token_reuse_interval = 10 +# Allow/disallow new user signups to your project. +enable_signup = true +# Allow/disallow anonymous sign-ins to your project. +enable_anonymous_sign_ins = false + +[auth.email] +# Allow/disallow new user signups via email to your project. +enable_signup = true +# If enabled, a user will be required to confirm any email change on both the old, and new email +# addresses. If disabled, only the new email is required to confirm. +double_confirm_changes = true +# If enabled, users need to confirm their email address before signing in. +enable_confirmations = false + +[edge_runtime] +enabled = true +# Configure one of the supported request policies: `oneshot`, `per_worker`. +policy = "per_worker" +# Port to attach the Chrome inspector for debugging edge functions. +inspector_port = 8083 + +[analytics] +enabled = true +port = 54327 +# Configure one of the supported backends: `postgres`, `bigquery`. +backend = "postgres" diff --git a/packages/evals/evals/edge-function-hello-world/EVAL.ts b/packages/evals/evals/edge-function-hello-world/EVAL.ts index 9c90d49..83f329a 100644 --- a/packages/evals/evals/edge-function-hello-world/EVAL.ts +++ b/packages/evals/evals/edge-function-hello-world/EVAL.ts @@ -1,26 +1,31 @@ +export const expectedReferenceFiles = [ + "edge-fun-quickstart.md", + "edge-fun-project-structure.md", + "edge-pat-cors.md", + "edge-pat-error-handling.md", + "dev-getting-started.md", +]; + import { existsSync, readdirSync } from "node:fs"; import { join } from "node:path"; -import { expect, test } from "vitest"; +import type { EvalAssertion } from "../../src/eval-types.js"; import { findFunctionFile, findSharedCorsFile, - functionsDir, getFunctionCode, + getFunctionsDir, getSharedCode, - supabaseDir, + getSupabaseDir, } from "../eval-utils.ts"; const FUNCTION_NAME = "hello-world"; -const helloWorldDir = join(functionsDir, FUNCTION_NAME); -/** Read function code + all shared modules combined. */ function getAllCode(): string { const code = getFunctionCode(FUNCTION_NAME); return `${code}\n${getSharedCode()}`; } -/** Extract the code after the first `catch` keyword to the end of the function. */ function getCatchBlockCode(): string { const code = getFunctionCode(FUNCTION_NAME); const catchIndex = code.search(/\bcatch\b/); @@ -28,121 +33,123 @@ function getCatchBlockCode(): string { return code.slice(catchIndex); } -test("supabase project initialized", () => { - expect(existsSync(join(supabaseDir, "config.toml"))).toBe(true); -}); - -test("function directory exists", () => { - expect(existsSync(helloWorldDir)).toBe(true); -}); - -test("function index file exists", () => { - expect(findFunctionFile(FUNCTION_NAME)).not.toBeNull(); -}); - -test("uses Deno.serve", () => { - const code = getFunctionCode(FUNCTION_NAME); - expect(code).toMatch(/Deno\.serve/); -}); - -test("returns JSON response", () => { - // Check both the function file and shared modules for JSON response patterns - const allCode = getAllCode(); - const hasContentTypeHeader = - /content-type['"]\s*:\s*['"]application\/json/i.test(allCode); - const hasResponseJson = /Response\.json/i.test(allCode); - const hasJsonStringify = /JSON\.stringify/i.test(allCode); - expect(hasContentTypeHeader || hasResponseJson || hasJsonStringify).toBe( - true, - ); -}); - -test("handles OPTIONS preflight", () => { - // OPTIONS handling may be in the function itself or in a shared CORS helper - const allCode = getAllCode(); - expect(allCode).toMatch(/['"]OPTIONS['"]/); - expect(allCode).toMatch(/\.method/); -}); - -test("defines CORS headers", () => { - const allCode = getAllCode(); - expect(allCode).toMatch(/Access-Control-Allow-Origin/); -}); - -test("CORS allows required headers", () => { - const allCode = getAllCode().toLowerCase(); - // Must include authorization and apikey in allowed headers - expect(allCode).toMatch(/access-control-allow-headers/); - expect(allCode).toMatch(/authorization/); - expect(allCode).toMatch(/apikey/); -}); - -test("error response has CORS headers", () => { - const catchCode = getCatchBlockCode(); - expect(catchCode.length).toBeGreaterThan(0); - // The catch block should either directly reference CORS headers, or call - // a shared helper that includes them (e.g. errorResponse, corsHeaders). - const sharedCode = getSharedCode(); - // Direct CORS reference in catch block - const directCors = - /corsHeaders|cors_headers|Access-Control-Allow-Origin/i.test(catchCode); - // Calls a shared helper that itself includes CORS headers - const callsSharedHelper = - /errorResponse|jsonResponse|json_response|error_response/i.test( - catchCode, - ) && /Access-Control-Allow-Origin/i.test(sharedCode); - expect(directCors || callsSharedHelper).toBe(true); -}); - -test("has try-catch for error handling", () => { - const code = getFunctionCode(FUNCTION_NAME); - expect(code).toMatch(/\btry\s*\{/); - expect(code).toMatch(/\bcatch\b/); -}); - -test("returns proper error status code", () => { - const catchCode = getCatchBlockCode(); - expect(catchCode.length).toBeGreaterThan(0); - // Error response should use status 400 or 500 (not default 200). - // Match object-style { status: 500 } or function-call-style fn('msg', 500) - const hasObjectStatus = /status:\s*(400|500|4\d{2}|5\d{2})/.test(catchCode); - const hasFnArgStatus = /[,(]\s*(400|500|4\d{2}|5\d{2})\s*[),]/.test( - catchCode, - ); - expect(hasObjectStatus || hasFnArgStatus).toBe(true); -}); - -test("shared CORS module exists", () => { - expect(findSharedCorsFile()).not.toBeNull(); -}); - -test("function imports from shared", () => { - const code = getFunctionCode(FUNCTION_NAME); - // Should import from ../_shared/ relative path - expect(code).toMatch(/from\s+['"]\.\.\/(_shared|_utils)/); -}); - -test("function uses hyphenated name", () => { - // The function directory should use hyphens, not underscores - const dirs = existsSync(functionsDir) ? readdirSync(functionsDir) : []; - const helloDir = dirs.find((d) => d.includes("hello") && d.includes("world")); - expect(helloDir).toBeDefined(); - expect(helloDir).toMatch(/^hello-world$/); -}); - -test("overall quality: demonstrates Edge Function best practices", () => { - const allCode = getAllCode().toLowerCase(); - // A high-quality Edge Function should contain most of these patterns - const signals = [ - /deno\.serve/, // Modern Deno.serve API - /['"]options['"]/, // OPTIONS preflight handling - /access-control-allow-origin/, // CORS headers defined - /\btry\s*\{/, // Error handling with try-catch - /status:\s*(400|500|4\d{2}|5\d{2})|[,(]\s*(400|500|4\d{2}|5\d{2})\s*[),]/, // Proper error status codes - /from\s+['"]\.\.\/(_shared|_utils)/, // Imports from shared directory - /authorization/, // Allows authorization header in CORS - /apikey/, // Allows apikey header in CORS - ]; - const matches = signals.filter((r) => r.test(allCode)); - expect(matches.length).toBeGreaterThanOrEqual(6); -}); +export const assertions: EvalAssertion[] = [ + { + name: "supabase project initialized", + check: () => existsSync(join(getSupabaseDir(), "config.toml")), + }, + { + name: "function directory exists", + check: () => existsSync(join(getFunctionsDir(), FUNCTION_NAME)), + }, + { + name: "function index file exists", + check: () => findFunctionFile(FUNCTION_NAME) !== null, + }, + { + name: "uses Deno.serve", + check: () => /Deno\.serve/.test(getFunctionCode(FUNCTION_NAME)), + }, + { + name: "returns JSON response", + check: () => { + const allCode = getAllCode(); + return ( + /content-type['"]\s*:\s*['"]application\/json/i.test(allCode) || + /Response\.json/i.test(allCode) || + /JSON\.stringify/i.test(allCode) + ); + }, + }, + { + name: "handles OPTIONS preflight", + check: () => { + const allCode = getAllCode(); + return /['"]OPTIONS['"]/.test(allCode) && /\.method/.test(allCode); + }, + }, + { + name: "defines CORS headers", + check: () => /Access-Control-Allow-Origin/.test(getAllCode()), + }, + { + name: "CORS allows required headers", + check: () => { + const allCode = getAllCode().toLowerCase(); + return ( + /access-control-allow-headers/.test(allCode) && + /authorization/.test(allCode) && + /apikey/.test(allCode) + ); + }, + }, + { + name: "error response has CORS headers", + check: () => { + const catchCode = getCatchBlockCode(); + if (catchCode.length === 0) return false; + const sharedCode = getSharedCode(); + const directCors = + /corsHeaders|cors_headers|Access-Control-Allow-Origin/i.test(catchCode); + const callsSharedHelper = + /errorResponse|jsonResponse|json_response|error_response/i.test( + catchCode, + ) && /Access-Control-Allow-Origin/i.test(sharedCode); + return directCors || callsSharedHelper; + }, + }, + { + name: "has try-catch for error handling", + check: () => { + const code = getFunctionCode(FUNCTION_NAME); + return /\btry\s*\{/.test(code) && /\bcatch\b/.test(code); + }, + }, + { + name: "returns proper error status code", + check: () => { + const catchCode = getCatchBlockCode(); + if (catchCode.length === 0) return false; + return ( + /status:\s*(400|500|4\d{2}|5\d{2})/.test(catchCode) || + /[,(]\s*(400|500|4\d{2}|5\d{2})\s*[),]/.test(catchCode) + ); + }, + }, + { + name: "shared CORS module exists", + check: () => findSharedCorsFile() !== null, + }, + { + name: "function imports from shared", + check: () => + /from\s+['"]\.\.\/(_shared|_utils)/.test(getFunctionCode(FUNCTION_NAME)), + }, + { + name: "function uses hyphenated name", + check: () => { + const dirs = existsSync(getFunctionsDir()) ? readdirSync(getFunctionsDir()) : []; + const helloDir = dirs.find( + (d) => d.includes("hello") && d.includes("world"), + ); + return helloDir !== undefined && /^hello-world$/.test(helloDir); + }, + }, + { + name: "overall quality: demonstrates Edge Function best practices", + check: () => { + const allCode = getAllCode().toLowerCase(); + const signals = [ + /deno\.serve/, + /['"]options['"]/, + /access-control-allow-origin/, + /\btry\s*\{/, + /status:\s*(400|500|4\d{2}|5\d{2})|[,(]\s*(400|500|4\d{2}|5\d{2})\s*[),]/, + /from\s+['"]\.\.\/(_shared|_utils)/, + /authorization/, + /apikey/, + ]; + return signals.filter((r) => r.test(allCode)).length >= 6; + }, + }, +]; diff --git a/packages/evals/evals/eval-utils.ts b/packages/evals/evals/eval-utils.ts index c3b875c..84b8f90 100644 --- a/packages/evals/evals/eval-utils.ts +++ b/packages/evals/evals/eval-utils.ts @@ -2,12 +2,90 @@ import { existsSync, readdirSync, readFileSync, statSync } from "node:fs"; import { join } from "node:path"; // --------------------------------------------------------------------------- -// Common paths +// Runtime DB helpers (use only in async tests) // --------------------------------------------------------------------------- -export const supabaseDir = join(process.cwd(), "supabase"); -export const migrationsDir = join(supabaseDir, "migrations"); -export const functionsDir = join(supabaseDir, "functions"); +const SUPABASE_URL = process.env.SUPABASE_URL ?? "http://127.0.0.1:54321"; +const SERVICE_KEY = process.env.SUPABASE_SERVICE_ROLE_KEY ?? ""; +const ANON_KEY = process.env.SUPABASE_ANON_KEY ?? ""; + +/** Execute a raw SQL query via PostgREST's /rpc endpoint or via the REST API. */ +async function pgRest( + table: string, + options: { select?: string; role?: "service_role" | "anon" } = {}, +): Promise<{ data: Record[]; error: string | null }> { + const key = options.role === "anon" ? ANON_KEY : SERVICE_KEY; + const select = options.select ?? "*"; + const res = await fetch(`${SUPABASE_URL}/rest/v1/${table}?select=${select}`, { + headers: { + apikey: key, + Authorization: `Bearer ${key}`, + "Content-Type": "application/json", + }, + }); + + if (!res.ok) { + const body = await res.text(); + return { data: [], error: `HTTP ${res.status}: ${body}` }; + } + + const data = (await res.json()) as Record[]; + return { data, error: null }; +} + +/** + * Check whether a table is visible through the PostgREST API. + * Uses the service role key (bypasses RLS). + */ +export async function tableExists(tableName: string): Promise { + const { error } = await pgRest(tableName); + // A 404 or PGRST116 means the table/view doesn't exist in the schema cache. + return error === null || !error.includes("404"); +} + +/** + * Query rows from a table. + * @param tableName - table to query + * @param role - "service_role" bypasses RLS; "anon" respects RLS policies + */ +export async function queryTable( + tableName: string, + role: "service_role" | "anon" = "service_role", +): Promise<{ data: Record[]; error: string | null }> { + return pgRest(tableName, { role }); +} + +/** + * Return true if the table exists AND is empty when queried as anon + * (i.e., RLS is blocking access as expected for an unauthenticated user). + */ +export async function anonSeeesNoRows(tableName: string): Promise { + const { data, error } = await pgRest(tableName, { role: "anon" }); + return error === null && data.length === 0; +} + +// --------------------------------------------------------------------------- +// Common paths +// +// These are FUNCTIONS, not constants, so they re-evaluate process.cwd() on +// every call. The runner does `process.chdir(workspacePath)` before running +// assertions, so all path helpers resolve relative to the correct workspace. +// --------------------------------------------------------------------------- + +/** Returns the supabase/ directory under the current working directory. */ +export function getSupabaseDir(): string { + return join(process.cwd(), "supabase"); +} + +/** Returns the supabase/migrations/ directory. */ +export function getMigrationsDir(): string { + return join(getSupabaseDir(), "migrations"); +} + +/** Returns the supabase/functions/ directory. */ +export function getFunctionsDir(): string { + return join(getSupabaseDir(), "functions"); +} // --------------------------------------------------------------------------- // Migration helpers @@ -15,10 +93,11 @@ export const functionsDir = join(supabaseDir, "functions"); /** Find all .sql migration files (agent may create one or more). */ export function findMigrationFiles(): string[] { - if (!existsSync(migrationsDir)) return []; - return readdirSync(migrationsDir) + const dir = getMigrationsDir(); + if (!existsSync(dir)) return []; + return readdirSync(dir) .filter((f) => f.endsWith(".sql")) - .map((f) => join(migrationsDir, f)); + .map((f) => join(dir, f)); } /** Read and concatenate all migration SQL files. */ @@ -39,7 +118,7 @@ export function getMigrationSQL(): string { * @param functionName - directory name under supabase/functions/ (e.g. "hello-world") */ export function findFunctionFile(functionName: string): string | null { - const fnDir = join(functionsDir, functionName); + const fnDir = join(getFunctionsDir(), functionName); if (!existsSync(fnDir)) return null; const files = readdirSync(fnDir).filter( (f) => f.startsWith("index.") && (f.endsWith(".ts") || f.endsWith(".tsx")), @@ -61,12 +140,13 @@ export function getFunctionCode(functionName: string): string { /** Find a shared CORS module under supabase/functions/_shared/ (or similar _-prefixed dir). */ export function findSharedCorsFile(): string | null { - if (!existsSync(functionsDir)) return null; - const sharedDirs = readdirSync(functionsDir).filter( - (d) => d.startsWith("_") && statSync(join(functionsDir, d)).isDirectory(), + const fnDir = getFunctionsDir(); + if (!existsSync(fnDir)) return null; + const sharedDirs = readdirSync(fnDir).filter( + (d) => d.startsWith("_") && statSync(join(fnDir, d)).isDirectory(), ); for (const dir of sharedDirs) { - const dirPath = join(functionsDir, dir); + const dirPath = join(fnDir, dir); const files = readdirSync(dirPath).filter((f) => f.includes("cors")); if (files.length > 0) return join(dirPath, files[0]); } @@ -75,13 +155,14 @@ export function findSharedCorsFile(): string | null { /** Read and concatenate all .ts/.tsx files from _-prefixed shared directories. */ export function getSharedCode(): string { - if (!existsSync(functionsDir)) return ""; - const sharedDirs = readdirSync(functionsDir).filter( - (d) => d.startsWith("_") && statSync(join(functionsDir, d)).isDirectory(), + const fnDir = getFunctionsDir(); + if (!existsSync(fnDir)) return ""; + const sharedDirs = readdirSync(fnDir).filter( + (d) => d.startsWith("_") && statSync(join(fnDir, d)).isDirectory(), ); const parts: string[] = []; for (const dir of sharedDirs) { - const dirPath = join(functionsDir, dir); + const dirPath = join(fnDir, dir); const files = readdirSync(dirPath).filter( (f) => f.endsWith(".ts") || f.endsWith(".tsx"), ); diff --git a/packages/evals/evals/extension-wrong-schema/EVAL.ts b/packages/evals/evals/extension-wrong-schema/EVAL.ts new file mode 100644 index 0000000..0147649 --- /dev/null +++ b/packages/evals/evals/extension-wrong-schema/EVAL.ts @@ -0,0 +1,100 @@ +export const expectedReferenceFiles = [ + "db-schema-extensions.md", + "db-rls-mandatory.md", + "db-migrations-idempotent.md", + "db-schema-auth-fk.md", + "db-rls-common-mistakes.md", +]; + +import type { EvalAssertion } from "../../src/eval-types.js"; + +import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts"; + +export const assertions: EvalAssertion[] = [ + { + name: "migration file exists", + check: () => findMigrationFiles().length > 0, + }, + { + name: "extension installed in extensions schema", + check: () => + /create\s+extension[\s\S]*?with\s+schema\s+extensions/.test( + getMigrationSQL().toLowerCase(), + ), + }, + { + name: "IF NOT EXISTS on extension creation", + check: () => + /create\s+extension\s+if\s+not\s+exists/.test( + getMigrationSQL().toLowerCase(), + ), + }, + { + name: "vector column with correct dimensions", + check: () => + /(?:extensions\.)?vector\s*\(\s*1536\s*\)/.test( + getMigrationSQL().toLowerCase(), + ), + }, + { + name: "HNSW index used instead of IVFFlat", + check: () => /using\s+hnsw/.test(getMigrationSQL().toLowerCase()), + }, + { + name: "RLS enabled on documents table", + check: () => + /alter\s+table[\s\S]*?documents[\s\S]*?enable\s+row\s+level\s+security/.test( + getMigrationSQL().toLowerCase(), + ), + }, + { + name: "FK to auth.users with ON DELETE CASCADE", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return ( + /references\s+auth\.users/.test(sql) && + /on\s+delete\s+cascade/.test(sql) + ); + }, + }, + { + name: "policies use TO authenticated", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + return ( + policyBlocks.length > 0 && + policyBlocks.every((p) => /to\s+authenticated/.test(p)) + ); + }, + }, + { + name: "idempotent table creation (IF NOT EXISTS)", + check: () => + /create\s+table\s+if\s+not\s+exists/.test( + getMigrationSQL().toLowerCase(), + ), + }, + { + name: "overall quality: demonstrates pgvector best practices", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const signals = [ + /create\s+extension[\s\S]*?with\s+schema\s+extensions/.test(sql), + /create\s+extension\s+if\s+not\s+exists/.test(sql), + /(?:extensions\.)?vector\s*\(\s*1536\s*\)/.test(sql), + /using\s+hnsw/.test(sql), + /alter\s+table[\s\S]*?documents[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ), + /references\s+auth\.users/.test(sql) && + /on\s+delete\s+cascade/.test(sql), + policyBlocks.length > 0 && + policyBlocks.every((p) => /to\s+authenticated/.test(p)), + /if\s+not\s+exists/.test(sql), + ]; + return signals.filter(Boolean).length >= 6; + }, + }, +]; diff --git a/packages/evals/evals/extension-wrong-schema/PROMPT.md b/packages/evals/evals/extension-wrong-schema/PROMPT.md new file mode 100644 index 0000000..75c1533 --- /dev/null +++ b/packages/evals/evals/extension-wrong-schema/PROMPT.md @@ -0,0 +1,11 @@ +I'm building a semantic search feature for my app. I need to store document embeddings generated by OpenAI's ada-002 model (1536 dimensions) and let users search their own documents. + +Create a migration in `supabase/migrations/` that: + +1. Enables the pgvector extension +2. Creates a `documents` table with: + - An `embedding` column (1536 dimensions) + - A `content` text column + - A `user_id` column linked to the authenticated user +3. Adds a vector similarity search index +4. Ensures users can only see and manage their own documents diff --git a/packages/evals/evals/extension-wrong-schema/package.json b/packages/evals/evals/extension-wrong-schema/package.json new file mode 100644 index 0000000..f650af9 --- /dev/null +++ b/packages/evals/evals/extension-wrong-schema/package.json @@ -0,0 +1,5 @@ +{ + "name": "extension-wrong-schema", + "private": true, + "type": "module" +} diff --git a/packages/evals/evals/extension-wrong-schema/supabase/config.toml b/packages/evals/evals/extension-wrong-schema/supabase/config.toml new file mode 100644 index 0000000..d0d3d30 --- /dev/null +++ b/packages/evals/evals/extension-wrong-schema/supabase/config.toml @@ -0,0 +1,111 @@ +# For detailed configuration reference documentation, visit: +# https://supabase.com/docs/guides/local-development/cli/config +# A string used to distinguish different Supabase projects on the same host. Defaults to the +# working directory name when running `supabase init`. +project_id = "extension-wrong-schema" + +[api] +enabled = true +# Port to use for the API URL. +port = 54321 +# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API +# endpoints. `public` and `graphql_public` schemas are included by default. +schemas = ["public", "graphql_public"] +# Extra schemas to add to the search_path of every request. +extra_search_path = ["public", "extensions"] +# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size +# for accidental or malicious requests. +max_rows = 1000 + +[db] +# Port to use for the local database URL. +port = 54322 +# Port used by db diff command to initialize the shadow database. +shadow_port = 54320 +# The database major version to use. This has to be the same as your remote database's. Run `SHOW +# server_version;` on the remote database to check. +major_version = 17 + +[db.pooler] +enabled = false +# Port to use for the local connection pooler. +port = 54329 +# Specifies when a server connection can be reused by other clients. +# Configure one of the supported pooler modes: `transaction`, `session`. +pool_mode = "transaction" +# How many server connections to allow per user/database pair. +default_pool_size = 20 +# Maximum number of client connections allowed. +max_client_conn = 100 + +[db.migrations] +# If disabled, migrations will be skipped during a db push or reset. +enabled = true +schema_paths = [] + +[db.seed] +# If enabled, seeds the database after migrations during a db reset. +enabled = true +# Specifies an ordered list of seed files to load during db reset. +sql_paths = ["./seed.sql"] + +[realtime] +enabled = true + +[studio] +enabled = true +# Port to use for Supabase Studio. +port = 54323 +# External URL of the API server that frontend connects to. +api_url = "http://127.0.0.1" + +[inbucket] +enabled = true +# Port to use for the email testing server web interface. +port = 54324 + +[storage] +enabled = true +# The maximum file size allowed (e.g. "5MB", "500KB"). +file_size_limit = "50MiB" + +[auth] +enabled = true +# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used +# in emails. +site_url = "http://127.0.0.1:3000" +# A list of *exact* URLs that auth providers are permitted to redirect to post authentication. +additional_redirect_urls = ["https://127.0.0.1:3000"] +# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week). +jwt_expiry = 3600 +# If disabled, the refresh token will never expire. +enable_refresh_token_rotation = true +# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds. +# Requires enable_refresh_token_rotation = true. +refresh_token_reuse_interval = 10 +# Allow/disallow new user signups to your project. +enable_signup = true +# Allow/disallow anonymous sign-ins to your project. +enable_anonymous_sign_ins = false + +[auth.email] +# Allow/disallow new user signups via email to your project. +enable_signup = true +# If enabled, a user will be required to confirm any email change on both the old, and new email +# addresses. If disabled, only the new email is required to confirm. +double_confirm_changes = true +# If enabled, users need to confirm their email address before signing in. +enable_confirmations = false + +[edge_runtime] +enabled = true +# Configure one of the supported request policies: `oneshot`, `per_worker`. +policy = "per_worker" +# Port to attach the Chrome inspector for debugging edge functions. +inspector_port = 8083 + +[analytics] +enabled = true +port = 54327 +# Configure one of the supported backends: `postgres`, `bigquery`. +backend = "postgres" diff --git a/packages/evals/evals/postgrest-schema-cache/EVAL.ts b/packages/evals/evals/postgrest-schema-cache/EVAL.ts new file mode 100644 index 0000000..43cf599 --- /dev/null +++ b/packages/evals/evals/postgrest-schema-cache/EVAL.ts @@ -0,0 +1,114 @@ +export const expectedReferenceFiles = [ + "db-rls-views.md", + "db-migrations-idempotent.md", + "db-rls-mandatory.md", + "db-rls-performance.md", + "db-schema-timestamps.md", +]; + +import { existsSync, readdirSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import type { EvalAssertion } from "../../src/eval-types.js"; + +const migrationsDir = join(process.cwd(), "supabase", "migrations"); +const STARTER_MIGRATION = "20240101000000_create_products.sql"; + +function findAgentMigrationFiles(): string[] { + if (!existsSync(migrationsDir)) return []; + return readdirSync(migrationsDir) + .filter((f) => f.endsWith(".sql") && f !== STARTER_MIGRATION) + .map((f) => join(migrationsDir, f)); +} + +function getAgentMigrationSQL(): string { + const files = findAgentMigrationFiles(); + if (files.length === 0) + throw new Error( + "No agent-created migration file found in supabase/migrations/", + ); + return files.map((f) => readFileSync(f, "utf-8")).join("\n"); +} + +export const assertions: EvalAssertion[] = [ + { + name: "new migration file exists", + check: () => findAgentMigrationFiles().length > 0, + }, + { + name: "ADD COLUMN IF NOT EXISTS for description", + check: () => + /add\s+column\s+if\s+not\s+exists\s+description/.test( + getAgentMigrationSQL().toLowerCase(), + ), + }, + { + name: "ADD COLUMN IF NOT EXISTS for published_at", + check: () => + /add\s+column\s+if\s+not\s+exists\s+published_at/.test( + getAgentMigrationSQL().toLowerCase(), + ), + }, + { + name: "published_at uses timestamptz not plain timestamp", + check: () => { + const sql = getAgentMigrationSQL().toLowerCase(); + return ( + /published_at\s+timestamptz|published_at\s+timestamp\s+with\s+time\s+zone/.test( + sql, + ) && + !/published_at\s+timestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test( + sql, + ) + ); + }, + }, + { + name: "view public_products is created", + check: () => + /create\s+(or\s+replace\s+)?view\s+public_products/.test( + getAgentMigrationSQL().toLowerCase(), + ), + }, + { + name: "view uses security_invoker = true", + check: () => + /security_invoker\s*=\s*true/.test(getAgentMigrationSQL().toLowerCase()), + }, + { + name: "SELECT policy on products for authenticated role", + check: () => { + const sql = getAgentMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + return policyBlocks.some( + (p) => + p.includes("select") && + p.includes("products") && + /to\s+authenticated/.test(p), + ); + }, + }, + { + name: "NOTIFY pgrst reload schema is present", + check: () => /notify\s+pgrst/.test(getAgentMigrationSQL().toLowerCase()), + }, + { + name: "overall quality: demonstrates PostgREST and schema best practices", + check: () => { + const sql = getAgentMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const signals = [ + /add\s+column\s+if\s+not\s+exists/.test(sql), + /published_at\s+timestamptz|published_at\s+timestamp\s+with\s+time\s+zone/.test( + sql, + ), + /create\s+(or\s+replace\s+)?view\s+public_products/.test(sql), + /security_invoker\s*=\s*true/.test(sql), + policyBlocks.some( + (p) => p.includes("select") && /to\s+authenticated/.test(p), + ), + /notify\s+pgrst/.test(sql), + ]; + return signals.filter(Boolean).length >= 5; + }, + }, +]; diff --git a/packages/evals/evals/postgrest-schema-cache/PROMPT.md b/packages/evals/evals/postgrest-schema-cache/PROMPT.md new file mode 100644 index 0000000..6604169 --- /dev/null +++ b/packages/evals/evals/postgrest-schema-cache/PROMPT.md @@ -0,0 +1,9 @@ +I'm building a product catalog with Supabase. We already have a `products` table (see the existing migration in `supabase/migrations/`), but we need to expand it. + +Please create a new migration file in `supabase/migrations/` that: + +1. Adds two new columns to the `products` table: `description` (text) and `published_at` (timestamp) +2. Creates a view called `public_products` that shows only products where `published_at` is not null +3. Adds a policy so any authenticated user can view published products + +Make sure the migration is safe to run multiple times. diff --git a/packages/evals/evals/postgrest-schema-cache/package.json b/packages/evals/evals/postgrest-schema-cache/package.json new file mode 100644 index 0000000..28a1f3b --- /dev/null +++ b/packages/evals/evals/postgrest-schema-cache/package.json @@ -0,0 +1,5 @@ +{ + "name": "postgrest-schema-cache", + "private": true, + "type": "module" +} diff --git a/packages/evals/evals/postgrest-schema-cache/supabase/config.toml b/packages/evals/evals/postgrest-schema-cache/supabase/config.toml new file mode 100644 index 0000000..9fb569e --- /dev/null +++ b/packages/evals/evals/postgrest-schema-cache/supabase/config.toml @@ -0,0 +1,111 @@ +# For detailed configuration reference documentation, visit: +# https://supabase.com/docs/guides/local-development/cli/config +# A string used to distinguish different Supabase projects on the same host. Defaults to the +# working directory name when running `supabase init`. +project_id = "postgrest-schema-cache" + +[api] +enabled = true +# Port to use for the API URL. +port = 54321 +# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API +# endpoints. `public` and `graphql_public` schemas are included by default. +schemas = ["public", "graphql_public"] +# Extra schemas to add to the search_path of every request. +extra_search_path = ["public", "extensions"] +# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size +# for accidental or malicious requests. +max_rows = 1000 + +[db] +# Port to use for the local database URL. +port = 54322 +# Port used by db diff command to initialize the shadow database. +shadow_port = 54320 +# The database major version to use. This has to be the same as your remote database's. Run `SHOW +# server_version;` on the remote database to check. +major_version = 17 + +[db.pooler] +enabled = false +# Port to use for the local connection pooler. +port = 54329 +# Specifies when a server connection can be reused by other clients. +# Configure one of the supported pooler modes: `transaction`, `session`. +pool_mode = "transaction" +# How many server connections to allow per user/database pair. +default_pool_size = 20 +# Maximum number of client connections allowed. +max_client_conn = 100 + +[db.migrations] +# If disabled, migrations will be skipped during a db push or reset. +enabled = true +schema_paths = [] + +[db.seed] +# If enabled, seeds the database after migrations during a db reset. +enabled = true +# Specifies an ordered list of seed files to load during db reset. +sql_paths = ["./seed.sql"] + +[realtime] +enabled = true + +[studio] +enabled = true +# Port to use for Supabase Studio. +port = 54323 +# External URL of the API server that frontend connects to. +api_url = "http://127.0.0.1" + +[inbucket] +enabled = true +# Port to use for the email testing server web interface. +port = 54324 + +[storage] +enabled = true +# The maximum file size allowed (e.g. "5MB", "500KB"). +file_size_limit = "50MiB" + +[auth] +enabled = true +# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used +# in emails. +site_url = "http://127.0.0.1:3000" +# A list of *exact* URLs that auth providers are permitted to redirect to post authentication. +additional_redirect_urls = ["https://127.0.0.1:3000"] +# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week). +jwt_expiry = 3600 +# If disabled, the refresh token will never expire. +enable_refresh_token_rotation = true +# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds. +# Requires enable_refresh_token_rotation = true. +refresh_token_reuse_interval = 10 +# Allow/disallow new user signups to your project. +enable_signup = true +# Allow/disallow anonymous sign-ins to your project. +enable_anonymous_sign_ins = false + +[auth.email] +# Allow/disallow new user signups via email to your project. +enable_signup = true +# If enabled, a user will be required to confirm any email change on both the old, and new email +# addresses. If disabled, only the new email is required to confirm. +double_confirm_changes = true +# If enabled, users need to confirm their email address before signing in. +enable_confirmations = false + +[edge_runtime] +enabled = true +# Configure one of the supported request policies: `oneshot`, `per_worker`. +policy = "per_worker" +# Port to attach the Chrome inspector for debugging edge functions. +inspector_port = 8083 + +[analytics] +enabled = true +port = 54327 +# Configure one of the supported backends: `postgres`, `bigquery`. +backend = "postgres" diff --git a/packages/evals/evals/postgrest-schema-cache/supabase/migrations/20240101000000_create_products.sql b/packages/evals/evals/postgrest-schema-cache/supabase/migrations/20240101000000_create_products.sql new file mode 100644 index 0000000..4206a87 --- /dev/null +++ b/packages/evals/evals/postgrest-schema-cache/supabase/migrations/20240101000000_create_products.sql @@ -0,0 +1,8 @@ +-- Initial products table +create table if not exists products ( + id bigint primary key generated always as identity, + name text not null, + price numeric(10, 2) not null default 0 +); + +alter table products enable row level security; diff --git a/packages/evals/evals/rls-update-needs-select/EVAL.ts b/packages/evals/evals/rls-update-needs-select/EVAL.ts new file mode 100644 index 0000000..a19dd35 --- /dev/null +++ b/packages/evals/evals/rls-update-needs-select/EVAL.ts @@ -0,0 +1,122 @@ +export const expectedReferenceFiles = [ + "db-rls-common-mistakes.md", + "db-rls-policy-types.md", + "db-rls-performance.md", + "db-rls-mandatory.md", + "db-schema-timestamps.md", +]; + +import type { EvalAssertion } from "../../src/eval-types.js"; + +import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts"; + +export const assertions: EvalAssertion[] = [ + { + name: "migration file exists", + check: () => findMigrationFiles().length > 0, + }, + { + name: "creates orders table", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return /create\s+table/.test(sql) && /orders/.test(sql); + }, + }, + { + name: "enables RLS on orders table", + check: () => + /alter\s+table.*orders.*enable\s+row\s+level\s+security/.test( + getMigrationSQL().toLowerCase(), + ), + }, + { + name: "has SELECT policy on orders", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + return policyBlocks.some((p) => p.includes("for select")); + }, + }, + { + name: "has UPDATE policy with WITH CHECK on orders", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const updatePolicy = policyBlocks.find((p) => p.includes("for update")); + return updatePolicy !== undefined && /with\s+check/.test(updatePolicy); + }, + }, + { + name: "all policies use TO authenticated", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + return ( + policyBlocks.length > 0 && + policyBlocks.every((p) => /to\s+authenticated/.test(p)) + ); + }, + }, + { + name: "uses (select auth.uid()) not bare auth.uid() in policies", + check: () => { + const sql = getMigrationSQL(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + for (const policy of policyBlocks) { + if ( + policy.includes("auth.uid()") && + !/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy) + ) { + return false; + } + } + return true; + }, + }, + { + name: "uses timestamptz not plain timestamp for created_at", + check: () => { + const rawSql = getMigrationSQL().toLowerCase(); + const sql = rawSql.replace(/--[^\n]*/g, ""); + const hasPlainTimestamp = + /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/; + if (sql.includes("created_at")) { + return !hasPlainTimestamp.test(sql); + } + return true; + }, + }, + { + name: "FK to auth.users with ON DELETE CASCADE", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return ( + /references\s+auth\.users/.test(sql) && + /on\s+delete\s+cascade/.test(sql) + ); + }, + }, + { + name: "overall quality: demonstrates Supabase best practices", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const signals = [ + /alter\s+table.*orders.*enable\s+row\s+level\s+security/.test(sql), + policyBlocks.some((p) => p.includes("for select")), + policyBlocks.some( + (p) => p.includes("for update") && /with\s+check/.test(p), + ), + /\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql), + policyBlocks.length > 0 && + policyBlocks.every((p) => /to\s+authenticated/.test(p)), + /references\s+auth\.users/.test(sql) && + /on\s+delete\s+cascade/.test(sql), + !/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test( + sql.replace(/--[^\n]*/g, ""), + ), + ]; + return signals.filter(Boolean).length >= 5; + }, + }, +]; diff --git a/packages/evals/evals/rls-update-needs-select/PROMPT.md b/packages/evals/evals/rls-update-needs-select/PROMPT.md new file mode 100644 index 0000000..f298669 --- /dev/null +++ b/packages/evals/evals/rls-update-needs-select/PROMPT.md @@ -0,0 +1,7 @@ +I'm building an e-commerce app and need a migration for an `orders` table. Each order has a `status` (text), `total` (numeric), and `created_at` timestamp. Orders belong to users — each order should have a `user_id` that links to the authenticated user who placed it. + +Users need to be able to: +- View their own orders +- Update the status of their own orders + +Please create the migration in `supabase/migrations/`. diff --git a/packages/evals/evals/rls-update-needs-select/package.json b/packages/evals/evals/rls-update-needs-select/package.json new file mode 100644 index 0000000..e07e8e9 --- /dev/null +++ b/packages/evals/evals/rls-update-needs-select/package.json @@ -0,0 +1,5 @@ +{ + "name": "rls-update-needs-select", + "private": true, + "type": "module" +} diff --git a/packages/evals/evals/rls-update-needs-select/supabase/config.toml b/packages/evals/evals/rls-update-needs-select/supabase/config.toml new file mode 100644 index 0000000..ba7f194 --- /dev/null +++ b/packages/evals/evals/rls-update-needs-select/supabase/config.toml @@ -0,0 +1,111 @@ +# For detailed configuration reference documentation, visit: +# https://supabase.com/docs/guides/local-development/cli/config +# A string used to distinguish different Supabase projects on the same host. Defaults to the +# working directory name when running `supabase init`. +project_id = "rls-update-needs-select" + +[api] +enabled = true +# Port to use for the API URL. +port = 54321 +# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API +# endpoints. `public` and `graphql_public` schemas are included by default. +schemas = ["public", "graphql_public"] +# Extra schemas to add to the search_path of every request. +extra_search_path = ["public", "extensions"] +# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size +# for accidental or malicious requests. +max_rows = 1000 + +[db] +# Port to use for the local database URL. +port = 54322 +# Port used by db diff command to initialize the shadow database. +shadow_port = 54320 +# The database major version to use. This has to be the same as your remote database's. Run `SHOW +# server_version;` on the remote database to check. +major_version = 17 + +[db.pooler] +enabled = false +# Port to use for the local connection pooler. +port = 54329 +# Specifies when a server connection can be reused by other clients. +# Configure one of the supported pooler modes: `transaction`, `session`. +pool_mode = "transaction" +# How many server connections to allow per user/database pair. +default_pool_size = 20 +# Maximum number of client connections allowed. +max_client_conn = 100 + +[db.migrations] +# If disabled, migrations will be skipped during a db push or reset. +enabled = true +schema_paths = [] + +[db.seed] +# If enabled, seeds the database after migrations during a db reset. +enabled = true +# Specifies an ordered list of seed files to load during db reset. +sql_paths = ["./seed.sql"] + +[realtime] +enabled = true + +[studio] +enabled = true +# Port to use for Supabase Studio. +port = 54323 +# External URL of the API server that frontend connects to. +api_url = "http://127.0.0.1" + +[inbucket] +enabled = true +# Port to use for the email testing server web interface. +port = 54324 + +[storage] +enabled = true +# The maximum file size allowed (e.g. "5MB", "500KB"). +file_size_limit = "50MiB" + +[auth] +enabled = true +# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used +# in emails. +site_url = "http://127.0.0.1:3000" +# A list of *exact* URLs that auth providers are permitted to redirect to post authentication. +additional_redirect_urls = ["https://127.0.0.1:3000"] +# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week). +jwt_expiry = 3600 +# If disabled, the refresh token will never expire. +enable_refresh_token_rotation = true +# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds. +# Requires enable_refresh_token_rotation = true. +refresh_token_reuse_interval = 10 +# Allow/disallow new user signups to your project. +enable_signup = true +# Allow/disallow anonymous sign-ins to your project. +enable_anonymous_sign_ins = false + +[auth.email] +# Allow/disallow new user signups via email to your project. +enable_signup = true +# If enabled, a user will be required to confirm any email change on both the old, and new email +# addresses. If disabled, only the new email is required to confirm. +double_confirm_changes = true +# If enabled, users need to confirm their email address before signing in. +enable_confirmations = false + +[edge_runtime] +enabled = true +# Configure one of the supported request policies: `oneshot`, `per_worker`. +policy = "per_worker" +# Port to attach the Chrome inspector for debugging edge functions. +inspector_port = 8083 + +[analytics] +enabled = true +port = 54327 +# Configure one of the supported backends: `postgres`, `bigquery`. +backend = "postgres" diff --git a/packages/evals/evals/rls-user-metadata-role-check/EVAL.ts b/packages/evals/evals/rls-user-metadata-role-check/EVAL.ts new file mode 100644 index 0000000..f9da3db --- /dev/null +++ b/packages/evals/evals/rls-user-metadata-role-check/EVAL.ts @@ -0,0 +1,123 @@ +export const expectedReferenceFiles = [ + "db-rls-common-mistakes.md", + "db-rls-policy-types.md", + "db-rls-performance.md", + "db-rls-mandatory.md", + "db-schema-auth-fk.md", +]; + +import type { EvalAssertion } from "../../src/eval-types.js"; + +import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts"; + +export const assertions: EvalAssertion[] = [ + { + name: "migration file exists in supabase/migrations/", + check: () => findMigrationFiles().length > 0, + }, + { + name: "creates documents table", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return /create\s+table/.test(sql) && /documents/.test(sql); + }, + }, + { + name: "RLS enabled on documents table", + check: () => + /alter\s+table.*documents.*enable\s+row\s+level\s+security/.test( + getMigrationSQL().toLowerCase(), + ), + }, + { + name: "uses app_metadata not user_metadata for role check", + check: () => /app_metadata/.test(getMigrationSQL().toLowerCase()), + }, + { + name: "user_metadata does not appear in policy USING clauses", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + return policyBlocks.every((p) => !p.includes("user_metadata")); + }, + }, + { + name: "has at least two SELECT policies (owner and admin)", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const hasOwnerPolicy = policyBlocks.some( + (p) => + (p.includes("select") || !p.includes("insert")) && + (p.includes("user_id") || + p.includes("owner") || + p.includes("auth.uid")), + ); + const hasAdminPolicy = policyBlocks.some((p) => + p.includes("app_metadata"), + ); + return hasOwnerPolicy && hasAdminPolicy; + }, + }, + { + name: "policies use TO authenticated", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + return ( + policyBlocks.length > 0 && + policyBlocks.every((p) => /to\s+authenticated/.test(p)) + ); + }, + }, + { + name: "uses (select auth.uid()) subselect form in policies", + check: () => { + const sql = getMigrationSQL(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + for (const policy of policyBlocks) { + if ( + policy.includes("auth.uid()") && + !/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy) + ) { + return false; + } + } + return true; + }, + }, + { + name: "FK to auth.users with ON DELETE CASCADE", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return ( + /references\s+auth\.users/.test(sql) && + /on\s+delete\s+cascade/.test(sql) + ); + }, + }, + { + name: "overall quality: demonstrates Supabase best practices", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const signals = [ + /alter\s+table.*documents.*enable\s+row\s+level\s+security/.test(sql), + /app_metadata/.test(sql), + policyBlocks.every((p) => !p.includes("user_metadata")), + /\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql), + policyBlocks.length > 0 && + policyBlocks.every((p) => /to\s+authenticated/.test(p)), + /references\s+auth\.users/.test(sql) && + /on\s+delete\s+cascade/.test(sql), + policyBlocks.some( + (p) => + p.includes("user_id") || + p.includes("owner") || + p.includes("auth.uid"), + ) && policyBlocks.some((p) => p.includes("app_metadata")), + ]; + return signals.filter(Boolean).length >= 5; + }, + }, +]; diff --git a/packages/evals/evals/rls-user-metadata-role-check/PROMPT.md b/packages/evals/evals/rls-user-metadata-role-check/PROMPT.md new file mode 100644 index 0000000..8c686b8 --- /dev/null +++ b/packages/evals/evals/rls-user-metadata-role-check/PROMPT.md @@ -0,0 +1,7 @@ +I'm building a document management app on Supabase. I need a migration for a `documents` table. Each document has a `title` (text), `content` (text), and belongs to a user (the owner). + +The access rules are: +- Regular users can only read their own documents. +- Admin users — identified by a role field stored in their JWT — should be able to read all documents. + +Please create the migration in `supabase/migrations/`. The Supabase project is already initialized. diff --git a/packages/evals/evals/rls-user-metadata-role-check/package.json b/packages/evals/evals/rls-user-metadata-role-check/package.json new file mode 100644 index 0000000..27bce2a --- /dev/null +++ b/packages/evals/evals/rls-user-metadata-role-check/package.json @@ -0,0 +1,5 @@ +{ + "name": "rls-user-metadata-role-check", + "private": true, + "type": "module" +} diff --git a/packages/evals/evals/rls-user-metadata-role-check/supabase/config.toml b/packages/evals/evals/rls-user-metadata-role-check/supabase/config.toml new file mode 100644 index 0000000..f88384a --- /dev/null +++ b/packages/evals/evals/rls-user-metadata-role-check/supabase/config.toml @@ -0,0 +1,111 @@ +# For detailed configuration reference documentation, visit: +# https://supabase.com/docs/guides/local-development/cli/config +# A string used to distinguish different Supabase projects on the same host. Defaults to the +# working directory name when running `supabase init`. +project_id = "rls-user-metadata-role-check" + +[api] +enabled = true +# Port to use for the API URL. +port = 54321 +# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API +# endpoints. `public` and `graphql_public` schemas are included by default. +schemas = ["public", "graphql_public"] +# Extra schemas to add to the search_path of every request. +extra_search_path = ["public", "extensions"] +# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size +# for accidental or malicious requests. +max_rows = 1000 + +[db] +# Port to use for the local database URL. +port = 54322 +# Port used by db diff command to initialize the shadow database. +shadow_port = 54320 +# The database major version to use. This has to be the same as your remote database's. Run `SHOW +# server_version;` on the remote database to check. +major_version = 17 + +[db.pooler] +enabled = false +# Port to use for the local connection pooler. +port = 54329 +# Specifies when a server connection can be reused by other clients. +# Configure one of the supported pooler modes: `transaction`, `session`. +pool_mode = "transaction" +# How many server connections to allow per user/database pair. +default_pool_size = 20 +# Maximum number of client connections allowed. +max_client_conn = 100 + +[db.migrations] +# If disabled, migrations will be skipped during a db push or reset. +enabled = true +schema_paths = [] + +[db.seed] +# If enabled, seeds the database after migrations during a db reset. +enabled = true +# Specifies an ordered list of seed files to load during db reset. +sql_paths = ["./seed.sql"] + +[realtime] +enabled = true + +[studio] +enabled = true +# Port to use for Supabase Studio. +port = 54323 +# External URL of the API server that frontend connects to. +api_url = "http://127.0.0.1" + +[inbucket] +enabled = true +# Port to use for the email testing server web interface. +port = 54324 + +[storage] +enabled = true +# The maximum file size allowed (e.g. "5MB", "500KB"). +file_size_limit = "50MiB" + +[auth] +enabled = true +# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used +# in emails. +site_url = "http://127.0.0.1:3000" +# A list of *exact* URLs that auth providers are permitted to redirect to post authentication. +additional_redirect_urls = ["https://127.0.0.1:3000"] +# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week). +jwt_expiry = 3600 +# If disabled, the refresh token will never expire. +enable_refresh_token_rotation = true +# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds. +# Requires enable_refresh_token_rotation = true. +refresh_token_reuse_interval = 10 +# Allow/disallow new user signups to your project. +enable_signup = true +# Allow/disallow anonymous sign-ins to your project. +enable_anonymous_sign_ins = false + +[auth.email] +# Allow/disallow new user signups via email to your project. +enable_signup = true +# If enabled, a user will be required to confirm any email change on both the old, and new email +# addresses. If disabled, only the new email is required to confirm. +double_confirm_changes = true +# If enabled, users need to confirm their email address before signing in. +enable_confirmations = false + +[edge_runtime] +enabled = true +# Configure one of the supported request policies: `oneshot`, `per_worker`. +policy = "per_worker" +# Port to attach the Chrome inspector for debugging edge functions. +inspector_port = 8083 + +[analytics] +enabled = true +port = 54327 +# Configure one of the supported backends: `postgres`, `bigquery`. +backend = "postgres" diff --git a/packages/evals/evals/service-role-edge-function/EVAL.ts b/packages/evals/evals/service-role-edge-function/EVAL.ts new file mode 100644 index 0000000..10d81b6 --- /dev/null +++ b/packages/evals/evals/service-role-edge-function/EVAL.ts @@ -0,0 +1,102 @@ +export const expectedReferenceFiles = [ + "db-security-service-role.md", + "edge-fun-quickstart.md", + "edge-db-supabase-client.md", + "edge-pat-cors.md", + "edge-pat-error-handling.md", +]; + +import { existsSync } from "node:fs"; +import { join } from "node:path"; +import type { EvalAssertion } from "../../src/eval-types.js"; + +import { + findFunctionFile, + getFunctionCode, + getSharedCode, + getSupabaseDir, +} from "../eval-utils.ts"; + +const FUNCTION_NAME = "admin-reports"; + +function getAllCode(): string { + const code = getFunctionCode(FUNCTION_NAME); + return `${code}\n${getSharedCode()}`; +} + +export const assertions: EvalAssertion[] = [ + { + name: "supabase project initialized (config.toml exists)", + check: () => existsSync(join(getSupabaseDir(), "config.toml")), + }, + { + name: "edge function file exists", + check: () => findFunctionFile(FUNCTION_NAME) !== null, + }, + { + name: "uses Deno.env.get for service role key", + check: () => + /Deno\.env\.get\(\s*['"][^'"]*service[_-]?role[^'"]*['"]\s*\)/i.test( + getAllCode(), + ), + }, + { + name: "no hardcoded service role key", + check: () => { + const allCode = getAllCode(); + const lines = allCode.split("\n"); + const nonCommentLines = lines.filter( + (line) => !line.trimStart().startsWith("//"), + ); + return !nonCommentLines.some((line) => + /(['"`])eyJ[A-Za-z0-9_-]+\.\1?|(['"`])eyJ[A-Za-z0-9_-]+/.test(line), + ); + }, + }, + { + name: "createClient called with service role env var as second argument", + check: () => { + const allCode = getAllCode(); + return ( + /createClient/i.test(allCode) && + /Deno\.env\.get\(\s*['"][^'"]*service[_-]?role[^'"]*['"]\s*\)/i.test( + allCode, + ) + ); + }, + }, + { + name: "service role key env var name does not use NEXT_PUBLIC_ prefix", + check: () => !/NEXT_PUBLIC_[^'"]*service[_-]?role/i.test(getAllCode()), + }, + { + name: "CORS headers present", + check: () => /Access-Control-Allow-Origin/.test(getAllCode()), + }, + { + name: "returns JSON response", + check: () => { + const allCode = getAllCode(); + return ( + /content-type['"]\s*:\s*['"]application\/json/i.test(allCode) || + /Response\.json/i.test(allCode) || + /JSON\.stringify/i.test(allCode) + ); + }, + }, + { + name: "overall quality: demonstrates service role Edge Function best practices", + check: () => { + const allCode = getAllCode(); + const signals: RegExp[] = [ + /Deno\.env\.get\(\s*['"][^'"]*service[_-]?role[^'"]*['"]\s*\)/i, + /Access-Control-Allow-Origin/, + /createClient/i, + /\btry\s*\{/, + /Response\.json|JSON\.stringify/, + /Deno\.serve/, + ]; + return signals.filter((r) => r.test(allCode)).length >= 5; + }, + }, +]; diff --git a/packages/evals/evals/service-role-edge-function/PROMPT.md b/packages/evals/evals/service-role-edge-function/PROMPT.md new file mode 100644 index 0000000..beb22a5 --- /dev/null +++ b/packages/evals/evals/service-role-edge-function/PROMPT.md @@ -0,0 +1,9 @@ +I'm building an internal admin dashboard for my app. I need a Supabase Edge Function called `admin-reports` that returns all rows from the `reports` table — this is an admin-only endpoint so it needs to bypass Row Level Security. + +Create the function at `supabase/functions/admin-reports/index.ts`. Use environment variables for any Supabase keys — do not hardcode them in the source code. + +The function should: + +1. Return all rows from the `reports` table as a JSON response +2. Work when called from a browser (handle CORS) +3. Handle errors gracefully diff --git a/packages/evals/evals/service-role-edge-function/package.json b/packages/evals/evals/service-role-edge-function/package.json new file mode 100644 index 0000000..82dd807 --- /dev/null +++ b/packages/evals/evals/service-role-edge-function/package.json @@ -0,0 +1,5 @@ +{ + "name": "service-role-edge-function", + "private": true, + "type": "module" +} diff --git a/packages/evals/evals/service-role-edge-function/supabase/config.toml b/packages/evals/evals/service-role-edge-function/supabase/config.toml new file mode 100644 index 0000000..1c6f6c4 --- /dev/null +++ b/packages/evals/evals/service-role-edge-function/supabase/config.toml @@ -0,0 +1,64 @@ +# For detailed configuration reference documentation, visit: +# https://supabase.com/docs/guides/local-development/cli/config +# A string used to distinguish different Supabase projects on the same host. Defaults to the +# working directory name when running `supabase init`. +project_id = "service-role-edge-function" + +[api] +enabled = true +# Port to use for the API URL. +port = 54321 +# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API +# endpoints. `public` and `graphql_public` schemas are included by default. +schemas = ["public", "graphql_public"] +# Extra schemas to add to the search_path of every request. +extra_search_path = ["public", "extensions"] +# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size +# for accidental or malicious requests. +max_rows = 1000 + +[db] +# Port to use for the local database URL. +port = 54322 +# Port used by db diff command to initialize the shadow database. +shadow_port = 54320 +# The database major version to use. This has to be the same as your remote database's. Run `SHOW +# server_version;` on the remote database to check. +major_version = 17 + +[db.pooler] +enabled = false +# Port to use for the local connection pooler. +port = 54329 +# Specifies when a server connection can be reused by other clients. +# Configure one of the supported pooler modes: `transaction`, `session`. +pool_mode = "transaction" +# How many server connections to allow per user/database pair. +default_pool_size = 20 +# Maximum number of client connections allowed. +max_client_conn = 100 + +[storage] +enabled = true +# The maximum file size allowed (e.g. "5MB", "500KB"). +file_size_limit = "50MiB" + +[auth] +enabled = true +# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used +# in emails. +site_url = "http://127.0.0.1:3000" +# A list of *exact* URLs that auth providers are permitted to redirect to post authentication. +additional_redirect_urls = ["https://127.0.0.1:3000"] +# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week). +jwt_expiry = 3600 +# Allow/disallow new user signups to your project. +enable_signup = true +# Allow/disallow anonymous sign-ins to your project. +enable_anonymous_sign_ins = false + +[auth.email] +# Allow/disallow new user signups via email to your project. +enable_signup = true +# If enabled, users need to confirm their email address before signing in. +enable_confirmations = false diff --git a/packages/evals/evals/service-role-edge-function/supabase/migrations/20240101000000_create_reports_table.sql b/packages/evals/evals/service-role-edge-function/supabase/migrations/20240101000000_create_reports_table.sql new file mode 100644 index 0000000..e20892e --- /dev/null +++ b/packages/evals/evals/service-role-edge-function/supabase/migrations/20240101000000_create_reports_table.sql @@ -0,0 +1,10 @@ +-- Create the reports table +create table if not exists public.reports ( + id uuid primary key default gen_random_uuid(), + title text not null, + content text, + created_at timestamptz not null default now() +); + +-- Enable Row Level Security (browser clients use anon key and are restricted by default) +alter table public.reports enable row level security; diff --git a/packages/evals/evals/storage-rls-user-folders/EVAL.ts b/packages/evals/evals/storage-rls-user-folders/EVAL.ts index f56d76e..8add642 100644 --- a/packages/evals/evals/storage-rls-user-folders/EVAL.ts +++ b/packages/evals/evals/storage-rls-user-folders/EVAL.ts @@ -1,263 +1,253 @@ -import { expect, test } from "vitest"; +export const expectedReferenceFiles = [ + "storage-access-control.md", + "db-rls-mandatory.md", + "db-rls-common-mistakes.md", + "db-rls-performance.md", + "db-schema-auth-fk.md", + "db-schema-timestamps.md", + "db-perf-indexes.md", + "db-migrations-idempotent.md", +]; + +import type { EvalAssertion } from "../../src/eval-types.js"; import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts"; -test("migration file exists", () => { - expect(findMigrationFiles().length).toBeGreaterThan(0); -}); - -test("creates avatars bucket", () => { - const sql = getMigrationSQL().toLowerCase(); - // Should insert into storage.buckets with id 'avatars' and public = true - expect(sql).toMatch(/storage\.buckets/); - expect(sql).toMatch(/avatars/); - expect(sql).toMatch(/public/); - // Verify it's marked as a public bucket (true) - const avatarsBlock = sql.match( - /insert\s+into\s+storage\.buckets[\s\S]*?avatars[\s\S]*?;/, - ); - expect(avatarsBlock).not.toBeNull(); - if (avatarsBlock) { - expect(avatarsBlock[0]).toMatch(/true/); - } -}); - -test("creates documents bucket", () => { - const sql = getMigrationSQL().toLowerCase(); - // Should insert into storage.buckets with id 'documents' and public = false - expect(sql).toMatch(/documents/); - const documentsBlock = sql.match( - /insert\s+into\s+storage\.buckets[\s\S]*?documents[\s\S]*?;/, - ); - expect(documentsBlock).not.toBeNull(); - if (documentsBlock) { - expect(documentsBlock[0]).toMatch(/false/); - } -}); - -test("avatars bucket has mime type restriction", () => { - const sql = getMigrationSQL().toLowerCase(); - // Should have allowed_mime_types with image types - expect(sql).toMatch(/allowed_mime_types/); - // Check for image MIME types (jpeg, png, webp) - expect(sql).toMatch(/image\/jpeg/); - expect(sql).toMatch(/image\/png/); - expect(sql).toMatch(/image\/webp/); -}); - -test("avatars bucket has file size limit", () => { - const sql = getMigrationSQL().toLowerCase(); - // Should have file_size_limit set to approximately 2MB (2097152 bytes or 2MB string) - expect(sql).toMatch(/file_size_limit/); - // Accept either numeric bytes (2097152) or string form (2MB, 2MiB, 2 * 1024 * 1024) - const hasNumericLimit = /2097152/.test(sql); - const hasStringLimit = /2\s*m/i.test(sql); - const hasCalcLimit = /2\s*\*\s*1024\s*\*\s*1024/.test(sql); - expect(hasNumericLimit || hasStringLimit || hasCalcLimit).toBe(true); -}); - -test("storage policy uses foldername or path for user isolation", () => { - const sql = getMigrationSQL().toLowerCase(); - // Should use storage.foldername(name) with auth.uid()::text for folder isolation - const usesFoldername = /storage\.foldername\s*\(\s*name\s*\)/.test(sql); - // Also accept direct path matching patterns like (name ~ '^user-id/') - const usesPathMatch = - /\(\s*storage\.foldername\s*\(/.test(sql) || - /\bname\b.*auth\.uid\(\)/.test(sql); - expect(usesFoldername || usesPathMatch).toBe(true); - // Should cast auth.uid() to text for comparison with folder name - expect(sql).toMatch(/auth\.uid\(\)\s*::\s*text/); -}); - -test("storage policy uses TO authenticated", () => { - const sql = getMigrationSQL().toLowerCase(); - // Storage upload/delete/update policies should target authenticated users. - // Accepted forms: - // 1. Explicit TO authenticated - // 2. auth.uid() in USING/WITH CHECK (implicitly restricts to authenticated) - const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; - const storagePolicies = policyBlocks.filter((p) => - p.toLowerCase().includes("storage.objects"), - ); - // At least one storage policy should restrict to authenticated users - const hasAuthenticatedPolicy = storagePolicies.some( - (p) => - /to\s+(authenticated|public)/.test(p.toLowerCase()) || - /auth\.uid\(\)/.test(p.toLowerCase()), - ); - expect(hasAuthenticatedPolicy).toBe(true); - // Insert policies must restrict to authenticated users (explicit TO or auth.uid() check) - const insertPolicies = storagePolicies.filter((p) => - /for\s+insert/.test(p.toLowerCase()), - ); - for (const policy of insertPolicies) { - const hasExplicitTo = /to\s+authenticated/.test(policy.toLowerCase()); - const hasAuthUidCheck = /auth\.uid\(\)/.test(policy.toLowerCase()); - expect(hasExplicitTo || hasAuthUidCheck).toBe(true); - } -}); - -test("public read policy for avatars", () => { - const sql = getMigrationSQL().toLowerCase(); - // A SELECT policy on storage.objects for avatars bucket should allow public/anon access. - // Accepted forms: - // 1. Explicit TO public / TO anon - // 2. No TO clause (defaults to public role, granting all access) - // 3. No auth.uid() restriction in USING (open to everyone) - const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; - const avatarSelectPolicies = policyBlocks.filter( - (p) => - p.toLowerCase().includes("storage.objects") && - /for\s+select/.test(p.toLowerCase()) && - p.toLowerCase().includes("avatars"), - ); - expect(avatarSelectPolicies.length).toBeGreaterThan(0); - // Should allow public access: explicit TO public/anon, or no TO clause without auth.uid() restriction - const hasPublicAccess = avatarSelectPolicies.some((p) => { - const lower = p.toLowerCase(); - const hasExplicitPublic = - /to\s+public/.test(lower) || /to\s+anon/.test(lower); - // No TO clause and no auth.uid() restriction means open to all - const hasNoToClause = !/\bto\s+\w+/.test(lower); - const hasNoAuthRestriction = !/auth\.uid\(\)/.test(lower); - return hasExplicitPublic || (hasNoToClause && hasNoAuthRestriction); - }); - expect(hasPublicAccess).toBe(true); -}); - -test("documents bucket is fully private", () => { - const sql = getMigrationSQL().toLowerCase(); - // All policies for documents bucket should restrict to authenticated owner. - // Accepted forms: - // 1. Explicit TO authenticated - // 2. auth.uid() in USING/WITH CHECK (implicitly restricts to authenticated) - const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; - const documentPolicies = policyBlocks.filter( - (p) => - p.toLowerCase().includes("storage.objects") && - p.toLowerCase().includes("documents"), - ); - expect(documentPolicies.length).toBeGreaterThan(0); - // None should allow public/anon access - for (const policy of documentPolicies) { - expect(policy).not.toMatch(/to\s+public/); - expect(policy).not.toMatch(/to\s+anon/); - } - // All should be scoped to authenticated (explicit TO or auth.uid() check) - for (const policy of documentPolicies) { - const hasExplicitTo = /to\s+authenticated/.test(policy); - const hasAuthUidCheck = /auth\.uid\(\)/.test(policy); - expect(hasExplicitTo || hasAuthUidCheck).toBe(true); - } -}); - -test("creates file_metadata table", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/create\s+table/); - expect(sql).toMatch(/file_metadata/); -}); - -test("file_metadata has FK to auth.users with CASCADE", () => { - const sql = getMigrationSQL().toLowerCase(); - // Find the file_metadata CREATE TABLE block or the surrounding context - expect(sql).toMatch(/references\s+auth\.users/); - expect(sql).toMatch(/on\s+delete\s+cascade/); -}); - -test("RLS enabled on file_metadata", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch( - /alter\s+table.*file_metadata.*enable\s+row\s+level\s+security/, - ); -}); - -test("file_metadata policies use (select auth.uid())", () => { - const sql = getMigrationSQL(); - // Find policies that reference file_metadata - const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; - const metadataPolicies = policyBlocks.filter((p) => - p.toLowerCase().includes("file_metadata"), - ); - // Each policy that uses auth.uid() should use the subselect form - for (const policy of metadataPolicies) { - if (policy.includes("auth.uid()")) { - expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i); - } - } -}); - -test("uses timestamptz for time columns", () => { - const sql = getMigrationSQL().toLowerCase(); - // Only check if the migration defines time-related columns - if ( - sql.includes("created_at") || - sql.includes("updated_at") || - sql.includes("uploaded_at") - ) { - // Check column definitions for plain "timestamp" (not timestamptz / timestamp with time zone). - // Only match timestamp as a column type — look for column_name followed by timestamp. - // Exclude matches inside trigger/function bodies and RETURNS TRIGGER. - const columnDefs = sql.match( - /(?:created_at|updated_at|uploaded_at)\s+timestamp\b/g, - ); - if (columnDefs) { - for (const def of columnDefs) { - // Each match should use timestamptz or "timestamp with time zone" - expect(def).toMatch(/timestamptz|timestamp\s+with\s+time\s+zone/); +export const assertions: EvalAssertion[] = [ + { + name: "migration file exists", + check: () => findMigrationFiles().length > 0, + }, + { + name: "creates avatars bucket", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + if ( + !/storage\.buckets/.test(sql) || + !/avatars/.test(sql) || + !/public/.test(sql) + ) + return false; + const avatarsBlock = sql.match( + /insert\s+into\s+storage\.buckets[\s\S]*?avatars[\s\S]*?;/, + ); + return avatarsBlock !== null && /true/.test(avatarsBlock[0]); + }, + }, + { + name: "creates documents bucket", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + if (!/documents/.test(sql)) return false; + const documentsBlock = sql.match( + /insert\s+into\s+storage\.buckets[\s\S]*?documents[\s\S]*?;/, + ); + return documentsBlock !== null && /false/.test(documentsBlock[0]); + }, + }, + { + name: "avatars bucket has mime type restriction", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return ( + /allowed_mime_types/.test(sql) && + /image\/jpeg/.test(sql) && + /image\/png/.test(sql) && + /image\/webp/.test(sql) + ); + }, + }, + { + name: "avatars bucket has file size limit", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + if (!/file_size_limit/.test(sql)) return false; + return ( + /2097152/.test(sql) || + /2\s*m/i.test(sql) || + /2\s*\*\s*1024\s*\*\s*1024/.test(sql) + ); + }, + }, + { + name: "storage policy uses foldername or path for user isolation", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const usesFoldername = /storage\.foldername\s*\(\s*name\s*\)/.test(sql); + const usesPathMatch = + /\(\s*storage\.foldername\s*\(/.test(sql) || + /\bname\b.*auth\.uid\(\)/.test(sql); + return ( + (usesFoldername || usesPathMatch) && + /auth\.uid\(\)\s*::\s*text/.test(sql) + ); + }, + }, + { + name: "storage policy uses TO authenticated", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const storagePolicies = policyBlocks.filter((p) => + p.toLowerCase().includes("storage.objects"), + ); + const hasAuthenticatedPolicy = storagePolicies.some( + (p) => + /to\s+(authenticated|public)/.test(p.toLowerCase()) || + /auth\.uid\(\)/.test(p.toLowerCase()), + ); + if (!hasAuthenticatedPolicy) return false; + const insertPolicies = storagePolicies.filter((p) => + /for\s+insert/.test(p.toLowerCase()), + ); + return insertPolicies.every( + (p) => + /to\s+authenticated/.test(p.toLowerCase()) || + /auth\.uid\(\)/.test(p.toLowerCase()), + ); + }, + }, + { + name: "public read policy for avatars", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const avatarSelectPolicies = policyBlocks.filter( + (p) => + p.toLowerCase().includes("storage.objects") && + /for\s+select/.test(p.toLowerCase()) && + p.toLowerCase().includes("avatars"), + ); + if (avatarSelectPolicies.length === 0) return false; + return avatarSelectPolicies.some((p) => { + const lower = p.toLowerCase(); + const hasExplicitPublic = + /to\s+public/.test(lower) || /to\s+anon/.test(lower); + const hasNoToClause = !/\bto\s+\w+/.test(lower); + const hasNoAuthRestriction = !/auth\.uid\(\)/.test(lower); + return hasExplicitPublic || (hasNoToClause && hasNoAuthRestriction); + }); + }, + }, + { + name: "documents bucket is fully private", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const documentPolicies = policyBlocks.filter( + (p) => + p.toLowerCase().includes("storage.objects") && + p.toLowerCase().includes("documents"), + ); + if (documentPolicies.length === 0) return false; + return documentPolicies.every( + (p) => + !/to\s+public/.test(p) && + !/to\s+anon/.test(p) && + (/to\s+authenticated/.test(p) || /auth\.uid\(\)/.test(p)), + ); + }, + }, + { + name: "creates file_metadata table", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return /create\s+table/.test(sql) && /file_metadata/.test(sql); + }, + }, + { + name: "file_metadata has FK to auth.users with CASCADE", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return ( + /references\s+auth\.users/.test(sql) && + /on\s+delete\s+cascade/.test(sql) + ); + }, + }, + { + name: "RLS enabled on file_metadata", + check: () => + /alter\s+table.*file_metadata.*enable\s+row\s+level\s+security/.test( + getMigrationSQL().toLowerCase(), + ), + }, + { + name: "file_metadata policies use (select auth.uid())", + check: () => { + const sql = getMigrationSQL(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const metadataPolicies = policyBlocks.filter((p) => + p.toLowerCase().includes("file_metadata"), + ); + for (const policy of metadataPolicies) { + if ( + policy.includes("auth.uid()") && + !/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy) + ) { + return false; + } } - } - } -}); - -test("index on file_metadata user_id", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/create\s+index/); - // Should index user_id on file_metadata - expect(sql).toMatch(/file_metadata/); - expect(sql).toMatch(/user_id/); -}); - -test("idempotent DDL", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/if\s+not\s+exists/); -}); - -test("overall quality score", () => { - const sql = getMigrationSQL().toLowerCase(); - // A high-quality migration should contain most of these best-practice signals - const signals = [ - // 1. Avatars bucket is public - /insert\s+into\s+storage\.buckets[\s\S]*?avatars/, - // 2. Documents bucket exists - /insert\s+into\s+storage\.buckets[\s\S]*?documents/, - // 3. MIME type restriction - /allowed_mime_types/, - // 4. File size limit - /file_size_limit/, - // 5. Storage foldername helper - /storage\.foldername/, - // 6. auth.uid()::text cast - /auth\.uid\(\)\s*::\s*text/, - // 7. TO authenticated on policies - /to\s+authenticated/, - // 8. Public read for avatars - /to\s+(public|anon)/, - // 9. RLS on file_metadata - /enable\s+row\s+level\s+security/, - // 10. FK to auth.users with cascade - /on\s+delete\s+cascade/, - // 11. (select auth.uid()) subselect form - /\(select\s+auth\.uid\(\)\)/, - // 12. Index on user_id - /create\s+index/, - // 13. timestamptz usage - /timestamptz/, - // 14. IF NOT EXISTS for idempotency - /if\s+not\s+exists/, - // 15. file_metadata table - /create\s+table[\s\S]*?file_metadata/, - ]; - const matches = signals.filter((r) => r.test(sql)); - // Require at least 11 of 15 best-practice signals - expect(matches.length).toBeGreaterThanOrEqual(11); -}); + return true; + }, + }, + { + name: "uses timestamptz for time columns", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + if ( + !sql.includes("created_at") && + !sql.includes("updated_at") && + !sql.includes("uploaded_at") + ) { + return true; + } + const columnDefs = sql.match( + /(?:created_at|updated_at|uploaded_at)\s+timestamp\b/g, + ); + if (!columnDefs) return true; + return columnDefs.every((def) => + /timestamptz|timestamp\s+with\s+time\s+zone/.test(def), + ); + }, + }, + { + name: "index on file_metadata user_id", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return ( + /create\s+index/.test(sql) && + /file_metadata/.test(sql) && + /user_id/.test(sql) + ); + }, + }, + { + name: "idempotent DDL", + check: () => /if\s+not\s+exists/.test(getMigrationSQL().toLowerCase()), + }, + { + name: "overall quality score", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const signals = [ + /insert\s+into\s+storage\.buckets[\s\S]*?avatars/, + /insert\s+into\s+storage\.buckets[\s\S]*?documents/, + /allowed_mime_types/, + /file_size_limit/, + /storage\.foldername/, + /auth\.uid\(\)\s*::\s*text/, + /to\s+authenticated/, + /to\s+(public|anon)/, + /enable\s+row\s+level\s+security/, + /on\s+delete\s+cascade/, + /\(select\s+auth\.uid\(\)\)/, + /create\s+index/, + /timestamptz/, + /if\s+not\s+exists/, + /create\s+table[\s\S]*?file_metadata/, + ]; + return signals.filter((r) => r.test(sql)).length >= 11; + }, + }, +]; diff --git a/packages/evals/evals/team-rls-security-definer/EVAL.ts b/packages/evals/evals/team-rls-security-definer/EVAL.ts index fe2245c..0060ab5 100644 --- a/packages/evals/evals/team-rls-security-definer/EVAL.ts +++ b/packages/evals/evals/team-rls-security-definer/EVAL.ts @@ -1,182 +1,216 @@ -import { expect, test } from "vitest"; +export const expectedReferenceFiles = [ + "db-rls-mandatory.md", + "db-rls-policy-types.md", + "db-rls-common-mistakes.md", + "db-rls-performance.md", + "db-security-functions.md", + "db-schema-auth-fk.md", + "db-schema-timestamps.md", + "db-perf-indexes.md", + "db-migrations-idempotent.md", +]; + +import type { EvalAssertion } from "../../src/eval-types.js"; import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts"; -test("migration file exists", () => { - expect(findMigrationFiles().length).toBeGreaterThan(0); -}); - -test("creates organizations table", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/create\s+table[\s\S]*?organizations/); -}); - -test("creates memberships table", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/create\s+table[\s\S]*?memberships/); -}); - -test("creates projects table", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/create\s+table[\s\S]*?projects/); -}); - -test("enables RLS on all tables", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch( - /alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/, - ); - expect(sql).toMatch( - /alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/, - ); - expect(sql).toMatch( - /alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/, - ); -}); - -test("FK to auth.users with ON DELETE CASCADE", () => { - const sql = getMigrationSQL().toLowerCase(); - // memberships should reference auth.users with cascade delete - expect(sql).toMatch(/references\s+auth\.users/); - expect(sql).toMatch(/on\s+delete\s+cascade/); -}); - -test("org_id FK on projects", () => { - const sql = getMigrationSQL().toLowerCase(); - // projects should have a foreign key referencing organizations - expect(sql).toMatch( - /org[anization_]*id[\s\S]*?references[\s\S]*?organizations/, - ); -}); - -test("private schema created", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/create\s+schema[\s\S]*?private/); -}); - -test("security_definer helper function", () => { - const sql = getMigrationSQL().toLowerCase(); - // Function should be in the private schema with SECURITY DEFINER and search_path = '' - expect(sql).toMatch(/private\./); - expect(sql).toMatch(/security\s+definer/); - expect(sql).toMatch(/set\s+search_path\s*=\s*''/); -}); - -test("policies use (select auth.uid())", () => { - const sql = getMigrationSQL(); - const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; - expect(policyBlocks.length).toBeGreaterThan(0); - for (const policy of policyBlocks) { - if (policy.includes("auth.uid()")) { - // The subselect form: (select auth.uid()) - expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i); - } - } -}); - -test("policies use TO authenticated", () => { - const sql = getMigrationSQL().toLowerCase(); - const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; - expect(policyBlocks.length).toBeGreaterThan(0); - for (const policy of policyBlocks) { - expect(policy).toMatch(/to\s+authenticated/); - } -}); - -test("index on membership lookup columns", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/create\s+index/); - // Should index user_id and/or org_id on memberships for policy lookups - const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? []; - const indexesUserOrOrg = indexBlocks.filter( - (idx) => - idx.includes("user_id") || - idx.includes("org_id") || - idx.includes("organization_id"), - ); - expect(indexesUserOrOrg.length).toBeGreaterThanOrEqual(1); -}); - -test("uses timestamptz", () => { - const sql = getMigrationSQL().toLowerCase(); - // Match "timestamp" that is NOT followed by "tz" or "with time zone" - const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/; - // Only fail if the migration defines time columns with plain timestamp - if ( - sql.includes("created_at") || - sql.includes("updated_at") || - sql.includes("_at ") - ) { - expect(sql).not.toMatch(hasPlainTimestamp); - } -}); - -test("idempotent DDL", () => { - const sql = getMigrationSQL().toLowerCase(); - expect(sql).toMatch(/if\s+not\s+exists/); -}); - -test("delete policy restricted to owner role", () => { - const sql = getMigrationSQL().toLowerCase(); - // Look for a delete policy on projects that checks for owner (or admin) role - const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; - const deletePolicy = policyBlocks.find( - (p) => - p.toLowerCase().includes("delete") && p.toLowerCase().includes("project"), - ); - expect(deletePolicy).toBeDefined(); - // The delete policy should check for an owner/admin role - expect(deletePolicy?.toLowerCase()).toMatch(/owner|admin/); -}); - -test("overall quality score", () => { - const sql = getMigrationSQL().toLowerCase(); - const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; - // A high-quality migration should contain most of these best-practice signals - const signals = [ - // 1. RLS enabled on all three tables - /alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/.test( - sql, - ) && - /alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test( - sql, - ) && - /alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/.test( - sql, +export const assertions: EvalAssertion[] = [ + { + name: "migration file exists", + check: () => findMigrationFiles().length > 0, + }, + { + name: "creates organizations table", + check: () => + /create\s+table[\s\S]*?organizations/.test( + getMigrationSQL().toLowerCase(), ), - // 2. FK to auth.users with cascade - /references\s+auth\.users/.test(sql) && /on\s+delete\s+cascade/.test(sql), - // 3. Private schema created - /create\s+schema[\s\S]*?private/.test(sql), - // 4. security_definer with search_path - /security\s+definer/.test(sql) && /set\s+search_path\s*=\s*''/.test(sql), - // 5. Subselect auth.uid() - /\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql), - // 6. TO authenticated on policies - policyBlocks.length > 0 && - policyBlocks.every((p) => /to\s+authenticated/.test(p)), - // 7. Indexes on lookup columns - /create\s+index/.test(sql), - // 8. timestamptz (no plain timestamp) - !/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(sql), - // 9. Idempotent DDL - /if\s+not\s+exists/.test(sql), - // 10. Delete policy checks owner role - policyBlocks.some( - (p) => - p.toLowerCase().includes("delete") && - p.toLowerCase().includes("project") && - /owner|admin/.test(p.toLowerCase()), - ), - // 11. org_id FK on projects - /org[anization_]*id[\s\S]*?references[\s\S]*?organizations/.test(sql), - // 12. Multiple policies (at least one per table) - policyBlocks.length >= 3, - // 13. Membership role column exists - /role/.test(sql), - // 14. Private schema function referenced in policies - /private\./.test(sql), - ]; - const passed = signals.filter(Boolean).length; - expect(passed).toBeGreaterThanOrEqual(10); -}); + }, + { + name: "creates memberships table", + check: () => + /create\s+table[\s\S]*?memberships/.test(getMigrationSQL().toLowerCase()), + }, + { + name: "creates projects table", + check: () => + /create\s+table[\s\S]*?projects/.test(getMigrationSQL().toLowerCase()), + }, + { + name: "enables RLS on all tables", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return ( + /alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ) && + /alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ) && + /alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ) + ); + }, + }, + { + name: "FK to auth.users with ON DELETE CASCADE", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return ( + /references\s+auth\.users/.test(sql) && + /on\s+delete\s+cascade/.test(sql) + ); + }, + }, + { + name: "org_id FK on projects", + check: () => + /org[anization_]*id[\s\S]*?references[\s\S]*?organizations/.test( + getMigrationSQL().toLowerCase(), + ), + }, + { + name: "private schema created", + check: () => + /create\s+schema[\s\S]*?private/.test(getMigrationSQL().toLowerCase()), + }, + { + name: "security_definer helper function", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + return ( + /private\./.test(sql) && + /security\s+definer/.test(sql) && + /set\s+search_path\s*=\s*''/.test(sql) + ); + }, + }, + { + name: "policies use (select auth.uid())", + check: () => { + const sql = getMigrationSQL(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + if (policyBlocks.length === 0) return false; + for (const policy of policyBlocks) { + if ( + policy.includes("auth.uid()") && + !/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy) + ) { + return false; + } + } + return true; + }, + }, + { + name: "policies use TO authenticated", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + return ( + policyBlocks.length > 0 && + policyBlocks.every((p) => /to\s+authenticated/.test(p)) + ); + }, + }, + { + name: "index on membership lookup columns", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + if (!/create\s+index/.test(sql)) return false; + const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? []; + return ( + indexBlocks.filter( + (idx) => + idx.includes("user_id") || + idx.includes("org_id") || + idx.includes("organization_id"), + ).length >= 1 + ); + }, + }, + { + name: "uses timestamptz", + check: () => { + const rawSql = getMigrationSQL().toLowerCase(); + const sql = rawSql.replace(/--[^\n]*/g, ""); + const hasPlainTimestamp = + /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/; + if ( + sql.includes("created_at") || + sql.includes("updated_at") || + sql.includes("_at ") + ) { + return !hasPlainTimestamp.test(sql); + } + return true; + }, + }, + { + name: "idempotent DDL", + check: () => /if\s+not\s+exists/.test(getMigrationSQL().toLowerCase()), + }, + { + name: "stable or immutable on helper function", + check: () => + /\bstable\b|\bimmutable\b/.test(getMigrationSQL().toLowerCase()), + }, + { + name: "delete policy restricted to owner role", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const deletePolicy = policyBlocks.find( + (p) => + p.toLowerCase().includes("delete") && + p.toLowerCase().includes("project"), + ); + if (!deletePolicy) return false; + return /owner|admin/.test(deletePolicy.toLowerCase()); + }, + }, + { + name: "overall quality score", + check: () => { + const sql = getMigrationSQL().toLowerCase(); + const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? []; + const signals = [ + /alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ) && + /alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ) && + /alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/.test( + sql, + ), + /references\s+auth\.users/.test(sql) && + /on\s+delete\s+cascade/.test(sql), + /create\s+schema[\s\S]*?private/.test(sql), + /security\s+definer/.test(sql) && + /set\s+search_path\s*=\s*''/.test(sql), + /\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql), + policyBlocks.length > 0 && + policyBlocks.every((p) => /to\s+authenticated/.test(p)), + /create\s+index/.test(sql), + !/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test( + sql.replace(/--[^\n]*/g, ""), + ), + /if\s+not\s+exists/.test(sql), + policyBlocks.some( + (p) => + p.toLowerCase().includes("delete") && + p.toLowerCase().includes("project") && + /owner|admin/.test(p.toLowerCase()), + ), + /org[anization_]*id[\s\S]*?references[\s\S]*?organizations/.test(sql), + policyBlocks.length >= 3, + /role/.test(sql), + /private\./.test(sql), + /\bstable\b|\bimmutable\b/.test(sql), + ]; + return signals.filter(Boolean).length >= 11; + }, + }, +]; diff --git a/packages/evals/package.json b/packages/evals/package.json index 84e9d30..63d6988 100644 --- a/packages/evals/package.json +++ b/packages/evals/package.json @@ -17,7 +17,6 @@ "devDependencies": { "@types/node": "^20.10.0", "tsx": "^4.7.0", - "typescript": "^5.3.0", - "vitest": "^3.1.0" + "typescript": "^5.3.0" } } diff --git a/packages/evals/scenarios/SCENARIOS.md b/packages/evals/scenarios/SCENARIOS.md index bf1e12b..56e8550 100644 --- a/packages/evals/scenarios/SCENARIOS.md +++ b/packages/evals/scenarios/SCENARIOS.md @@ -6,5 +6,12 @@ | 2 | [team-rls-security-definer](team-rls-security-definer.md) | Team-based RLS with security definer helper in a private schema | | 3 | [storage-rls-user-folders](storage-rls-user-folders.md) | Storage buckets with RLS policies for user-isolated folders | | 4 | [edge-function-hello-world](edge-function-hello-world.md) | Hello-world Edge Function with CORS and shared utilities | -| 5 | edge-function-stripe-webhook | Stripe webhook Edge Function with signature verification and orders migration | -| 6 | [collaborative-rooms-realtime](collaborative-rooms-realtime.md) | Collaborative rooms with role-based RLS, broadcast triggers, and Realtime authorization | \ No newline at end of file +| 5 | [collaborative-rooms-realtime](collaborative-rooms-realtime.md) | Collaborative rooms with role-based RLS, broadcast triggers, and Realtime authorization | +| 6 | [auth-fk-cascade-delete](auth-fk-cascade-delete.md) | Profiles table with auth.users FK cascade and auto-create trigger | +| 7 | [rls-update-needs-select](rls-update-needs-select.md) | Orders table where UPDATE silently fails without a matching SELECT policy | +| 8 | [extension-wrong-schema](extension-wrong-schema.md) | pgvector extension setup with correct schema placement, HNSW index, and user-scoped RLS | +| 9 | [connection-pooling-prisma](connection-pooling-prisma.md) | Fix Prisma schema to use Supabase transaction-mode pooler (port 6543, pgbouncer=true, directUrl) for serverless deployments | +| 10 | [cli-hallucinated-commands](cli-hallucinated-commands.md) | CLI cheat-sheet that must use only real Supabase CLI commands, avoiding hallucinated `supabase functions log` and `supabase db query` | +| 11 | [postgrest-schema-cache](postgrest-schema-cache.md) | Add columns and a view to an existing table, with NOTIFY pgrst to reload the PostgREST schema cache | +| 12 | [rls-user-metadata-role-check](rls-user-metadata-role-check.md) | Documents table with owner and admin RLS — must use app_metadata not user_metadata for role authorization | +| 13 | [service-role-edge-function](service-role-edge-function.md) | Admin Edge Function that bypasses RLS using the service role key via env vars, never hardcoded | diff --git a/packages/evals/scenarios/auth-fk-cascade-delete.md b/packages/evals/scenarios/auth-fk-cascade-delete.md new file mode 100644 index 0000000..ec0dd3d --- /dev/null +++ b/packages/evals/scenarios/auth-fk-cascade-delete.md @@ -0,0 +1,84 @@ +# Scenario: auth-fk-cascade-delete + +## Summary + +The agent must create a `profiles` table that references `auth.users` with +`ON DELETE CASCADE`, and a trigger that auto-creates a profile row when a new +user signs up. The common mistake — omitting CASCADE — causes user deletion to +fail with a foreign key violation. + +## Real-World Justification + +Why this is a common and important workflow: + +1. **Top troubleshooting entry** — "Database error saving new user" and + "Errors when creating/updating/deleting users" are listed as common issues in + the Supabase troubleshooting guide. The majority of these failures trace back + to FK violations when deleting users who have linked profile rows. + - Source: https://supabase.com/docs/guides/troubleshooting +2. **Auth trigger pattern ubiquity** — The `handle_new_user` trigger on + `auth.users` is documented in the official Supabase onboarding guide and + replicated in thousands of community starter templates. Getting the + `security definer` + `set search_path = ''` details wrong breaks signups. + - Source: https://supabase.com/docs/guides/database/postgres/cascade-deletes +3. **Community-reported cascade omission** — Multiple GitHub issues report + unexpected FK violation errors when calling `auth.admin.deleteUser()` from + the SDK because the profile table was created without CASCADE. + - Source: https://github.com/supabase/supabase/issues/ + +## Skill References Exercised + +| Reference File | What It Teaches | What the Agent Should Apply | +|---|---|---| +| `references/db-schema-auth-fk.md` | ON DELETE CASCADE requirement for auth.users FKs | `REFERENCES auth.users(id) ON DELETE CASCADE` | +| `references/db-security-functions.md` | security definer + set search_path = '' for trigger functions | Correct trigger function definition | +| `references/db-rls-mandatory.md` | Enable RLS on all public tables | RLS enabled on profiles | +| `references/db-rls-common-mistakes.md` | TO clause and subselect auth.uid() | Correct policy scoping | + +## Workspace Setup + +- Empty workspace with a pre-initialized `supabase/config.toml` (no migrations) + +## Agent Task (PROMPT.md draft) + +> Set up a `profiles` table for my Supabase app. Every user who signs up should +> automatically get a profile row with their `id`, `email`, and `full_name` +> (pulled from signup metadata). The profiles table should go in +> `supabase/migrations/` as a SQL migration. Users should only be able to read +> and update their own profile. + +## Evaluation Criteria + +| # | Test Name | What It Checks | Quality Dimension | +|---|-----------|----------------|-------------------| +| 1 | migration file exists | At least one `.sql` file in `supabase/migrations/` | structure | +| 2 | creates profiles table | SQL contains `CREATE TABLE` and `profiles` | correctness | +| 3 | FK references auth.users | `REFERENCES auth.users` present | correctness | +| 4 | ON DELETE CASCADE present | `ON DELETE CASCADE` on the auth.users FK | correctness | +| 5 | RLS enabled on profiles | `ALTER TABLE profiles ENABLE ROW LEVEL SECURITY` | security | +| 6 | trigger function uses security definer | `SECURITY DEFINER` in the trigger function definition | security | +| 7 | trigger function sets search_path | `SET search_path = ''` or `set search_path` in trigger function | security | +| 8 | trigger created on auth.users | `CREATE TRIGGER ... ON auth.users` | correctness | +| 9 | policies scoped to authenticated | `TO authenticated` in policy definitions | security | + +## Reasoning + +1. **Baseline differentiator:** Without the skill, an agent creates the FK + without CASCADE and omits `set search_path = ''` on the trigger function — + two independently dangerous omissions. +2. **Skill value:** `db-schema-auth-fk.md` is explicitly about this exact + scenario; `db-security-functions.md` covers the trigger security requirements. +3. **Testability:** CASCADE and search_path are simple string patterns. Trigger + creation on `auth.users` is a unique structural signal. +4. **Realism:** The profiles-with-trigger pattern is the #1 starter pattern in + every Supabase tutorial and the #1 source of FK-violation bugs reported in + the community. + +## Difficulty + +**Rating:** MEDIUM + +- Without skill: ~35% of assertions expected to pass (table and FK likely, but + no CASCADE, no search_path, weak policies) +- With skill: ~90% of assertions expected to pass +- **pass_threshold:** 8 diff --git a/packages/evals/scenarios/auth-rls-new-project.md b/packages/evals/scenarios/auth-rls-new-project.md index ed15dc4..58ebde8 100644 --- a/packages/evals/scenarios/auth-rls-new-project.md +++ b/packages/evals/scenarios/auth-rls-new-project.md @@ -85,8 +85,9 @@ specific quality signal: | 8 | TO authenticated | Policies scoped to authenticated role | security | | 9 | timestamptz | No plain `timestamp` for time columns | correctness | | 10 | index on user_id | `CREATE INDEX` on the FK column | performance | -| 11 | IF NOT EXISTS | Idempotent migration | idempotency | -| 12 | overall quality | At least 4/5 best-practice signals present | overall | +| 11 | no SERIAL/BIGSERIAL | PK does not use error-prone serial type | correctness | +| 12 | IF NOT EXISTS | Idempotent migration | idempotency | +| 13 | overall quality | At least 4/5 best-practice signals present | overall | ## Reasoning @@ -121,4 +122,5 @@ Step-by-step reasoning for why this scenario is well-designed: **Rating:** EASY - Without skill: ~50-65% of assertions expected to pass -- With skill: ~90-100% of assertions expected to pass \ No newline at end of file +- With skill: ~90-100% of assertions expected to pass +- **pass_threshold:** 10 \ No newline at end of file diff --git a/packages/evals/scenarios/cli-hallucinated-commands.md b/packages/evals/scenarios/cli-hallucinated-commands.md new file mode 100644 index 0000000..7c21dec --- /dev/null +++ b/packages/evals/scenarios/cli-hallucinated-commands.md @@ -0,0 +1,120 @@ +# Scenario: cli-hallucinated-commands + +## Summary + +The agent must create a Supabase CLI reference cheat-sheet (`CLI_REFERENCE.md`) +covering how to view Edge Function logs and how to run ad-hoc SQL queries +against a Supabase project. This tests whether the agent invents non-existent +CLI commands (`supabase functions log`, `supabase db query`) instead of +describing the real workflows. + +## Real-World Justification + +Why this is a common and important workflow: + +1. **`supabase functions log` is a persistent hallucination** — LLMs frequently + suggest `supabase functions log` (singular) or `supabase functions logs` as + CLI commands to stream deployed function logs. Neither command exists in the + Supabase CLI. The real workflow is to use the Supabase Dashboard Logs + Explorer, or for local development, `supabase start` + `supabase functions + serve` which prints logs to stdout. This pattern appears across many + developer questions and multiple model responses. + - Source: https://supabase.com/docs/reference/cli/supabase-functions + +2. **`supabase db query` is a persistent hallucination** — LLMs suggest + `supabase db query` or `supabase db query --sql "SELECT ..."` as a way to + run ad-hoc SQL via the CLI. This command does not exist. The real workflow + is to connect via `psql` using the connection string from the Dashboard, + or use the Dashboard SQL Editor, or `supabase db dump` for schema exports. + - Source: https://supabase.com/docs/reference/cli/supabase-db + +3. **Developers frequently ask for a CLI cheat-sheet** — Setting up a reference + file for project onboarding is a standard ask. The agent must produce + accurate commands, not invented ones that will silently fail. + +## Skill References Exercised + +Which reference files the agent should consult and what each teaches: + +| Reference File | What It Teaches | What the Agent Should Apply | +|---|---|---| +| `references/dev-getting-started.md` | Real CLI commands: `supabase start`, `supabase stop`, `supabase db push`, `supabase db reset`, `supabase db diff` | Use only real `supabase db` subcommands | +| `references/edge-fun-quickstart.md` | Real Edge Function CLI: `supabase functions new`, `supabase functions serve`, `supabase functions deploy` | Use real function commands, not invented log commands | + +## Workspace Setup + +What the workspace starts with before the agent runs: + +- A pre-initialized `supabase/config.toml` (standard project setup) +- An existing Edge Function at `supabase/functions/process-order/index.ts` +- The agent is expected to create `CLI_REFERENCE.md` in the project root + +## Agent Task (PROMPT.md draft) + +The prompt to give the agent. Written as a developer would ask it — no hints +about what the tests check: + +> I'm onboarding a new developer to my Supabase project. Create a +> `CLI_REFERENCE.md` file in the project root with a practical cheat-sheet +> of Supabase CLI commands we use day-to-day. It should cover: +> +> 1. Starting and stopping the local dev stack +> 2. Managing database migrations (push, reset, diff) +> 3. Working with the `process-order` Edge Function (local dev and deploy) +> 4. How to view Edge Function logs (both local dev and production) +> 5. How to run ad-hoc SQL queries against the database (local and remote) +> +> Include the actual commands with brief explanations. + +## Evaluation Criteria + +What vitest should assert on the agent's output. Each assertion tests a +specific quality signal: + +| # | Test Name | What It Checks | Quality Dimension | +|---|-----------|----------------|-------------------| +| 1 | CLI_REFERENCE.md exists | `CLI_REFERENCE.md` file exists in project root | structure | +| 2 | no hallucinated functions log command | File does NOT contain `supabase functions log` (without 's' as a complete command) | correctness | +| 3 | no hallucinated db query command | File does NOT contain `supabase db query` | correctness | +| 4 | mentions supabase functions serve for local | File contains `supabase functions serve` | correctness | +| 5 | mentions supabase functions deploy | File contains `supabase functions deploy` | correctness | +| 6 | mentions psql or connection string for SQL | File contains `psql` or `connection string` or `SQL Editor` or `db dump` | correctness | +| 7 | mentions supabase db push or reset | File contains `supabase db push` or `supabase db reset` | correctness | +| 8 | mentions supabase start | File contains `supabase start` | correctness | +| 9 | mentions Dashboard for production logs | File mentions `Dashboard` or `Logs Explorer` for production log viewing | correctness | + +## Reasoning + +Step-by-step reasoning for why this scenario is well-designed: + +1. **Baseline differentiator:** An agent without the skill is very likely to + hallucinate both `supabase functions log` and `supabase db query` since + these are plausible-sounding commands that follow the CLI's pattern. + Multiple real-world LLM responses have included these exact commands. With + the skill's reference files listing the actual CLI commands, the agent + should know what exists and what doesn't. + +2. **Skill value:** The quickstart and getting-started reference files + enumerate the real CLI subcommands. An agent reading these will see that + `supabase functions` only has `new`, `serve`, `deploy`, `delete`, `list` + subcommands, and `supabase db` only has `push`, `reset`, `diff`, `dump`, + `lint`, `pull` — not `query`. This directly prevents the hallucination. + +3. **Testability:** All assertions are regex/string matches on a single + markdown file. No runtime execution or migration parsing needed. Checks 2 + and 3 are pure absence tests (NOT contains) which are simple but + high-signal. + +4. **Realism:** Writing a CLI reference for project onboarding is a genuine + task. The two hallucinated commands are the most commonly confused ones + based on developer feedback. Getting these wrong produces broken workflows + that are frustrating to debug. + +## Difficulty + +**Rating:** EASY + +- Without skill: ~30-50% of assertions expected to pass (likely fails checks + 2 and/or 3 due to hallucination, may also miss Dashboard mention for logs) +- With skill: ~90-100% of assertions expected to pass +- **pass_threshold:** 9 diff --git a/packages/evals/scenarios/collaborative-rooms-realtime.md b/packages/evals/scenarios/collaborative-rooms-realtime.md index 2ea414d..fbc00eb 100644 --- a/packages/evals/scenarios/collaborative-rooms-realtime.md +++ b/packages/evals/scenarios/collaborative-rooms-realtime.md @@ -154,3 +154,4 @@ Step-by-step reasoning for why this scenario is well-designed: - Without skill: ~25-40% of assertions expected to pass - With skill: ~80-90% of assertions expected to pass +- **pass_threshold:** 17 diff --git a/packages/evals/scenarios/connection-pooling-prisma.md b/packages/evals/scenarios/connection-pooling-prisma.md new file mode 100644 index 0000000..540f96c --- /dev/null +++ b/packages/evals/scenarios/connection-pooling-prisma.md @@ -0,0 +1,80 @@ +# Scenario: connection-pooling-prisma + +## Summary + +The agent must produce a `DATABASE_URL` configuration and Prisma schema setup +that correctly uses Supabase's transaction-mode pooler (port 6543) with the +`?pgbouncer=true` parameter to disable prepared statements. Without this, Prisma +throws "prepared statement already exists" errors in serverless environments. + +## Real-World Justification + +Why this is a common and important workflow: + +1. **Top troubleshooting entry** — "Error: prepared statement XXX already exists" + is listed in the Supabase troubleshooting guide under Database Issues as a + direct consequence of using transaction-mode pooling without disabling + prepared statements. + - Source: https://supabase.com/docs/guides/troubleshooting +2. **Serverless deployment reality** — Vercel and other serverless platforms + are the most popular Supabase deployment targets. Each function invocation + creates a new connection, making transaction-mode pooling mandatory. The + Prisma + Supabase combination is the most-searched configuration pairing. + - Source: https://supabase.com/docs/guides/database/connecting-to-postgres +3. **Connection exhaustion** — Using session mode (port 5432) in serverless + leads to "remaining connection slots are reserved" errors — another top + troubleshooting entry. The fix requires switching to port 6543. + - Source: https://supabase.com/docs/guides/troubleshooting + +## Skill References Exercised + +| Reference File | What It Teaches | What the Agent Should Apply | +|---|---|---| +| `references/db-conn-pooling.md` | Transaction mode port 6543, pgbouncer=true for Prisma | Correct DATABASE_URL with port 6543 and ?pgbouncer=true | +| `references/db-migrations-idempotent.md` | Migration file conventions and naming | Migration file in supabase/migrations/ | +| `references/db-schema-auth-fk.md` | Schema best practices for user-linked tables | Proper FK patterns if schema is involved | + +## Workspace Setup + +- A workspace with `supabase/config.toml` already initialized +- A `prisma/schema.prisma` starter file with a placeholder `DATABASE_URL` using + direct connection (port 5432, no pgbouncer flag) + +## Agent Task (PROMPT.md draft) + +> I'm deploying my Supabase app on Vercel using Prisma. I keep getting +> "prepared statement already exists" errors in production. My current +> `DATABASE_URL` in `prisma/schema.prisma` uses the direct connection string. +> Fix the Prisma configuration so it works correctly with Supabase's connection +> pooler. + +## Evaluation Criteria + +| # | Test Name | What It Checks | Quality Dimension | +|---|-----------|----------------|-------------------| +| 1 | prisma schema references pooler port | `DATABASE_URL` or connection hint references port `6543` | correctness | +| 2 | pgbouncer=true param present | `?pgbouncer=true` or `pgbouncer=true` in the connection URL or env comment | correctness | +| 3 | DIRECT_URL provided for migrations | A separate `directUrl` or `DIRECT_URL` variable defined for Prisma migrations | correctness | +| 4 | connection limit set to 1 | `connection_limit=1` in the pooler URL or Prisma datasource | performance | +| 5 | explanation distinguishes port 6543 vs 5432 | Output or comments distinguish transaction mode (6543) from direct (5432) | correctness | + +## Reasoning + +1. **Baseline differentiator:** An agent without the skill typically updates + the port or adds pgbouncer but forgets `DIRECT_URL` for migrations, or sets + `max` connections too high, or uses session mode instead of transaction mode. +2. **Skill value:** `db-conn-pooling.md` provides the exact pattern: port 6543, + `?pgbouncer=true`, `max: 1` per serverless instance. +3. **Testability:** Port numbers and query parameters are directly readable as + string patterns in the output files. +4. **Realism:** "Prisma prepared statement already exists on Supabase" is one + of the most-searched Supabase error messages on Stack Overflow and GitHub. + +## Difficulty + +**Rating:** MEDIUM + +- Without skill: ~30% of assertions expected to pass (agent may change port but + likely misses pgbouncer param and DIRECT_URL) +- With skill: ~90% of assertions expected to pass +- **pass_threshold:** 7 diff --git a/packages/evals/scenarios/edge-function-hello-world.md b/packages/evals/scenarios/edge-function-hello-world.md index 719221c..9246f97 100644 --- a/packages/evals/scenarios/edge-function-hello-world.md +++ b/packages/evals/scenarios/edge-function-hello-world.md @@ -127,3 +127,4 @@ Step-by-step reasoning for why this scenario is well-designed: - Without skill: ~45-60% of assertions expected to pass - With skill: ~90-100% of assertions expected to pass +- **pass_threshold:** 13 diff --git a/packages/evals/scenarios/extension-wrong-schema.md b/packages/evals/scenarios/extension-wrong-schema.md new file mode 100644 index 0000000..e354e6e --- /dev/null +++ b/packages/evals/scenarios/extension-wrong-schema.md @@ -0,0 +1,89 @@ +# Scenario: extension-wrong-schema + +## Summary + +The agent must create a migration that enables the `pgvector` extension and +creates an `embeddings` table with a vector column and an HNSW index. The trap +is installing the extension in the `public` schema (the default) instead of +the `extensions` schema, and using IVFFlat without a `lists` parameter. + +## Real-World Justification + +Why this is a common and important workflow: + +1. **Known schema pollution issue** — Installing extensions in `public` exposes + extension functions and types through the PostgREST API, which can reveal + internal details and cause "42501: permission denied" errors. The Supabase + troubleshooting guide covers permission errors as a category. + - Source: https://supabase.com/docs/guides/troubleshooting +2. **IVFFlat without lists = error** — The Supabase troubleshooting guide + contains a dedicated entry: "Increase vector lookup speeds by applying an + HNSW index" which warns against IVFFlat and notes its required `lists` + parameter. Missing this causes a CREATE INDEX error. + - Source: https://supabase.com/docs/guides/troubleshooting +3. **pgvector adoption** — Vector/AI embeddings are the fastest-growing + Supabase use case. Nearly every AI-powered Supabase project starts with + the pgvector extension setup. Getting the schema right from the start + prevents later schema drift. + - Source: https://supabase.com/docs/guides/database/extensions/pgvector + +## Skill References Exercised + +| Reference File | What It Teaches | What the Agent Should Apply | +|---|---|---| +| `references/db-schema-extensions.md` | Install extensions in `extensions` schema, not `public`; HNSW over IVFFlat; IVFFlat needs `lists` | `CREATE EXTENSION ... WITH SCHEMA extensions`; HNSW index | +| `references/db-rls-mandatory.md` | Enable RLS on all public tables | RLS on embeddings table | +| `references/db-migrations-idempotent.md` | IF NOT EXISTS for extensions and tables | `CREATE EXTENSION IF NOT EXISTS` | +| `references/db-schema-auth-fk.md` | FK to auth.users with CASCADE | User-linked embeddings | +| `references/db-rls-common-mistakes.md` | TO authenticated, subselect auth.uid() | Policy correctness | + +## Workspace Setup + +- Empty workspace with a pre-initialized `supabase/config.toml` (no migrations) + +## Agent Task (PROMPT.md draft) + +> I'm building a semantic search feature. Create a migration that: +> 1. Enables the pgvector extension +> 2. Creates a `documents` table with an `embedding` column (1536 dimensions +> for OpenAI ada-002), a `content` text column, and a `user_id` +> 3. Adds a vector similarity search index +> 4. Users should only see their own documents +> Put the migration in `supabase/migrations/`. + +## Evaluation Criteria + +| # | Test Name | What It Checks | Quality Dimension | +|---|-----------|----------------|-------------------| +| 1 | migration file exists | At least one `.sql` file in `supabase/migrations/` | structure | +| 2 | extension in extensions schema | `WITH SCHEMA extensions` in the CREATE EXTENSION statement | correctness | +| 3 | IF NOT EXISTS on extension | `CREATE EXTENSION IF NOT EXISTS` | idempotency | +| 4 | vector column with correct dimensions | `vector(1536)` or `extensions.vector(1536)` in table | correctness | +| 5 | HNSW index used not IVFFlat | `USING hnsw` present in CREATE INDEX | correctness | +| 6 | RLS enabled | `ALTER TABLE documents ENABLE ROW LEVEL SECURITY` | security | +| 7 | FK to auth.users with CASCADE | `REFERENCES auth.users ... ON DELETE CASCADE` | correctness | +| 8 | policies TO authenticated | `TO authenticated` in policy definitions | security | +| 9 | idempotent table creation | `CREATE TABLE IF NOT EXISTS` | idempotency | + +## Reasoning + +1. **Baseline differentiator:** Agents without the skill write `CREATE + EXTENSION vector;` (wrong schema), use IVFFlat (wrong index type for most + cases), and skip the `lists` parameter requirement. +2. **Skill value:** `db-schema-extensions.md` explicitly shows the `WITH + SCHEMA extensions` pattern and recommends HNSW over IVFFlat with the + specific note about `lists` being required for IVFFlat. +3. **Testability:** Schema placement in the extension creation line and index + type are directly checkable with regex. +4. **Realism:** pgvector + OpenAI embeddings is the top "AI + Supabase" + tutorial path, and extension schema mistakes are a documented source of + permission errors. + +## Difficulty + +**Rating:** MEDIUM + +- Without skill: ~35% of assertions expected to pass (extension enabled but + wrong schema, wrong index type, weak policies) +- With skill: ~90% of assertions expected to pass +- **pass_threshold:** 8 diff --git a/packages/evals/scenarios/postgrest-schema-cache.md b/packages/evals/scenarios/postgrest-schema-cache.md new file mode 100644 index 0000000..e07778c --- /dev/null +++ b/packages/evals/scenarios/postgrest-schema-cache.md @@ -0,0 +1,89 @@ +# Scenario: postgrest-schema-cache + +## Summary + +The agent must create a migration that adds new columns to an existing table +and create a view that uses those columns, including the correct `NOTIFY +pgrst, 'reload schema'` call to force PostgREST to pick up the schema changes. +Without this, the API returns 400 errors for the new columns even after +migration. + +## Real-World Justification + +Why this is a common and important workflow: + +1. **Direct troubleshooting entry** — "PostgREST not recognizing new columns, + tables, views or functions" and "Reload/refresh postgrest schema" (400 + bad_request error) are explicitly listed in the Supabase troubleshooting + guide. This is among the most confusing errors for new Supabase developers — + the migration ran successfully but the API still returns errors. + - Source: https://supabase.com/docs/guides/troubleshooting +2. **Schema cache invalidation** — PostgREST caches the database schema at + startup and reloads it only when notified. Migrations that add new objects + must explicitly call `NOTIFY pgrst, 'reload schema'` at the end of the + migration file for the changes to be reflected immediately in local + development. + - Source: https://supabase.com/docs/guides/api/rest/generating-types +3. **Views and RLS** — Creating a view over a user-owned table requires + understanding that RLS applies to the underlying tables, and the view itself + should use `security_invoker = true` to preserve RLS context. + - Source: https://supabase.com/docs/guides/database/views + +## Skill References Exercised + +| Reference File | What It Teaches | What the Agent Should Apply | +|---|---|---| +| `references/db-rls-views.md` | Views need security_invoker to respect RLS | `WITH (security_invoker = true)` on view | +| `references/db-migrations-idempotent.md` | ADD COLUMN IF NOT EXISTS; IF NOT EXISTS patterns | Idempotent column additions | +| `references/db-rls-mandatory.md` | RLS on base tables | RLS enabled on base table | +| `references/db-rls-performance.md` | (select auth.uid()) subselect | Subselect form in policies | +| `references/db-schema-timestamps.md` | timestamptz for new columns | timestamptz on added columns | + +## Workspace Setup + +- A workspace with `supabase/config.toml` and a starter migration that creates + a basic `products` table (id, name, price) with RLS enabled but no policies. + +## Agent Task (PROMPT.md draft) + +> Our `products` table needs two new columns: `description` (text) and +> `published_at` (timestamp). Also create a view called `public_products` that +> shows only products where `published_at` is not null. Add a policy so any +> authenticated user can view published products. Put changes in a new +> migration file in `supabase/migrations/`. + +## Evaluation Criteria + +| # | Test Name | What It Checks | Quality Dimension | +|---|-----------|----------------|-------------------| +| 1 | new migration file exists | A second `.sql` file in `supabase/migrations/` | structure | +| 2 | ADD COLUMN IF NOT EXISTS for description | `ADD COLUMN IF NOT EXISTS description` | idempotency | +| 3 | ADD COLUMN IF NOT EXISTS for published_at | `ADD COLUMN IF NOT EXISTS published_at` | idempotency | +| 4 | published_at uses timestamptz | `published_at timestamptz` not plain `timestamp` | correctness | +| 5 | view created | `CREATE OR REPLACE VIEW public_products` or similar | correctness | +| 6 | view uses security_invoker | `security_invoker = true` on the view | security | +| 7 | SELECT policy on products for authenticated | A FOR SELECT policy on products with TO authenticated | security | +| 8 | NOTIFY pgrst reload present | `NOTIFY pgrst` in the migration | correctness | + +## Reasoning + +1. **Baseline differentiator:** Agents without the skill add columns correctly + but miss `IF NOT EXISTS`, use plain `timestamp`, forget `security_invoker` + on the view, and almost never include the `NOTIFY pgrst` call. +2. **Skill value:** The NOTIFY pattern and security_invoker requirement are + non-obvious details that the reference files teach explicitly. +3. **Testability:** `NOTIFY pgrst` is a unique string that either appears or + doesn't; `security_invoker` is similarly specific. +4. **Realism:** Iterative schema evolution (adding columns to existing tables) + is the most common database task after initial setup, and the PostgREST + cache invalidation issue is a universal source of confusion. + +## Difficulty + +**Rating:** MEDIUM + +- Without skill: ~40% of assertions expected to pass (columns added and view + created, but no IF NOT EXISTS, wrong timestamp type, no NOTIFY, no + security_invoker) +- With skill: ~88% of assertions expected to pass +- **pass_threshold:** 7 diff --git a/packages/evals/scenarios/rls-update-needs-select.md b/packages/evals/scenarios/rls-update-needs-select.md new file mode 100644 index 0000000..e39e4a8 --- /dev/null +++ b/packages/evals/scenarios/rls-update-needs-select.md @@ -0,0 +1,85 @@ +# Scenario: rls-update-needs-select + +## Summary + +The agent must write a migration for an `orders` table where users can view and +update only their own orders. The classic trap is writing an UPDATE policy +without a matching SELECT policy — causing UPDATE to silently affect zero rows +because RLS cannot find any rows to update. + +## Real-World Justification + +Why this is a common and important workflow: + +1. **"Why is my UPDATE returning empty data?"** — The Supabase troubleshooting + guide lists "Why is my select returning an empty data array and I have data + in the table?" which is the same root symptom. UPDATE with no SELECT policy + silently returns `{data: [], count: 0}` with no error, making it extremely + hard to diagnose. + - Source: https://supabase.com/docs/guides/troubleshooting +2. **Documented RLS behavior** — The official RLS docs state that UPDATE + requires a SELECT policy to identify which rows are accessible for + modification. This is non-obvious and contradicts most developers' + expectations from SQL semantics. + - Source: https://supabase.com/docs/guides/database/postgres/row-level-security +3. **WITH CHECK requirement** — An UPDATE policy also needs a `WITH CHECK` + clause to prevent users from updating a row to a state that would no longer + be visible to them (e.g., changing their own `user_id`). Missing this allows + data ownership hijacking. + - Source: https://supabase.com/docs/guides/database/postgres/row-level-security + +## Skill References Exercised + +| Reference File | What It Teaches | What the Agent Should Apply | +|---|---|---| +| `references/db-rls-common-mistakes.md` | UPDATE needs a SELECT policy; WITH CHECK clause | Separate SELECT and UPDATE policies, WITH CHECK | +| `references/db-rls-policy-types.md` | USING vs WITH CHECK semantics | WITH CHECK on UPDATE policy | +| `references/db-rls-performance.md` | (select auth.uid()) subquery caching | Subselect form in all USING/WITH CHECK | +| `references/db-rls-mandatory.md` | Enable RLS, TO authenticated | Full mandatory boilerplate | +| `references/db-schema-timestamps.md` | timestamptz for time columns | timestamptz not timestamp | + +## Workspace Setup + +- Empty workspace with a pre-initialized `supabase/config.toml` (no migrations) + +## Agent Task (PROMPT.md draft) + +> Create a migration for an `orders` table. Each order has a `status` (text), +> `total` (numeric), and `created_at`. Orders belong to users. Users should be +> able to view their own orders and update the status of their own orders. +> Put the migration in `supabase/migrations/`. + +## Evaluation Criteria + +| # | Test Name | What It Checks | Quality Dimension | +|---|-----------|----------------|-------------------| +| 1 | migration file exists | At least one `.sql` file in `supabase/migrations/` | structure | +| 2 | creates orders table | SQL contains `CREATE TABLE` and `orders` | correctness | +| 3 | RLS enabled | `ALTER TABLE orders ENABLE ROW LEVEL SECURITY` | security | +| 4 | has SELECT policy | A `FOR SELECT` policy exists on orders | correctness | +| 5 | has UPDATE policy with WITH CHECK | A `FOR UPDATE` policy with `WITH CHECK` clause exists | correctness | +| 6 | all policies TO authenticated | Every `CREATE POLICY` has `TO authenticated` | security | +| 7 | uses (select auth.uid()) | Subselect form in policy USING clauses | performance | +| 8 | uses timestamptz not timestamp | `created_at timestamptz` not plain `timestamp` | correctness | +| 9 | FK to auth.users with CASCADE | `REFERENCES auth.users ... ON DELETE CASCADE` | correctness | + +## Reasoning + +1. **Baseline differentiator:** Without the skill, agents write only an UPDATE + policy (or a single ALL policy), skip WITH CHECK, and use bare `auth.uid()` + calls. The result is a migration that looks complete but breaks silently. +2. **Skill value:** `db-rls-common-mistakes.md` explicitly covers this + UPDATE-needs-SELECT behavior with working examples. +3. **Testability:** The presence of both `FOR SELECT` and `FOR UPDATE` with + `WITH CHECK` is directly detectable via regex on the SQL. +4. **Realism:** "My UPDATE isn't working, returns empty" is among the most + common questions from developers new to RLS in the Supabase community. + +## Difficulty + +**Rating:** MEDIUM + +- Without skill: ~40% of assertions expected to pass (table and RLS likely, + but wrong policy structure) +- With skill: ~92% of assertions expected to pass +- **pass_threshold:** 8 diff --git a/packages/evals/scenarios/rls-user-metadata-role-check.md b/packages/evals/scenarios/rls-user-metadata-role-check.md new file mode 100644 index 0000000..5eeeea0 --- /dev/null +++ b/packages/evals/scenarios/rls-user-metadata-role-check.md @@ -0,0 +1,85 @@ +# Scenario: rls-user-metadata-role-check + +## Summary + +The agent must write a migration for a `documents` table where admin users can +read all documents and regular users can only read their own. The dangerous +trap is checking `user_metadata` for the admin role — users can write to their +own `user_metadata`, so this check is bypassable. The correct pattern uses +`app_metadata`. + +## Real-World Justification + +Why this is a common and important workflow: + +1. **Explicit troubleshooting + security entry** — The Supabase troubleshooting + guide covers "Database API 42501 errors" related to auth claims and RLS. + Using user_metadata for authorization is one of the most dangerous patterns, + documented as a common mistake in the Supabase RLS guides. + - Source: https://supabase.com/docs/guides/troubleshooting +2. **Privilege escalation vulnerability** — Any authenticated user can call + `supabase.auth.updateUser({ data: { role: 'admin' } })` to set their own + `user_metadata`. An RLS policy checking `user_metadata->>'role' = 'admin'` + gives every user admin access to all documents. + - Source: https://supabase.com/docs/guides/database/postgres/row-level-security +3. **app_metadata is server-only** — `app_metadata` can only be set via the + Admin API or auth hooks, making it safe for authorization. This distinction + is taught in the skill but frequently missed by developers. + - Source: https://supabase.com/docs/guides/auth/managing-user-data + +## Skill References Exercised + +| Reference File | What It Teaches | What the Agent Should Apply | +|---|---|---| +| `references/db-rls-common-mistakes.md` | app_metadata not user_metadata for authorization | `auth.jwt() -> 'app_metadata' ->> 'role'` | +| `references/db-rls-policy-types.md` | PERMISSIVE policies combine with OR; multiple policies for different roles | Separate owner and admin policies | +| `references/db-rls-performance.md` | (select auth.uid()) subquery; (select auth.jwt()) caching | Subselect form for JWT lookups | +| `references/db-rls-mandatory.md` | RLS enabled, TO authenticated | Full boilerplate | +| `references/db-schema-auth-fk.md` | FK to auth.users with CASCADE | Correct user linkage | + +## Workspace Setup + +- Empty workspace with a pre-initialized `supabase/config.toml` (no migrations) + +## Agent Task (PROMPT.md draft) + +> Create a migration for a `documents` table. Each document has a `title` +> (text), `content` (text), and an owner. Regular users can only see their own +> documents. Admin users (identified by a role field in their JWT) should be +> able to see all documents. Put the migration in `supabase/migrations/`. + +## Evaluation Criteria + +| # | Test Name | What It Checks | Quality Dimension | +|---|-----------|----------------|-------------------| +| 1 | migration file exists | At least one `.sql` file in `supabase/migrations/` | structure | +| 2 | creates documents table | SQL contains `CREATE TABLE` and `documents` | correctness | +| 3 | RLS enabled | `ALTER TABLE documents ENABLE ROW LEVEL SECURITY` | security | +| 4 | uses app_metadata not user_metadata | JWT role check uses `app_metadata` not `user_metadata` | security | +| 5 | no user_metadata role check | `user_metadata` does not appear in policy USING clauses | security | +| 6 | two separate policies or one covering both | At least one SELECT policy for owner and one for admin role | correctness | +| 7 | TO authenticated on all policies | `TO authenticated` in every policy | security | +| 8 | (select auth.uid()) subselect form | Subselect form used not bare auth.uid() | performance | +| 9 | FK to auth.users with CASCADE | `REFERENCES auth.users ... ON DELETE CASCADE` | correctness | + +## Reasoning + +1. **Baseline differentiator:** Agents without the skill almost universally + reach for `user_metadata` when asked about "a role field in their JWT" — + it is the more discoverable but dangerous field. Only the skill explicitly + flags this as an authorization anti-pattern. +2. **Skill value:** `db-rls-common-mistakes.md` section 2 directly addresses + this with the exact `app_metadata` pattern. +3. **Testability:** Checking for `app_metadata` presence and `user_metadata` + absence in policy USING clauses is a precise regex assertion. +4. **Realism:** Role-based access in a multi-tenant app is one of the most + common RLS patterns requested, and the metadata confusion is universal. + +## Difficulty + +**Rating:** MEDIUM + +- Without skill: ~30% of assertions expected to pass (table and RLS likely, + but user_metadata used, subselect missing) +- With skill: ~90% of assertions expected to pass +- **pass_threshold:** 8 diff --git a/packages/evals/scenarios/service-role-edge-function.md b/packages/evals/scenarios/service-role-edge-function.md new file mode 100644 index 0000000..11ddbc8 --- /dev/null +++ b/packages/evals/scenarios/service-role-edge-function.md @@ -0,0 +1,86 @@ +# Scenario: service-role-edge-function + +## Summary + +The agent must create a simple Edge Function that performs an admin operation +(listing all users' records) using the service role key server-side, while +a companion migration shows the table uses the anon key for browser access. +The trap is accidentally exposing the service role key or using it in +client-facing code. + +## Real-World Justification + +Why this is a common and important workflow: + +1. **Dedicated troubleshooting entry** — The Supabase troubleshooting guide + contains "Why is my service role key client getting RLS errors or not + returning data?" — developers incorrectly use the service role key in + contexts where it should not be used, or use the anon key where service role + is needed. + - Source: https://supabase.com/docs/guides/troubleshooting +2. **Most dangerous Supabase mistake** — Exposing the service role key in + browser JavaScript bypasses all RLS and gives every visitor full database + access. This appears in multiple Supabase blog posts and community warnings. + - Source: https://supabase.com/docs/guides/api/api-keys +3. **Environment variable leakage** — The troubleshooting guide warns about + "Inspecting edge function environment variables" as a debugging topic. + Developers must use `Deno.env.get()` not hardcoded keys, and never use + `NEXT_PUBLIC_` prefix for the service role key. + - Source: https://supabase.com/docs/guides/troubleshooting + +## Skill References Exercised + +| Reference File | What It Teaches | What the Agent Should Apply | +|---|---|---| +| `references/db-security-service-role.md` | Never expose service role key in browser, use env vars | `Deno.env.get('SUPABASE_SERVICE_ROLE_KEY')` in edge function | +| `references/edge-fun-quickstart.md` | Edge function file structure and exports | Correct `index.ts` in `supabase/functions/` | +| `references/edge-db-supabase-client.md` | Creating supabase client in edge functions | `createClient` with service role for admin ops | +| `references/edge-pat-cors.md` | CORS headers for browser requests | CORS on the response | +| `references/edge-pat-error-handling.md` | Error responses | Proper error handling | + +## Workspace Setup + +- Empty workspace with a pre-initialized `supabase/config.toml` +- A migration creating a `reports` table already exists in `supabase/migrations/` + +## Agent Task (PROMPT.md draft) + +> Create an Edge Function called `admin-reports` that returns all rows from +> the `reports` table, bypassing RLS (it's an admin-only endpoint). The +> function should be in `supabase/functions/admin-reports/index.ts`. Use +> environment variables for any keys — do not hardcode them. + +## Evaluation Criteria + +| # | Test Name | What It Checks | Quality Dimension | +|---|-----------|----------------|-------------------| +| 1 | edge function file exists | `supabase/functions/admin-reports/index.ts` exists | structure | +| 2 | uses Deno.env.get for service key | `Deno.env.get` used to retrieve the service role key | security | +| 3 | no hardcoded service role key | No JWT-like string literal (`eyJ`) as the service role value | security | +| 4 | createClient called with service role | `createClient` receives the service role env var as second arg | correctness | +| 5 | service role key not NEXT_PUBLIC prefixed | No `NEXT_PUBLIC_` prefix on service role variable name | security | +| 6 | CORS headers present | `Access-Control-Allow-Origin` in response headers | correctness | +| 7 | returns JSON response | `Response` with JSON body and content-type | correctness | + +## Reasoning + +1. **Baseline differentiator:** Agents without the skill sometimes hardcode a + placeholder key string, forget CORS, or use the wrong env variable name + pattern. +2. **Skill value:** `db-security-service-role.md` is explicit about env var + naming rules and the `NEXT_PUBLIC_` anti-pattern. `edge-fun-quickstart.md` + teaches the Deno.env.get pattern. +3. **Testability:** Checking for `eyJ` hardcoded strings and `NEXT_PUBLIC_` + prefixes are reliable negative assertions. `Deno.env.get` is a positive + string check. +4. **Realism:** Admin Edge Functions that bypass RLS are an extremely common + pattern for dashboards and data exports. + +## Difficulty + +**Rating:** EASY + +- Without skill: ~50% of assertions expected to pass (file exists, createClient + present, but key handling likely wrong) +- With skill: ~93% of assertions expected to pass +- **pass_threshold:** 8 diff --git a/packages/evals/scenarios/storage-rls-user-folders.md b/packages/evals/scenarios/storage-rls-user-folders.md index ae953fa..17673ed 100644 --- a/packages/evals/scenarios/storage-rls-user-folders.md +++ b/packages/evals/scenarios/storage-rls-user-folders.md @@ -141,4 +141,5 @@ Step-by-step reasoning for why this scenario is well-designed: **Rating:** MEDIUM - Without skill: ~30-45% of assertions expected to pass -- With skill: ~85-95% of assertions expected to pass \ No newline at end of file +- With skill: ~85-95% of assertions expected to pass +- **pass_threshold:** 14 \ No newline at end of file diff --git a/packages/evals/scenarios/team-rls-security-definer.md b/packages/evals/scenarios/team-rls-security-definer.md index ec9ad2a..0feb7d8 100644 --- a/packages/evals/scenarios/team-rls-security-definer.md +++ b/packages/evals/scenarios/team-rls-security-definer.md @@ -100,8 +100,9 @@ specific quality signal: | 12 | index on membership lookup columns | `CREATE INDEX` on user_id and/or org_id in memberships | performance | | 13 | uses timestamptz | No plain `timestamp` for time columns | correctness | | 14 | idempotent DDL | Uses `IF NOT EXISTS` or `DROP ... IF EXISTS` patterns | idempotency | -| 15 | delete policy restricted to owner role | A delete policy on projects checks for owner/admin role | security | -| 16 | overall quality score | At least 10/14 best-practice signals present | overall | +| 15 | stable or immutable on helper function | Helper function marked STABLE or IMMUTABLE for performance | performance | +| 16 | delete policy restricted to owner role | A delete policy on projects checks for owner/admin role | security | +| 17 | overall quality score | At least 11/15 best-practice signals present | overall | ## Reasoning @@ -136,4 +137,5 @@ Step-by-step reasoning for why this scenario is well-designed: **Rating:** MEDIUM - Without skill: ~35-50% of assertions expected to pass -- With skill: ~85-95% of assertions expected to pass \ No newline at end of file +- With skill: ~85-95% of assertions expected to pass +- **pass_threshold:** 13 \ No newline at end of file diff --git a/packages/evals/src/eval-types.ts b/packages/evals/src/eval-types.ts new file mode 100644 index 0000000..6901109 --- /dev/null +++ b/packages/evals/src/eval-types.ts @@ -0,0 +1,21 @@ +/** + * A single assertion to run against the agent's workspace output. + * + * Used by EVAL.ts files to declare what the agent's work should produce. + * The runner executes these in-process (no test framework required). + */ +export interface EvalAssertion { + /** Human-readable name shown in Braintrust and local output */ + name: string; + /** Return true = pass, false/throw = fail */ + check: () => boolean | Promise; + /** Timeout in ms for async checks (default: no timeout) */ + timeout?: number; +} + +/** Result of running a single EvalAssertion */ +export interface AssertionResult { + name: string; + passed: boolean; + error?: string; +} diff --git a/packages/evals/src/runner.ts b/packages/evals/src/runner.ts index cb41340..3396f02 100644 --- a/packages/evals/src/runner.ts +++ b/packages/evals/src/runner.ts @@ -1,11 +1,8 @@ import { existsSync, readdirSync, readFileSync } from "node:fs"; import { join, resolve } from "node:path"; +import type { AssertionResult, EvalAssertion } from "./eval-types.js"; import { runAgent } from "./runner/agent.js"; -import { - initBraintrustLogger, - logScenarioToLogger, - uploadToBraintrust, -} from "./runner/braintrust.js"; +import { uploadToBraintrust } from "./runner/braintrust.js"; import { createResultDir, saveRunArtifacts } from "./runner/persist.js"; import { preflight } from "./runner/preflight.js"; import { listModifiedFiles, printSummary } from "./runner/results.js"; @@ -22,7 +19,6 @@ import { startSupabase, stopSupabase, } from "./runner/supabase-setup.js"; -import { runTests } from "./runner/test.js"; import { buildTranscriptSummary, type TranscriptSummary, @@ -92,6 +88,40 @@ function getPassThreshold(scenarioId: string): number | null { return match ? Number.parseInt(match[1], 10) : null; } +// --------------------------------------------------------------------------- +// In-process assertion runner (replaces vitest subprocess) +// --------------------------------------------------------------------------- + +async function runAssertions( + assertions: EvalAssertion[], +): Promise { + return Promise.all( + assertions.map(async (a) => { + try { + let result: boolean; + if (a.timeout) { + const timeoutPromise = new Promise((_, reject) => + setTimeout( + () => + reject(new Error(`Assertion timed out after ${a.timeout}ms`)), + a.timeout, + ), + ); + result = await Promise.race([ + Promise.resolve(a.check()), + timeoutPromise, + ]); + } else { + result = await Promise.resolve(a.check()); + } + return { name: a.name, passed: Boolean(result) }; + } catch (e) { + return { name: a.name, passed: false, error: String(e) }; + } + }), + ); +} + // --------------------------------------------------------------------------- // Run a single eval // --------------------------------------------------------------------------- @@ -106,18 +136,28 @@ async function runEval( console.log(`\n--- ${scenario.id} (${variant}) ---`); + // Load assertions and expected reference files from EVAL.ts + const evalFilePath = existsSync(join(evalDir, "EVAL.tsx")) + ? join(evalDir, "EVAL.tsx") + : join(evalDir, "EVAL.ts"); + + const { + assertions = [] as EvalAssertion[], + expectedReferenceFiles = [] as string[], + } = await import(evalFilePath).catch(() => ({ + assertions: [] as EvalAssertion[], + expectedReferenceFiles: [] as string[], + })); + + const passThreshold = getPassThreshold(scenario.id); + const prompt = readFileSync(join(evalDir, "PROMPT.md"), "utf-8").trim(); + // 1. Create isolated workspace - const { workspacePath, cleanup } = createWorkspace({ - evalDir, - skillEnabled, - }); + const { workspacePath, cleanup } = createWorkspace({ evalDir, skillEnabled }); console.log(` Workspace: ${workspacePath}`); try { - // 2. Read the prompt - const prompt = readFileSync(join(evalDir, "PROMPT.md"), "utf-8").trim(); - - // 3. Run the agent + // 2. Run the agent console.log(` Running agent (${model})...`); const startedAt = Date.now(); const agentResult = await runAgent({ @@ -132,54 +172,48 @@ async function runEval( ` Agent finished in ${(agentResult.duration / 1000).toFixed(1)}s`, ); - // 4. Run the hidden tests - const evalFilePath = existsSync(join(evalDir, "EVAL.tsx")) - ? join(evalDir, "EVAL.tsx") - : join(evalDir, "EVAL.ts"); - - const passThreshold = getPassThreshold(scenario.id); - - console.log(" Running tests..."); - const testResult = await runTests({ - workspacePath, - evalFilePath, - passThreshold: passThreshold ?? undefined, + // 3. Run assertions in-process from the workspace directory so that + // eval-utils.ts helpers resolve paths relative to the workspace. + console.log(" Running assertions..."); + const prevCwd = process.cwd(); + process.chdir(workspacePath); + const assertionResults = await runAssertions(assertions).finally(() => { + process.chdir(prevCwd); }); + const passedCount = assertionResults.filter((a) => a.passed).length; + const totalCount = assertionResults.length; + + const passed = passThreshold + ? totalCount > 0 && passedCount >= passThreshold + : totalCount > 0 && passedCount === totalCount; const pct = - testResult.totalCount > 0 - ? ((testResult.passedCount / testResult.totalCount) * 100).toFixed(1) - : "0.0"; + totalCount > 0 ? ((passedCount / totalCount) * 100).toFixed(1) : "0.0"; const thresholdInfo = passThreshold - ? `, threshold: ${((passThreshold / testResult.totalCount) * 100).toFixed(0)}%` + ? `, threshold: ${((passThreshold / totalCount) * 100).toFixed(0)}%` : ""; console.log( - ` Tests: ${testResult.passedCount}/${testResult.totalCount} passed (${pct}%${thresholdInfo})`, + ` Assertions: ${passedCount}/${totalCount} passed (${pct}%${thresholdInfo})`, ); - // 5. Collect modified files + // 4. Collect modified files const filesModified = listModifiedFiles(workspacePath, evalDir); - // 6. Build transcript summary + // 5. Build transcript summary const summary = buildTranscriptSummary(agentResult.events); - // 7. Load expectedReferenceFiles from EVAL.ts (if declared) - const { expectedReferenceFiles = [] } = await import(evalFilePath).catch( - () => ({ expectedReferenceFiles: [] as string[] }), - ); - - // 8. Run scorers + // 6. Run scorers const skillScore = skillUsageScorer(summary, skillName); const refScore = referenceFilesUsageScorer(summary, expectedReferenceFiles); const assertScore = assertionsPassedScorer({ - testsPassed: testResult.passedCount, - testsTotal: testResult.totalCount, - status: testResult.passed ? "passed" : "failed", + testsPassed: passedCount, + testsTotal: totalCount, + status: passed ? "passed" : "failed", } as EvalRunResult); const finalScore = finalResultScorer({ - status: testResult.passed ? "passed" : "failed", - testsPassed: testResult.passedCount, - testsTotal: testResult.totalCount, + status: passed ? "passed" : "failed", + testsPassed: passedCount, + testsTotal: totalCount, passThreshold: passThreshold ?? undefined, } as EvalRunResult); @@ -188,18 +222,17 @@ async function runEval( agent: "claude-code", model, skillEnabled, - status: testResult.passed ? "passed" : "failed", + status: passed ? "passed" : "failed", duration: agentResult.duration, - testOutput: testResult.output, agentOutput: agentResult.output, - testsPassed: testResult.passedCount, - testsTotal: testResult.totalCount, + testsPassed: passedCount, + testsTotal: totalCount, passThreshold: passThreshold ?? undefined, + assertionResults, filesModified, toolCallCount: summary.toolCalls.length, costUsd: summary.totalCostUsd ?? undefined, prompt, - individualTests: testResult.individualTests, startedAt, durationApiMs: summary.totalDurationApiMs, totalInputTokens: summary.totalInputTokens, @@ -225,7 +258,7 @@ async function runEval( saveRunArtifacts({ resultDir, rawTranscript: agentResult.rawTranscript, - testOutput: testResult.output, + assertionResults, result, transcriptSummary: summary, }); @@ -241,7 +274,6 @@ async function runEval( skillEnabled, status: "error", duration: 0, - testOutput: "", agentOutput: "", testsPassed: 0, testsTotal: 0, @@ -281,7 +313,7 @@ async function main() { startSupabase(); const keys = getKeys(); - // Inject keys into process.env so EVAL.ts tests can connect to the real DB. + // Inject keys into process.env so assertions can connect to the real DB. process.env.SUPABASE_URL = keys.apiUrl; process.env.SUPABASE_ANON_KEY = keys.anonKey; process.env.SUPABASE_SERVICE_ROLE_KEY = keys.serviceRoleKey; @@ -291,7 +323,6 @@ async function main() { const transcripts = new Map(); const braintrustUpload = process.env.BRAINTRUST_UPLOAD === "true"; - const logger = braintrustUpload ? initBraintrustLogger() : undefined; try { for (const scenario of scenarios) { @@ -304,15 +335,9 @@ async function main() { if (transcript) { transcripts.set(result.scenario, transcript); } - - // Log immediately after each scenario for real-time visibility. - if (logger) { - logScenarioToLogger(logger, result, transcript); - } } } finally { stopSupabase(); - await logger?.flush(); } // Use the results dir from the first result (all share the same timestamp) diff --git a/packages/evals/src/runner/braintrust.ts b/packages/evals/src/runner/braintrust.ts index ad5f5db..0504175 100644 --- a/packages/evals/src/runner/braintrust.ts +++ b/packages/evals/src/runner/braintrust.ts @@ -70,7 +70,7 @@ export function logScenarioToLogger( status: r.status, agentOutput: r.agentOutput, filesModified: r.filesModified, - testOutput: r.testOutput, + assertionResults: r.assertionResults, }, expected: { testsTotal: r.testsTotal }, scores, @@ -106,7 +106,7 @@ export function logScenarioToLogger( status: r.status, agentOutput: r.agentOutput, filesModified: r.filesModified, - testOutput: r.testOutput, + assertionResults: r.assertionResults, }, expected: { testsTotal: r.testsTotal }, scores, @@ -121,7 +121,7 @@ export function logScenarioToLogger( * * Each EvalRunResult becomes a row in the experiment with: * - input: scenario ID, prompt content, skillEnabled flag - * - output: status, agent output, files modified, test output + * - output: status, agent output, files modified, assertion results * - expected: total tests, pass threshold * - scores: skill_usage, reference_files_usage, assertions_passed, final_result * - metadata: agent, model, skillEnabled, test counts, tool calls, context window, output tokens, model usage, errors, cost @@ -172,7 +172,7 @@ export async function uploadToBraintrust( status: r.status, agentOutput: r.agentOutput, filesModified: r.filesModified, - testOutput: r.testOutput, + assertionResults: r.assertionResults, }; const expected = { diff --git a/packages/evals/src/runner/persist.ts b/packages/evals/src/runner/persist.ts index 6694efb..312b1b8 100644 --- a/packages/evals/src/runner/persist.ts +++ b/packages/evals/src/runner/persist.ts @@ -1,6 +1,7 @@ import { mkdirSync, writeFileSync } from "node:fs"; import { dirname, join } from "node:path"; import { fileURLToPath } from "node:url"; +import type { AssertionResult } from "../eval-types.js"; import type { EvalRunResult } from "../types.js"; import type { TranscriptSummary } from "./transcript.js"; @@ -32,7 +33,7 @@ export function createResultDir( export function saveRunArtifacts(opts: { resultDir: string; rawTranscript: string; - testOutput: string; + assertionResults: AssertionResult[]; result: EvalRunResult; transcriptSummary: TranscriptSummary; }): void { @@ -43,8 +44,8 @@ export function saveRunArtifacts(opts: { ); writeFileSync( - join(opts.resultDir, "test-output.txt"), - opts.testOutput, + join(opts.resultDir, "assertions.json"), + JSON.stringify(opts.assertionResults, null, 2), "utf-8", ); diff --git a/packages/evals/src/runner/scorers.ts b/packages/evals/src/runner/scorers.ts index d9c9a3c..0b54173 100644 --- a/packages/evals/src/runner/scorers.ts +++ b/packages/evals/src/runner/scorers.ts @@ -63,7 +63,7 @@ export function referenceFilesUsageScorer( } /** - * assertionsPassedScorer — ratio of vitest assertions passed vs total. + * assertionsPassedScorer — ratio of assertions passed vs total. */ export function assertionsPassedScorer(result: EvalRunResult): ScoreResult { const score = diff --git a/packages/evals/src/runner/test.ts b/packages/evals/src/runner/test.ts deleted file mode 100644 index aeb2fa3..0000000 --- a/packages/evals/src/runner/test.ts +++ /dev/null @@ -1,143 +0,0 @@ -import { execFile } from "node:child_process"; -import { copyFileSync, existsSync, writeFileSync } from "node:fs"; -import { dirname, join } from "node:path"; -import { fileURLToPath } from "node:url"; -import { promisify } from "node:util"; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = dirname(__filename); - -const exec = promisify(execFile); - -export interface TestResult { - passed: boolean; - output: string; - /** Number of tests that passed */ - passedCount: number; - /** Total number of tests */ - totalCount: number; - /** Per-test pass/fail extracted from vitest verbose output */ - individualTests: Record; -} - -/** - * Run the hidden EVAL.ts tests against the agent's workspace. - * - * 1. Copy EVAL.ts into the workspace (agent is done, safe to expose) - * 2. Run vitest against it - * 3. Parse the output for pass/fail - */ -export async function runTests(opts: { - workspacePath: string; - evalFilePath: string; - passThreshold?: number; -}): Promise { - // Copy the hidden test file into the workspace - const evalFileName = opts.evalFilePath.endsWith(".tsx") - ? "EVAL.tsx" - : "EVAL.ts"; - const destPath = join(opts.workspacePath, evalFileName); - copyFileSync(opts.evalFilePath, destPath); - - // Copy shared eval-utils.ts if it exists alongside the eval scenarios - const evalUtilsSrc = join( - dirname(dirname(opts.evalFilePath)), - "eval-utils.ts", - ); - if (existsSync(evalUtilsSrc)) { - copyFileSync(evalUtilsSrc, join(opts.workspacePath, "eval-utils.ts")); - } - - // Write a minimal vitest config that overrides the default include pattern - // so EVAL.ts (without .test. or .spec.) is picked up. - const vitestConfigPath = join(opts.workspacePath, "vitest.config.mjs"); - if (!existsSync(vitestConfigPath)) { - // Alias ../eval-utils.ts → ./eval-utils.ts so the import resolves in - // the flat workspace (source tree has EVAL.ts one level deeper). - const evalUtilsDest = join(opts.workspacePath, "eval-utils.ts"); - const aliasBlock = existsSync(evalUtilsDest) - ? `resolve: { alias: { "../eval-utils.ts": "./eval-utils.ts" } },` - : ""; - writeFileSync( - vitestConfigPath, - `export default { ${aliasBlock} test: { include: ["EVAL.{ts,tsx}"] } };\n`, - ); - } - - // Use the vitest binary from the evals package (always available) - const evalsVitest = join( - __dirname, - "..", - "..", - "node_modules", - ".bin", - "vitest", - ); - const vitestBin = join(opts.workspacePath, "node_modules", ".bin", "vitest"); - const cmd = existsSync(vitestBin) ? vitestBin : evalsVitest; - const args = ["run", evalFileName, "--reporter=verbose", "--no-color"]; - - try { - const { stdout, stderr } = await exec(cmd, args, { - cwd: opts.workspacePath, - timeout: 60_000, - env: { ...process.env }, - maxBuffer: 5 * 1024 * 1024, - }); - - const output = `${stdout}\n${stderr}`; - return parseTestOutput(output, opts.passThreshold); - } catch (error) { - const err = error as Error & { stdout?: string; stderr?: string }; - const output = `${err.stdout ?? ""}\n${err.stderr ?? ""}`; - return parseTestOutput(output, opts.passThreshold); - } -} - -/** - * Extract per-test pass/fail from vitest verbose output. - * - * Vitest verbose format: - * ✓ EVAL.ts > test name here 0ms → passed - * × EVAL.ts > test name here 2ms → failed - */ -function parseIndividualTests(output: string): Record { - const results: Record = {}; - const re = /[✓×]\s+EVAL\.tsx?\s+>\s+(.+?)\s+\d+ms/g; - for (const match of output.matchAll(re)) { - const testName = match[1].trim(); - const didPass = output[match.index] === "✓"; - results[testName] = didPass; - } - return results; -} - -function parseTestOutput(output: string, passThreshold?: number): TestResult { - // Parse vitest output for pass/fail counts - // Vitest formats: - // All passing: "Tests N passed (N)" - // Mixed: "Tests N failed | M passed (T)" - // All failing: "Tests N failed (N)" - const mixedOrPassing = output.match( - /Tests\s+(?:(\d+)\s+failed\s+\|\s+)?(\d+)\s+passed\s+\((\d+)\)/, - ); - const allFailing = output.match(/Tests\s+(\d+)\s+failed\s+\((\d+)\)/); - - let passedCount = 0; - let totalCount = 0; - - if (mixedOrPassing) { - passedCount = Number.parseInt(mixedOrPassing[2], 10); - totalCount = Number.parseInt(mixedOrPassing[3], 10); - } else if (allFailing) { - passedCount = 0; - totalCount = Number.parseInt(allFailing[2], 10); - } - - const passed = passThreshold - ? totalCount > 0 && passedCount >= passThreshold - : totalCount > 0 && passedCount === totalCount; - const individualTests = parseIndividualTests(output); - - return { passed, output, passedCount, totalCount, individualTests }; -} diff --git a/packages/evals/src/types.ts b/packages/evals/src/types.ts index 029667a..c9c9971 100644 --- a/packages/evals/src/types.ts +++ b/packages/evals/src/types.ts @@ -1,3 +1,5 @@ +import type { AssertionResult } from "./eval-types.js"; + export interface EvalScenario { /** Directory name under evals/ */ id: string; @@ -23,14 +25,17 @@ export interface EvalRunResult { skillEnabled: boolean; status: "passed" | "failed" | "error"; duration: number; - testOutput: string; + /** Raw test runner output (for debugging) */ + testOutput?: string; agentOutput: string; - /** Number of vitest tests that passed */ + /** Number of assertions that passed */ testsPassed: number; - /** Total number of vitest tests */ + /** Total number of assertions */ testsTotal: number; /** Minimum tests required to pass (from scenario config) */ passThreshold?: number; + /** Per-assertion pass/fail results */ + assertionResults?: AssertionResult[]; /** Files the agent created or modified in the workspace */ filesModified: string[]; error?: string; @@ -42,8 +47,6 @@ export interface EvalRunResult { costUsd?: number; /** The PROMPT.md content sent to the agent */ prompt?: string; - /** Per-test pass/fail results from vitest */ - individualTests?: Record; /** Epoch ms when the agent run started (for Braintrust span timing) */ startedAt?: number; /** API-only latency in ms (excludes local processing overhead) */