mirror of
https://github.com/supabase/agent-skills.git
synced 2026-03-27 10:09:26 +08:00
replace vitest for braintrust assertions
This commit is contained in:
85
packages/evals/evals/auth-fk-cascade-delete/EVAL.ts
Normal file
85
packages/evals/evals/auth-fk-cascade-delete/EVAL.ts
Normal file
@@ -0,0 +1,85 @@
|
||||
export const expectedReferenceFiles = [
|
||||
"db-schema-auth-fk.md",
|
||||
"db-security-functions.md",
|
||||
"db-rls-mandatory.md",
|
||||
"db-rls-common-mistakes.md",
|
||||
];
|
||||
|
||||
import type { EvalAssertion } from "../../src/eval-types.js";
|
||||
|
||||
import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
|
||||
|
||||
export const assertions: EvalAssertion[] = [
|
||||
{
|
||||
name: "migration file exists",
|
||||
check: () => findMigrationFiles().length > 0,
|
||||
},
|
||||
{
|
||||
name: "creates profiles table",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return /create\s+table/.test(sql) && /profiles/.test(sql);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "FK references auth.users",
|
||||
check: () =>
|
||||
/references\s+auth\.users/.test(getMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "ON DELETE CASCADE present",
|
||||
check: () => /on\s+delete\s+cascade/.test(getMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "RLS enabled on profiles",
|
||||
check: () =>
|
||||
/alter\s+table.*profiles.*enable\s+row\s+level\s+security/.test(
|
||||
getMigrationSQL().toLowerCase(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "trigger function uses SECURITY DEFINER",
|
||||
check: () => /security\s+definer/.test(getMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "trigger function sets search_path",
|
||||
check: () =>
|
||||
/set\s+search_path\s*=\s*''/.test(getMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "trigger created on auth.users",
|
||||
check: () =>
|
||||
/create\s+trigger[\s\S]*?on\s+auth\.users/.test(
|
||||
getMigrationSQL().toLowerCase(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "policies scoped to authenticated",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
return (
|
||||
policyBlocks.length > 0 &&
|
||||
policyBlocks.every((p) => /to\s+authenticated/.test(p))
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "overall quality: demonstrates Supabase best practices",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const signals = [
|
||||
/references\s+auth\.users/.test(sql) &&
|
||||
/on\s+delete\s+cascade/.test(sql),
|
||||
/alter\s+table.*profiles.*enable\s+row\s+level\s+security/.test(sql),
|
||||
/security\s+definer/.test(sql),
|
||||
/set\s+search_path\s*=\s*''/.test(sql),
|
||||
/create\s+trigger[\s\S]*?on\s+auth\.users/.test(sql),
|
||||
policyBlocks.length > 0 &&
|
||||
policyBlocks.every((p) => /to\s+authenticated/.test(p)),
|
||||
];
|
||||
return signals.filter(Boolean).length >= 5;
|
||||
},
|
||||
},
|
||||
];
|
||||
7
packages/evals/evals/auth-fk-cascade-delete/PROMPT.md
Normal file
7
packages/evals/evals/auth-fk-cascade-delete/PROMPT.md
Normal file
@@ -0,0 +1,7 @@
|
||||
I'm building a Supabase app and need to set up a `profiles` table. Every user who signs up should automatically get a profile row containing their `id`, `email`, and `full_name` (pulled from signup metadata).
|
||||
|
||||
Please create a SQL migration in `supabase/migrations/` that:
|
||||
|
||||
1. Creates the `profiles` table linked to Supabase Auth users
|
||||
2. Sets up a trigger so a profile row is created automatically whenever a new user signs up
|
||||
3. Enables Row Level Security so users can only read and update their own profile
|
||||
5
packages/evals/evals/auth-fk-cascade-delete/package.json
Normal file
5
packages/evals/evals/auth-fk-cascade-delete/package.json
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"name": "auth-fk-cascade-delete",
|
||||
"private": true,
|
||||
"type": "module"
|
||||
}
|
||||
111
packages/evals/evals/auth-fk-cascade-delete/supabase/config.toml
Normal file
111
packages/evals/evals/auth-fk-cascade-delete/supabase/config.toml
Normal file
@@ -0,0 +1,111 @@
|
||||
# For detailed configuration reference documentation, visit:
|
||||
# https://supabase.com/docs/guides/local-development/cli/config
|
||||
# A string used to distinguish different Supabase projects on the same host. Defaults to the
|
||||
# working directory name when running `supabase init`.
|
||||
project_id = "auth-fk-cascade-delete"
|
||||
|
||||
[api]
|
||||
enabled = true
|
||||
# Port to use for the API URL.
|
||||
port = 54321
|
||||
# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
|
||||
# endpoints. `public` and `graphql_public` schemas are included by default.
|
||||
schemas = ["public", "graphql_public"]
|
||||
# Extra schemas to add to the search_path of every request.
|
||||
extra_search_path = ["public", "extensions"]
|
||||
# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
|
||||
# for accidental or malicious requests.
|
||||
max_rows = 1000
|
||||
|
||||
[db]
|
||||
# Port to use for the local database URL.
|
||||
port = 54322
|
||||
# Port used by db diff command to initialize the shadow database.
|
||||
shadow_port = 54320
|
||||
# The database major version to use. This has to be the same as your remote database's. Run `SHOW
|
||||
# server_version;` on the remote database to check.
|
||||
major_version = 17
|
||||
|
||||
[db.pooler]
|
||||
enabled = false
|
||||
# Port to use for the local connection pooler.
|
||||
port = 54329
|
||||
# Specifies when a server connection can be reused by other clients.
|
||||
# Configure one of the supported pooler modes: `transaction`, `session`.
|
||||
pool_mode = "transaction"
|
||||
# How many server connections to allow per user/database pair.
|
||||
default_pool_size = 20
|
||||
# Maximum number of client connections allowed.
|
||||
max_client_conn = 100
|
||||
|
||||
[db.migrations]
|
||||
# If disabled, migrations will be skipped during a db push or reset.
|
||||
enabled = true
|
||||
schema_paths = []
|
||||
|
||||
[db.seed]
|
||||
# If enabled, seeds the database after migrations during a db reset.
|
||||
enabled = true
|
||||
# Specifies an ordered list of seed files to load during db reset.
|
||||
sql_paths = ["./seed.sql"]
|
||||
|
||||
[realtime]
|
||||
enabled = true
|
||||
|
||||
[studio]
|
||||
enabled = true
|
||||
# Port to use for Supabase Studio.
|
||||
port = 54323
|
||||
# External URL of the API server that frontend connects to.
|
||||
api_url = "http://127.0.0.1"
|
||||
|
||||
[inbucket]
|
||||
enabled = true
|
||||
# Port to use for the email testing server web interface.
|
||||
port = 54324
|
||||
|
||||
[storage]
|
||||
enabled = true
|
||||
# The maximum file size allowed (e.g. "5MB", "500KB").
|
||||
file_size_limit = "50MiB"
|
||||
|
||||
[auth]
|
||||
enabled = true
|
||||
# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
|
||||
# in emails.
|
||||
site_url = "http://127.0.0.1:3000"
|
||||
# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
|
||||
additional_redirect_urls = ["https://127.0.0.1:3000"]
|
||||
# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
|
||||
jwt_expiry = 3600
|
||||
# If disabled, the refresh token will never expire.
|
||||
enable_refresh_token_rotation = true
|
||||
# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
|
||||
# Requires enable_refresh_token_rotation = true.
|
||||
refresh_token_reuse_interval = 10
|
||||
# Allow/disallow new user signups to your project.
|
||||
enable_signup = true
|
||||
# Allow/disallow anonymous sign-ins to your project.
|
||||
enable_anonymous_sign_ins = false
|
||||
|
||||
[auth.email]
|
||||
# Allow/disallow new user signups via email to your project.
|
||||
enable_signup = true
|
||||
# If enabled, a user will be required to confirm any email change on both the old, and new email
|
||||
# addresses. If disabled, only the new email is required to confirm.
|
||||
double_confirm_changes = true
|
||||
# If enabled, users need to confirm their email address before signing in.
|
||||
enable_confirmations = false
|
||||
|
||||
[edge_runtime]
|
||||
enabled = true
|
||||
# Configure one of the supported request policies: `oneshot`, `per_worker`.
|
||||
policy = "per_worker"
|
||||
# Port to attach the Chrome inspector for debugging edge functions.
|
||||
inspector_port = 8083
|
||||
|
||||
[analytics]
|
||||
enabled = true
|
||||
port = 54327
|
||||
# Configure one of the supported backends: `postgres`, `bigquery`.
|
||||
backend = "postgres"
|
||||
@@ -1,97 +1,150 @@
|
||||
export const expectedReferenceFiles = [
|
||||
"dev-getting-started.md",
|
||||
"db-rls-mandatory.md",
|
||||
"db-rls-policy-types.md",
|
||||
"db-rls-common-mistakes.md",
|
||||
"db-schema-auth-fk.md",
|
||||
"db-schema-timestamps.md",
|
||||
"db-migrations-idempotent.md",
|
||||
];
|
||||
|
||||
import { existsSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import { expect, test } from "vitest";
|
||||
import type { EvalAssertion } from "../../src/eval-types.js";
|
||||
|
||||
import {
|
||||
anonSeeesNoRows,
|
||||
findMigrationFiles,
|
||||
getMigrationSQL,
|
||||
supabaseDir,
|
||||
getSupabaseDir,
|
||||
queryTable,
|
||||
tableExists,
|
||||
} from "../eval-utils.ts";
|
||||
|
||||
test("supabase project initialized (config.toml exists)", () => {
|
||||
expect(existsSync(join(supabaseDir, "config.toml"))).toBe(true);
|
||||
});
|
||||
|
||||
test("migration file exists in supabase/migrations/", () => {
|
||||
expect(findMigrationFiles().length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
test("creates tasks table", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/create\s+table/);
|
||||
expect(sql).toMatch(/tasks/);
|
||||
});
|
||||
|
||||
test("enables RLS on tasks table", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/alter\s+table.*tasks.*enable\s+row\s+level\s+security/);
|
||||
});
|
||||
|
||||
test("has foreign key to auth.users", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/references\s+auth\.users/);
|
||||
});
|
||||
|
||||
test("uses ON DELETE CASCADE for auth FK", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/on\s+delete\s+cascade/);
|
||||
});
|
||||
|
||||
test("uses (select auth.uid()) not bare auth.uid() in policies", () => {
|
||||
const sql = getMigrationSQL();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
for (const policy of policyBlocks) {
|
||||
if (policy.includes("auth.uid()")) {
|
||||
// The subselect form: (select auth.uid())
|
||||
expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
test("policies use TO authenticated", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
expect(policyBlocks.length).toBeGreaterThan(0);
|
||||
for (const policy of policyBlocks) {
|
||||
expect(policy).toMatch(/to\s+authenticated/);
|
||||
}
|
||||
});
|
||||
|
||||
test("uses timestamptz not plain timestamp for time columns", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Match "timestamp" that is NOT followed by "tz" or "with time zone"
|
||||
const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
|
||||
// Only fail if the migration defines time columns with plain timestamp
|
||||
if (
|
||||
sql.includes("created_at") ||
|
||||
sql.includes("updated_at") ||
|
||||
sql.includes("due_date")
|
||||
) {
|
||||
expect(sql).not.toMatch(hasPlainTimestamp);
|
||||
}
|
||||
});
|
||||
|
||||
test("creates index on user_id column", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/create\s+index/);
|
||||
expect(sql).toMatch(/user_id/);
|
||||
});
|
||||
|
||||
test("migration is idempotent (uses IF NOT EXISTS)", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/if\s+not\s+exists/);
|
||||
});
|
||||
|
||||
test("overall quality: demonstrates Supabase best practices", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// A high-quality migration should contain most of these patterns
|
||||
const signals = [
|
||||
/enable\s+row\s+level\s+security/,
|
||||
/\(select\s+auth\.uid\(\)\)/,
|
||||
/to\s+authenticated/,
|
||||
/on\s+delete\s+cascade/,
|
||||
/create\s+index/,
|
||||
];
|
||||
const matches = signals.filter((r) => r.test(sql));
|
||||
expect(matches.length).toBeGreaterThanOrEqual(4);
|
||||
});
|
||||
export const assertions: EvalAssertion[] = [
|
||||
{
|
||||
name: "supabase project initialized (config.toml exists)",
|
||||
check: () => existsSync(join(getSupabaseDir(), "config.toml")),
|
||||
},
|
||||
{
|
||||
name: "migration file exists in supabase/migrations/",
|
||||
check: () => findMigrationFiles().length > 0,
|
||||
},
|
||||
{
|
||||
name: "creates tasks table",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return /create\s+table/.test(sql) && /tasks/.test(sql);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "enables RLS on tasks table",
|
||||
check: () =>
|
||||
/alter\s+table.*tasks.*enable\s+row\s+level\s+security/.test(
|
||||
getMigrationSQL().toLowerCase(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "has foreign key to auth.users",
|
||||
check: () =>
|
||||
/references\s+auth\.users/.test(getMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "uses ON DELETE CASCADE for auth FK",
|
||||
check: () => /on\s+delete\s+cascade/.test(getMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "uses (select auth.uid()) not bare auth.uid() in policies",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
for (const policy of policyBlocks) {
|
||||
if (
|
||||
policy.includes("auth.uid()") &&
|
||||
!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "policies use TO authenticated",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
return (
|
||||
policyBlocks.length > 0 &&
|
||||
policyBlocks.every((p) => /to\s+authenticated/.test(p))
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "uses timestamptz not plain timestamp for time columns",
|
||||
check: () => {
|
||||
const rawSql = getMigrationSQL().toLowerCase();
|
||||
const sql = rawSql.replace(/--[^\n]*/g, "");
|
||||
const hasPlainTimestamp =
|
||||
/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
|
||||
if (
|
||||
sql.includes("created_at") ||
|
||||
sql.includes("updated_at") ||
|
||||
sql.includes("due_date")
|
||||
) {
|
||||
return !hasPlainTimestamp.test(sql);
|
||||
}
|
||||
return true;
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "creates index on user_id column",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return /create\s+index/.test(sql) && /user_id/.test(sql);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "does not use SERIAL or BIGSERIAL for primary key",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return !/\bserial\b/.test(sql) && !/\bbigserial\b/.test(sql);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "migration is idempotent (uses IF NOT EXISTS)",
|
||||
check: () => /if\s+not\s+exists/.test(getMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "overall quality: demonstrates Supabase best practices",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const signals = [
|
||||
/enable\s+row\s+level\s+security/,
|
||||
/\(select\s+auth\.uid\(\)\)/,
|
||||
/to\s+authenticated/,
|
||||
/on\s+delete\s+cascade/,
|
||||
/create\s+index/,
|
||||
];
|
||||
return signals.filter((r) => r.test(sql)).length >= 4;
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "tasks table exists in the database after migration",
|
||||
check: () => tableExists("tasks"),
|
||||
timeout: 10_000,
|
||||
},
|
||||
{
|
||||
name: "tasks table is queryable with service role",
|
||||
check: async () => {
|
||||
const { error } = await queryTable("tasks", "service_role");
|
||||
return error === null;
|
||||
},
|
||||
timeout: 10_000,
|
||||
},
|
||||
{
|
||||
name: "tasks table returns no rows for anon (RLS is active)",
|
||||
check: () => anonSeeesNoRows("tasks"),
|
||||
timeout: 10_000,
|
||||
},
|
||||
];
|
||||
|
||||
@@ -1,16 +1,15 @@
|
||||
I'm starting a new Supabase project from scratch for a task management app. Users should sign up with email/password, and each user should only see their own tasks.
|
||||
I'm building a task management app. Users sign up with email/password and should only see their own tasks.
|
||||
|
||||
Set up the project:
|
||||
A Supabase project is already initialized and running locally. The `supabase/` directory and `config.toml` are already set up — do not run `supabase init` or `supabase start`.
|
||||
|
||||
1. Initialize the Supabase project with the CLI (`npx supabase init`)
|
||||
2. Start the local Supabase stack (`npx supabase start`)
|
||||
3. Create a SQL migration for a tasks table with columns: title (text), description (text), status (text), and due_date
|
||||
Create a SQL migration for a tasks table:
|
||||
|
||||
The migration must:
|
||||
|
||||
- Create the tasks table with proper column types
|
||||
- Link tasks to authenticated users
|
||||
- Enable Row Level Security
|
||||
- Create policies so users can only CRUD their own tasks
|
||||
- Add appropriate indexes
|
||||
- Be idempotent (safe to run multiple times)
|
||||
1. Create a new migration file with `npx supabase migration new`
|
||||
2. Write the migration SQL with:
|
||||
- A `tasks` table with columns: title (text), description (text), status (text), due_date (timestamptz)
|
||||
- Link tasks to authenticated users (foreign key to `auth.users`)
|
||||
- Enable Row Level Security
|
||||
- RLS policies so users can only CRUD their own tasks
|
||||
- Appropriate indexes
|
||||
- Idempotent (safe to run multiple times)
|
||||
3. Apply the migration with `npx supabase db push`
|
||||
|
||||
128
packages/evals/evals/cli-hallucinated-commands/EVAL.ts
Normal file
128
packages/evals/evals/cli-hallucinated-commands/EVAL.ts
Normal file
@@ -0,0 +1,128 @@
|
||||
export const expectedReferenceFiles = [
|
||||
"dev-getting-started.md",
|
||||
"edge-fun-quickstart.md",
|
||||
];
|
||||
|
||||
import { readdirSync, readFileSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import type { EvalAssertion } from "../../src/eval-types.js";
|
||||
|
||||
const cwd = process.cwd();
|
||||
|
||||
function findReferenceFile(): string | null {
|
||||
const candidates = readdirSync(cwd).filter((f) => {
|
||||
const lower = f.toLowerCase();
|
||||
return (
|
||||
lower === "cli_reference.md" ||
|
||||
lower === "cli-reference.md" ||
|
||||
lower === "clireference.md"
|
||||
);
|
||||
});
|
||||
return candidates.length > 0 ? join(cwd, candidates[0]) : null;
|
||||
}
|
||||
|
||||
function getReferenceContent(): string {
|
||||
const file = findReferenceFile();
|
||||
if (!file) throw new Error("CLI_REFERENCE.md not found in project root");
|
||||
return readFileSync(file, "utf-8");
|
||||
}
|
||||
|
||||
export const assertions: EvalAssertion[] = [
|
||||
{
|
||||
name: "CLI_REFERENCE.md exists in project root",
|
||||
check: () => findReferenceFile() !== null,
|
||||
},
|
||||
{
|
||||
name: "no hallucinated functions log command",
|
||||
check: () => {
|
||||
const content = getReferenceContent();
|
||||
return (
|
||||
!/`supabase\s+functions\s+log`/.test(content) &&
|
||||
!/^\s*npx\s+supabase\s+functions\s+log\b/m.test(content) &&
|
||||
!/^\s*supabase\s+functions\s+log\b/m.test(content)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "no hallucinated db query command",
|
||||
check: () => {
|
||||
const content = getReferenceContent();
|
||||
return (
|
||||
!/`supabase\s+db\s+query`/.test(content) &&
|
||||
!/^\s*npx\s+supabase\s+db\s+query\b/m.test(content) &&
|
||||
!/^\s*supabase\s+db\s+query\b/m.test(content)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "mentions supabase functions serve for local development",
|
||||
check: () =>
|
||||
/supabase\s+functions\s+serve/.test(getReferenceContent().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "mentions supabase functions deploy",
|
||||
check: () =>
|
||||
/supabase\s+functions\s+deploy/.test(getReferenceContent().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "mentions psql or SQL Editor or connection string for ad-hoc SQL",
|
||||
check: () => {
|
||||
const content = getReferenceContent().toLowerCase();
|
||||
return (
|
||||
/\bpsql\b/.test(content) ||
|
||||
/sql\s+editor/.test(content) ||
|
||||
/connection\s+string/.test(content) ||
|
||||
/supabase\s+db\s+dump/.test(content)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "mentions supabase db push or supabase db reset for migrations",
|
||||
check: () => {
|
||||
const content = getReferenceContent().toLowerCase();
|
||||
return (
|
||||
/supabase\s+db\s+push/.test(content) ||
|
||||
/supabase\s+db\s+reset/.test(content)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "mentions supabase start for local stack",
|
||||
check: () => /supabase\s+start/.test(getReferenceContent().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "mentions Dashboard or Logs Explorer for production log viewing",
|
||||
check: () => {
|
||||
const content = getReferenceContent().toLowerCase();
|
||||
return /\bdashboard\b/.test(content) || /logs\s+explorer/.test(content);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "overall quality: uses real CLI commands throughout",
|
||||
check: () => {
|
||||
const content = getReferenceContent().toLowerCase();
|
||||
const signals = [
|
||||
/supabase\s+start/,
|
||||
/supabase\s+stop/,
|
||||
/supabase\s+functions\s+serve/,
|
||||
/supabase\s+functions\s+deploy/,
|
||||
/supabase\s+db\s+(push|reset|diff)/,
|
||||
/\bpsql\b|\bsql\s+editor\b|\bconnection\s+string\b/,
|
||||
/\bdashboard\b|\blogs\s+explorer\b/,
|
||||
];
|
||||
const hallucinations = [
|
||||
/`supabase\s+functions\s+log`/,
|
||||
/^\s*npx\s+supabase\s+functions\s+log\b/m,
|
||||
/^\s*supabase\s+functions\s+log\b/m,
|
||||
/`supabase\s+db\s+query`/,
|
||||
/^\s*npx\s+supabase\s+db\s+query\b/m,
|
||||
/^\s*supabase\s+db\s+query\b/m,
|
||||
];
|
||||
const positiveMatches = signals.filter((r) => r.test(content)).length;
|
||||
const hallucinationMatches = hallucinations.filter((r) =>
|
||||
r.test(content),
|
||||
).length;
|
||||
return positiveMatches >= 5 && hallucinationMatches === 0;
|
||||
},
|
||||
},
|
||||
];
|
||||
9
packages/evals/evals/cli-hallucinated-commands/PROMPT.md
Normal file
9
packages/evals/evals/cli-hallucinated-commands/PROMPT.md
Normal file
@@ -0,0 +1,9 @@
|
||||
I'm onboarding a new developer to my Supabase project. Create a `CLI_REFERENCE.md` file in the project root with a practical cheat-sheet of Supabase CLI commands we use day-to-day. It should cover:
|
||||
|
||||
1. Starting and stopping the local dev stack
|
||||
2. Managing database migrations (push, reset, diff)
|
||||
3. Working with the `process-order` Edge Function (local dev and deploy)
|
||||
4. How to view Edge Function logs (both local dev and production)
|
||||
5. How to run ad-hoc SQL queries against the database (local and remote)
|
||||
|
||||
Include the actual commands with brief explanations.
|
||||
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"name": "cli-hallucinated-commands",
|
||||
"private": true,
|
||||
"type": "module"
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
# For detailed configuration reference documentation, visit:
|
||||
# https://supabase.com/docs/guides/local-development/cli/config
|
||||
# A string used to distinguish different Supabase projects on the same host. Defaults to the
|
||||
# working directory name when running `supabase init`.
|
||||
project_id = "cli-hallucinated-commands"
|
||||
|
||||
[api]
|
||||
enabled = true
|
||||
# Port to use for the API URL.
|
||||
port = 54321
|
||||
# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
|
||||
# endpoints. `public` and `graphql_public` schemas are included by default.
|
||||
schemas = ["public", "graphql_public"]
|
||||
# Extra schemas to add to the search_path of every request.
|
||||
extra_search_path = ["public", "extensions"]
|
||||
# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
|
||||
# for accidental or malicious requests.
|
||||
max_rows = 1000
|
||||
|
||||
[db]
|
||||
# Port to use for the local database URL.
|
||||
port = 54322
|
||||
# Port used by db diff command to initialize the shadow database.
|
||||
shadow_port = 54320
|
||||
# The database major version to use. This has to be the same as your remote database's. Run `SHOW
|
||||
# server_version;` on the remote database to check.
|
||||
major_version = 17
|
||||
|
||||
[db.pooler]
|
||||
enabled = false
|
||||
# Port to use for the local connection pooler.
|
||||
port = 54329
|
||||
# Specifies when a server connection can be reused by other clients.
|
||||
# Configure one of the supported pooler modes: `transaction`, `session`.
|
||||
pool_mode = "transaction"
|
||||
# How many server connections to allow per user/database pair.
|
||||
default_pool_size = 20
|
||||
# Maximum number of client connections allowed.
|
||||
max_client_conn = 100
|
||||
|
||||
[storage]
|
||||
enabled = true
|
||||
# The maximum file size allowed (e.g. "5MB", "500KB").
|
||||
file_size_limit = "50MiB"
|
||||
|
||||
[auth]
|
||||
enabled = true
|
||||
# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
|
||||
# in emails.
|
||||
site_url = "http://127.0.0.1:3000"
|
||||
# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
|
||||
additional_redirect_urls = ["https://127.0.0.1:3000"]
|
||||
# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
|
||||
jwt_expiry = 3600
|
||||
# Allow/disallow new user signups to your project.
|
||||
enable_signup = true
|
||||
# Allow/disallow anonymous sign-ins to your project.
|
||||
enable_anonymous_sign_ins = false
|
||||
|
||||
[auth.email]
|
||||
# Allow/disallow new user signups via email to your project.
|
||||
enable_signup = true
|
||||
# If enabled, users need to confirm their email address before signing in.
|
||||
enable_confirmations = false
|
||||
@@ -0,0 +1,29 @@
|
||||
import { createClient } from "jsr:@supabase/supabase-js@2";
|
||||
|
||||
Deno.serve(async (req) => {
|
||||
try {
|
||||
const { orderId } = await req.json();
|
||||
|
||||
const supabase = createClient(
|
||||
Deno.env.get("SUPABASE_URL") ?? "",
|
||||
Deno.env.get("SUPABASE_ANON_KEY") ?? "",
|
||||
);
|
||||
|
||||
const { data, error } = await supabase
|
||||
.from("orders")
|
||||
.select("*")
|
||||
.eq("id", orderId)
|
||||
.single();
|
||||
|
||||
if (error) throw error;
|
||||
|
||||
return new Response(JSON.stringify({ order: data }), {
|
||||
headers: { "Content-Type": "application/json" },
|
||||
});
|
||||
} catch (err) {
|
||||
return new Response(JSON.stringify({ error: String(err) }), {
|
||||
status: 500,
|
||||
headers: { "Content-Type": "application/json" },
|
||||
});
|
||||
}
|
||||
});
|
||||
@@ -1,333 +1,354 @@
|
||||
import { expect, test } from "vitest";
|
||||
export const expectedReferenceFiles = [
|
||||
"db-rls-mandatory.md",
|
||||
"db-rls-common-mistakes.md",
|
||||
"db-rls-performance.md",
|
||||
"db-security-functions.md",
|
||||
"db-schema-auth-fk.md",
|
||||
"db-schema-timestamps.md",
|
||||
"db-schema-realtime.md",
|
||||
"db-perf-indexes.md",
|
||||
"db-migrations-idempotent.md",
|
||||
"realtime-setup-auth.md",
|
||||
"realtime-broadcast-database.md",
|
||||
"realtime-setup-channels.md",
|
||||
];
|
||||
|
||||
import type { EvalAssertion } from "../../src/eval-types.js";
|
||||
|
||||
import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
|
||||
|
||||
test("migration file exists", () => {
|
||||
expect(findMigrationFiles().length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
test("creates rooms table", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/create\s+table[\s\S]*?rooms/);
|
||||
});
|
||||
|
||||
test("creates room_members table", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Accept room_members, members, memberships, room_users, etc.
|
||||
const hasMembership =
|
||||
/create\s+table[\s\S]*?room_members/.test(sql) ||
|
||||
/create\s+table[\s\S]*?room_users/.test(sql) ||
|
||||
/create\s+table[\s\S]*?memberships/.test(sql);
|
||||
expect(hasMembership).toBe(true);
|
||||
});
|
||||
|
||||
test("creates content table", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Accept content, contents, items, room_content, room_items, documents, etc.
|
||||
const hasContent =
|
||||
/create\s+table[\s\S]*?content/.test(sql) ||
|
||||
/create\s+table[\s\S]*?items/.test(sql) ||
|
||||
/create\s+table[\s\S]*?documents/.test(sql) ||
|
||||
/create\s+table[\s\S]*?posts/.test(sql) ||
|
||||
/create\s+table[\s\S]*?messages/.test(sql);
|
||||
expect(hasContent).toBe(true);
|
||||
});
|
||||
|
||||
test("room_members has role column with owner/editor/viewer", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/role/);
|
||||
// Should define the three roles somewhere (enum, check constraint, or comment)
|
||||
expect(sql).toMatch(/owner/);
|
||||
expect(sql).toMatch(/editor/);
|
||||
expect(sql).toMatch(/viewer/);
|
||||
});
|
||||
|
||||
test("enables RLS on all application tables", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Must enable RLS on rooms
|
||||
expect(sql).toMatch(
|
||||
/alter\s+table[\s\S]*?rooms[\s\S]*?enable\s+row\s+level\s+security/,
|
||||
);
|
||||
// Must enable RLS on membership table
|
||||
const hasMembershipRls =
|
||||
/alter\s+table[\s\S]*?room_members[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) ||
|
||||
/alter\s+table[\s\S]*?room_users[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) ||
|
||||
/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
);
|
||||
expect(hasMembershipRls).toBe(true);
|
||||
// Must enable RLS on content table (accept various names)
|
||||
const hasContentRls =
|
||||
/alter\s+table[\s\S]*?content[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) ||
|
||||
/alter\s+table[\s\S]*?items[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) ||
|
||||
/alter\s+table[\s\S]*?documents[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) ||
|
||||
/alter\s+table[\s\S]*?posts[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) ||
|
||||
/alter\s+table[\s\S]*?messages[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
);
|
||||
expect(hasContentRls).toBe(true);
|
||||
});
|
||||
|
||||
test("FK to auth.users with ON DELETE CASCADE", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/references\s+auth\.users/);
|
||||
expect(sql).toMatch(/on\s+delete\s+cascade/);
|
||||
});
|
||||
|
||||
test("content has room_id FK referencing rooms", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Content table should have a foreign key to rooms
|
||||
expect(sql).toMatch(/room_id[\s\S]*?references[\s\S]*?rooms/);
|
||||
});
|
||||
|
||||
test("policies use (select auth.uid())", () => {
|
||||
const sql = getMigrationSQL();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
expect(policyBlocks.length).toBeGreaterThan(0);
|
||||
for (const policy of policyBlocks) {
|
||||
if (policy.includes("auth.uid()")) {
|
||||
expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
test("policies use TO authenticated", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
// Filter to only application table policies (not realtime.messages which may use different roles)
|
||||
const appPolicies = policyBlocks.filter(
|
||||
(p) => !p.includes("realtime.messages"),
|
||||
);
|
||||
expect(appPolicies.length).toBeGreaterThan(0);
|
||||
for (const policy of appPolicies) {
|
||||
expect(policy).toMatch(/to\s+authenticated/);
|
||||
}
|
||||
});
|
||||
|
||||
test("private schema with security_definer helper function", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Private schema should be created
|
||||
expect(sql).toMatch(/create\s+schema[\s\S]*?private/);
|
||||
// A function in the private schema with SECURITY DEFINER
|
||||
expect(sql).toMatch(/private\./);
|
||||
expect(sql).toMatch(/security\s+definer/);
|
||||
expect(sql).toMatch(/set\s+search_path\s*=\s*''/);
|
||||
});
|
||||
|
||||
test("role-based write policies: content INSERT/UPDATE restricted to owner or editor", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
// Find INSERT or UPDATE policies on the content table
|
||||
const writePolicies = policyBlocks.filter(
|
||||
(p) =>
|
||||
(/for\s+(insert|update|all)/.test(p) || /insert|update/.test(p)) &&
|
||||
(p.includes("content") ||
|
||||
p.includes("items") ||
|
||||
p.includes("documents") ||
|
||||
p.includes("posts") ||
|
||||
p.includes("messages")),
|
||||
);
|
||||
// At least one write policy should check for owner or editor role
|
||||
const checksRole = writePolicies.some(
|
||||
(p) => p.includes("owner") || p.includes("editor"),
|
||||
);
|
||||
expect(checksRole).toBe(true);
|
||||
});
|
||||
|
||||
test("viewer role is read-only (no write access to content)", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
// Find content write policies (INSERT, UPDATE, DELETE)
|
||||
const contentWritePolicies = policyBlocks.filter(
|
||||
(p) =>
|
||||
/for\s+(insert|update|delete)/.test(p) &&
|
||||
(p.includes("content") ||
|
||||
p.includes("items") ||
|
||||
p.includes("documents") ||
|
||||
p.includes("posts") ||
|
||||
p.includes("messages")),
|
||||
);
|
||||
// None of the write policies should grant access to viewer role
|
||||
// They should either explicitly check for owner/editor OR exclude viewer
|
||||
if (contentWritePolicies.length > 0) {
|
||||
const anyGrantsViewer = contentWritePolicies.some((p) => {
|
||||
// If the policy doesn't mention any role, it's too permissive
|
||||
const mentionsRole =
|
||||
p.includes("owner") || p.includes("editor") || p.includes("viewer");
|
||||
if (!mentionsRole) return true; // no role check = viewer could write
|
||||
// If it specifically includes viewer in a write context, that's wrong
|
||||
export const assertions: EvalAssertion[] = [
|
||||
{
|
||||
name: "migration file exists",
|
||||
check: () => findMigrationFiles().length > 0,
|
||||
},
|
||||
{
|
||||
name: "creates rooms table",
|
||||
check: () =>
|
||||
/create\s+table[\s\S]*?rooms/.test(getMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "creates room_members table",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return (
|
||||
p.includes("viewer") && !p.includes("owner") && !p.includes("editor")
|
||||
/create\s+table[\s\S]*?room_members/.test(sql) ||
|
||||
/create\s+table[\s\S]*?room_users/.test(sql) ||
|
||||
/create\s+table[\s\S]*?memberships/.test(sql)
|
||||
);
|
||||
});
|
||||
expect(anyGrantsViewer).toBe(false);
|
||||
}
|
||||
});
|
||||
|
||||
test("indexes on membership lookup columns", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/create\s+index/);
|
||||
const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? [];
|
||||
// Should index user_id and/or room_id on the membership table
|
||||
const membershipIndexes = indexBlocks.filter(
|
||||
(idx) =>
|
||||
idx.toLowerCase().includes("user_id") ||
|
||||
idx.toLowerCase().includes("room_id"),
|
||||
);
|
||||
expect(membershipIndexes.length).toBeGreaterThanOrEqual(1);
|
||||
});
|
||||
|
||||
test("uses timestamptz not plain timestamp", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Match "timestamp" that is NOT followed by "tz" or "with time zone"
|
||||
const hasPlainTimestamp =
|
||||
/(?:created_at|updated_at|invited_at|joined_at)\s+timestamp(?!\s*tz)(?!\s+with\s+time\s+zone)/;
|
||||
// Only fail if the migration defines time columns with plain timestamp
|
||||
if (
|
||||
sql.includes("created_at") ||
|
||||
sql.includes("updated_at") ||
|
||||
sql.includes("_at ")
|
||||
) {
|
||||
expect(sql).not.toMatch(hasPlainTimestamp);
|
||||
}
|
||||
});
|
||||
|
||||
test("idempotent DDL", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/if\s+not\s+exists/);
|
||||
});
|
||||
|
||||
test("realtime publication enabled for content table", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Should add the content table to supabase_realtime publication
|
||||
expect(sql).toMatch(/alter\s+publication\s+supabase_realtime\s+add\s+table/);
|
||||
});
|
||||
|
||||
test("broadcast trigger for content changes", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Should use realtime.broadcast_changes() or realtime.send() in a trigger
|
||||
const usesBroadcastChanges = /realtime\.broadcast_changes/.test(sql);
|
||||
const usesRealtimeSend = /realtime\.send/.test(sql);
|
||||
expect(usesBroadcastChanges || usesRealtimeSend).toBe(true);
|
||||
// Should create a trigger on the content table
|
||||
expect(sql).toMatch(/create\s+trigger/);
|
||||
});
|
||||
|
||||
test("broadcast trigger function uses security definer", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Find function definitions that reference realtime.broadcast_changes or realtime.send
|
||||
const functionBlocks =
|
||||
sql.match(/create[\s\S]*?function[\s\S]*?\$\$[\s\S]*?\$\$/gi) ?? [];
|
||||
const realtimeFunctions = functionBlocks.filter(
|
||||
(f) =>
|
||||
f.toLowerCase().includes("realtime.broadcast_changes") ||
|
||||
f.toLowerCase().includes("realtime.send"),
|
||||
);
|
||||
expect(realtimeFunctions.length).toBeGreaterThan(0);
|
||||
// The trigger function should have security definer and search_path
|
||||
const hasSecurityDefiner = realtimeFunctions.some(
|
||||
(f) =>
|
||||
/security\s+definer/.test(f.toLowerCase()) &&
|
||||
/set\s+search_path\s*=\s*''/.test(f.toLowerCase()),
|
||||
);
|
||||
expect(hasSecurityDefiner).toBe(true);
|
||||
});
|
||||
|
||||
test("RLS policies on realtime.messages", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const realtimePolicies = policyBlocks.filter((p) =>
|
||||
p.includes("realtime.messages"),
|
||||
);
|
||||
expect(realtimePolicies.length).toBeGreaterThan(0);
|
||||
// At least one policy should target authenticated users
|
||||
const hasAuthPolicy = realtimePolicies.some(
|
||||
(p) => /to\s+authenticated/.test(p) || /auth\.uid\(\)/.test(p),
|
||||
);
|
||||
expect(hasAuthPolicy).toBe(true);
|
||||
});
|
||||
|
||||
test("realtime policy checks extension column", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const realtimePolicies = policyBlocks.filter((p) =>
|
||||
p.includes("realtime.messages"),
|
||||
);
|
||||
// At least one realtime policy should reference the extension column
|
||||
const checksExtension = realtimePolicies.some(
|
||||
(p) =>
|
||||
p.includes("extension") &&
|
||||
(p.includes("broadcast") || p.includes("presence")),
|
||||
);
|
||||
expect(checksExtension).toBe(true);
|
||||
});
|
||||
|
||||
test("overall quality score", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
|
||||
const signals = [
|
||||
// 1. RLS enabled on rooms
|
||||
/alter\s+table[\s\S]*?rooms[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
),
|
||||
// 2. RLS enabled on membership table
|
||||
/alter\s+table[\s\S]*?(room_members|room_users|memberships)[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
),
|
||||
// 3. RLS enabled on content table
|
||||
/alter\s+table[\s\S]*?(content|items|documents|posts|messages)[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
),
|
||||
// 4. FK to auth.users with cascade
|
||||
/references\s+auth\.users/.test(sql) && /on\s+delete\s+cascade/.test(sql),
|
||||
// 5. Private schema created
|
||||
/create\s+schema[\s\S]*?private/.test(sql),
|
||||
// 6. security_definer with search_path
|
||||
/security\s+definer/.test(sql) && /set\s+search_path\s*=\s*''/.test(sql),
|
||||
// 7. Subselect auth.uid()
|
||||
/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
|
||||
// 8. TO authenticated on policies
|
||||
policyBlocks.length > 0 &&
|
||||
policyBlocks.filter((p) => !p.includes("realtime.messages")).length > 0 &&
|
||||
policyBlocks
|
||||
.filter((p) => !p.includes("realtime.messages"))
|
||||
.every((p) => /to\s+authenticated/.test(p)),
|
||||
// 9. Indexes on lookup columns
|
||||
/create\s+index/.test(sql),
|
||||
// 10. timestamptz usage (accepts both timestamptz and timestamp with time zone)
|
||||
/timestamptz/.test(sql) || /timestamp\s+with\s+time\s+zone/.test(sql),
|
||||
// 11. IF NOT EXISTS for idempotency
|
||||
/if\s+not\s+exists/.test(sql),
|
||||
// 12. Role-based policies (owner/editor/viewer)
|
||||
sql.includes("owner") && sql.includes("editor") && sql.includes("viewer"),
|
||||
// 13. Realtime publication
|
||||
/alter\s+publication\s+supabase_realtime\s+add\s+table/.test(sql),
|
||||
// 14. Broadcast trigger (broadcast_changes or realtime.send)
|
||||
/realtime\.broadcast_changes/.test(sql) || /realtime\.send/.test(sql),
|
||||
// 15. Trigger creation
|
||||
/create\s+trigger/.test(sql),
|
||||
// 16. RLS on realtime.messages
|
||||
policyBlocks.some((p) => p.includes("realtime.messages")),
|
||||
// 17. Extension check in realtime policy
|
||||
policyBlocks
|
||||
.filter((p) => p.includes("realtime.messages"))
|
||||
.some((p) => p.includes("extension")),
|
||||
// 18. room_id FK on content
|
||||
/room_id[\s\S]*?references[\s\S]*?rooms/.test(sql),
|
||||
];
|
||||
const passed = signals.filter(Boolean).length;
|
||||
expect(passed).toBeGreaterThanOrEqual(13);
|
||||
});
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "creates content table",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return (
|
||||
/create\s+table[\s\S]*?content/.test(sql) ||
|
||||
/create\s+table[\s\S]*?items/.test(sql) ||
|
||||
/create\s+table[\s\S]*?documents/.test(sql) ||
|
||||
/create\s+table[\s\S]*?posts/.test(sql) ||
|
||||
/create\s+table[\s\S]*?messages/.test(sql)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "room_members has role column with owner/editor/viewer",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return (
|
||||
/role/.test(sql) &&
|
||||
/owner/.test(sql) &&
|
||||
/editor/.test(sql) &&
|
||||
/viewer/.test(sql)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "enables RLS on all application tables",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const roomsRls =
|
||||
/alter\s+table[\s\S]*?rooms[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
);
|
||||
const membershipRls =
|
||||
/alter\s+table[\s\S]*?room_members[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) ||
|
||||
/alter\s+table[\s\S]*?room_users[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) ||
|
||||
/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
);
|
||||
const contentRls =
|
||||
/alter\s+table[\s\S]*?content[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) ||
|
||||
/alter\s+table[\s\S]*?items[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) ||
|
||||
/alter\s+table[\s\S]*?documents[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) ||
|
||||
/alter\s+table[\s\S]*?posts[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) ||
|
||||
/alter\s+table[\s\S]*?messages[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
);
|
||||
return roomsRls && membershipRls && contentRls;
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "FK to auth.users with ON DELETE CASCADE",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return (
|
||||
/references\s+auth\.users/.test(sql) &&
|
||||
/on\s+delete\s+cascade/.test(sql)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "content has room_id FK referencing rooms",
|
||||
check: () =>
|
||||
/room_id[\s\S]*?references[\s\S]*?rooms/.test(
|
||||
getMigrationSQL().toLowerCase(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "policies use (select auth.uid())",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
if (policyBlocks.length === 0) return false;
|
||||
for (const policy of policyBlocks) {
|
||||
if (
|
||||
policy.includes("auth.uid()") &&
|
||||
!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "policies use TO authenticated",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const appPolicies = policyBlocks.filter(
|
||||
(p) => !p.includes("realtime.messages"),
|
||||
);
|
||||
return (
|
||||
appPolicies.length > 0 &&
|
||||
appPolicies.every((p) => /to\s+authenticated/.test(p))
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "private schema with security_definer helper function",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return (
|
||||
/create\s+schema[\s\S]*?private/.test(sql) &&
|
||||
/private\./.test(sql) &&
|
||||
/security\s+definer/.test(sql) &&
|
||||
/set\s+search_path\s*=\s*''/.test(sql)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "role-based write policies: content INSERT/UPDATE restricted to owner or editor",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const writePolicies = policyBlocks.filter(
|
||||
(p) =>
|
||||
(/for\s+(insert|update|all)/.test(p) || /insert|update/.test(p)) &&
|
||||
(p.includes("content") ||
|
||||
p.includes("items") ||
|
||||
p.includes("documents") ||
|
||||
p.includes("posts") ||
|
||||
p.includes("messages")),
|
||||
);
|
||||
return writePolicies.some(
|
||||
(p) => p.includes("owner") || p.includes("editor"),
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "viewer role is read-only (no write access to content)",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const contentWritePolicies = policyBlocks.filter(
|
||||
(p) =>
|
||||
/for\s+(insert|update|delete)/.test(p) &&
|
||||
(p.includes("content") ||
|
||||
p.includes("items") ||
|
||||
p.includes("documents") ||
|
||||
p.includes("posts") ||
|
||||
p.includes("messages")),
|
||||
);
|
||||
if (contentWritePolicies.length === 0) return true;
|
||||
return !contentWritePolicies.some((p) => {
|
||||
const mentionsRole =
|
||||
p.includes("owner") || p.includes("editor") || p.includes("viewer");
|
||||
if (!mentionsRole) return true;
|
||||
return (
|
||||
p.includes("viewer") && !p.includes("owner") && !p.includes("editor")
|
||||
);
|
||||
});
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "indexes on membership lookup columns",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
if (!/create\s+index/.test(sql)) return false;
|
||||
const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? [];
|
||||
return (
|
||||
indexBlocks.filter(
|
||||
(idx) =>
|
||||
idx.toLowerCase().includes("user_id") ||
|
||||
idx.toLowerCase().includes("room_id"),
|
||||
).length >= 1
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "uses timestamptz not plain timestamp",
|
||||
check: () => {
|
||||
const rawSql = getMigrationSQL().toLowerCase();
|
||||
const sql = rawSql.replace(/--[^\n]*/g, "");
|
||||
const hasPlainTimestamp =
|
||||
/(?:created_at|updated_at|invited_at|joined_at)\s+timestamp(?!\s*tz)(?!\s+with\s+time\s+zone)/;
|
||||
if (
|
||||
sql.includes("created_at") ||
|
||||
sql.includes("updated_at") ||
|
||||
sql.includes("_at ")
|
||||
) {
|
||||
return !hasPlainTimestamp.test(sql);
|
||||
}
|
||||
return true;
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "idempotent DDL",
|
||||
check: () => /if\s+not\s+exists/.test(getMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "realtime publication enabled for content table",
|
||||
check: () =>
|
||||
/alter\s+publication\s+supabase_realtime\s+add\s+table/.test(
|
||||
getMigrationSQL().toLowerCase(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "broadcast trigger for content changes",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return (
|
||||
(/realtime\.broadcast_changes/.test(sql) ||
|
||||
/realtime\.send/.test(sql)) &&
|
||||
/create\s+trigger/.test(sql)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "broadcast trigger function uses security definer",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const functionBlocks =
|
||||
sql.match(/create[\s\S]*?function[\s\S]*?\$\$[\s\S]*?\$\$/gi) ?? [];
|
||||
const realtimeFunctions = functionBlocks.filter(
|
||||
(f) =>
|
||||
f.toLowerCase().includes("realtime.broadcast_changes") ||
|
||||
f.toLowerCase().includes("realtime.send"),
|
||||
);
|
||||
if (realtimeFunctions.length === 0) return false;
|
||||
return realtimeFunctions.some(
|
||||
(f) =>
|
||||
/security\s+definer/.test(f.toLowerCase()) &&
|
||||
/set\s+search_path\s*=\s*''/.test(f.toLowerCase()),
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "RLS policies on realtime.messages",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const realtimePolicies = policyBlocks.filter((p) =>
|
||||
p.includes("realtime.messages"),
|
||||
);
|
||||
if (realtimePolicies.length === 0) return false;
|
||||
return realtimePolicies.some(
|
||||
(p) => /to\s+authenticated/.test(p) || /auth\.uid\(\)/.test(p),
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "realtime policy checks extension column",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const realtimePolicies = policyBlocks.filter((p) =>
|
||||
p.includes("realtime.messages"),
|
||||
);
|
||||
return realtimePolicies.some(
|
||||
(p) =>
|
||||
p.includes("extension") &&
|
||||
(p.includes("broadcast") || p.includes("presence")),
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "overall quality score",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const signals = [
|
||||
/alter\s+table[\s\S]*?rooms[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
),
|
||||
/alter\s+table[\s\S]*?(room_members|room_users|memberships)[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
),
|
||||
/alter\s+table[\s\S]*?(content|items|documents|posts|messages)[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
),
|
||||
/references\s+auth\.users/.test(sql) &&
|
||||
/on\s+delete\s+cascade/.test(sql),
|
||||
/create\s+schema[\s\S]*?private/.test(sql),
|
||||
/security\s+definer/.test(sql) &&
|
||||
/set\s+search_path\s*=\s*''/.test(sql),
|
||||
/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
|
||||
policyBlocks.length > 0 &&
|
||||
policyBlocks.filter((p) => !p.includes("realtime.messages")).length >
|
||||
0 &&
|
||||
policyBlocks
|
||||
.filter((p) => !p.includes("realtime.messages"))
|
||||
.every((p) => /to\s+authenticated/.test(p)),
|
||||
/create\s+index/.test(sql),
|
||||
/timestamptz/.test(sql) || /timestamp\s+with\s+time\s+zone/.test(sql),
|
||||
/if\s+not\s+exists/.test(sql),
|
||||
sql.includes("owner") &&
|
||||
sql.includes("editor") &&
|
||||
sql.includes("viewer"),
|
||||
/alter\s+publication\s+supabase_realtime\s+add\s+table/.test(sql),
|
||||
/realtime\.broadcast_changes/.test(sql) || /realtime\.send/.test(sql),
|
||||
/create\s+trigger/.test(sql),
|
||||
policyBlocks.some((p) => p.includes("realtime.messages")),
|
||||
policyBlocks
|
||||
.filter((p) => p.includes("realtime.messages"))
|
||||
.some((p) => p.includes("extension")),
|
||||
/room_id[\s\S]*?references[\s\S]*?rooms/.test(sql),
|
||||
];
|
||||
return signals.filter(Boolean).length >= 13;
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
# Direct connection to the database — used for migrations
|
||||
# Replace with your Supabase project's direct connection string
|
||||
DATABASE_URL="postgresql://postgres:[YOUR-PASSWORD]@db.[YOUR-PROJECT-REF].supabase.co:5432/postgres"
|
||||
134
packages/evals/evals/connection-pooling-prisma/EVAL.ts
Normal file
134
packages/evals/evals/connection-pooling-prisma/EVAL.ts
Normal file
@@ -0,0 +1,134 @@
|
||||
export const expectedReferenceFiles = [
|
||||
"db-conn-pooling.md",
|
||||
"db-migrations-idempotent.md",
|
||||
"db-schema-auth-fk.md",
|
||||
];
|
||||
|
||||
import { existsSync, readdirSync, readFileSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import type { EvalAssertion } from "../../src/eval-types.js";
|
||||
|
||||
const cwd = process.cwd();
|
||||
|
||||
function findPrismaSchema(): string | null {
|
||||
const candidates = [
|
||||
join(cwd, "prisma", "schema.prisma"),
|
||||
join(cwd, "schema.prisma"),
|
||||
];
|
||||
for (const p of candidates) {
|
||||
if (existsSync(p)) return p;
|
||||
}
|
||||
const prismaDir = join(cwd, "prisma");
|
||||
if (existsSync(prismaDir)) {
|
||||
const files = readdirSync(prismaDir).filter((f) => f.endsWith(".prisma"));
|
||||
if (files.length > 0) return join(prismaDir, files[0]);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function getPrismaSchema(): string {
|
||||
const file = findPrismaSchema();
|
||||
if (!file) throw new Error("No .prisma schema file found");
|
||||
return readFileSync(file, "utf-8");
|
||||
}
|
||||
|
||||
function findEnvFiles(): string[] {
|
||||
const found: string[] = [];
|
||||
for (const name of [
|
||||
".env",
|
||||
".env.example",
|
||||
".env.local",
|
||||
".env.production",
|
||||
".env.development",
|
||||
]) {
|
||||
const p = join(cwd, name);
|
||||
if (existsSync(p)) found.push(p);
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
||||
function getAllEnvContent(): string {
|
||||
return findEnvFiles()
|
||||
.map((f) => readFileSync(f, "utf-8"))
|
||||
.join("\n");
|
||||
}
|
||||
|
||||
function getAllOutputContent(): string {
|
||||
const parts: string[] = [];
|
||||
const schema = findPrismaSchema();
|
||||
if (schema) parts.push(readFileSync(schema, "utf-8"));
|
||||
parts.push(getAllEnvContent());
|
||||
const mdFiles = readdirSync(cwd).filter((f) => f.endsWith(".md"));
|
||||
for (const f of mdFiles) {
|
||||
parts.push(readFileSync(join(cwd, f), "utf-8"));
|
||||
}
|
||||
return parts.join("\n");
|
||||
}
|
||||
|
||||
export const assertions: EvalAssertion[] = [
|
||||
{
|
||||
name: "prisma schema file exists",
|
||||
check: () => findPrismaSchema() !== null,
|
||||
},
|
||||
{
|
||||
name: "prisma schema references pooler port 6543",
|
||||
check: () => /6543/.test(getAllOutputContent()),
|
||||
},
|
||||
{
|
||||
name: "pgbouncer=true param present",
|
||||
check: () =>
|
||||
/pgbouncer\s*=\s*true/.test(getAllOutputContent().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "DIRECT_URL provided for migrations",
|
||||
check: () => {
|
||||
const allContent = `${getPrismaSchema().toLowerCase()}\n${getAllEnvContent().toLowerCase()}`;
|
||||
return /directurl/.test(allContent) || /direct_url/.test(allContent);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "datasource block references directUrl or DIRECT_URL env var",
|
||||
check: () => {
|
||||
const schema = getPrismaSchema().toLowerCase();
|
||||
const datasourceBlock =
|
||||
schema.match(/datasource\s+\w+\s*\{[\s\S]*?\}/)?.[0] ?? "";
|
||||
return (
|
||||
/directurl/.test(datasourceBlock) || /direct_url/.test(datasourceBlock)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "connection limit set to 1 for serverless",
|
||||
check: () => {
|
||||
const content = getAllOutputContent().toLowerCase();
|
||||
return (
|
||||
/connection_limit\s*=\s*1/.test(content) ||
|
||||
/connection_limit:\s*1/.test(content) ||
|
||||
/connectionlimit\s*=\s*1/.test(content)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "explanation distinguishes port 6543 vs 5432",
|
||||
check: () => {
|
||||
const content = getAllOutputContent();
|
||||
return /6543/.test(content) && /5432/.test(content);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "overall quality: demonstrates correct Prisma + Supabase pooler setup",
|
||||
check: () => {
|
||||
const schema = getPrismaSchema().toLowerCase();
|
||||
const envContent = getAllEnvContent().toLowerCase();
|
||||
const allContent = `${schema}\n${envContent}`;
|
||||
const signals = [
|
||||
/6543/,
|
||||
/pgbouncer\s*=\s*true/,
|
||||
/directurl|direct_url/,
|
||||
/connection_limit\s*=\s*1|connection_limit:\s*1/,
|
||||
/5432/,
|
||||
];
|
||||
return signals.filter((r) => r.test(allContent)).length >= 4;
|
||||
},
|
||||
},
|
||||
];
|
||||
3
packages/evals/evals/connection-pooling-prisma/PROMPT.md
Normal file
3
packages/evals/evals/connection-pooling-prisma/PROMPT.md
Normal file
@@ -0,0 +1,3 @@
|
||||
I'm deploying my Supabase app on Vercel using Prisma. I keep getting "prepared statement already exists" errors in production. My current `DATABASE_URL` in `prisma/schema.prisma` uses the direct connection string on port 5432 with no pooler settings.
|
||||
|
||||
Fix the Prisma configuration so it works correctly with Supabase's connection pooler for serverless deployments. Make any changes needed to `prisma/schema.prisma` and update the `.env.example` file with the correct connection string format.
|
||||
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"name": "connection-pooling-prisma",
|
||||
"private": true,
|
||||
"type": "module"
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
// This is your Prisma schema file,
|
||||
// learn more about it in the docs: https://pris.ly/d/prisma-schema
|
||||
|
||||
generator client {
|
||||
provider = "prisma-client-js"
|
||||
}
|
||||
|
||||
datasource db {
|
||||
provider = "postgresql"
|
||||
url = env("DATABASE_URL")
|
||||
}
|
||||
|
||||
model User {
|
||||
id String @id @default(cuid())
|
||||
email String @unique
|
||||
name String?
|
||||
createdAt DateTime @default(now())
|
||||
posts Post[]
|
||||
}
|
||||
|
||||
model Post {
|
||||
id String @id @default(cuid())
|
||||
title String
|
||||
content String?
|
||||
published Boolean @default(false)
|
||||
author User @relation(fields: [authorId], references: [id])
|
||||
authorId String
|
||||
createdAt DateTime @default(now())
|
||||
}
|
||||
@@ -0,0 +1,111 @@
|
||||
# For detailed configuration reference documentation, visit:
|
||||
# https://supabase.com/docs/guides/local-development/cli/config
|
||||
# A string used to distinguish different Supabase projects on the same host. Defaults to the
|
||||
# working directory name when running `supabase init`.
|
||||
project_id = "connection-pooling-prisma"
|
||||
|
||||
[api]
|
||||
enabled = true
|
||||
# Port to use for the API URL.
|
||||
port = 54321
|
||||
# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
|
||||
# endpoints. `public` and `graphql_public` schemas are included by default.
|
||||
schemas = ["public", "graphql_public"]
|
||||
# Extra schemas to add to the search_path of every request.
|
||||
extra_search_path = ["public", "extensions"]
|
||||
# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
|
||||
# for accidental or malicious requests.
|
||||
max_rows = 1000
|
||||
|
||||
[db]
|
||||
# Port to use for the local database URL.
|
||||
port = 54322
|
||||
# Port used by db diff command to initialize the shadow database.
|
||||
shadow_port = 54320
|
||||
# The database major version to use. This has to be the same as your remote database's. Run `SHOW
|
||||
# server_version;` on the remote database to check.
|
||||
major_version = 17
|
||||
|
||||
[db.pooler]
|
||||
enabled = false
|
||||
# Port to use for the local connection pooler.
|
||||
port = 54329
|
||||
# Specifies when a server connection can be reused by other clients.
|
||||
# Configure one of the supported pooler modes: `transaction`, `session`.
|
||||
pool_mode = "transaction"
|
||||
# How many server connections to allow per user/database pair.
|
||||
default_pool_size = 20
|
||||
# Maximum number of client connections allowed.
|
||||
max_client_conn = 100
|
||||
|
||||
[db.migrations]
|
||||
# If disabled, migrations will be skipped during a db push or reset.
|
||||
enabled = true
|
||||
schema_paths = []
|
||||
|
||||
[db.seed]
|
||||
# If enabled, seeds the database after migrations during a db reset.
|
||||
enabled = true
|
||||
# Specifies an ordered list of seed files to load during db reset.
|
||||
sql_paths = ["./seed.sql"]
|
||||
|
||||
[realtime]
|
||||
enabled = true
|
||||
|
||||
[studio]
|
||||
enabled = true
|
||||
# Port to use for Supabase Studio.
|
||||
port = 54323
|
||||
# External URL of the API server that frontend connects to.
|
||||
api_url = "http://127.0.0.1"
|
||||
|
||||
[inbucket]
|
||||
enabled = true
|
||||
# Port to use for the email testing server web interface.
|
||||
port = 54324
|
||||
|
||||
[storage]
|
||||
enabled = true
|
||||
# The maximum file size allowed (e.g. "5MB", "500KB").
|
||||
file_size_limit = "50MiB"
|
||||
|
||||
[auth]
|
||||
enabled = true
|
||||
# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
|
||||
# in emails.
|
||||
site_url = "http://127.0.0.1:3000"
|
||||
# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
|
||||
additional_redirect_urls = ["https://127.0.0.1:3000"]
|
||||
# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
|
||||
jwt_expiry = 3600
|
||||
# If disabled, the refresh token will never expire.
|
||||
enable_refresh_token_rotation = true
|
||||
# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
|
||||
# Requires enable_refresh_token_rotation = true.
|
||||
refresh_token_reuse_interval = 10
|
||||
# Allow/disallow new user signups to your project.
|
||||
enable_signup = true
|
||||
# Allow/disallow anonymous sign-ins to your project.
|
||||
enable_anonymous_sign_ins = false
|
||||
|
||||
[auth.email]
|
||||
# Allow/disallow new user signups via email to your project.
|
||||
enable_signup = true
|
||||
# If enabled, a user will be required to confirm any email change on both the old, and new email
|
||||
# addresses. If disabled, only the new email is required to confirm.
|
||||
double_confirm_changes = true
|
||||
# If enabled, users need to confirm their email address before signing in.
|
||||
enable_confirmations = false
|
||||
|
||||
[edge_runtime]
|
||||
enabled = true
|
||||
# Configure one of the supported request policies: `oneshot`, `per_worker`.
|
||||
policy = "per_worker"
|
||||
# Port to attach the Chrome inspector for debugging edge functions.
|
||||
inspector_port = 8083
|
||||
|
||||
[analytics]
|
||||
enabled = true
|
||||
port = 54327
|
||||
# Configure one of the supported backends: `postgres`, `bigquery`.
|
||||
backend = "postgres"
|
||||
@@ -1,26 +1,31 @@
|
||||
export const expectedReferenceFiles = [
|
||||
"edge-fun-quickstart.md",
|
||||
"edge-fun-project-structure.md",
|
||||
"edge-pat-cors.md",
|
||||
"edge-pat-error-handling.md",
|
||||
"dev-getting-started.md",
|
||||
];
|
||||
|
||||
import { existsSync, readdirSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import { expect, test } from "vitest";
|
||||
import type { EvalAssertion } from "../../src/eval-types.js";
|
||||
|
||||
import {
|
||||
findFunctionFile,
|
||||
findSharedCorsFile,
|
||||
functionsDir,
|
||||
getFunctionCode,
|
||||
getFunctionsDir,
|
||||
getSharedCode,
|
||||
supabaseDir,
|
||||
getSupabaseDir,
|
||||
} from "../eval-utils.ts";
|
||||
|
||||
const FUNCTION_NAME = "hello-world";
|
||||
const helloWorldDir = join(functionsDir, FUNCTION_NAME);
|
||||
|
||||
/** Read function code + all shared modules combined. */
|
||||
function getAllCode(): string {
|
||||
const code = getFunctionCode(FUNCTION_NAME);
|
||||
return `${code}\n${getSharedCode()}`;
|
||||
}
|
||||
|
||||
/** Extract the code after the first `catch` keyword to the end of the function. */
|
||||
function getCatchBlockCode(): string {
|
||||
const code = getFunctionCode(FUNCTION_NAME);
|
||||
const catchIndex = code.search(/\bcatch\b/);
|
||||
@@ -28,121 +33,123 @@ function getCatchBlockCode(): string {
|
||||
return code.slice(catchIndex);
|
||||
}
|
||||
|
||||
test("supabase project initialized", () => {
|
||||
expect(existsSync(join(supabaseDir, "config.toml"))).toBe(true);
|
||||
});
|
||||
|
||||
test("function directory exists", () => {
|
||||
expect(existsSync(helloWorldDir)).toBe(true);
|
||||
});
|
||||
|
||||
test("function index file exists", () => {
|
||||
expect(findFunctionFile(FUNCTION_NAME)).not.toBeNull();
|
||||
});
|
||||
|
||||
test("uses Deno.serve", () => {
|
||||
const code = getFunctionCode(FUNCTION_NAME);
|
||||
expect(code).toMatch(/Deno\.serve/);
|
||||
});
|
||||
|
||||
test("returns JSON response", () => {
|
||||
// Check both the function file and shared modules for JSON response patterns
|
||||
const allCode = getAllCode();
|
||||
const hasContentTypeHeader =
|
||||
/content-type['"]\s*:\s*['"]application\/json/i.test(allCode);
|
||||
const hasResponseJson = /Response\.json/i.test(allCode);
|
||||
const hasJsonStringify = /JSON\.stringify/i.test(allCode);
|
||||
expect(hasContentTypeHeader || hasResponseJson || hasJsonStringify).toBe(
|
||||
true,
|
||||
);
|
||||
});
|
||||
|
||||
test("handles OPTIONS preflight", () => {
|
||||
// OPTIONS handling may be in the function itself or in a shared CORS helper
|
||||
const allCode = getAllCode();
|
||||
expect(allCode).toMatch(/['"]OPTIONS['"]/);
|
||||
expect(allCode).toMatch(/\.method/);
|
||||
});
|
||||
|
||||
test("defines CORS headers", () => {
|
||||
const allCode = getAllCode();
|
||||
expect(allCode).toMatch(/Access-Control-Allow-Origin/);
|
||||
});
|
||||
|
||||
test("CORS allows required headers", () => {
|
||||
const allCode = getAllCode().toLowerCase();
|
||||
// Must include authorization and apikey in allowed headers
|
||||
expect(allCode).toMatch(/access-control-allow-headers/);
|
||||
expect(allCode).toMatch(/authorization/);
|
||||
expect(allCode).toMatch(/apikey/);
|
||||
});
|
||||
|
||||
test("error response has CORS headers", () => {
|
||||
const catchCode = getCatchBlockCode();
|
||||
expect(catchCode.length).toBeGreaterThan(0);
|
||||
// The catch block should either directly reference CORS headers, or call
|
||||
// a shared helper that includes them (e.g. errorResponse, corsHeaders).
|
||||
const sharedCode = getSharedCode();
|
||||
// Direct CORS reference in catch block
|
||||
const directCors =
|
||||
/corsHeaders|cors_headers|Access-Control-Allow-Origin/i.test(catchCode);
|
||||
// Calls a shared helper that itself includes CORS headers
|
||||
const callsSharedHelper =
|
||||
/errorResponse|jsonResponse|json_response|error_response/i.test(
|
||||
catchCode,
|
||||
) && /Access-Control-Allow-Origin/i.test(sharedCode);
|
||||
expect(directCors || callsSharedHelper).toBe(true);
|
||||
});
|
||||
|
||||
test("has try-catch for error handling", () => {
|
||||
const code = getFunctionCode(FUNCTION_NAME);
|
||||
expect(code).toMatch(/\btry\s*\{/);
|
||||
expect(code).toMatch(/\bcatch\b/);
|
||||
});
|
||||
|
||||
test("returns proper error status code", () => {
|
||||
const catchCode = getCatchBlockCode();
|
||||
expect(catchCode.length).toBeGreaterThan(0);
|
||||
// Error response should use status 400 or 500 (not default 200).
|
||||
// Match object-style { status: 500 } or function-call-style fn('msg', 500)
|
||||
const hasObjectStatus = /status:\s*(400|500|4\d{2}|5\d{2})/.test(catchCode);
|
||||
const hasFnArgStatus = /[,(]\s*(400|500|4\d{2}|5\d{2})\s*[),]/.test(
|
||||
catchCode,
|
||||
);
|
||||
expect(hasObjectStatus || hasFnArgStatus).toBe(true);
|
||||
});
|
||||
|
||||
test("shared CORS module exists", () => {
|
||||
expect(findSharedCorsFile()).not.toBeNull();
|
||||
});
|
||||
|
||||
test("function imports from shared", () => {
|
||||
const code = getFunctionCode(FUNCTION_NAME);
|
||||
// Should import from ../_shared/ relative path
|
||||
expect(code).toMatch(/from\s+['"]\.\.\/(_shared|_utils)/);
|
||||
});
|
||||
|
||||
test("function uses hyphenated name", () => {
|
||||
// The function directory should use hyphens, not underscores
|
||||
const dirs = existsSync(functionsDir) ? readdirSync(functionsDir) : [];
|
||||
const helloDir = dirs.find((d) => d.includes("hello") && d.includes("world"));
|
||||
expect(helloDir).toBeDefined();
|
||||
expect(helloDir).toMatch(/^hello-world$/);
|
||||
});
|
||||
|
||||
test("overall quality: demonstrates Edge Function best practices", () => {
|
||||
const allCode = getAllCode().toLowerCase();
|
||||
// A high-quality Edge Function should contain most of these patterns
|
||||
const signals = [
|
||||
/deno\.serve/, // Modern Deno.serve API
|
||||
/['"]options['"]/, // OPTIONS preflight handling
|
||||
/access-control-allow-origin/, // CORS headers defined
|
||||
/\btry\s*\{/, // Error handling with try-catch
|
||||
/status:\s*(400|500|4\d{2}|5\d{2})|[,(]\s*(400|500|4\d{2}|5\d{2})\s*[),]/, // Proper error status codes
|
||||
/from\s+['"]\.\.\/(_shared|_utils)/, // Imports from shared directory
|
||||
/authorization/, // Allows authorization header in CORS
|
||||
/apikey/, // Allows apikey header in CORS
|
||||
];
|
||||
const matches = signals.filter((r) => r.test(allCode));
|
||||
expect(matches.length).toBeGreaterThanOrEqual(6);
|
||||
});
|
||||
export const assertions: EvalAssertion[] = [
|
||||
{
|
||||
name: "supabase project initialized",
|
||||
check: () => existsSync(join(getSupabaseDir(), "config.toml")),
|
||||
},
|
||||
{
|
||||
name: "function directory exists",
|
||||
check: () => existsSync(join(getFunctionsDir(), FUNCTION_NAME)),
|
||||
},
|
||||
{
|
||||
name: "function index file exists",
|
||||
check: () => findFunctionFile(FUNCTION_NAME) !== null,
|
||||
},
|
||||
{
|
||||
name: "uses Deno.serve",
|
||||
check: () => /Deno\.serve/.test(getFunctionCode(FUNCTION_NAME)),
|
||||
},
|
||||
{
|
||||
name: "returns JSON response",
|
||||
check: () => {
|
||||
const allCode = getAllCode();
|
||||
return (
|
||||
/content-type['"]\s*:\s*['"]application\/json/i.test(allCode) ||
|
||||
/Response\.json/i.test(allCode) ||
|
||||
/JSON\.stringify/i.test(allCode)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "handles OPTIONS preflight",
|
||||
check: () => {
|
||||
const allCode = getAllCode();
|
||||
return /['"]OPTIONS['"]/.test(allCode) && /\.method/.test(allCode);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "defines CORS headers",
|
||||
check: () => /Access-Control-Allow-Origin/.test(getAllCode()),
|
||||
},
|
||||
{
|
||||
name: "CORS allows required headers",
|
||||
check: () => {
|
||||
const allCode = getAllCode().toLowerCase();
|
||||
return (
|
||||
/access-control-allow-headers/.test(allCode) &&
|
||||
/authorization/.test(allCode) &&
|
||||
/apikey/.test(allCode)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "error response has CORS headers",
|
||||
check: () => {
|
||||
const catchCode = getCatchBlockCode();
|
||||
if (catchCode.length === 0) return false;
|
||||
const sharedCode = getSharedCode();
|
||||
const directCors =
|
||||
/corsHeaders|cors_headers|Access-Control-Allow-Origin/i.test(catchCode);
|
||||
const callsSharedHelper =
|
||||
/errorResponse|jsonResponse|json_response|error_response/i.test(
|
||||
catchCode,
|
||||
) && /Access-Control-Allow-Origin/i.test(sharedCode);
|
||||
return directCors || callsSharedHelper;
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "has try-catch for error handling",
|
||||
check: () => {
|
||||
const code = getFunctionCode(FUNCTION_NAME);
|
||||
return /\btry\s*\{/.test(code) && /\bcatch\b/.test(code);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "returns proper error status code",
|
||||
check: () => {
|
||||
const catchCode = getCatchBlockCode();
|
||||
if (catchCode.length === 0) return false;
|
||||
return (
|
||||
/status:\s*(400|500|4\d{2}|5\d{2})/.test(catchCode) ||
|
||||
/[,(]\s*(400|500|4\d{2}|5\d{2})\s*[),]/.test(catchCode)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "shared CORS module exists",
|
||||
check: () => findSharedCorsFile() !== null,
|
||||
},
|
||||
{
|
||||
name: "function imports from shared",
|
||||
check: () =>
|
||||
/from\s+['"]\.\.\/(_shared|_utils)/.test(getFunctionCode(FUNCTION_NAME)),
|
||||
},
|
||||
{
|
||||
name: "function uses hyphenated name",
|
||||
check: () => {
|
||||
const dirs = existsSync(getFunctionsDir()) ? readdirSync(getFunctionsDir()) : [];
|
||||
const helloDir = dirs.find(
|
||||
(d) => d.includes("hello") && d.includes("world"),
|
||||
);
|
||||
return helloDir !== undefined && /^hello-world$/.test(helloDir);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "overall quality: demonstrates Edge Function best practices",
|
||||
check: () => {
|
||||
const allCode = getAllCode().toLowerCase();
|
||||
const signals = [
|
||||
/deno\.serve/,
|
||||
/['"]options['"]/,
|
||||
/access-control-allow-origin/,
|
||||
/\btry\s*\{/,
|
||||
/status:\s*(400|500|4\d{2}|5\d{2})|[,(]\s*(400|500|4\d{2}|5\d{2})\s*[),]/,
|
||||
/from\s+['"]\.\.\/(_shared|_utils)/,
|
||||
/authorization/,
|
||||
/apikey/,
|
||||
];
|
||||
return signals.filter((r) => r.test(allCode)).length >= 6;
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
@@ -2,12 +2,90 @@ import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Common paths
|
||||
// Runtime DB helpers (use only in async tests)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const supabaseDir = join(process.cwd(), "supabase");
|
||||
export const migrationsDir = join(supabaseDir, "migrations");
|
||||
export const functionsDir = join(supabaseDir, "functions");
|
||||
const SUPABASE_URL = process.env.SUPABASE_URL ?? "http://127.0.0.1:54321";
|
||||
const SERVICE_KEY = process.env.SUPABASE_SERVICE_ROLE_KEY ?? "";
|
||||
const ANON_KEY = process.env.SUPABASE_ANON_KEY ?? "";
|
||||
|
||||
/** Execute a raw SQL query via PostgREST's /rpc endpoint or via the REST API. */
|
||||
async function pgRest(
|
||||
table: string,
|
||||
options: { select?: string; role?: "service_role" | "anon" } = {},
|
||||
): Promise<{ data: Record<string, unknown>[]; error: string | null }> {
|
||||
const key = options.role === "anon" ? ANON_KEY : SERVICE_KEY;
|
||||
const select = options.select ?? "*";
|
||||
const res = await fetch(`${SUPABASE_URL}/rest/v1/${table}?select=${select}`, {
|
||||
headers: {
|
||||
apikey: key,
|
||||
Authorization: `Bearer ${key}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
const body = await res.text();
|
||||
return { data: [], error: `HTTP ${res.status}: ${body}` };
|
||||
}
|
||||
|
||||
const data = (await res.json()) as Record<string, unknown>[];
|
||||
return { data, error: null };
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether a table is visible through the PostgREST API.
|
||||
* Uses the service role key (bypasses RLS).
|
||||
*/
|
||||
export async function tableExists(tableName: string): Promise<boolean> {
|
||||
const { error } = await pgRest(tableName);
|
||||
// A 404 or PGRST116 means the table/view doesn't exist in the schema cache.
|
||||
return error === null || !error.includes("404");
|
||||
}
|
||||
|
||||
/**
|
||||
* Query rows from a table.
|
||||
* @param tableName - table to query
|
||||
* @param role - "service_role" bypasses RLS; "anon" respects RLS policies
|
||||
*/
|
||||
export async function queryTable(
|
||||
tableName: string,
|
||||
role: "service_role" | "anon" = "service_role",
|
||||
): Promise<{ data: Record<string, unknown>[]; error: string | null }> {
|
||||
return pgRest(tableName, { role });
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the table exists AND is empty when queried as anon
|
||||
* (i.e., RLS is blocking access as expected for an unauthenticated user).
|
||||
*/
|
||||
export async function anonSeeesNoRows(tableName: string): Promise<boolean> {
|
||||
const { data, error } = await pgRest(tableName, { role: "anon" });
|
||||
return error === null && data.length === 0;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Common paths
|
||||
//
|
||||
// These are FUNCTIONS, not constants, so they re-evaluate process.cwd() on
|
||||
// every call. The runner does `process.chdir(workspacePath)` before running
|
||||
// assertions, so all path helpers resolve relative to the correct workspace.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Returns the supabase/ directory under the current working directory. */
|
||||
export function getSupabaseDir(): string {
|
||||
return join(process.cwd(), "supabase");
|
||||
}
|
||||
|
||||
/** Returns the supabase/migrations/ directory. */
|
||||
export function getMigrationsDir(): string {
|
||||
return join(getSupabaseDir(), "migrations");
|
||||
}
|
||||
|
||||
/** Returns the supabase/functions/ directory. */
|
||||
export function getFunctionsDir(): string {
|
||||
return join(getSupabaseDir(), "functions");
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Migration helpers
|
||||
@@ -15,10 +93,11 @@ export const functionsDir = join(supabaseDir, "functions");
|
||||
|
||||
/** Find all .sql migration files (agent may create one or more). */
|
||||
export function findMigrationFiles(): string[] {
|
||||
if (!existsSync(migrationsDir)) return [];
|
||||
return readdirSync(migrationsDir)
|
||||
const dir = getMigrationsDir();
|
||||
if (!existsSync(dir)) return [];
|
||||
return readdirSync(dir)
|
||||
.filter((f) => f.endsWith(".sql"))
|
||||
.map((f) => join(migrationsDir, f));
|
||||
.map((f) => join(dir, f));
|
||||
}
|
||||
|
||||
/** Read and concatenate all migration SQL files. */
|
||||
@@ -39,7 +118,7 @@ export function getMigrationSQL(): string {
|
||||
* @param functionName - directory name under supabase/functions/ (e.g. "hello-world")
|
||||
*/
|
||||
export function findFunctionFile(functionName: string): string | null {
|
||||
const fnDir = join(functionsDir, functionName);
|
||||
const fnDir = join(getFunctionsDir(), functionName);
|
||||
if (!existsSync(fnDir)) return null;
|
||||
const files = readdirSync(fnDir).filter(
|
||||
(f) => f.startsWith("index.") && (f.endsWith(".ts") || f.endsWith(".tsx")),
|
||||
@@ -61,12 +140,13 @@ export function getFunctionCode(functionName: string): string {
|
||||
|
||||
/** Find a shared CORS module under supabase/functions/_shared/ (or similar _-prefixed dir). */
|
||||
export function findSharedCorsFile(): string | null {
|
||||
if (!existsSync(functionsDir)) return null;
|
||||
const sharedDirs = readdirSync(functionsDir).filter(
|
||||
(d) => d.startsWith("_") && statSync(join(functionsDir, d)).isDirectory(),
|
||||
const fnDir = getFunctionsDir();
|
||||
if (!existsSync(fnDir)) return null;
|
||||
const sharedDirs = readdirSync(fnDir).filter(
|
||||
(d) => d.startsWith("_") && statSync(join(fnDir, d)).isDirectory(),
|
||||
);
|
||||
for (const dir of sharedDirs) {
|
||||
const dirPath = join(functionsDir, dir);
|
||||
const dirPath = join(fnDir, dir);
|
||||
const files = readdirSync(dirPath).filter((f) => f.includes("cors"));
|
||||
if (files.length > 0) return join(dirPath, files[0]);
|
||||
}
|
||||
@@ -75,13 +155,14 @@ export function findSharedCorsFile(): string | null {
|
||||
|
||||
/** Read and concatenate all .ts/.tsx files from _-prefixed shared directories. */
|
||||
export function getSharedCode(): string {
|
||||
if (!existsSync(functionsDir)) return "";
|
||||
const sharedDirs = readdirSync(functionsDir).filter(
|
||||
(d) => d.startsWith("_") && statSync(join(functionsDir, d)).isDirectory(),
|
||||
const fnDir = getFunctionsDir();
|
||||
if (!existsSync(fnDir)) return "";
|
||||
const sharedDirs = readdirSync(fnDir).filter(
|
||||
(d) => d.startsWith("_") && statSync(join(fnDir, d)).isDirectory(),
|
||||
);
|
||||
const parts: string[] = [];
|
||||
for (const dir of sharedDirs) {
|
||||
const dirPath = join(functionsDir, dir);
|
||||
const dirPath = join(fnDir, dir);
|
||||
const files = readdirSync(dirPath).filter(
|
||||
(f) => f.endsWith(".ts") || f.endsWith(".tsx"),
|
||||
);
|
||||
|
||||
100
packages/evals/evals/extension-wrong-schema/EVAL.ts
Normal file
100
packages/evals/evals/extension-wrong-schema/EVAL.ts
Normal file
@@ -0,0 +1,100 @@
|
||||
export const expectedReferenceFiles = [
|
||||
"db-schema-extensions.md",
|
||||
"db-rls-mandatory.md",
|
||||
"db-migrations-idempotent.md",
|
||||
"db-schema-auth-fk.md",
|
||||
"db-rls-common-mistakes.md",
|
||||
];
|
||||
|
||||
import type { EvalAssertion } from "../../src/eval-types.js";
|
||||
|
||||
import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
|
||||
|
||||
export const assertions: EvalAssertion[] = [
|
||||
{
|
||||
name: "migration file exists",
|
||||
check: () => findMigrationFiles().length > 0,
|
||||
},
|
||||
{
|
||||
name: "extension installed in extensions schema",
|
||||
check: () =>
|
||||
/create\s+extension[\s\S]*?with\s+schema\s+extensions/.test(
|
||||
getMigrationSQL().toLowerCase(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "IF NOT EXISTS on extension creation",
|
||||
check: () =>
|
||||
/create\s+extension\s+if\s+not\s+exists/.test(
|
||||
getMigrationSQL().toLowerCase(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "vector column with correct dimensions",
|
||||
check: () =>
|
||||
/(?:extensions\.)?vector\s*\(\s*1536\s*\)/.test(
|
||||
getMigrationSQL().toLowerCase(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "HNSW index used instead of IVFFlat",
|
||||
check: () => /using\s+hnsw/.test(getMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "RLS enabled on documents table",
|
||||
check: () =>
|
||||
/alter\s+table[\s\S]*?documents[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
getMigrationSQL().toLowerCase(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "FK to auth.users with ON DELETE CASCADE",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return (
|
||||
/references\s+auth\.users/.test(sql) &&
|
||||
/on\s+delete\s+cascade/.test(sql)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "policies use TO authenticated",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
return (
|
||||
policyBlocks.length > 0 &&
|
||||
policyBlocks.every((p) => /to\s+authenticated/.test(p))
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "idempotent table creation (IF NOT EXISTS)",
|
||||
check: () =>
|
||||
/create\s+table\s+if\s+not\s+exists/.test(
|
||||
getMigrationSQL().toLowerCase(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "overall quality: demonstrates pgvector best practices",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const signals = [
|
||||
/create\s+extension[\s\S]*?with\s+schema\s+extensions/.test(sql),
|
||||
/create\s+extension\s+if\s+not\s+exists/.test(sql),
|
||||
/(?:extensions\.)?vector\s*\(\s*1536\s*\)/.test(sql),
|
||||
/using\s+hnsw/.test(sql),
|
||||
/alter\s+table[\s\S]*?documents[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
),
|
||||
/references\s+auth\.users/.test(sql) &&
|
||||
/on\s+delete\s+cascade/.test(sql),
|
||||
policyBlocks.length > 0 &&
|
||||
policyBlocks.every((p) => /to\s+authenticated/.test(p)),
|
||||
/if\s+not\s+exists/.test(sql),
|
||||
];
|
||||
return signals.filter(Boolean).length >= 6;
|
||||
},
|
||||
},
|
||||
];
|
||||
11
packages/evals/evals/extension-wrong-schema/PROMPT.md
Normal file
11
packages/evals/evals/extension-wrong-schema/PROMPT.md
Normal file
@@ -0,0 +1,11 @@
|
||||
I'm building a semantic search feature for my app. I need to store document embeddings generated by OpenAI's ada-002 model (1536 dimensions) and let users search their own documents.
|
||||
|
||||
Create a migration in `supabase/migrations/` that:
|
||||
|
||||
1. Enables the pgvector extension
|
||||
2. Creates a `documents` table with:
|
||||
- An `embedding` column (1536 dimensions)
|
||||
- A `content` text column
|
||||
- A `user_id` column linked to the authenticated user
|
||||
3. Adds a vector similarity search index
|
||||
4. Ensures users can only see and manage their own documents
|
||||
5
packages/evals/evals/extension-wrong-schema/package.json
Normal file
5
packages/evals/evals/extension-wrong-schema/package.json
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"name": "extension-wrong-schema",
|
||||
"private": true,
|
||||
"type": "module"
|
||||
}
|
||||
111
packages/evals/evals/extension-wrong-schema/supabase/config.toml
Normal file
111
packages/evals/evals/extension-wrong-schema/supabase/config.toml
Normal file
@@ -0,0 +1,111 @@
|
||||
# For detailed configuration reference documentation, visit:
|
||||
# https://supabase.com/docs/guides/local-development/cli/config
|
||||
# A string used to distinguish different Supabase projects on the same host. Defaults to the
|
||||
# working directory name when running `supabase init`.
|
||||
project_id = "extension-wrong-schema"
|
||||
|
||||
[api]
|
||||
enabled = true
|
||||
# Port to use for the API URL.
|
||||
port = 54321
|
||||
# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
|
||||
# endpoints. `public` and `graphql_public` schemas are included by default.
|
||||
schemas = ["public", "graphql_public"]
|
||||
# Extra schemas to add to the search_path of every request.
|
||||
extra_search_path = ["public", "extensions"]
|
||||
# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
|
||||
# for accidental or malicious requests.
|
||||
max_rows = 1000
|
||||
|
||||
[db]
|
||||
# Port to use for the local database URL.
|
||||
port = 54322
|
||||
# Port used by db diff command to initialize the shadow database.
|
||||
shadow_port = 54320
|
||||
# The database major version to use. This has to be the same as your remote database's. Run `SHOW
|
||||
# server_version;` on the remote database to check.
|
||||
major_version = 17
|
||||
|
||||
[db.pooler]
|
||||
enabled = false
|
||||
# Port to use for the local connection pooler.
|
||||
port = 54329
|
||||
# Specifies when a server connection can be reused by other clients.
|
||||
# Configure one of the supported pooler modes: `transaction`, `session`.
|
||||
pool_mode = "transaction"
|
||||
# How many server connections to allow per user/database pair.
|
||||
default_pool_size = 20
|
||||
# Maximum number of client connections allowed.
|
||||
max_client_conn = 100
|
||||
|
||||
[db.migrations]
|
||||
# If disabled, migrations will be skipped during a db push or reset.
|
||||
enabled = true
|
||||
schema_paths = []
|
||||
|
||||
[db.seed]
|
||||
# If enabled, seeds the database after migrations during a db reset.
|
||||
enabled = true
|
||||
# Specifies an ordered list of seed files to load during db reset.
|
||||
sql_paths = ["./seed.sql"]
|
||||
|
||||
[realtime]
|
||||
enabled = true
|
||||
|
||||
[studio]
|
||||
enabled = true
|
||||
# Port to use for Supabase Studio.
|
||||
port = 54323
|
||||
# External URL of the API server that frontend connects to.
|
||||
api_url = "http://127.0.0.1"
|
||||
|
||||
[inbucket]
|
||||
enabled = true
|
||||
# Port to use for the email testing server web interface.
|
||||
port = 54324
|
||||
|
||||
[storage]
|
||||
enabled = true
|
||||
# The maximum file size allowed (e.g. "5MB", "500KB").
|
||||
file_size_limit = "50MiB"
|
||||
|
||||
[auth]
|
||||
enabled = true
|
||||
# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
|
||||
# in emails.
|
||||
site_url = "http://127.0.0.1:3000"
|
||||
# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
|
||||
additional_redirect_urls = ["https://127.0.0.1:3000"]
|
||||
# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
|
||||
jwt_expiry = 3600
|
||||
# If disabled, the refresh token will never expire.
|
||||
enable_refresh_token_rotation = true
|
||||
# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
|
||||
# Requires enable_refresh_token_rotation = true.
|
||||
refresh_token_reuse_interval = 10
|
||||
# Allow/disallow new user signups to your project.
|
||||
enable_signup = true
|
||||
# Allow/disallow anonymous sign-ins to your project.
|
||||
enable_anonymous_sign_ins = false
|
||||
|
||||
[auth.email]
|
||||
# Allow/disallow new user signups via email to your project.
|
||||
enable_signup = true
|
||||
# If enabled, a user will be required to confirm any email change on both the old, and new email
|
||||
# addresses. If disabled, only the new email is required to confirm.
|
||||
double_confirm_changes = true
|
||||
# If enabled, users need to confirm their email address before signing in.
|
||||
enable_confirmations = false
|
||||
|
||||
[edge_runtime]
|
||||
enabled = true
|
||||
# Configure one of the supported request policies: `oneshot`, `per_worker`.
|
||||
policy = "per_worker"
|
||||
# Port to attach the Chrome inspector for debugging edge functions.
|
||||
inspector_port = 8083
|
||||
|
||||
[analytics]
|
||||
enabled = true
|
||||
port = 54327
|
||||
# Configure one of the supported backends: `postgres`, `bigquery`.
|
||||
backend = "postgres"
|
||||
114
packages/evals/evals/postgrest-schema-cache/EVAL.ts
Normal file
114
packages/evals/evals/postgrest-schema-cache/EVAL.ts
Normal file
@@ -0,0 +1,114 @@
|
||||
export const expectedReferenceFiles = [
|
||||
"db-rls-views.md",
|
||||
"db-migrations-idempotent.md",
|
||||
"db-rls-mandatory.md",
|
||||
"db-rls-performance.md",
|
||||
"db-schema-timestamps.md",
|
||||
];
|
||||
|
||||
import { existsSync, readdirSync, readFileSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import type { EvalAssertion } from "../../src/eval-types.js";
|
||||
|
||||
const migrationsDir = join(process.cwd(), "supabase", "migrations");
|
||||
const STARTER_MIGRATION = "20240101000000_create_products.sql";
|
||||
|
||||
function findAgentMigrationFiles(): string[] {
|
||||
if (!existsSync(migrationsDir)) return [];
|
||||
return readdirSync(migrationsDir)
|
||||
.filter((f) => f.endsWith(".sql") && f !== STARTER_MIGRATION)
|
||||
.map((f) => join(migrationsDir, f));
|
||||
}
|
||||
|
||||
function getAgentMigrationSQL(): string {
|
||||
const files = findAgentMigrationFiles();
|
||||
if (files.length === 0)
|
||||
throw new Error(
|
||||
"No agent-created migration file found in supabase/migrations/",
|
||||
);
|
||||
return files.map((f) => readFileSync(f, "utf-8")).join("\n");
|
||||
}
|
||||
|
||||
export const assertions: EvalAssertion[] = [
|
||||
{
|
||||
name: "new migration file exists",
|
||||
check: () => findAgentMigrationFiles().length > 0,
|
||||
},
|
||||
{
|
||||
name: "ADD COLUMN IF NOT EXISTS for description",
|
||||
check: () =>
|
||||
/add\s+column\s+if\s+not\s+exists\s+description/.test(
|
||||
getAgentMigrationSQL().toLowerCase(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "ADD COLUMN IF NOT EXISTS for published_at",
|
||||
check: () =>
|
||||
/add\s+column\s+if\s+not\s+exists\s+published_at/.test(
|
||||
getAgentMigrationSQL().toLowerCase(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "published_at uses timestamptz not plain timestamp",
|
||||
check: () => {
|
||||
const sql = getAgentMigrationSQL().toLowerCase();
|
||||
return (
|
||||
/published_at\s+timestamptz|published_at\s+timestamp\s+with\s+time\s+zone/.test(
|
||||
sql,
|
||||
) &&
|
||||
!/published_at\s+timestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(
|
||||
sql,
|
||||
)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "view public_products is created",
|
||||
check: () =>
|
||||
/create\s+(or\s+replace\s+)?view\s+public_products/.test(
|
||||
getAgentMigrationSQL().toLowerCase(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "view uses security_invoker = true",
|
||||
check: () =>
|
||||
/security_invoker\s*=\s*true/.test(getAgentMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "SELECT policy on products for authenticated role",
|
||||
check: () => {
|
||||
const sql = getAgentMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
return policyBlocks.some(
|
||||
(p) =>
|
||||
p.includes("select") &&
|
||||
p.includes("products") &&
|
||||
/to\s+authenticated/.test(p),
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "NOTIFY pgrst reload schema is present",
|
||||
check: () => /notify\s+pgrst/.test(getAgentMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "overall quality: demonstrates PostgREST and schema best practices",
|
||||
check: () => {
|
||||
const sql = getAgentMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const signals = [
|
||||
/add\s+column\s+if\s+not\s+exists/.test(sql),
|
||||
/published_at\s+timestamptz|published_at\s+timestamp\s+with\s+time\s+zone/.test(
|
||||
sql,
|
||||
),
|
||||
/create\s+(or\s+replace\s+)?view\s+public_products/.test(sql),
|
||||
/security_invoker\s*=\s*true/.test(sql),
|
||||
policyBlocks.some(
|
||||
(p) => p.includes("select") && /to\s+authenticated/.test(p),
|
||||
),
|
||||
/notify\s+pgrst/.test(sql),
|
||||
];
|
||||
return signals.filter(Boolean).length >= 5;
|
||||
},
|
||||
},
|
||||
];
|
||||
9
packages/evals/evals/postgrest-schema-cache/PROMPT.md
Normal file
9
packages/evals/evals/postgrest-schema-cache/PROMPT.md
Normal file
@@ -0,0 +1,9 @@
|
||||
I'm building a product catalog with Supabase. We already have a `products` table (see the existing migration in `supabase/migrations/`), but we need to expand it.
|
||||
|
||||
Please create a new migration file in `supabase/migrations/` that:
|
||||
|
||||
1. Adds two new columns to the `products` table: `description` (text) and `published_at` (timestamp)
|
||||
2. Creates a view called `public_products` that shows only products where `published_at` is not null
|
||||
3. Adds a policy so any authenticated user can view published products
|
||||
|
||||
Make sure the migration is safe to run multiple times.
|
||||
5
packages/evals/evals/postgrest-schema-cache/package.json
Normal file
5
packages/evals/evals/postgrest-schema-cache/package.json
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"name": "postgrest-schema-cache",
|
||||
"private": true,
|
||||
"type": "module"
|
||||
}
|
||||
111
packages/evals/evals/postgrest-schema-cache/supabase/config.toml
Normal file
111
packages/evals/evals/postgrest-schema-cache/supabase/config.toml
Normal file
@@ -0,0 +1,111 @@
|
||||
# For detailed configuration reference documentation, visit:
|
||||
# https://supabase.com/docs/guides/local-development/cli/config
|
||||
# A string used to distinguish different Supabase projects on the same host. Defaults to the
|
||||
# working directory name when running `supabase init`.
|
||||
project_id = "postgrest-schema-cache"
|
||||
|
||||
[api]
|
||||
enabled = true
|
||||
# Port to use for the API URL.
|
||||
port = 54321
|
||||
# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
|
||||
# endpoints. `public` and `graphql_public` schemas are included by default.
|
||||
schemas = ["public", "graphql_public"]
|
||||
# Extra schemas to add to the search_path of every request.
|
||||
extra_search_path = ["public", "extensions"]
|
||||
# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
|
||||
# for accidental or malicious requests.
|
||||
max_rows = 1000
|
||||
|
||||
[db]
|
||||
# Port to use for the local database URL.
|
||||
port = 54322
|
||||
# Port used by db diff command to initialize the shadow database.
|
||||
shadow_port = 54320
|
||||
# The database major version to use. This has to be the same as your remote database's. Run `SHOW
|
||||
# server_version;` on the remote database to check.
|
||||
major_version = 17
|
||||
|
||||
[db.pooler]
|
||||
enabled = false
|
||||
# Port to use for the local connection pooler.
|
||||
port = 54329
|
||||
# Specifies when a server connection can be reused by other clients.
|
||||
# Configure one of the supported pooler modes: `transaction`, `session`.
|
||||
pool_mode = "transaction"
|
||||
# How many server connections to allow per user/database pair.
|
||||
default_pool_size = 20
|
||||
# Maximum number of client connections allowed.
|
||||
max_client_conn = 100
|
||||
|
||||
[db.migrations]
|
||||
# If disabled, migrations will be skipped during a db push or reset.
|
||||
enabled = true
|
||||
schema_paths = []
|
||||
|
||||
[db.seed]
|
||||
# If enabled, seeds the database after migrations during a db reset.
|
||||
enabled = true
|
||||
# Specifies an ordered list of seed files to load during db reset.
|
||||
sql_paths = ["./seed.sql"]
|
||||
|
||||
[realtime]
|
||||
enabled = true
|
||||
|
||||
[studio]
|
||||
enabled = true
|
||||
# Port to use for Supabase Studio.
|
||||
port = 54323
|
||||
# External URL of the API server that frontend connects to.
|
||||
api_url = "http://127.0.0.1"
|
||||
|
||||
[inbucket]
|
||||
enabled = true
|
||||
# Port to use for the email testing server web interface.
|
||||
port = 54324
|
||||
|
||||
[storage]
|
||||
enabled = true
|
||||
# The maximum file size allowed (e.g. "5MB", "500KB").
|
||||
file_size_limit = "50MiB"
|
||||
|
||||
[auth]
|
||||
enabled = true
|
||||
# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
|
||||
# in emails.
|
||||
site_url = "http://127.0.0.1:3000"
|
||||
# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
|
||||
additional_redirect_urls = ["https://127.0.0.1:3000"]
|
||||
# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
|
||||
jwt_expiry = 3600
|
||||
# If disabled, the refresh token will never expire.
|
||||
enable_refresh_token_rotation = true
|
||||
# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
|
||||
# Requires enable_refresh_token_rotation = true.
|
||||
refresh_token_reuse_interval = 10
|
||||
# Allow/disallow new user signups to your project.
|
||||
enable_signup = true
|
||||
# Allow/disallow anonymous sign-ins to your project.
|
||||
enable_anonymous_sign_ins = false
|
||||
|
||||
[auth.email]
|
||||
# Allow/disallow new user signups via email to your project.
|
||||
enable_signup = true
|
||||
# If enabled, a user will be required to confirm any email change on both the old, and new email
|
||||
# addresses. If disabled, only the new email is required to confirm.
|
||||
double_confirm_changes = true
|
||||
# If enabled, users need to confirm their email address before signing in.
|
||||
enable_confirmations = false
|
||||
|
||||
[edge_runtime]
|
||||
enabled = true
|
||||
# Configure one of the supported request policies: `oneshot`, `per_worker`.
|
||||
policy = "per_worker"
|
||||
# Port to attach the Chrome inspector for debugging edge functions.
|
||||
inspector_port = 8083
|
||||
|
||||
[analytics]
|
||||
enabled = true
|
||||
port = 54327
|
||||
# Configure one of the supported backends: `postgres`, `bigquery`.
|
||||
backend = "postgres"
|
||||
@@ -0,0 +1,8 @@
|
||||
-- Initial products table
|
||||
create table if not exists products (
|
||||
id bigint primary key generated always as identity,
|
||||
name text not null,
|
||||
price numeric(10, 2) not null default 0
|
||||
);
|
||||
|
||||
alter table products enable row level security;
|
||||
122
packages/evals/evals/rls-update-needs-select/EVAL.ts
Normal file
122
packages/evals/evals/rls-update-needs-select/EVAL.ts
Normal file
@@ -0,0 +1,122 @@
|
||||
export const expectedReferenceFiles = [
|
||||
"db-rls-common-mistakes.md",
|
||||
"db-rls-policy-types.md",
|
||||
"db-rls-performance.md",
|
||||
"db-rls-mandatory.md",
|
||||
"db-schema-timestamps.md",
|
||||
];
|
||||
|
||||
import type { EvalAssertion } from "../../src/eval-types.js";
|
||||
|
||||
import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
|
||||
|
||||
export const assertions: EvalAssertion[] = [
|
||||
{
|
||||
name: "migration file exists",
|
||||
check: () => findMigrationFiles().length > 0,
|
||||
},
|
||||
{
|
||||
name: "creates orders table",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return /create\s+table/.test(sql) && /orders/.test(sql);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "enables RLS on orders table",
|
||||
check: () =>
|
||||
/alter\s+table.*orders.*enable\s+row\s+level\s+security/.test(
|
||||
getMigrationSQL().toLowerCase(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "has SELECT policy on orders",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
return policyBlocks.some((p) => p.includes("for select"));
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "has UPDATE policy with WITH CHECK on orders",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const updatePolicy = policyBlocks.find((p) => p.includes("for update"));
|
||||
return updatePolicy !== undefined && /with\s+check/.test(updatePolicy);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "all policies use TO authenticated",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
return (
|
||||
policyBlocks.length > 0 &&
|
||||
policyBlocks.every((p) => /to\s+authenticated/.test(p))
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "uses (select auth.uid()) not bare auth.uid() in policies",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
for (const policy of policyBlocks) {
|
||||
if (
|
||||
policy.includes("auth.uid()") &&
|
||||
!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "uses timestamptz not plain timestamp for created_at",
|
||||
check: () => {
|
||||
const rawSql = getMigrationSQL().toLowerCase();
|
||||
const sql = rawSql.replace(/--[^\n]*/g, "");
|
||||
const hasPlainTimestamp =
|
||||
/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
|
||||
if (sql.includes("created_at")) {
|
||||
return !hasPlainTimestamp.test(sql);
|
||||
}
|
||||
return true;
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "FK to auth.users with ON DELETE CASCADE",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return (
|
||||
/references\s+auth\.users/.test(sql) &&
|
||||
/on\s+delete\s+cascade/.test(sql)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "overall quality: demonstrates Supabase best practices",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const signals = [
|
||||
/alter\s+table.*orders.*enable\s+row\s+level\s+security/.test(sql),
|
||||
policyBlocks.some((p) => p.includes("for select")),
|
||||
policyBlocks.some(
|
||||
(p) => p.includes("for update") && /with\s+check/.test(p),
|
||||
),
|
||||
/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
|
||||
policyBlocks.length > 0 &&
|
||||
policyBlocks.every((p) => /to\s+authenticated/.test(p)),
|
||||
/references\s+auth\.users/.test(sql) &&
|
||||
/on\s+delete\s+cascade/.test(sql),
|
||||
!/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(
|
||||
sql.replace(/--[^\n]*/g, ""),
|
||||
),
|
||||
];
|
||||
return signals.filter(Boolean).length >= 5;
|
||||
},
|
||||
},
|
||||
];
|
||||
7
packages/evals/evals/rls-update-needs-select/PROMPT.md
Normal file
7
packages/evals/evals/rls-update-needs-select/PROMPT.md
Normal file
@@ -0,0 +1,7 @@
|
||||
I'm building an e-commerce app and need a migration for an `orders` table. Each order has a `status` (text), `total` (numeric), and `created_at` timestamp. Orders belong to users — each order should have a `user_id` that links to the authenticated user who placed it.
|
||||
|
||||
Users need to be able to:
|
||||
- View their own orders
|
||||
- Update the status of their own orders
|
||||
|
||||
Please create the migration in `supabase/migrations/`.
|
||||
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"name": "rls-update-needs-select",
|
||||
"private": true,
|
||||
"type": "module"
|
||||
}
|
||||
@@ -0,0 +1,111 @@
|
||||
# For detailed configuration reference documentation, visit:
|
||||
# https://supabase.com/docs/guides/local-development/cli/config
|
||||
# A string used to distinguish different Supabase projects on the same host. Defaults to the
|
||||
# working directory name when running `supabase init`.
|
||||
project_id = "rls-update-needs-select"
|
||||
|
||||
[api]
|
||||
enabled = true
|
||||
# Port to use for the API URL.
|
||||
port = 54321
|
||||
# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
|
||||
# endpoints. `public` and `graphql_public` schemas are included by default.
|
||||
schemas = ["public", "graphql_public"]
|
||||
# Extra schemas to add to the search_path of every request.
|
||||
extra_search_path = ["public", "extensions"]
|
||||
# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
|
||||
# for accidental or malicious requests.
|
||||
max_rows = 1000
|
||||
|
||||
[db]
|
||||
# Port to use for the local database URL.
|
||||
port = 54322
|
||||
# Port used by db diff command to initialize the shadow database.
|
||||
shadow_port = 54320
|
||||
# The database major version to use. This has to be the same as your remote database's. Run `SHOW
|
||||
# server_version;` on the remote database to check.
|
||||
major_version = 17
|
||||
|
||||
[db.pooler]
|
||||
enabled = false
|
||||
# Port to use for the local connection pooler.
|
||||
port = 54329
|
||||
# Specifies when a server connection can be reused by other clients.
|
||||
# Configure one of the supported pooler modes: `transaction`, `session`.
|
||||
pool_mode = "transaction"
|
||||
# How many server connections to allow per user/database pair.
|
||||
default_pool_size = 20
|
||||
# Maximum number of client connections allowed.
|
||||
max_client_conn = 100
|
||||
|
||||
[db.migrations]
|
||||
# If disabled, migrations will be skipped during a db push or reset.
|
||||
enabled = true
|
||||
schema_paths = []
|
||||
|
||||
[db.seed]
|
||||
# If enabled, seeds the database after migrations during a db reset.
|
||||
enabled = true
|
||||
# Specifies an ordered list of seed files to load during db reset.
|
||||
sql_paths = ["./seed.sql"]
|
||||
|
||||
[realtime]
|
||||
enabled = true
|
||||
|
||||
[studio]
|
||||
enabled = true
|
||||
# Port to use for Supabase Studio.
|
||||
port = 54323
|
||||
# External URL of the API server that frontend connects to.
|
||||
api_url = "http://127.0.0.1"
|
||||
|
||||
[inbucket]
|
||||
enabled = true
|
||||
# Port to use for the email testing server web interface.
|
||||
port = 54324
|
||||
|
||||
[storage]
|
||||
enabled = true
|
||||
# The maximum file size allowed (e.g. "5MB", "500KB").
|
||||
file_size_limit = "50MiB"
|
||||
|
||||
[auth]
|
||||
enabled = true
|
||||
# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
|
||||
# in emails.
|
||||
site_url = "http://127.0.0.1:3000"
|
||||
# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
|
||||
additional_redirect_urls = ["https://127.0.0.1:3000"]
|
||||
# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
|
||||
jwt_expiry = 3600
|
||||
# If disabled, the refresh token will never expire.
|
||||
enable_refresh_token_rotation = true
|
||||
# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
|
||||
# Requires enable_refresh_token_rotation = true.
|
||||
refresh_token_reuse_interval = 10
|
||||
# Allow/disallow new user signups to your project.
|
||||
enable_signup = true
|
||||
# Allow/disallow anonymous sign-ins to your project.
|
||||
enable_anonymous_sign_ins = false
|
||||
|
||||
[auth.email]
|
||||
# Allow/disallow new user signups via email to your project.
|
||||
enable_signup = true
|
||||
# If enabled, a user will be required to confirm any email change on both the old, and new email
|
||||
# addresses. If disabled, only the new email is required to confirm.
|
||||
double_confirm_changes = true
|
||||
# If enabled, users need to confirm their email address before signing in.
|
||||
enable_confirmations = false
|
||||
|
||||
[edge_runtime]
|
||||
enabled = true
|
||||
# Configure one of the supported request policies: `oneshot`, `per_worker`.
|
||||
policy = "per_worker"
|
||||
# Port to attach the Chrome inspector for debugging edge functions.
|
||||
inspector_port = 8083
|
||||
|
||||
[analytics]
|
||||
enabled = true
|
||||
port = 54327
|
||||
# Configure one of the supported backends: `postgres`, `bigquery`.
|
||||
backend = "postgres"
|
||||
123
packages/evals/evals/rls-user-metadata-role-check/EVAL.ts
Normal file
123
packages/evals/evals/rls-user-metadata-role-check/EVAL.ts
Normal file
@@ -0,0 +1,123 @@
|
||||
export const expectedReferenceFiles = [
|
||||
"db-rls-common-mistakes.md",
|
||||
"db-rls-policy-types.md",
|
||||
"db-rls-performance.md",
|
||||
"db-rls-mandatory.md",
|
||||
"db-schema-auth-fk.md",
|
||||
];
|
||||
|
||||
import type { EvalAssertion } from "../../src/eval-types.js";
|
||||
|
||||
import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
|
||||
|
||||
export const assertions: EvalAssertion[] = [
|
||||
{
|
||||
name: "migration file exists in supabase/migrations/",
|
||||
check: () => findMigrationFiles().length > 0,
|
||||
},
|
||||
{
|
||||
name: "creates documents table",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return /create\s+table/.test(sql) && /documents/.test(sql);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "RLS enabled on documents table",
|
||||
check: () =>
|
||||
/alter\s+table.*documents.*enable\s+row\s+level\s+security/.test(
|
||||
getMigrationSQL().toLowerCase(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "uses app_metadata not user_metadata for role check",
|
||||
check: () => /app_metadata/.test(getMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "user_metadata does not appear in policy USING clauses",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
return policyBlocks.every((p) => !p.includes("user_metadata"));
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "has at least two SELECT policies (owner and admin)",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const hasOwnerPolicy = policyBlocks.some(
|
||||
(p) =>
|
||||
(p.includes("select") || !p.includes("insert")) &&
|
||||
(p.includes("user_id") ||
|
||||
p.includes("owner") ||
|
||||
p.includes("auth.uid")),
|
||||
);
|
||||
const hasAdminPolicy = policyBlocks.some((p) =>
|
||||
p.includes("app_metadata"),
|
||||
);
|
||||
return hasOwnerPolicy && hasAdminPolicy;
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "policies use TO authenticated",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
return (
|
||||
policyBlocks.length > 0 &&
|
||||
policyBlocks.every((p) => /to\s+authenticated/.test(p))
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "uses (select auth.uid()) subselect form in policies",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
for (const policy of policyBlocks) {
|
||||
if (
|
||||
policy.includes("auth.uid()") &&
|
||||
!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "FK to auth.users with ON DELETE CASCADE",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return (
|
||||
/references\s+auth\.users/.test(sql) &&
|
||||
/on\s+delete\s+cascade/.test(sql)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "overall quality: demonstrates Supabase best practices",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const signals = [
|
||||
/alter\s+table.*documents.*enable\s+row\s+level\s+security/.test(sql),
|
||||
/app_metadata/.test(sql),
|
||||
policyBlocks.every((p) => !p.includes("user_metadata")),
|
||||
/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
|
||||
policyBlocks.length > 0 &&
|
||||
policyBlocks.every((p) => /to\s+authenticated/.test(p)),
|
||||
/references\s+auth\.users/.test(sql) &&
|
||||
/on\s+delete\s+cascade/.test(sql),
|
||||
policyBlocks.some(
|
||||
(p) =>
|
||||
p.includes("user_id") ||
|
||||
p.includes("owner") ||
|
||||
p.includes("auth.uid"),
|
||||
) && policyBlocks.some((p) => p.includes("app_metadata")),
|
||||
];
|
||||
return signals.filter(Boolean).length >= 5;
|
||||
},
|
||||
},
|
||||
];
|
||||
@@ -0,0 +1,7 @@
|
||||
I'm building a document management app on Supabase. I need a migration for a `documents` table. Each document has a `title` (text), `content` (text), and belongs to a user (the owner).
|
||||
|
||||
The access rules are:
|
||||
- Regular users can only read their own documents.
|
||||
- Admin users — identified by a role field stored in their JWT — should be able to read all documents.
|
||||
|
||||
Please create the migration in `supabase/migrations/`. The Supabase project is already initialized.
|
||||
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"name": "rls-user-metadata-role-check",
|
||||
"private": true,
|
||||
"type": "module"
|
||||
}
|
||||
@@ -0,0 +1,111 @@
|
||||
# For detailed configuration reference documentation, visit:
|
||||
# https://supabase.com/docs/guides/local-development/cli/config
|
||||
# A string used to distinguish different Supabase projects on the same host. Defaults to the
|
||||
# working directory name when running `supabase init`.
|
||||
project_id = "rls-user-metadata-role-check"
|
||||
|
||||
[api]
|
||||
enabled = true
|
||||
# Port to use for the API URL.
|
||||
port = 54321
|
||||
# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
|
||||
# endpoints. `public` and `graphql_public` schemas are included by default.
|
||||
schemas = ["public", "graphql_public"]
|
||||
# Extra schemas to add to the search_path of every request.
|
||||
extra_search_path = ["public", "extensions"]
|
||||
# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
|
||||
# for accidental or malicious requests.
|
||||
max_rows = 1000
|
||||
|
||||
[db]
|
||||
# Port to use for the local database URL.
|
||||
port = 54322
|
||||
# Port used by db diff command to initialize the shadow database.
|
||||
shadow_port = 54320
|
||||
# The database major version to use. This has to be the same as your remote database's. Run `SHOW
|
||||
# server_version;` on the remote database to check.
|
||||
major_version = 17
|
||||
|
||||
[db.pooler]
|
||||
enabled = false
|
||||
# Port to use for the local connection pooler.
|
||||
port = 54329
|
||||
# Specifies when a server connection can be reused by other clients.
|
||||
# Configure one of the supported pooler modes: `transaction`, `session`.
|
||||
pool_mode = "transaction"
|
||||
# How many server connections to allow per user/database pair.
|
||||
default_pool_size = 20
|
||||
# Maximum number of client connections allowed.
|
||||
max_client_conn = 100
|
||||
|
||||
[db.migrations]
|
||||
# If disabled, migrations will be skipped during a db push or reset.
|
||||
enabled = true
|
||||
schema_paths = []
|
||||
|
||||
[db.seed]
|
||||
# If enabled, seeds the database after migrations during a db reset.
|
||||
enabled = true
|
||||
# Specifies an ordered list of seed files to load during db reset.
|
||||
sql_paths = ["./seed.sql"]
|
||||
|
||||
[realtime]
|
||||
enabled = true
|
||||
|
||||
[studio]
|
||||
enabled = true
|
||||
# Port to use for Supabase Studio.
|
||||
port = 54323
|
||||
# External URL of the API server that frontend connects to.
|
||||
api_url = "http://127.0.0.1"
|
||||
|
||||
[inbucket]
|
||||
enabled = true
|
||||
# Port to use for the email testing server web interface.
|
||||
port = 54324
|
||||
|
||||
[storage]
|
||||
enabled = true
|
||||
# The maximum file size allowed (e.g. "5MB", "500KB").
|
||||
file_size_limit = "50MiB"
|
||||
|
||||
[auth]
|
||||
enabled = true
|
||||
# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
|
||||
# in emails.
|
||||
site_url = "http://127.0.0.1:3000"
|
||||
# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
|
||||
additional_redirect_urls = ["https://127.0.0.1:3000"]
|
||||
# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
|
||||
jwt_expiry = 3600
|
||||
# If disabled, the refresh token will never expire.
|
||||
enable_refresh_token_rotation = true
|
||||
# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
|
||||
# Requires enable_refresh_token_rotation = true.
|
||||
refresh_token_reuse_interval = 10
|
||||
# Allow/disallow new user signups to your project.
|
||||
enable_signup = true
|
||||
# Allow/disallow anonymous sign-ins to your project.
|
||||
enable_anonymous_sign_ins = false
|
||||
|
||||
[auth.email]
|
||||
# Allow/disallow new user signups via email to your project.
|
||||
enable_signup = true
|
||||
# If enabled, a user will be required to confirm any email change on both the old, and new email
|
||||
# addresses. If disabled, only the new email is required to confirm.
|
||||
double_confirm_changes = true
|
||||
# If enabled, users need to confirm their email address before signing in.
|
||||
enable_confirmations = false
|
||||
|
||||
[edge_runtime]
|
||||
enabled = true
|
||||
# Configure one of the supported request policies: `oneshot`, `per_worker`.
|
||||
policy = "per_worker"
|
||||
# Port to attach the Chrome inspector for debugging edge functions.
|
||||
inspector_port = 8083
|
||||
|
||||
[analytics]
|
||||
enabled = true
|
||||
port = 54327
|
||||
# Configure one of the supported backends: `postgres`, `bigquery`.
|
||||
backend = "postgres"
|
||||
102
packages/evals/evals/service-role-edge-function/EVAL.ts
Normal file
102
packages/evals/evals/service-role-edge-function/EVAL.ts
Normal file
@@ -0,0 +1,102 @@
|
||||
export const expectedReferenceFiles = [
|
||||
"db-security-service-role.md",
|
||||
"edge-fun-quickstart.md",
|
||||
"edge-db-supabase-client.md",
|
||||
"edge-pat-cors.md",
|
||||
"edge-pat-error-handling.md",
|
||||
];
|
||||
|
||||
import { existsSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import type { EvalAssertion } from "../../src/eval-types.js";
|
||||
|
||||
import {
|
||||
findFunctionFile,
|
||||
getFunctionCode,
|
||||
getSharedCode,
|
||||
getSupabaseDir,
|
||||
} from "../eval-utils.ts";
|
||||
|
||||
const FUNCTION_NAME = "admin-reports";
|
||||
|
||||
function getAllCode(): string {
|
||||
const code = getFunctionCode(FUNCTION_NAME);
|
||||
return `${code}\n${getSharedCode()}`;
|
||||
}
|
||||
|
||||
export const assertions: EvalAssertion[] = [
|
||||
{
|
||||
name: "supabase project initialized (config.toml exists)",
|
||||
check: () => existsSync(join(getSupabaseDir(), "config.toml")),
|
||||
},
|
||||
{
|
||||
name: "edge function file exists",
|
||||
check: () => findFunctionFile(FUNCTION_NAME) !== null,
|
||||
},
|
||||
{
|
||||
name: "uses Deno.env.get for service role key",
|
||||
check: () =>
|
||||
/Deno\.env\.get\(\s*['"][^'"]*service[_-]?role[^'"]*['"]\s*\)/i.test(
|
||||
getAllCode(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "no hardcoded service role key",
|
||||
check: () => {
|
||||
const allCode = getAllCode();
|
||||
const lines = allCode.split("\n");
|
||||
const nonCommentLines = lines.filter(
|
||||
(line) => !line.trimStart().startsWith("//"),
|
||||
);
|
||||
return !nonCommentLines.some((line) =>
|
||||
/(['"`])eyJ[A-Za-z0-9_-]+\.\1?|(['"`])eyJ[A-Za-z0-9_-]+/.test(line),
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "createClient called with service role env var as second argument",
|
||||
check: () => {
|
||||
const allCode = getAllCode();
|
||||
return (
|
||||
/createClient/i.test(allCode) &&
|
||||
/Deno\.env\.get\(\s*['"][^'"]*service[_-]?role[^'"]*['"]\s*\)/i.test(
|
||||
allCode,
|
||||
)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "service role key env var name does not use NEXT_PUBLIC_ prefix",
|
||||
check: () => !/NEXT_PUBLIC_[^'"]*service[_-]?role/i.test(getAllCode()),
|
||||
},
|
||||
{
|
||||
name: "CORS headers present",
|
||||
check: () => /Access-Control-Allow-Origin/.test(getAllCode()),
|
||||
},
|
||||
{
|
||||
name: "returns JSON response",
|
||||
check: () => {
|
||||
const allCode = getAllCode();
|
||||
return (
|
||||
/content-type['"]\s*:\s*['"]application\/json/i.test(allCode) ||
|
||||
/Response\.json/i.test(allCode) ||
|
||||
/JSON\.stringify/i.test(allCode)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "overall quality: demonstrates service role Edge Function best practices",
|
||||
check: () => {
|
||||
const allCode = getAllCode();
|
||||
const signals: RegExp[] = [
|
||||
/Deno\.env\.get\(\s*['"][^'"]*service[_-]?role[^'"]*['"]\s*\)/i,
|
||||
/Access-Control-Allow-Origin/,
|
||||
/createClient/i,
|
||||
/\btry\s*\{/,
|
||||
/Response\.json|JSON\.stringify/,
|
||||
/Deno\.serve/,
|
||||
];
|
||||
return signals.filter((r) => r.test(allCode)).length >= 5;
|
||||
},
|
||||
},
|
||||
];
|
||||
@@ -0,0 +1,9 @@
|
||||
I'm building an internal admin dashboard for my app. I need a Supabase Edge Function called `admin-reports` that returns all rows from the `reports` table — this is an admin-only endpoint so it needs to bypass Row Level Security.
|
||||
|
||||
Create the function at `supabase/functions/admin-reports/index.ts`. Use environment variables for any Supabase keys — do not hardcode them in the source code.
|
||||
|
||||
The function should:
|
||||
|
||||
1. Return all rows from the `reports` table as a JSON response
|
||||
2. Work when called from a browser (handle CORS)
|
||||
3. Handle errors gracefully
|
||||
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"name": "service-role-edge-function",
|
||||
"private": true,
|
||||
"type": "module"
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
# For detailed configuration reference documentation, visit:
|
||||
# https://supabase.com/docs/guides/local-development/cli/config
|
||||
# A string used to distinguish different Supabase projects on the same host. Defaults to the
|
||||
# working directory name when running `supabase init`.
|
||||
project_id = "service-role-edge-function"
|
||||
|
||||
[api]
|
||||
enabled = true
|
||||
# Port to use for the API URL.
|
||||
port = 54321
|
||||
# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
|
||||
# endpoints. `public` and `graphql_public` schemas are included by default.
|
||||
schemas = ["public", "graphql_public"]
|
||||
# Extra schemas to add to the search_path of every request.
|
||||
extra_search_path = ["public", "extensions"]
|
||||
# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
|
||||
# for accidental or malicious requests.
|
||||
max_rows = 1000
|
||||
|
||||
[db]
|
||||
# Port to use for the local database URL.
|
||||
port = 54322
|
||||
# Port used by db diff command to initialize the shadow database.
|
||||
shadow_port = 54320
|
||||
# The database major version to use. This has to be the same as your remote database's. Run `SHOW
|
||||
# server_version;` on the remote database to check.
|
||||
major_version = 17
|
||||
|
||||
[db.pooler]
|
||||
enabled = false
|
||||
# Port to use for the local connection pooler.
|
||||
port = 54329
|
||||
# Specifies when a server connection can be reused by other clients.
|
||||
# Configure one of the supported pooler modes: `transaction`, `session`.
|
||||
pool_mode = "transaction"
|
||||
# How many server connections to allow per user/database pair.
|
||||
default_pool_size = 20
|
||||
# Maximum number of client connections allowed.
|
||||
max_client_conn = 100
|
||||
|
||||
[storage]
|
||||
enabled = true
|
||||
# The maximum file size allowed (e.g. "5MB", "500KB").
|
||||
file_size_limit = "50MiB"
|
||||
|
||||
[auth]
|
||||
enabled = true
|
||||
# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
|
||||
# in emails.
|
||||
site_url = "http://127.0.0.1:3000"
|
||||
# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
|
||||
additional_redirect_urls = ["https://127.0.0.1:3000"]
|
||||
# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
|
||||
jwt_expiry = 3600
|
||||
# Allow/disallow new user signups to your project.
|
||||
enable_signup = true
|
||||
# Allow/disallow anonymous sign-ins to your project.
|
||||
enable_anonymous_sign_ins = false
|
||||
|
||||
[auth.email]
|
||||
# Allow/disallow new user signups via email to your project.
|
||||
enable_signup = true
|
||||
# If enabled, users need to confirm their email address before signing in.
|
||||
enable_confirmations = false
|
||||
@@ -0,0 +1,10 @@
|
||||
-- Create the reports table
|
||||
create table if not exists public.reports (
|
||||
id uuid primary key default gen_random_uuid(),
|
||||
title text not null,
|
||||
content text,
|
||||
created_at timestamptz not null default now()
|
||||
);
|
||||
|
||||
-- Enable Row Level Security (browser clients use anon key and are restricted by default)
|
||||
alter table public.reports enable row level security;
|
||||
@@ -1,263 +1,253 @@
|
||||
import { expect, test } from "vitest";
|
||||
export const expectedReferenceFiles = [
|
||||
"storage-access-control.md",
|
||||
"db-rls-mandatory.md",
|
||||
"db-rls-common-mistakes.md",
|
||||
"db-rls-performance.md",
|
||||
"db-schema-auth-fk.md",
|
||||
"db-schema-timestamps.md",
|
||||
"db-perf-indexes.md",
|
||||
"db-migrations-idempotent.md",
|
||||
];
|
||||
|
||||
import type { EvalAssertion } from "../../src/eval-types.js";
|
||||
|
||||
import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
|
||||
|
||||
test("migration file exists", () => {
|
||||
expect(findMigrationFiles().length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
test("creates avatars bucket", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Should insert into storage.buckets with id 'avatars' and public = true
|
||||
expect(sql).toMatch(/storage\.buckets/);
|
||||
expect(sql).toMatch(/avatars/);
|
||||
expect(sql).toMatch(/public/);
|
||||
// Verify it's marked as a public bucket (true)
|
||||
const avatarsBlock = sql.match(
|
||||
/insert\s+into\s+storage\.buckets[\s\S]*?avatars[\s\S]*?;/,
|
||||
);
|
||||
expect(avatarsBlock).not.toBeNull();
|
||||
if (avatarsBlock) {
|
||||
expect(avatarsBlock[0]).toMatch(/true/);
|
||||
}
|
||||
});
|
||||
|
||||
test("creates documents bucket", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Should insert into storage.buckets with id 'documents' and public = false
|
||||
expect(sql).toMatch(/documents/);
|
||||
const documentsBlock = sql.match(
|
||||
/insert\s+into\s+storage\.buckets[\s\S]*?documents[\s\S]*?;/,
|
||||
);
|
||||
expect(documentsBlock).not.toBeNull();
|
||||
if (documentsBlock) {
|
||||
expect(documentsBlock[0]).toMatch(/false/);
|
||||
}
|
||||
});
|
||||
|
||||
test("avatars bucket has mime type restriction", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Should have allowed_mime_types with image types
|
||||
expect(sql).toMatch(/allowed_mime_types/);
|
||||
// Check for image MIME types (jpeg, png, webp)
|
||||
expect(sql).toMatch(/image\/jpeg/);
|
||||
expect(sql).toMatch(/image\/png/);
|
||||
expect(sql).toMatch(/image\/webp/);
|
||||
});
|
||||
|
||||
test("avatars bucket has file size limit", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Should have file_size_limit set to approximately 2MB (2097152 bytes or 2MB string)
|
||||
expect(sql).toMatch(/file_size_limit/);
|
||||
// Accept either numeric bytes (2097152) or string form (2MB, 2MiB, 2 * 1024 * 1024)
|
||||
const hasNumericLimit = /2097152/.test(sql);
|
||||
const hasStringLimit = /2\s*m/i.test(sql);
|
||||
const hasCalcLimit = /2\s*\*\s*1024\s*\*\s*1024/.test(sql);
|
||||
expect(hasNumericLimit || hasStringLimit || hasCalcLimit).toBe(true);
|
||||
});
|
||||
|
||||
test("storage policy uses foldername or path for user isolation", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Should use storage.foldername(name) with auth.uid()::text for folder isolation
|
||||
const usesFoldername = /storage\.foldername\s*\(\s*name\s*\)/.test(sql);
|
||||
// Also accept direct path matching patterns like (name ~ '^user-id/')
|
||||
const usesPathMatch =
|
||||
/\(\s*storage\.foldername\s*\(/.test(sql) ||
|
||||
/\bname\b.*auth\.uid\(\)/.test(sql);
|
||||
expect(usesFoldername || usesPathMatch).toBe(true);
|
||||
// Should cast auth.uid() to text for comparison with folder name
|
||||
expect(sql).toMatch(/auth\.uid\(\)\s*::\s*text/);
|
||||
});
|
||||
|
||||
test("storage policy uses TO authenticated", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Storage upload/delete/update policies should target authenticated users.
|
||||
// Accepted forms:
|
||||
// 1. Explicit TO authenticated
|
||||
// 2. auth.uid() in USING/WITH CHECK (implicitly restricts to authenticated)
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const storagePolicies = policyBlocks.filter((p) =>
|
||||
p.toLowerCase().includes("storage.objects"),
|
||||
);
|
||||
// At least one storage policy should restrict to authenticated users
|
||||
const hasAuthenticatedPolicy = storagePolicies.some(
|
||||
(p) =>
|
||||
/to\s+(authenticated|public)/.test(p.toLowerCase()) ||
|
||||
/auth\.uid\(\)/.test(p.toLowerCase()),
|
||||
);
|
||||
expect(hasAuthenticatedPolicy).toBe(true);
|
||||
// Insert policies must restrict to authenticated users (explicit TO or auth.uid() check)
|
||||
const insertPolicies = storagePolicies.filter((p) =>
|
||||
/for\s+insert/.test(p.toLowerCase()),
|
||||
);
|
||||
for (const policy of insertPolicies) {
|
||||
const hasExplicitTo = /to\s+authenticated/.test(policy.toLowerCase());
|
||||
const hasAuthUidCheck = /auth\.uid\(\)/.test(policy.toLowerCase());
|
||||
expect(hasExplicitTo || hasAuthUidCheck).toBe(true);
|
||||
}
|
||||
});
|
||||
|
||||
test("public read policy for avatars", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// A SELECT policy on storage.objects for avatars bucket should allow public/anon access.
|
||||
// Accepted forms:
|
||||
// 1. Explicit TO public / TO anon
|
||||
// 2. No TO clause (defaults to public role, granting all access)
|
||||
// 3. No auth.uid() restriction in USING (open to everyone)
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const avatarSelectPolicies = policyBlocks.filter(
|
||||
(p) =>
|
||||
p.toLowerCase().includes("storage.objects") &&
|
||||
/for\s+select/.test(p.toLowerCase()) &&
|
||||
p.toLowerCase().includes("avatars"),
|
||||
);
|
||||
expect(avatarSelectPolicies.length).toBeGreaterThan(0);
|
||||
// Should allow public access: explicit TO public/anon, or no TO clause without auth.uid() restriction
|
||||
const hasPublicAccess = avatarSelectPolicies.some((p) => {
|
||||
const lower = p.toLowerCase();
|
||||
const hasExplicitPublic =
|
||||
/to\s+public/.test(lower) || /to\s+anon/.test(lower);
|
||||
// No TO clause and no auth.uid() restriction means open to all
|
||||
const hasNoToClause = !/\bto\s+\w+/.test(lower);
|
||||
const hasNoAuthRestriction = !/auth\.uid\(\)/.test(lower);
|
||||
return hasExplicitPublic || (hasNoToClause && hasNoAuthRestriction);
|
||||
});
|
||||
expect(hasPublicAccess).toBe(true);
|
||||
});
|
||||
|
||||
test("documents bucket is fully private", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// All policies for documents bucket should restrict to authenticated owner.
|
||||
// Accepted forms:
|
||||
// 1. Explicit TO authenticated
|
||||
// 2. auth.uid() in USING/WITH CHECK (implicitly restricts to authenticated)
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const documentPolicies = policyBlocks.filter(
|
||||
(p) =>
|
||||
p.toLowerCase().includes("storage.objects") &&
|
||||
p.toLowerCase().includes("documents"),
|
||||
);
|
||||
expect(documentPolicies.length).toBeGreaterThan(0);
|
||||
// None should allow public/anon access
|
||||
for (const policy of documentPolicies) {
|
||||
expect(policy).not.toMatch(/to\s+public/);
|
||||
expect(policy).not.toMatch(/to\s+anon/);
|
||||
}
|
||||
// All should be scoped to authenticated (explicit TO or auth.uid() check)
|
||||
for (const policy of documentPolicies) {
|
||||
const hasExplicitTo = /to\s+authenticated/.test(policy);
|
||||
const hasAuthUidCheck = /auth\.uid\(\)/.test(policy);
|
||||
expect(hasExplicitTo || hasAuthUidCheck).toBe(true);
|
||||
}
|
||||
});
|
||||
|
||||
test("creates file_metadata table", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/create\s+table/);
|
||||
expect(sql).toMatch(/file_metadata/);
|
||||
});
|
||||
|
||||
test("file_metadata has FK to auth.users with CASCADE", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Find the file_metadata CREATE TABLE block or the surrounding context
|
||||
expect(sql).toMatch(/references\s+auth\.users/);
|
||||
expect(sql).toMatch(/on\s+delete\s+cascade/);
|
||||
});
|
||||
|
||||
test("RLS enabled on file_metadata", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(
|
||||
/alter\s+table.*file_metadata.*enable\s+row\s+level\s+security/,
|
||||
);
|
||||
});
|
||||
|
||||
test("file_metadata policies use (select auth.uid())", () => {
|
||||
const sql = getMigrationSQL();
|
||||
// Find policies that reference file_metadata
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const metadataPolicies = policyBlocks.filter((p) =>
|
||||
p.toLowerCase().includes("file_metadata"),
|
||||
);
|
||||
// Each policy that uses auth.uid() should use the subselect form
|
||||
for (const policy of metadataPolicies) {
|
||||
if (policy.includes("auth.uid()")) {
|
||||
expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
test("uses timestamptz for time columns", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Only check if the migration defines time-related columns
|
||||
if (
|
||||
sql.includes("created_at") ||
|
||||
sql.includes("updated_at") ||
|
||||
sql.includes("uploaded_at")
|
||||
) {
|
||||
// Check column definitions for plain "timestamp" (not timestamptz / timestamp with time zone).
|
||||
// Only match timestamp as a column type — look for column_name followed by timestamp.
|
||||
// Exclude matches inside trigger/function bodies and RETURNS TRIGGER.
|
||||
const columnDefs = sql.match(
|
||||
/(?:created_at|updated_at|uploaded_at)\s+timestamp\b/g,
|
||||
);
|
||||
if (columnDefs) {
|
||||
for (const def of columnDefs) {
|
||||
// Each match should use timestamptz or "timestamp with time zone"
|
||||
expect(def).toMatch(/timestamptz|timestamp\s+with\s+time\s+zone/);
|
||||
export const assertions: EvalAssertion[] = [
|
||||
{
|
||||
name: "migration file exists",
|
||||
check: () => findMigrationFiles().length > 0,
|
||||
},
|
||||
{
|
||||
name: "creates avatars bucket",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
if (
|
||||
!/storage\.buckets/.test(sql) ||
|
||||
!/avatars/.test(sql) ||
|
||||
!/public/.test(sql)
|
||||
)
|
||||
return false;
|
||||
const avatarsBlock = sql.match(
|
||||
/insert\s+into\s+storage\.buckets[\s\S]*?avatars[\s\S]*?;/,
|
||||
);
|
||||
return avatarsBlock !== null && /true/.test(avatarsBlock[0]);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "creates documents bucket",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
if (!/documents/.test(sql)) return false;
|
||||
const documentsBlock = sql.match(
|
||||
/insert\s+into\s+storage\.buckets[\s\S]*?documents[\s\S]*?;/,
|
||||
);
|
||||
return documentsBlock !== null && /false/.test(documentsBlock[0]);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "avatars bucket has mime type restriction",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return (
|
||||
/allowed_mime_types/.test(sql) &&
|
||||
/image\/jpeg/.test(sql) &&
|
||||
/image\/png/.test(sql) &&
|
||||
/image\/webp/.test(sql)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "avatars bucket has file size limit",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
if (!/file_size_limit/.test(sql)) return false;
|
||||
return (
|
||||
/2097152/.test(sql) ||
|
||||
/2\s*m/i.test(sql) ||
|
||||
/2\s*\*\s*1024\s*\*\s*1024/.test(sql)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "storage policy uses foldername or path for user isolation",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const usesFoldername = /storage\.foldername\s*\(\s*name\s*\)/.test(sql);
|
||||
const usesPathMatch =
|
||||
/\(\s*storage\.foldername\s*\(/.test(sql) ||
|
||||
/\bname\b.*auth\.uid\(\)/.test(sql);
|
||||
return (
|
||||
(usesFoldername || usesPathMatch) &&
|
||||
/auth\.uid\(\)\s*::\s*text/.test(sql)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "storage policy uses TO authenticated",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const storagePolicies = policyBlocks.filter((p) =>
|
||||
p.toLowerCase().includes("storage.objects"),
|
||||
);
|
||||
const hasAuthenticatedPolicy = storagePolicies.some(
|
||||
(p) =>
|
||||
/to\s+(authenticated|public)/.test(p.toLowerCase()) ||
|
||||
/auth\.uid\(\)/.test(p.toLowerCase()),
|
||||
);
|
||||
if (!hasAuthenticatedPolicy) return false;
|
||||
const insertPolicies = storagePolicies.filter((p) =>
|
||||
/for\s+insert/.test(p.toLowerCase()),
|
||||
);
|
||||
return insertPolicies.every(
|
||||
(p) =>
|
||||
/to\s+authenticated/.test(p.toLowerCase()) ||
|
||||
/auth\.uid\(\)/.test(p.toLowerCase()),
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "public read policy for avatars",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const avatarSelectPolicies = policyBlocks.filter(
|
||||
(p) =>
|
||||
p.toLowerCase().includes("storage.objects") &&
|
||||
/for\s+select/.test(p.toLowerCase()) &&
|
||||
p.toLowerCase().includes("avatars"),
|
||||
);
|
||||
if (avatarSelectPolicies.length === 0) return false;
|
||||
return avatarSelectPolicies.some((p) => {
|
||||
const lower = p.toLowerCase();
|
||||
const hasExplicitPublic =
|
||||
/to\s+public/.test(lower) || /to\s+anon/.test(lower);
|
||||
const hasNoToClause = !/\bto\s+\w+/.test(lower);
|
||||
const hasNoAuthRestriction = !/auth\.uid\(\)/.test(lower);
|
||||
return hasExplicitPublic || (hasNoToClause && hasNoAuthRestriction);
|
||||
});
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "documents bucket is fully private",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const documentPolicies = policyBlocks.filter(
|
||||
(p) =>
|
||||
p.toLowerCase().includes("storage.objects") &&
|
||||
p.toLowerCase().includes("documents"),
|
||||
);
|
||||
if (documentPolicies.length === 0) return false;
|
||||
return documentPolicies.every(
|
||||
(p) =>
|
||||
!/to\s+public/.test(p) &&
|
||||
!/to\s+anon/.test(p) &&
|
||||
(/to\s+authenticated/.test(p) || /auth\.uid\(\)/.test(p)),
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "creates file_metadata table",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return /create\s+table/.test(sql) && /file_metadata/.test(sql);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "file_metadata has FK to auth.users with CASCADE",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return (
|
||||
/references\s+auth\.users/.test(sql) &&
|
||||
/on\s+delete\s+cascade/.test(sql)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "RLS enabled on file_metadata",
|
||||
check: () =>
|
||||
/alter\s+table.*file_metadata.*enable\s+row\s+level\s+security/.test(
|
||||
getMigrationSQL().toLowerCase(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "file_metadata policies use (select auth.uid())",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const metadataPolicies = policyBlocks.filter((p) =>
|
||||
p.toLowerCase().includes("file_metadata"),
|
||||
);
|
||||
for (const policy of metadataPolicies) {
|
||||
if (
|
||||
policy.includes("auth.uid()") &&
|
||||
!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
test("index on file_metadata user_id", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/create\s+index/);
|
||||
// Should index user_id on file_metadata
|
||||
expect(sql).toMatch(/file_metadata/);
|
||||
expect(sql).toMatch(/user_id/);
|
||||
});
|
||||
|
||||
test("idempotent DDL", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/if\s+not\s+exists/);
|
||||
});
|
||||
|
||||
test("overall quality score", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// A high-quality migration should contain most of these best-practice signals
|
||||
const signals = [
|
||||
// 1. Avatars bucket is public
|
||||
/insert\s+into\s+storage\.buckets[\s\S]*?avatars/,
|
||||
// 2. Documents bucket exists
|
||||
/insert\s+into\s+storage\.buckets[\s\S]*?documents/,
|
||||
// 3. MIME type restriction
|
||||
/allowed_mime_types/,
|
||||
// 4. File size limit
|
||||
/file_size_limit/,
|
||||
// 5. Storage foldername helper
|
||||
/storage\.foldername/,
|
||||
// 6. auth.uid()::text cast
|
||||
/auth\.uid\(\)\s*::\s*text/,
|
||||
// 7. TO authenticated on policies
|
||||
/to\s+authenticated/,
|
||||
// 8. Public read for avatars
|
||||
/to\s+(public|anon)/,
|
||||
// 9. RLS on file_metadata
|
||||
/enable\s+row\s+level\s+security/,
|
||||
// 10. FK to auth.users with cascade
|
||||
/on\s+delete\s+cascade/,
|
||||
// 11. (select auth.uid()) subselect form
|
||||
/\(select\s+auth\.uid\(\)\)/,
|
||||
// 12. Index on user_id
|
||||
/create\s+index/,
|
||||
// 13. timestamptz usage
|
||||
/timestamptz/,
|
||||
// 14. IF NOT EXISTS for idempotency
|
||||
/if\s+not\s+exists/,
|
||||
// 15. file_metadata table
|
||||
/create\s+table[\s\S]*?file_metadata/,
|
||||
];
|
||||
const matches = signals.filter((r) => r.test(sql));
|
||||
// Require at least 11 of 15 best-practice signals
|
||||
expect(matches.length).toBeGreaterThanOrEqual(11);
|
||||
});
|
||||
return true;
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "uses timestamptz for time columns",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
if (
|
||||
!sql.includes("created_at") &&
|
||||
!sql.includes("updated_at") &&
|
||||
!sql.includes("uploaded_at")
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
const columnDefs = sql.match(
|
||||
/(?:created_at|updated_at|uploaded_at)\s+timestamp\b/g,
|
||||
);
|
||||
if (!columnDefs) return true;
|
||||
return columnDefs.every((def) =>
|
||||
/timestamptz|timestamp\s+with\s+time\s+zone/.test(def),
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "index on file_metadata user_id",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return (
|
||||
/create\s+index/.test(sql) &&
|
||||
/file_metadata/.test(sql) &&
|
||||
/user_id/.test(sql)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "idempotent DDL",
|
||||
check: () => /if\s+not\s+exists/.test(getMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "overall quality score",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const signals = [
|
||||
/insert\s+into\s+storage\.buckets[\s\S]*?avatars/,
|
||||
/insert\s+into\s+storage\.buckets[\s\S]*?documents/,
|
||||
/allowed_mime_types/,
|
||||
/file_size_limit/,
|
||||
/storage\.foldername/,
|
||||
/auth\.uid\(\)\s*::\s*text/,
|
||||
/to\s+authenticated/,
|
||||
/to\s+(public|anon)/,
|
||||
/enable\s+row\s+level\s+security/,
|
||||
/on\s+delete\s+cascade/,
|
||||
/\(select\s+auth\.uid\(\)\)/,
|
||||
/create\s+index/,
|
||||
/timestamptz/,
|
||||
/if\s+not\s+exists/,
|
||||
/create\s+table[\s\S]*?file_metadata/,
|
||||
];
|
||||
return signals.filter((r) => r.test(sql)).length >= 11;
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
@@ -1,182 +1,216 @@
|
||||
import { expect, test } from "vitest";
|
||||
export const expectedReferenceFiles = [
|
||||
"db-rls-mandatory.md",
|
||||
"db-rls-policy-types.md",
|
||||
"db-rls-common-mistakes.md",
|
||||
"db-rls-performance.md",
|
||||
"db-security-functions.md",
|
||||
"db-schema-auth-fk.md",
|
||||
"db-schema-timestamps.md",
|
||||
"db-perf-indexes.md",
|
||||
"db-migrations-idempotent.md",
|
||||
];
|
||||
|
||||
import type { EvalAssertion } from "../../src/eval-types.js";
|
||||
|
||||
import { findMigrationFiles, getMigrationSQL } from "../eval-utils.ts";
|
||||
|
||||
test("migration file exists", () => {
|
||||
expect(findMigrationFiles().length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
test("creates organizations table", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/create\s+table[\s\S]*?organizations/);
|
||||
});
|
||||
|
||||
test("creates memberships table", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/create\s+table[\s\S]*?memberships/);
|
||||
});
|
||||
|
||||
test("creates projects table", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/create\s+table[\s\S]*?projects/);
|
||||
});
|
||||
|
||||
test("enables RLS on all tables", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(
|
||||
/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/,
|
||||
);
|
||||
expect(sql).toMatch(
|
||||
/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/,
|
||||
);
|
||||
expect(sql).toMatch(
|
||||
/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/,
|
||||
);
|
||||
});
|
||||
|
||||
test("FK to auth.users with ON DELETE CASCADE", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// memberships should reference auth.users with cascade delete
|
||||
expect(sql).toMatch(/references\s+auth\.users/);
|
||||
expect(sql).toMatch(/on\s+delete\s+cascade/);
|
||||
});
|
||||
|
||||
test("org_id FK on projects", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// projects should have a foreign key referencing organizations
|
||||
expect(sql).toMatch(
|
||||
/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/,
|
||||
);
|
||||
});
|
||||
|
||||
test("private schema created", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/create\s+schema[\s\S]*?private/);
|
||||
});
|
||||
|
||||
test("security_definer helper function", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Function should be in the private schema with SECURITY DEFINER and search_path = ''
|
||||
expect(sql).toMatch(/private\./);
|
||||
expect(sql).toMatch(/security\s+definer/);
|
||||
expect(sql).toMatch(/set\s+search_path\s*=\s*''/);
|
||||
});
|
||||
|
||||
test("policies use (select auth.uid())", () => {
|
||||
const sql = getMigrationSQL();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
expect(policyBlocks.length).toBeGreaterThan(0);
|
||||
for (const policy of policyBlocks) {
|
||||
if (policy.includes("auth.uid()")) {
|
||||
// The subselect form: (select auth.uid())
|
||||
expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
test("policies use TO authenticated", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
expect(policyBlocks.length).toBeGreaterThan(0);
|
||||
for (const policy of policyBlocks) {
|
||||
expect(policy).toMatch(/to\s+authenticated/);
|
||||
}
|
||||
});
|
||||
|
||||
test("index on membership lookup columns", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/create\s+index/);
|
||||
// Should index user_id and/or org_id on memberships for policy lookups
|
||||
const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? [];
|
||||
const indexesUserOrOrg = indexBlocks.filter(
|
||||
(idx) =>
|
||||
idx.includes("user_id") ||
|
||||
idx.includes("org_id") ||
|
||||
idx.includes("organization_id"),
|
||||
);
|
||||
expect(indexesUserOrOrg.length).toBeGreaterThanOrEqual(1);
|
||||
});
|
||||
|
||||
test("uses timestamptz", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Match "timestamp" that is NOT followed by "tz" or "with time zone"
|
||||
const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
|
||||
// Only fail if the migration defines time columns with plain timestamp
|
||||
if (
|
||||
sql.includes("created_at") ||
|
||||
sql.includes("updated_at") ||
|
||||
sql.includes("_at ")
|
||||
) {
|
||||
expect(sql).not.toMatch(hasPlainTimestamp);
|
||||
}
|
||||
});
|
||||
|
||||
test("idempotent DDL", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
expect(sql).toMatch(/if\s+not\s+exists/);
|
||||
});
|
||||
|
||||
test("delete policy restricted to owner role", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
// Look for a delete policy on projects that checks for owner (or admin) role
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const deletePolicy = policyBlocks.find(
|
||||
(p) =>
|
||||
p.toLowerCase().includes("delete") && p.toLowerCase().includes("project"),
|
||||
);
|
||||
expect(deletePolicy).toBeDefined();
|
||||
// The delete policy should check for an owner/admin role
|
||||
expect(deletePolicy?.toLowerCase()).toMatch(/owner|admin/);
|
||||
});
|
||||
|
||||
test("overall quality score", () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
// A high-quality migration should contain most of these best-practice signals
|
||||
const signals = [
|
||||
// 1. RLS enabled on all three tables
|
||||
/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) &&
|
||||
/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) &&
|
||||
/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
export const assertions: EvalAssertion[] = [
|
||||
{
|
||||
name: "migration file exists",
|
||||
check: () => findMigrationFiles().length > 0,
|
||||
},
|
||||
{
|
||||
name: "creates organizations table",
|
||||
check: () =>
|
||||
/create\s+table[\s\S]*?organizations/.test(
|
||||
getMigrationSQL().toLowerCase(),
|
||||
),
|
||||
// 2. FK to auth.users with cascade
|
||||
/references\s+auth\.users/.test(sql) && /on\s+delete\s+cascade/.test(sql),
|
||||
// 3. Private schema created
|
||||
/create\s+schema[\s\S]*?private/.test(sql),
|
||||
// 4. security_definer with search_path
|
||||
/security\s+definer/.test(sql) && /set\s+search_path\s*=\s*''/.test(sql),
|
||||
// 5. Subselect auth.uid()
|
||||
/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
|
||||
// 6. TO authenticated on policies
|
||||
policyBlocks.length > 0 &&
|
||||
policyBlocks.every((p) => /to\s+authenticated/.test(p)),
|
||||
// 7. Indexes on lookup columns
|
||||
/create\s+index/.test(sql),
|
||||
// 8. timestamptz (no plain timestamp)
|
||||
!/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(sql),
|
||||
// 9. Idempotent DDL
|
||||
/if\s+not\s+exists/.test(sql),
|
||||
// 10. Delete policy checks owner role
|
||||
policyBlocks.some(
|
||||
(p) =>
|
||||
p.toLowerCase().includes("delete") &&
|
||||
p.toLowerCase().includes("project") &&
|
||||
/owner|admin/.test(p.toLowerCase()),
|
||||
),
|
||||
// 11. org_id FK on projects
|
||||
/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/.test(sql),
|
||||
// 12. Multiple policies (at least one per table)
|
||||
policyBlocks.length >= 3,
|
||||
// 13. Membership role column exists
|
||||
/role/.test(sql),
|
||||
// 14. Private schema function referenced in policies
|
||||
/private\./.test(sql),
|
||||
];
|
||||
const passed = signals.filter(Boolean).length;
|
||||
expect(passed).toBeGreaterThanOrEqual(10);
|
||||
});
|
||||
},
|
||||
{
|
||||
name: "creates memberships table",
|
||||
check: () =>
|
||||
/create\s+table[\s\S]*?memberships/.test(getMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "creates projects table",
|
||||
check: () =>
|
||||
/create\s+table[\s\S]*?projects/.test(getMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "enables RLS on all tables",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return (
|
||||
/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) &&
|
||||
/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) &&
|
||||
/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "FK to auth.users with ON DELETE CASCADE",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return (
|
||||
/references\s+auth\.users/.test(sql) &&
|
||||
/on\s+delete\s+cascade/.test(sql)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "org_id FK on projects",
|
||||
check: () =>
|
||||
/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/.test(
|
||||
getMigrationSQL().toLowerCase(),
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "private schema created",
|
||||
check: () =>
|
||||
/create\s+schema[\s\S]*?private/.test(getMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "security_definer helper function",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
return (
|
||||
/private\./.test(sql) &&
|
||||
/security\s+definer/.test(sql) &&
|
||||
/set\s+search_path\s*=\s*''/.test(sql)
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "policies use (select auth.uid())",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
if (policyBlocks.length === 0) return false;
|
||||
for (const policy of policyBlocks) {
|
||||
if (
|
||||
policy.includes("auth.uid()") &&
|
||||
!/\(\s*select\s+auth\.uid\(\)\s*\)/i.test(policy)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "policies use TO authenticated",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
return (
|
||||
policyBlocks.length > 0 &&
|
||||
policyBlocks.every((p) => /to\s+authenticated/.test(p))
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "index on membership lookup columns",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
if (!/create\s+index/.test(sql)) return false;
|
||||
const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? [];
|
||||
return (
|
||||
indexBlocks.filter(
|
||||
(idx) =>
|
||||
idx.includes("user_id") ||
|
||||
idx.includes("org_id") ||
|
||||
idx.includes("organization_id"),
|
||||
).length >= 1
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "uses timestamptz",
|
||||
check: () => {
|
||||
const rawSql = getMigrationSQL().toLowerCase();
|
||||
const sql = rawSql.replace(/--[^\n]*/g, "");
|
||||
const hasPlainTimestamp =
|
||||
/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
|
||||
if (
|
||||
sql.includes("created_at") ||
|
||||
sql.includes("updated_at") ||
|
||||
sql.includes("_at ")
|
||||
) {
|
||||
return !hasPlainTimestamp.test(sql);
|
||||
}
|
||||
return true;
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "idempotent DDL",
|
||||
check: () => /if\s+not\s+exists/.test(getMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "stable or immutable on helper function",
|
||||
check: () =>
|
||||
/\bstable\b|\bimmutable\b/.test(getMigrationSQL().toLowerCase()),
|
||||
},
|
||||
{
|
||||
name: "delete policy restricted to owner role",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const deletePolicy = policyBlocks.find(
|
||||
(p) =>
|
||||
p.toLowerCase().includes("delete") &&
|
||||
p.toLowerCase().includes("project"),
|
||||
);
|
||||
if (!deletePolicy) return false;
|
||||
return /owner|admin/.test(deletePolicy.toLowerCase());
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "overall quality score",
|
||||
check: () => {
|
||||
const sql = getMigrationSQL().toLowerCase();
|
||||
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||
const signals = [
|
||||
/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) &&
|
||||
/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
) &&
|
||||
/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||
sql,
|
||||
),
|
||||
/references\s+auth\.users/.test(sql) &&
|
||||
/on\s+delete\s+cascade/.test(sql),
|
||||
/create\s+schema[\s\S]*?private/.test(sql),
|
||||
/security\s+definer/.test(sql) &&
|
||||
/set\s+search_path\s*=\s*''/.test(sql),
|
||||
/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
|
||||
policyBlocks.length > 0 &&
|
||||
policyBlocks.every((p) => /to\s+authenticated/.test(p)),
|
||||
/create\s+index/.test(sql),
|
||||
!/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(
|
||||
sql.replace(/--[^\n]*/g, ""),
|
||||
),
|
||||
/if\s+not\s+exists/.test(sql),
|
||||
policyBlocks.some(
|
||||
(p) =>
|
||||
p.toLowerCase().includes("delete") &&
|
||||
p.toLowerCase().includes("project") &&
|
||||
/owner|admin/.test(p.toLowerCase()),
|
||||
),
|
||||
/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/.test(sql),
|
||||
policyBlocks.length >= 3,
|
||||
/role/.test(sql),
|
||||
/private\./.test(sql),
|
||||
/\bstable\b|\bimmutable\b/.test(sql),
|
||||
];
|
||||
return signals.filter(Boolean).length >= 11;
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
"devDependencies": {
|
||||
"@types/node": "^20.10.0",
|
||||
"tsx": "^4.7.0",
|
||||
"typescript": "^5.3.0",
|
||||
"vitest": "^3.1.0"
|
||||
"typescript": "^5.3.0"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,5 +6,12 @@
|
||||
| 2 | [team-rls-security-definer](team-rls-security-definer.md) | Team-based RLS with security definer helper in a private schema |
|
||||
| 3 | [storage-rls-user-folders](storage-rls-user-folders.md) | Storage buckets with RLS policies for user-isolated folders |
|
||||
| 4 | [edge-function-hello-world](edge-function-hello-world.md) | Hello-world Edge Function with CORS and shared utilities |
|
||||
| 5 | edge-function-stripe-webhook | Stripe webhook Edge Function with signature verification and orders migration |
|
||||
| 6 | [collaborative-rooms-realtime](collaborative-rooms-realtime.md) | Collaborative rooms with role-based RLS, broadcast triggers, and Realtime authorization |
|
||||
| 5 | [collaborative-rooms-realtime](collaborative-rooms-realtime.md) | Collaborative rooms with role-based RLS, broadcast triggers, and Realtime authorization |
|
||||
| 6 | [auth-fk-cascade-delete](auth-fk-cascade-delete.md) | Profiles table with auth.users FK cascade and auto-create trigger |
|
||||
| 7 | [rls-update-needs-select](rls-update-needs-select.md) | Orders table where UPDATE silently fails without a matching SELECT policy |
|
||||
| 8 | [extension-wrong-schema](extension-wrong-schema.md) | pgvector extension setup with correct schema placement, HNSW index, and user-scoped RLS |
|
||||
| 9 | [connection-pooling-prisma](connection-pooling-prisma.md) | Fix Prisma schema to use Supabase transaction-mode pooler (port 6543, pgbouncer=true, directUrl) for serverless deployments |
|
||||
| 10 | [cli-hallucinated-commands](cli-hallucinated-commands.md) | CLI cheat-sheet that must use only real Supabase CLI commands, avoiding hallucinated `supabase functions log` and `supabase db query` |
|
||||
| 11 | [postgrest-schema-cache](postgrest-schema-cache.md) | Add columns and a view to an existing table, with NOTIFY pgrst to reload the PostgREST schema cache |
|
||||
| 12 | [rls-user-metadata-role-check](rls-user-metadata-role-check.md) | Documents table with owner and admin RLS — must use app_metadata not user_metadata for role authorization |
|
||||
| 13 | [service-role-edge-function](service-role-edge-function.md) | Admin Edge Function that bypasses RLS using the service role key via env vars, never hardcoded |
|
||||
|
||||
84
packages/evals/scenarios/auth-fk-cascade-delete.md
Normal file
84
packages/evals/scenarios/auth-fk-cascade-delete.md
Normal file
@@ -0,0 +1,84 @@
|
||||
# Scenario: auth-fk-cascade-delete
|
||||
|
||||
## Summary
|
||||
|
||||
The agent must create a `profiles` table that references `auth.users` with
|
||||
`ON DELETE CASCADE`, and a trigger that auto-creates a profile row when a new
|
||||
user signs up. The common mistake — omitting CASCADE — causes user deletion to
|
||||
fail with a foreign key violation.
|
||||
|
||||
## Real-World Justification
|
||||
|
||||
Why this is a common and important workflow:
|
||||
|
||||
1. **Top troubleshooting entry** — "Database error saving new user" and
|
||||
"Errors when creating/updating/deleting users" are listed as common issues in
|
||||
the Supabase troubleshooting guide. The majority of these failures trace back
|
||||
to FK violations when deleting users who have linked profile rows.
|
||||
- Source: https://supabase.com/docs/guides/troubleshooting
|
||||
2. **Auth trigger pattern ubiquity** — The `handle_new_user` trigger on
|
||||
`auth.users` is documented in the official Supabase onboarding guide and
|
||||
replicated in thousands of community starter templates. Getting the
|
||||
`security definer` + `set search_path = ''` details wrong breaks signups.
|
||||
- Source: https://supabase.com/docs/guides/database/postgres/cascade-deletes
|
||||
3. **Community-reported cascade omission** — Multiple GitHub issues report
|
||||
unexpected FK violation errors when calling `auth.admin.deleteUser()` from
|
||||
the SDK because the profile table was created without CASCADE.
|
||||
- Source: https://github.com/supabase/supabase/issues/
|
||||
|
||||
## Skill References Exercised
|
||||
|
||||
| Reference File | What It Teaches | What the Agent Should Apply |
|
||||
|---|---|---|
|
||||
| `references/db-schema-auth-fk.md` | ON DELETE CASCADE requirement for auth.users FKs | `REFERENCES auth.users(id) ON DELETE CASCADE` |
|
||||
| `references/db-security-functions.md` | security definer + set search_path = '' for trigger functions | Correct trigger function definition |
|
||||
| `references/db-rls-mandatory.md` | Enable RLS on all public tables | RLS enabled on profiles |
|
||||
| `references/db-rls-common-mistakes.md` | TO clause and subselect auth.uid() | Correct policy scoping |
|
||||
|
||||
## Workspace Setup
|
||||
|
||||
- Empty workspace with a pre-initialized `supabase/config.toml` (no migrations)
|
||||
|
||||
## Agent Task (PROMPT.md draft)
|
||||
|
||||
> Set up a `profiles` table for my Supabase app. Every user who signs up should
|
||||
> automatically get a profile row with their `id`, `email`, and `full_name`
|
||||
> (pulled from signup metadata). The profiles table should go in
|
||||
> `supabase/migrations/` as a SQL migration. Users should only be able to read
|
||||
> and update their own profile.
|
||||
|
||||
## Evaluation Criteria
|
||||
|
||||
| # | Test Name | What It Checks | Quality Dimension |
|
||||
|---|-----------|----------------|-------------------|
|
||||
| 1 | migration file exists | At least one `.sql` file in `supabase/migrations/` | structure |
|
||||
| 2 | creates profiles table | SQL contains `CREATE TABLE` and `profiles` | correctness |
|
||||
| 3 | FK references auth.users | `REFERENCES auth.users` present | correctness |
|
||||
| 4 | ON DELETE CASCADE present | `ON DELETE CASCADE` on the auth.users FK | correctness |
|
||||
| 5 | RLS enabled on profiles | `ALTER TABLE profiles ENABLE ROW LEVEL SECURITY` | security |
|
||||
| 6 | trigger function uses security definer | `SECURITY DEFINER` in the trigger function definition | security |
|
||||
| 7 | trigger function sets search_path | `SET search_path = ''` or `set search_path` in trigger function | security |
|
||||
| 8 | trigger created on auth.users | `CREATE TRIGGER ... ON auth.users` | correctness |
|
||||
| 9 | policies scoped to authenticated | `TO authenticated` in policy definitions | security |
|
||||
|
||||
## Reasoning
|
||||
|
||||
1. **Baseline differentiator:** Without the skill, an agent creates the FK
|
||||
without CASCADE and omits `set search_path = ''` on the trigger function —
|
||||
two independently dangerous omissions.
|
||||
2. **Skill value:** `db-schema-auth-fk.md` is explicitly about this exact
|
||||
scenario; `db-security-functions.md` covers the trigger security requirements.
|
||||
3. **Testability:** CASCADE and search_path are simple string patterns. Trigger
|
||||
creation on `auth.users` is a unique structural signal.
|
||||
4. **Realism:** The profiles-with-trigger pattern is the #1 starter pattern in
|
||||
every Supabase tutorial and the #1 source of FK-violation bugs reported in
|
||||
the community.
|
||||
|
||||
## Difficulty
|
||||
|
||||
**Rating:** MEDIUM
|
||||
|
||||
- Without skill: ~35% of assertions expected to pass (table and FK likely, but
|
||||
no CASCADE, no search_path, weak policies)
|
||||
- With skill: ~90% of assertions expected to pass
|
||||
- **pass_threshold:** 8
|
||||
@@ -85,8 +85,9 @@ specific quality signal:
|
||||
| 8 | TO authenticated | Policies scoped to authenticated role | security |
|
||||
| 9 | timestamptz | No plain `timestamp` for time columns | correctness |
|
||||
| 10 | index on user_id | `CREATE INDEX` on the FK column | performance |
|
||||
| 11 | IF NOT EXISTS | Idempotent migration | idempotency |
|
||||
| 12 | overall quality | At least 4/5 best-practice signals present | overall |
|
||||
| 11 | no SERIAL/BIGSERIAL | PK does not use error-prone serial type | correctness |
|
||||
| 12 | IF NOT EXISTS | Idempotent migration | idempotency |
|
||||
| 13 | overall quality | At least 4/5 best-practice signals present | overall |
|
||||
|
||||
## Reasoning
|
||||
|
||||
@@ -121,4 +122,5 @@ Step-by-step reasoning for why this scenario is well-designed:
|
||||
**Rating:** EASY
|
||||
|
||||
- Without skill: ~50-65% of assertions expected to pass
|
||||
- With skill: ~90-100% of assertions expected to pass
|
||||
- With skill: ~90-100% of assertions expected to pass
|
||||
- **pass_threshold:** 10
|
||||
120
packages/evals/scenarios/cli-hallucinated-commands.md
Normal file
120
packages/evals/scenarios/cli-hallucinated-commands.md
Normal file
@@ -0,0 +1,120 @@
|
||||
# Scenario: cli-hallucinated-commands
|
||||
|
||||
## Summary
|
||||
|
||||
The agent must create a Supabase CLI reference cheat-sheet (`CLI_REFERENCE.md`)
|
||||
covering how to view Edge Function logs and how to run ad-hoc SQL queries
|
||||
against a Supabase project. This tests whether the agent invents non-existent
|
||||
CLI commands (`supabase functions log`, `supabase db query`) instead of
|
||||
describing the real workflows.
|
||||
|
||||
## Real-World Justification
|
||||
|
||||
Why this is a common and important workflow:
|
||||
|
||||
1. **`supabase functions log` is a persistent hallucination** — LLMs frequently
|
||||
suggest `supabase functions log` (singular) or `supabase functions logs` as
|
||||
CLI commands to stream deployed function logs. Neither command exists in the
|
||||
Supabase CLI. The real workflow is to use the Supabase Dashboard Logs
|
||||
Explorer, or for local development, `supabase start` + `supabase functions
|
||||
serve` which prints logs to stdout. This pattern appears across many
|
||||
developer questions and multiple model responses.
|
||||
- Source: https://supabase.com/docs/reference/cli/supabase-functions
|
||||
|
||||
2. **`supabase db query` is a persistent hallucination** — LLMs suggest
|
||||
`supabase db query` or `supabase db query --sql "SELECT ..."` as a way to
|
||||
run ad-hoc SQL via the CLI. This command does not exist. The real workflow
|
||||
is to connect via `psql` using the connection string from the Dashboard,
|
||||
or use the Dashboard SQL Editor, or `supabase db dump` for schema exports.
|
||||
- Source: https://supabase.com/docs/reference/cli/supabase-db
|
||||
|
||||
3. **Developers frequently ask for a CLI cheat-sheet** — Setting up a reference
|
||||
file for project onboarding is a standard ask. The agent must produce
|
||||
accurate commands, not invented ones that will silently fail.
|
||||
|
||||
## Skill References Exercised
|
||||
|
||||
Which reference files the agent should consult and what each teaches:
|
||||
|
||||
| Reference File | What It Teaches | What the Agent Should Apply |
|
||||
|---|---|---|
|
||||
| `references/dev-getting-started.md` | Real CLI commands: `supabase start`, `supabase stop`, `supabase db push`, `supabase db reset`, `supabase db diff` | Use only real `supabase db` subcommands |
|
||||
| `references/edge-fun-quickstart.md` | Real Edge Function CLI: `supabase functions new`, `supabase functions serve`, `supabase functions deploy` | Use real function commands, not invented log commands |
|
||||
|
||||
## Workspace Setup
|
||||
|
||||
What the workspace starts with before the agent runs:
|
||||
|
||||
- A pre-initialized `supabase/config.toml` (standard project setup)
|
||||
- An existing Edge Function at `supabase/functions/process-order/index.ts`
|
||||
- The agent is expected to create `CLI_REFERENCE.md` in the project root
|
||||
|
||||
## Agent Task (PROMPT.md draft)
|
||||
|
||||
The prompt to give the agent. Written as a developer would ask it — no hints
|
||||
about what the tests check:
|
||||
|
||||
> I'm onboarding a new developer to my Supabase project. Create a
|
||||
> `CLI_REFERENCE.md` file in the project root with a practical cheat-sheet
|
||||
> of Supabase CLI commands we use day-to-day. It should cover:
|
||||
>
|
||||
> 1. Starting and stopping the local dev stack
|
||||
> 2. Managing database migrations (push, reset, diff)
|
||||
> 3. Working with the `process-order` Edge Function (local dev and deploy)
|
||||
> 4. How to view Edge Function logs (both local dev and production)
|
||||
> 5. How to run ad-hoc SQL queries against the database (local and remote)
|
||||
>
|
||||
> Include the actual commands with brief explanations.
|
||||
|
||||
## Evaluation Criteria
|
||||
|
||||
What vitest should assert on the agent's output. Each assertion tests a
|
||||
specific quality signal:
|
||||
|
||||
| # | Test Name | What It Checks | Quality Dimension |
|
||||
|---|-----------|----------------|-------------------|
|
||||
| 1 | CLI_REFERENCE.md exists | `CLI_REFERENCE.md` file exists in project root | structure |
|
||||
| 2 | no hallucinated functions log command | File does NOT contain `supabase functions log` (without 's' as a complete command) | correctness |
|
||||
| 3 | no hallucinated db query command | File does NOT contain `supabase db query` | correctness |
|
||||
| 4 | mentions supabase functions serve for local | File contains `supabase functions serve` | correctness |
|
||||
| 5 | mentions supabase functions deploy | File contains `supabase functions deploy` | correctness |
|
||||
| 6 | mentions psql or connection string for SQL | File contains `psql` or `connection string` or `SQL Editor` or `db dump` | correctness |
|
||||
| 7 | mentions supabase db push or reset | File contains `supabase db push` or `supabase db reset` | correctness |
|
||||
| 8 | mentions supabase start | File contains `supabase start` | correctness |
|
||||
| 9 | mentions Dashboard for production logs | File mentions `Dashboard` or `Logs Explorer` for production log viewing | correctness |
|
||||
|
||||
## Reasoning
|
||||
|
||||
Step-by-step reasoning for why this scenario is well-designed:
|
||||
|
||||
1. **Baseline differentiator:** An agent without the skill is very likely to
|
||||
hallucinate both `supabase functions log` and `supabase db query` since
|
||||
these are plausible-sounding commands that follow the CLI's pattern.
|
||||
Multiple real-world LLM responses have included these exact commands. With
|
||||
the skill's reference files listing the actual CLI commands, the agent
|
||||
should know what exists and what doesn't.
|
||||
|
||||
2. **Skill value:** The quickstart and getting-started reference files
|
||||
enumerate the real CLI subcommands. An agent reading these will see that
|
||||
`supabase functions` only has `new`, `serve`, `deploy`, `delete`, `list`
|
||||
subcommands, and `supabase db` only has `push`, `reset`, `diff`, `dump`,
|
||||
`lint`, `pull` — not `query`. This directly prevents the hallucination.
|
||||
|
||||
3. **Testability:** All assertions are regex/string matches on a single
|
||||
markdown file. No runtime execution or migration parsing needed. Checks 2
|
||||
and 3 are pure absence tests (NOT contains) which are simple but
|
||||
high-signal.
|
||||
|
||||
4. **Realism:** Writing a CLI reference for project onboarding is a genuine
|
||||
task. The two hallucinated commands are the most commonly confused ones
|
||||
based on developer feedback. Getting these wrong produces broken workflows
|
||||
that are frustrating to debug.
|
||||
|
||||
## Difficulty
|
||||
|
||||
**Rating:** EASY
|
||||
|
||||
- Without skill: ~30-50% of assertions expected to pass (likely fails checks
|
||||
2 and/or 3 due to hallucination, may also miss Dashboard mention for logs)
|
||||
- With skill: ~90-100% of assertions expected to pass
|
||||
- **pass_threshold:** 9
|
||||
@@ -154,3 +154,4 @@ Step-by-step reasoning for why this scenario is well-designed:
|
||||
|
||||
- Without skill: ~25-40% of assertions expected to pass
|
||||
- With skill: ~80-90% of assertions expected to pass
|
||||
- **pass_threshold:** 17
|
||||
|
||||
80
packages/evals/scenarios/connection-pooling-prisma.md
Normal file
80
packages/evals/scenarios/connection-pooling-prisma.md
Normal file
@@ -0,0 +1,80 @@
|
||||
# Scenario: connection-pooling-prisma
|
||||
|
||||
## Summary
|
||||
|
||||
The agent must produce a `DATABASE_URL` configuration and Prisma schema setup
|
||||
that correctly uses Supabase's transaction-mode pooler (port 6543) with the
|
||||
`?pgbouncer=true` parameter to disable prepared statements. Without this, Prisma
|
||||
throws "prepared statement already exists" errors in serverless environments.
|
||||
|
||||
## Real-World Justification
|
||||
|
||||
Why this is a common and important workflow:
|
||||
|
||||
1. **Top troubleshooting entry** — "Error: prepared statement XXX already exists"
|
||||
is listed in the Supabase troubleshooting guide under Database Issues as a
|
||||
direct consequence of using transaction-mode pooling without disabling
|
||||
prepared statements.
|
||||
- Source: https://supabase.com/docs/guides/troubleshooting
|
||||
2. **Serverless deployment reality** — Vercel and other serverless platforms
|
||||
are the most popular Supabase deployment targets. Each function invocation
|
||||
creates a new connection, making transaction-mode pooling mandatory. The
|
||||
Prisma + Supabase combination is the most-searched configuration pairing.
|
||||
- Source: https://supabase.com/docs/guides/database/connecting-to-postgres
|
||||
3. **Connection exhaustion** — Using session mode (port 5432) in serverless
|
||||
leads to "remaining connection slots are reserved" errors — another top
|
||||
troubleshooting entry. The fix requires switching to port 6543.
|
||||
- Source: https://supabase.com/docs/guides/troubleshooting
|
||||
|
||||
## Skill References Exercised
|
||||
|
||||
| Reference File | What It Teaches | What the Agent Should Apply |
|
||||
|---|---|---|
|
||||
| `references/db-conn-pooling.md` | Transaction mode port 6543, pgbouncer=true for Prisma | Correct DATABASE_URL with port 6543 and ?pgbouncer=true |
|
||||
| `references/db-migrations-idempotent.md` | Migration file conventions and naming | Migration file in supabase/migrations/ |
|
||||
| `references/db-schema-auth-fk.md` | Schema best practices for user-linked tables | Proper FK patterns if schema is involved |
|
||||
|
||||
## Workspace Setup
|
||||
|
||||
- A workspace with `supabase/config.toml` already initialized
|
||||
- A `prisma/schema.prisma` starter file with a placeholder `DATABASE_URL` using
|
||||
direct connection (port 5432, no pgbouncer flag)
|
||||
|
||||
## Agent Task (PROMPT.md draft)
|
||||
|
||||
> I'm deploying my Supabase app on Vercel using Prisma. I keep getting
|
||||
> "prepared statement already exists" errors in production. My current
|
||||
> `DATABASE_URL` in `prisma/schema.prisma` uses the direct connection string.
|
||||
> Fix the Prisma configuration so it works correctly with Supabase's connection
|
||||
> pooler.
|
||||
|
||||
## Evaluation Criteria
|
||||
|
||||
| # | Test Name | What It Checks | Quality Dimension |
|
||||
|---|-----------|----------------|-------------------|
|
||||
| 1 | prisma schema references pooler port | `DATABASE_URL` or connection hint references port `6543` | correctness |
|
||||
| 2 | pgbouncer=true param present | `?pgbouncer=true` or `pgbouncer=true` in the connection URL or env comment | correctness |
|
||||
| 3 | DIRECT_URL provided for migrations | A separate `directUrl` or `DIRECT_URL` variable defined for Prisma migrations | correctness |
|
||||
| 4 | connection limit set to 1 | `connection_limit=1` in the pooler URL or Prisma datasource | performance |
|
||||
| 5 | explanation distinguishes port 6543 vs 5432 | Output or comments distinguish transaction mode (6543) from direct (5432) | correctness |
|
||||
|
||||
## Reasoning
|
||||
|
||||
1. **Baseline differentiator:** An agent without the skill typically updates
|
||||
the port or adds pgbouncer but forgets `DIRECT_URL` for migrations, or sets
|
||||
`max` connections too high, or uses session mode instead of transaction mode.
|
||||
2. **Skill value:** `db-conn-pooling.md` provides the exact pattern: port 6543,
|
||||
`?pgbouncer=true`, `max: 1` per serverless instance.
|
||||
3. **Testability:** Port numbers and query parameters are directly readable as
|
||||
string patterns in the output files.
|
||||
4. **Realism:** "Prisma prepared statement already exists on Supabase" is one
|
||||
of the most-searched Supabase error messages on Stack Overflow and GitHub.
|
||||
|
||||
## Difficulty
|
||||
|
||||
**Rating:** MEDIUM
|
||||
|
||||
- Without skill: ~30% of assertions expected to pass (agent may change port but
|
||||
likely misses pgbouncer param and DIRECT_URL)
|
||||
- With skill: ~90% of assertions expected to pass
|
||||
- **pass_threshold:** 7
|
||||
@@ -127,3 +127,4 @@ Step-by-step reasoning for why this scenario is well-designed:
|
||||
|
||||
- Without skill: ~45-60% of assertions expected to pass
|
||||
- With skill: ~90-100% of assertions expected to pass
|
||||
- **pass_threshold:** 13
|
||||
|
||||
89
packages/evals/scenarios/extension-wrong-schema.md
Normal file
89
packages/evals/scenarios/extension-wrong-schema.md
Normal file
@@ -0,0 +1,89 @@
|
||||
# Scenario: extension-wrong-schema
|
||||
|
||||
## Summary
|
||||
|
||||
The agent must create a migration that enables the `pgvector` extension and
|
||||
creates an `embeddings` table with a vector column and an HNSW index. The trap
|
||||
is installing the extension in the `public` schema (the default) instead of
|
||||
the `extensions` schema, and using IVFFlat without a `lists` parameter.
|
||||
|
||||
## Real-World Justification
|
||||
|
||||
Why this is a common and important workflow:
|
||||
|
||||
1. **Known schema pollution issue** — Installing extensions in `public` exposes
|
||||
extension functions and types through the PostgREST API, which can reveal
|
||||
internal details and cause "42501: permission denied" errors. The Supabase
|
||||
troubleshooting guide covers permission errors as a category.
|
||||
- Source: https://supabase.com/docs/guides/troubleshooting
|
||||
2. **IVFFlat without lists = error** — The Supabase troubleshooting guide
|
||||
contains a dedicated entry: "Increase vector lookup speeds by applying an
|
||||
HNSW index" which warns against IVFFlat and notes its required `lists`
|
||||
parameter. Missing this causes a CREATE INDEX error.
|
||||
- Source: https://supabase.com/docs/guides/troubleshooting
|
||||
3. **pgvector adoption** — Vector/AI embeddings are the fastest-growing
|
||||
Supabase use case. Nearly every AI-powered Supabase project starts with
|
||||
the pgvector extension setup. Getting the schema right from the start
|
||||
prevents later schema drift.
|
||||
- Source: https://supabase.com/docs/guides/database/extensions/pgvector
|
||||
|
||||
## Skill References Exercised
|
||||
|
||||
| Reference File | What It Teaches | What the Agent Should Apply |
|
||||
|---|---|---|
|
||||
| `references/db-schema-extensions.md` | Install extensions in `extensions` schema, not `public`; HNSW over IVFFlat; IVFFlat needs `lists` | `CREATE EXTENSION ... WITH SCHEMA extensions`; HNSW index |
|
||||
| `references/db-rls-mandatory.md` | Enable RLS on all public tables | RLS on embeddings table |
|
||||
| `references/db-migrations-idempotent.md` | IF NOT EXISTS for extensions and tables | `CREATE EXTENSION IF NOT EXISTS` |
|
||||
| `references/db-schema-auth-fk.md` | FK to auth.users with CASCADE | User-linked embeddings |
|
||||
| `references/db-rls-common-mistakes.md` | TO authenticated, subselect auth.uid() | Policy correctness |
|
||||
|
||||
## Workspace Setup
|
||||
|
||||
- Empty workspace with a pre-initialized `supabase/config.toml` (no migrations)
|
||||
|
||||
## Agent Task (PROMPT.md draft)
|
||||
|
||||
> I'm building a semantic search feature. Create a migration that:
|
||||
> 1. Enables the pgvector extension
|
||||
> 2. Creates a `documents` table with an `embedding` column (1536 dimensions
|
||||
> for OpenAI ada-002), a `content` text column, and a `user_id`
|
||||
> 3. Adds a vector similarity search index
|
||||
> 4. Users should only see their own documents
|
||||
> Put the migration in `supabase/migrations/`.
|
||||
|
||||
## Evaluation Criteria
|
||||
|
||||
| # | Test Name | What It Checks | Quality Dimension |
|
||||
|---|-----------|----------------|-------------------|
|
||||
| 1 | migration file exists | At least one `.sql` file in `supabase/migrations/` | structure |
|
||||
| 2 | extension in extensions schema | `WITH SCHEMA extensions` in the CREATE EXTENSION statement | correctness |
|
||||
| 3 | IF NOT EXISTS on extension | `CREATE EXTENSION IF NOT EXISTS` | idempotency |
|
||||
| 4 | vector column with correct dimensions | `vector(1536)` or `extensions.vector(1536)` in table | correctness |
|
||||
| 5 | HNSW index used not IVFFlat | `USING hnsw` present in CREATE INDEX | correctness |
|
||||
| 6 | RLS enabled | `ALTER TABLE documents ENABLE ROW LEVEL SECURITY` | security |
|
||||
| 7 | FK to auth.users with CASCADE | `REFERENCES auth.users ... ON DELETE CASCADE` | correctness |
|
||||
| 8 | policies TO authenticated | `TO authenticated` in policy definitions | security |
|
||||
| 9 | idempotent table creation | `CREATE TABLE IF NOT EXISTS` | idempotency |
|
||||
|
||||
## Reasoning
|
||||
|
||||
1. **Baseline differentiator:** Agents without the skill write `CREATE
|
||||
EXTENSION vector;` (wrong schema), use IVFFlat (wrong index type for most
|
||||
cases), and skip the `lists` parameter requirement.
|
||||
2. **Skill value:** `db-schema-extensions.md` explicitly shows the `WITH
|
||||
SCHEMA extensions` pattern and recommends HNSW over IVFFlat with the
|
||||
specific note about `lists` being required for IVFFlat.
|
||||
3. **Testability:** Schema placement in the extension creation line and index
|
||||
type are directly checkable with regex.
|
||||
4. **Realism:** pgvector + OpenAI embeddings is the top "AI + Supabase"
|
||||
tutorial path, and extension schema mistakes are a documented source of
|
||||
permission errors.
|
||||
|
||||
## Difficulty
|
||||
|
||||
**Rating:** MEDIUM
|
||||
|
||||
- Without skill: ~35% of assertions expected to pass (extension enabled but
|
||||
wrong schema, wrong index type, weak policies)
|
||||
- With skill: ~90% of assertions expected to pass
|
||||
- **pass_threshold:** 8
|
||||
89
packages/evals/scenarios/postgrest-schema-cache.md
Normal file
89
packages/evals/scenarios/postgrest-schema-cache.md
Normal file
@@ -0,0 +1,89 @@
|
||||
# Scenario: postgrest-schema-cache
|
||||
|
||||
## Summary
|
||||
|
||||
The agent must create a migration that adds new columns to an existing table
|
||||
and create a view that uses those columns, including the correct `NOTIFY
|
||||
pgrst, 'reload schema'` call to force PostgREST to pick up the schema changes.
|
||||
Without this, the API returns 400 errors for the new columns even after
|
||||
migration.
|
||||
|
||||
## Real-World Justification
|
||||
|
||||
Why this is a common and important workflow:
|
||||
|
||||
1. **Direct troubleshooting entry** — "PostgREST not recognizing new columns,
|
||||
tables, views or functions" and "Reload/refresh postgrest schema" (400
|
||||
bad_request error) are explicitly listed in the Supabase troubleshooting
|
||||
guide. This is among the most confusing errors for new Supabase developers —
|
||||
the migration ran successfully but the API still returns errors.
|
||||
- Source: https://supabase.com/docs/guides/troubleshooting
|
||||
2. **Schema cache invalidation** — PostgREST caches the database schema at
|
||||
startup and reloads it only when notified. Migrations that add new objects
|
||||
must explicitly call `NOTIFY pgrst, 'reload schema'` at the end of the
|
||||
migration file for the changes to be reflected immediately in local
|
||||
development.
|
||||
- Source: https://supabase.com/docs/guides/api/rest/generating-types
|
||||
3. **Views and RLS** — Creating a view over a user-owned table requires
|
||||
understanding that RLS applies to the underlying tables, and the view itself
|
||||
should use `security_invoker = true` to preserve RLS context.
|
||||
- Source: https://supabase.com/docs/guides/database/views
|
||||
|
||||
## Skill References Exercised
|
||||
|
||||
| Reference File | What It Teaches | What the Agent Should Apply |
|
||||
|---|---|---|
|
||||
| `references/db-rls-views.md` | Views need security_invoker to respect RLS | `WITH (security_invoker = true)` on view |
|
||||
| `references/db-migrations-idempotent.md` | ADD COLUMN IF NOT EXISTS; IF NOT EXISTS patterns | Idempotent column additions |
|
||||
| `references/db-rls-mandatory.md` | RLS on base tables | RLS enabled on base table |
|
||||
| `references/db-rls-performance.md` | (select auth.uid()) subselect | Subselect form in policies |
|
||||
| `references/db-schema-timestamps.md` | timestamptz for new columns | timestamptz on added columns |
|
||||
|
||||
## Workspace Setup
|
||||
|
||||
- A workspace with `supabase/config.toml` and a starter migration that creates
|
||||
a basic `products` table (id, name, price) with RLS enabled but no policies.
|
||||
|
||||
## Agent Task (PROMPT.md draft)
|
||||
|
||||
> Our `products` table needs two new columns: `description` (text) and
|
||||
> `published_at` (timestamp). Also create a view called `public_products` that
|
||||
> shows only products where `published_at` is not null. Add a policy so any
|
||||
> authenticated user can view published products. Put changes in a new
|
||||
> migration file in `supabase/migrations/`.
|
||||
|
||||
## Evaluation Criteria
|
||||
|
||||
| # | Test Name | What It Checks | Quality Dimension |
|
||||
|---|-----------|----------------|-------------------|
|
||||
| 1 | new migration file exists | A second `.sql` file in `supabase/migrations/` | structure |
|
||||
| 2 | ADD COLUMN IF NOT EXISTS for description | `ADD COLUMN IF NOT EXISTS description` | idempotency |
|
||||
| 3 | ADD COLUMN IF NOT EXISTS for published_at | `ADD COLUMN IF NOT EXISTS published_at` | idempotency |
|
||||
| 4 | published_at uses timestamptz | `published_at timestamptz` not plain `timestamp` | correctness |
|
||||
| 5 | view created | `CREATE OR REPLACE VIEW public_products` or similar | correctness |
|
||||
| 6 | view uses security_invoker | `security_invoker = true` on the view | security |
|
||||
| 7 | SELECT policy on products for authenticated | A FOR SELECT policy on products with TO authenticated | security |
|
||||
| 8 | NOTIFY pgrst reload present | `NOTIFY pgrst` in the migration | correctness |
|
||||
|
||||
## Reasoning
|
||||
|
||||
1. **Baseline differentiator:** Agents without the skill add columns correctly
|
||||
but miss `IF NOT EXISTS`, use plain `timestamp`, forget `security_invoker`
|
||||
on the view, and almost never include the `NOTIFY pgrst` call.
|
||||
2. **Skill value:** The NOTIFY pattern and security_invoker requirement are
|
||||
non-obvious details that the reference files teach explicitly.
|
||||
3. **Testability:** `NOTIFY pgrst` is a unique string that either appears or
|
||||
doesn't; `security_invoker` is similarly specific.
|
||||
4. **Realism:** Iterative schema evolution (adding columns to existing tables)
|
||||
is the most common database task after initial setup, and the PostgREST
|
||||
cache invalidation issue is a universal source of confusion.
|
||||
|
||||
## Difficulty
|
||||
|
||||
**Rating:** MEDIUM
|
||||
|
||||
- Without skill: ~40% of assertions expected to pass (columns added and view
|
||||
created, but no IF NOT EXISTS, wrong timestamp type, no NOTIFY, no
|
||||
security_invoker)
|
||||
- With skill: ~88% of assertions expected to pass
|
||||
- **pass_threshold:** 7
|
||||
85
packages/evals/scenarios/rls-update-needs-select.md
Normal file
85
packages/evals/scenarios/rls-update-needs-select.md
Normal file
@@ -0,0 +1,85 @@
|
||||
# Scenario: rls-update-needs-select
|
||||
|
||||
## Summary
|
||||
|
||||
The agent must write a migration for an `orders` table where users can view and
|
||||
update only their own orders. The classic trap is writing an UPDATE policy
|
||||
without a matching SELECT policy — causing UPDATE to silently affect zero rows
|
||||
because RLS cannot find any rows to update.
|
||||
|
||||
## Real-World Justification
|
||||
|
||||
Why this is a common and important workflow:
|
||||
|
||||
1. **"Why is my UPDATE returning empty data?"** — The Supabase troubleshooting
|
||||
guide lists "Why is my select returning an empty data array and I have data
|
||||
in the table?" which is the same root symptom. UPDATE with no SELECT policy
|
||||
silently returns `{data: [], count: 0}` with no error, making it extremely
|
||||
hard to diagnose.
|
||||
- Source: https://supabase.com/docs/guides/troubleshooting
|
||||
2. **Documented RLS behavior** — The official RLS docs state that UPDATE
|
||||
requires a SELECT policy to identify which rows are accessible for
|
||||
modification. This is non-obvious and contradicts most developers'
|
||||
expectations from SQL semantics.
|
||||
- Source: https://supabase.com/docs/guides/database/postgres/row-level-security
|
||||
3. **WITH CHECK requirement** — An UPDATE policy also needs a `WITH CHECK`
|
||||
clause to prevent users from updating a row to a state that would no longer
|
||||
be visible to them (e.g., changing their own `user_id`). Missing this allows
|
||||
data ownership hijacking.
|
||||
- Source: https://supabase.com/docs/guides/database/postgres/row-level-security
|
||||
|
||||
## Skill References Exercised
|
||||
|
||||
| Reference File | What It Teaches | What the Agent Should Apply |
|
||||
|---|---|---|
|
||||
| `references/db-rls-common-mistakes.md` | UPDATE needs a SELECT policy; WITH CHECK clause | Separate SELECT and UPDATE policies, WITH CHECK |
|
||||
| `references/db-rls-policy-types.md` | USING vs WITH CHECK semantics | WITH CHECK on UPDATE policy |
|
||||
| `references/db-rls-performance.md` | (select auth.uid()) subquery caching | Subselect form in all USING/WITH CHECK |
|
||||
| `references/db-rls-mandatory.md` | Enable RLS, TO authenticated | Full mandatory boilerplate |
|
||||
| `references/db-schema-timestamps.md` | timestamptz for time columns | timestamptz not timestamp |
|
||||
|
||||
## Workspace Setup
|
||||
|
||||
- Empty workspace with a pre-initialized `supabase/config.toml` (no migrations)
|
||||
|
||||
## Agent Task (PROMPT.md draft)
|
||||
|
||||
> Create a migration for an `orders` table. Each order has a `status` (text),
|
||||
> `total` (numeric), and `created_at`. Orders belong to users. Users should be
|
||||
> able to view their own orders and update the status of their own orders.
|
||||
> Put the migration in `supabase/migrations/`.
|
||||
|
||||
## Evaluation Criteria
|
||||
|
||||
| # | Test Name | What It Checks | Quality Dimension |
|
||||
|---|-----------|----------------|-------------------|
|
||||
| 1 | migration file exists | At least one `.sql` file in `supabase/migrations/` | structure |
|
||||
| 2 | creates orders table | SQL contains `CREATE TABLE` and `orders` | correctness |
|
||||
| 3 | RLS enabled | `ALTER TABLE orders ENABLE ROW LEVEL SECURITY` | security |
|
||||
| 4 | has SELECT policy | A `FOR SELECT` policy exists on orders | correctness |
|
||||
| 5 | has UPDATE policy with WITH CHECK | A `FOR UPDATE` policy with `WITH CHECK` clause exists | correctness |
|
||||
| 6 | all policies TO authenticated | Every `CREATE POLICY` has `TO authenticated` | security |
|
||||
| 7 | uses (select auth.uid()) | Subselect form in policy USING clauses | performance |
|
||||
| 8 | uses timestamptz not timestamp | `created_at timestamptz` not plain `timestamp` | correctness |
|
||||
| 9 | FK to auth.users with CASCADE | `REFERENCES auth.users ... ON DELETE CASCADE` | correctness |
|
||||
|
||||
## Reasoning
|
||||
|
||||
1. **Baseline differentiator:** Without the skill, agents write only an UPDATE
|
||||
policy (or a single ALL policy), skip WITH CHECK, and use bare `auth.uid()`
|
||||
calls. The result is a migration that looks complete but breaks silently.
|
||||
2. **Skill value:** `db-rls-common-mistakes.md` explicitly covers this
|
||||
UPDATE-needs-SELECT behavior with working examples.
|
||||
3. **Testability:** The presence of both `FOR SELECT` and `FOR UPDATE` with
|
||||
`WITH CHECK` is directly detectable via regex on the SQL.
|
||||
4. **Realism:** "My UPDATE isn't working, returns empty" is among the most
|
||||
common questions from developers new to RLS in the Supabase community.
|
||||
|
||||
## Difficulty
|
||||
|
||||
**Rating:** MEDIUM
|
||||
|
||||
- Without skill: ~40% of assertions expected to pass (table and RLS likely,
|
||||
but wrong policy structure)
|
||||
- With skill: ~92% of assertions expected to pass
|
||||
- **pass_threshold:** 8
|
||||
85
packages/evals/scenarios/rls-user-metadata-role-check.md
Normal file
85
packages/evals/scenarios/rls-user-metadata-role-check.md
Normal file
@@ -0,0 +1,85 @@
|
||||
# Scenario: rls-user-metadata-role-check
|
||||
|
||||
## Summary
|
||||
|
||||
The agent must write a migration for a `documents` table where admin users can
|
||||
read all documents and regular users can only read their own. The dangerous
|
||||
trap is checking `user_metadata` for the admin role — users can write to their
|
||||
own `user_metadata`, so this check is bypassable. The correct pattern uses
|
||||
`app_metadata`.
|
||||
|
||||
## Real-World Justification
|
||||
|
||||
Why this is a common and important workflow:
|
||||
|
||||
1. **Explicit troubleshooting + security entry** — The Supabase troubleshooting
|
||||
guide covers "Database API 42501 errors" related to auth claims and RLS.
|
||||
Using user_metadata for authorization is one of the most dangerous patterns,
|
||||
documented as a common mistake in the Supabase RLS guides.
|
||||
- Source: https://supabase.com/docs/guides/troubleshooting
|
||||
2. **Privilege escalation vulnerability** — Any authenticated user can call
|
||||
`supabase.auth.updateUser({ data: { role: 'admin' } })` to set their own
|
||||
`user_metadata`. An RLS policy checking `user_metadata->>'role' = 'admin'`
|
||||
gives every user admin access to all documents.
|
||||
- Source: https://supabase.com/docs/guides/database/postgres/row-level-security
|
||||
3. **app_metadata is server-only** — `app_metadata` can only be set via the
|
||||
Admin API or auth hooks, making it safe for authorization. This distinction
|
||||
is taught in the skill but frequently missed by developers.
|
||||
- Source: https://supabase.com/docs/guides/auth/managing-user-data
|
||||
|
||||
## Skill References Exercised
|
||||
|
||||
| Reference File | What It Teaches | What the Agent Should Apply |
|
||||
|---|---|---|
|
||||
| `references/db-rls-common-mistakes.md` | app_metadata not user_metadata for authorization | `auth.jwt() -> 'app_metadata' ->> 'role'` |
|
||||
| `references/db-rls-policy-types.md` | PERMISSIVE policies combine with OR; multiple policies for different roles | Separate owner and admin policies |
|
||||
| `references/db-rls-performance.md` | (select auth.uid()) subquery; (select auth.jwt()) caching | Subselect form for JWT lookups |
|
||||
| `references/db-rls-mandatory.md` | RLS enabled, TO authenticated | Full boilerplate |
|
||||
| `references/db-schema-auth-fk.md` | FK to auth.users with CASCADE | Correct user linkage |
|
||||
|
||||
## Workspace Setup
|
||||
|
||||
- Empty workspace with a pre-initialized `supabase/config.toml` (no migrations)
|
||||
|
||||
## Agent Task (PROMPT.md draft)
|
||||
|
||||
> Create a migration for a `documents` table. Each document has a `title`
|
||||
> (text), `content` (text), and an owner. Regular users can only see their own
|
||||
> documents. Admin users (identified by a role field in their JWT) should be
|
||||
> able to see all documents. Put the migration in `supabase/migrations/`.
|
||||
|
||||
## Evaluation Criteria
|
||||
|
||||
| # | Test Name | What It Checks | Quality Dimension |
|
||||
|---|-----------|----------------|-------------------|
|
||||
| 1 | migration file exists | At least one `.sql` file in `supabase/migrations/` | structure |
|
||||
| 2 | creates documents table | SQL contains `CREATE TABLE` and `documents` | correctness |
|
||||
| 3 | RLS enabled | `ALTER TABLE documents ENABLE ROW LEVEL SECURITY` | security |
|
||||
| 4 | uses app_metadata not user_metadata | JWT role check uses `app_metadata` not `user_metadata` | security |
|
||||
| 5 | no user_metadata role check | `user_metadata` does not appear in policy USING clauses | security |
|
||||
| 6 | two separate policies or one covering both | At least one SELECT policy for owner and one for admin role | correctness |
|
||||
| 7 | TO authenticated on all policies | `TO authenticated` in every policy | security |
|
||||
| 8 | (select auth.uid()) subselect form | Subselect form used not bare auth.uid() | performance |
|
||||
| 9 | FK to auth.users with CASCADE | `REFERENCES auth.users ... ON DELETE CASCADE` | correctness |
|
||||
|
||||
## Reasoning
|
||||
|
||||
1. **Baseline differentiator:** Agents without the skill almost universally
|
||||
reach for `user_metadata` when asked about "a role field in their JWT" —
|
||||
it is the more discoverable but dangerous field. Only the skill explicitly
|
||||
flags this as an authorization anti-pattern.
|
||||
2. **Skill value:** `db-rls-common-mistakes.md` section 2 directly addresses
|
||||
this with the exact `app_metadata` pattern.
|
||||
3. **Testability:** Checking for `app_metadata` presence and `user_metadata`
|
||||
absence in policy USING clauses is a precise regex assertion.
|
||||
4. **Realism:** Role-based access in a multi-tenant app is one of the most
|
||||
common RLS patterns requested, and the metadata confusion is universal.
|
||||
|
||||
## Difficulty
|
||||
|
||||
**Rating:** MEDIUM
|
||||
|
||||
- Without skill: ~30% of assertions expected to pass (table and RLS likely,
|
||||
but user_metadata used, subselect missing)
|
||||
- With skill: ~90% of assertions expected to pass
|
||||
- **pass_threshold:** 8
|
||||
86
packages/evals/scenarios/service-role-edge-function.md
Normal file
86
packages/evals/scenarios/service-role-edge-function.md
Normal file
@@ -0,0 +1,86 @@
|
||||
# Scenario: service-role-edge-function
|
||||
|
||||
## Summary
|
||||
|
||||
The agent must create a simple Edge Function that performs an admin operation
|
||||
(listing all users' records) using the service role key server-side, while
|
||||
a companion migration shows the table uses the anon key for browser access.
|
||||
The trap is accidentally exposing the service role key or using it in
|
||||
client-facing code.
|
||||
|
||||
## Real-World Justification
|
||||
|
||||
Why this is a common and important workflow:
|
||||
|
||||
1. **Dedicated troubleshooting entry** — The Supabase troubleshooting guide
|
||||
contains "Why is my service role key client getting RLS errors or not
|
||||
returning data?" — developers incorrectly use the service role key in
|
||||
contexts where it should not be used, or use the anon key where service role
|
||||
is needed.
|
||||
- Source: https://supabase.com/docs/guides/troubleshooting
|
||||
2. **Most dangerous Supabase mistake** — Exposing the service role key in
|
||||
browser JavaScript bypasses all RLS and gives every visitor full database
|
||||
access. This appears in multiple Supabase blog posts and community warnings.
|
||||
- Source: https://supabase.com/docs/guides/api/api-keys
|
||||
3. **Environment variable leakage** — The troubleshooting guide warns about
|
||||
"Inspecting edge function environment variables" as a debugging topic.
|
||||
Developers must use `Deno.env.get()` not hardcoded keys, and never use
|
||||
`NEXT_PUBLIC_` prefix for the service role key.
|
||||
- Source: https://supabase.com/docs/guides/troubleshooting
|
||||
|
||||
## Skill References Exercised
|
||||
|
||||
| Reference File | What It Teaches | What the Agent Should Apply |
|
||||
|---|---|---|
|
||||
| `references/db-security-service-role.md` | Never expose service role key in browser, use env vars | `Deno.env.get('SUPABASE_SERVICE_ROLE_KEY')` in edge function |
|
||||
| `references/edge-fun-quickstart.md` | Edge function file structure and exports | Correct `index.ts` in `supabase/functions/` |
|
||||
| `references/edge-db-supabase-client.md` | Creating supabase client in edge functions | `createClient` with service role for admin ops |
|
||||
| `references/edge-pat-cors.md` | CORS headers for browser requests | CORS on the response |
|
||||
| `references/edge-pat-error-handling.md` | Error responses | Proper error handling |
|
||||
|
||||
## Workspace Setup
|
||||
|
||||
- Empty workspace with a pre-initialized `supabase/config.toml`
|
||||
- A migration creating a `reports` table already exists in `supabase/migrations/`
|
||||
|
||||
## Agent Task (PROMPT.md draft)
|
||||
|
||||
> Create an Edge Function called `admin-reports` that returns all rows from
|
||||
> the `reports` table, bypassing RLS (it's an admin-only endpoint). The
|
||||
> function should be in `supabase/functions/admin-reports/index.ts`. Use
|
||||
> environment variables for any keys — do not hardcode them.
|
||||
|
||||
## Evaluation Criteria
|
||||
|
||||
| # | Test Name | What It Checks | Quality Dimension |
|
||||
|---|-----------|----------------|-------------------|
|
||||
| 1 | edge function file exists | `supabase/functions/admin-reports/index.ts` exists | structure |
|
||||
| 2 | uses Deno.env.get for service key | `Deno.env.get` used to retrieve the service role key | security |
|
||||
| 3 | no hardcoded service role key | No JWT-like string literal (`eyJ`) as the service role value | security |
|
||||
| 4 | createClient called with service role | `createClient` receives the service role env var as second arg | correctness |
|
||||
| 5 | service role key not NEXT_PUBLIC prefixed | No `NEXT_PUBLIC_` prefix on service role variable name | security |
|
||||
| 6 | CORS headers present | `Access-Control-Allow-Origin` in response headers | correctness |
|
||||
| 7 | returns JSON response | `Response` with JSON body and content-type | correctness |
|
||||
|
||||
## Reasoning
|
||||
|
||||
1. **Baseline differentiator:** Agents without the skill sometimes hardcode a
|
||||
placeholder key string, forget CORS, or use the wrong env variable name
|
||||
pattern.
|
||||
2. **Skill value:** `db-security-service-role.md` is explicit about env var
|
||||
naming rules and the `NEXT_PUBLIC_` anti-pattern. `edge-fun-quickstart.md`
|
||||
teaches the Deno.env.get pattern.
|
||||
3. **Testability:** Checking for `eyJ` hardcoded strings and `NEXT_PUBLIC_`
|
||||
prefixes are reliable negative assertions. `Deno.env.get` is a positive
|
||||
string check.
|
||||
4. **Realism:** Admin Edge Functions that bypass RLS are an extremely common
|
||||
pattern for dashboards and data exports.
|
||||
|
||||
## Difficulty
|
||||
|
||||
**Rating:** EASY
|
||||
|
||||
- Without skill: ~50% of assertions expected to pass (file exists, createClient
|
||||
present, but key handling likely wrong)
|
||||
- With skill: ~93% of assertions expected to pass
|
||||
- **pass_threshold:** 8
|
||||
@@ -141,4 +141,5 @@ Step-by-step reasoning for why this scenario is well-designed:
|
||||
**Rating:** MEDIUM
|
||||
|
||||
- Without skill: ~30-45% of assertions expected to pass
|
||||
- With skill: ~85-95% of assertions expected to pass
|
||||
- With skill: ~85-95% of assertions expected to pass
|
||||
- **pass_threshold:** 14
|
||||
@@ -100,8 +100,9 @@ specific quality signal:
|
||||
| 12 | index on membership lookup columns | `CREATE INDEX` on user_id and/or org_id in memberships | performance |
|
||||
| 13 | uses timestamptz | No plain `timestamp` for time columns | correctness |
|
||||
| 14 | idempotent DDL | Uses `IF NOT EXISTS` or `DROP ... IF EXISTS` patterns | idempotency |
|
||||
| 15 | delete policy restricted to owner role | A delete policy on projects checks for owner/admin role | security |
|
||||
| 16 | overall quality score | At least 10/14 best-practice signals present | overall |
|
||||
| 15 | stable or immutable on helper function | Helper function marked STABLE or IMMUTABLE for performance | performance |
|
||||
| 16 | delete policy restricted to owner role | A delete policy on projects checks for owner/admin role | security |
|
||||
| 17 | overall quality score | At least 11/15 best-practice signals present | overall |
|
||||
|
||||
## Reasoning
|
||||
|
||||
@@ -136,4 +137,5 @@ Step-by-step reasoning for why this scenario is well-designed:
|
||||
**Rating:** MEDIUM
|
||||
|
||||
- Without skill: ~35-50% of assertions expected to pass
|
||||
- With skill: ~85-95% of assertions expected to pass
|
||||
- With skill: ~85-95% of assertions expected to pass
|
||||
- **pass_threshold:** 13
|
||||
21
packages/evals/src/eval-types.ts
Normal file
21
packages/evals/src/eval-types.ts
Normal file
@@ -0,0 +1,21 @@
|
||||
/**
|
||||
* A single assertion to run against the agent's workspace output.
|
||||
*
|
||||
* Used by EVAL.ts files to declare what the agent's work should produce.
|
||||
* The runner executes these in-process (no test framework required).
|
||||
*/
|
||||
export interface EvalAssertion {
|
||||
/** Human-readable name shown in Braintrust and local output */
|
||||
name: string;
|
||||
/** Return true = pass, false/throw = fail */
|
||||
check: () => boolean | Promise<boolean>;
|
||||
/** Timeout in ms for async checks (default: no timeout) */
|
||||
timeout?: number;
|
||||
}
|
||||
|
||||
/** Result of running a single EvalAssertion */
|
||||
export interface AssertionResult {
|
||||
name: string;
|
||||
passed: boolean;
|
||||
error?: string;
|
||||
}
|
||||
@@ -1,11 +1,8 @@
|
||||
import { existsSync, readdirSync, readFileSync } from "node:fs";
|
||||
import { join, resolve } from "node:path";
|
||||
import type { AssertionResult, EvalAssertion } from "./eval-types.js";
|
||||
import { runAgent } from "./runner/agent.js";
|
||||
import {
|
||||
initBraintrustLogger,
|
||||
logScenarioToLogger,
|
||||
uploadToBraintrust,
|
||||
} from "./runner/braintrust.js";
|
||||
import { uploadToBraintrust } from "./runner/braintrust.js";
|
||||
import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
|
||||
import { preflight } from "./runner/preflight.js";
|
||||
import { listModifiedFiles, printSummary } from "./runner/results.js";
|
||||
@@ -22,7 +19,6 @@ import {
|
||||
startSupabase,
|
||||
stopSupabase,
|
||||
} from "./runner/supabase-setup.js";
|
||||
import { runTests } from "./runner/test.js";
|
||||
import {
|
||||
buildTranscriptSummary,
|
||||
type TranscriptSummary,
|
||||
@@ -92,6 +88,40 @@ function getPassThreshold(scenarioId: string): number | null {
|
||||
return match ? Number.parseInt(match[1], 10) : null;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// In-process assertion runner (replaces vitest subprocess)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async function runAssertions(
|
||||
assertions: EvalAssertion[],
|
||||
): Promise<AssertionResult[]> {
|
||||
return Promise.all(
|
||||
assertions.map(async (a) => {
|
||||
try {
|
||||
let result: boolean;
|
||||
if (a.timeout) {
|
||||
const timeoutPromise = new Promise<never>((_, reject) =>
|
||||
setTimeout(
|
||||
() =>
|
||||
reject(new Error(`Assertion timed out after ${a.timeout}ms`)),
|
||||
a.timeout,
|
||||
),
|
||||
);
|
||||
result = await Promise.race([
|
||||
Promise.resolve(a.check()),
|
||||
timeoutPromise,
|
||||
]);
|
||||
} else {
|
||||
result = await Promise.resolve(a.check());
|
||||
}
|
||||
return { name: a.name, passed: Boolean(result) };
|
||||
} catch (e) {
|
||||
return { name: a.name, passed: false, error: String(e) };
|
||||
}
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Run a single eval
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -106,18 +136,28 @@ async function runEval(
|
||||
|
||||
console.log(`\n--- ${scenario.id} (${variant}) ---`);
|
||||
|
||||
// Load assertions and expected reference files from EVAL.ts
|
||||
const evalFilePath = existsSync(join(evalDir, "EVAL.tsx"))
|
||||
? join(evalDir, "EVAL.tsx")
|
||||
: join(evalDir, "EVAL.ts");
|
||||
|
||||
const {
|
||||
assertions = [] as EvalAssertion[],
|
||||
expectedReferenceFiles = [] as string[],
|
||||
} = await import(evalFilePath).catch(() => ({
|
||||
assertions: [] as EvalAssertion[],
|
||||
expectedReferenceFiles: [] as string[],
|
||||
}));
|
||||
|
||||
const passThreshold = getPassThreshold(scenario.id);
|
||||
const prompt = readFileSync(join(evalDir, "PROMPT.md"), "utf-8").trim();
|
||||
|
||||
// 1. Create isolated workspace
|
||||
const { workspacePath, cleanup } = createWorkspace({
|
||||
evalDir,
|
||||
skillEnabled,
|
||||
});
|
||||
const { workspacePath, cleanup } = createWorkspace({ evalDir, skillEnabled });
|
||||
console.log(` Workspace: ${workspacePath}`);
|
||||
|
||||
try {
|
||||
// 2. Read the prompt
|
||||
const prompt = readFileSync(join(evalDir, "PROMPT.md"), "utf-8").trim();
|
||||
|
||||
// 3. Run the agent
|
||||
// 2. Run the agent
|
||||
console.log(` Running agent (${model})...`);
|
||||
const startedAt = Date.now();
|
||||
const agentResult = await runAgent({
|
||||
@@ -132,54 +172,48 @@ async function runEval(
|
||||
` Agent finished in ${(agentResult.duration / 1000).toFixed(1)}s`,
|
||||
);
|
||||
|
||||
// 4. Run the hidden tests
|
||||
const evalFilePath = existsSync(join(evalDir, "EVAL.tsx"))
|
||||
? join(evalDir, "EVAL.tsx")
|
||||
: join(evalDir, "EVAL.ts");
|
||||
|
||||
const passThreshold = getPassThreshold(scenario.id);
|
||||
|
||||
console.log(" Running tests...");
|
||||
const testResult = await runTests({
|
||||
workspacePath,
|
||||
evalFilePath,
|
||||
passThreshold: passThreshold ?? undefined,
|
||||
// 3. Run assertions in-process from the workspace directory so that
|
||||
// eval-utils.ts helpers resolve paths relative to the workspace.
|
||||
console.log(" Running assertions...");
|
||||
const prevCwd = process.cwd();
|
||||
process.chdir(workspacePath);
|
||||
const assertionResults = await runAssertions(assertions).finally(() => {
|
||||
process.chdir(prevCwd);
|
||||
});
|
||||
const passedCount = assertionResults.filter((a) => a.passed).length;
|
||||
const totalCount = assertionResults.length;
|
||||
|
||||
const passed = passThreshold
|
||||
? totalCount > 0 && passedCount >= passThreshold
|
||||
: totalCount > 0 && passedCount === totalCount;
|
||||
|
||||
const pct =
|
||||
testResult.totalCount > 0
|
||||
? ((testResult.passedCount / testResult.totalCount) * 100).toFixed(1)
|
||||
: "0.0";
|
||||
totalCount > 0 ? ((passedCount / totalCount) * 100).toFixed(1) : "0.0";
|
||||
const thresholdInfo = passThreshold
|
||||
? `, threshold: ${((passThreshold / testResult.totalCount) * 100).toFixed(0)}%`
|
||||
? `, threshold: ${((passThreshold / totalCount) * 100).toFixed(0)}%`
|
||||
: "";
|
||||
console.log(
|
||||
` Tests: ${testResult.passedCount}/${testResult.totalCount} passed (${pct}%${thresholdInfo})`,
|
||||
` Assertions: ${passedCount}/${totalCount} passed (${pct}%${thresholdInfo})`,
|
||||
);
|
||||
|
||||
// 5. Collect modified files
|
||||
// 4. Collect modified files
|
||||
const filesModified = listModifiedFiles(workspacePath, evalDir);
|
||||
|
||||
// 6. Build transcript summary
|
||||
// 5. Build transcript summary
|
||||
const summary = buildTranscriptSummary(agentResult.events);
|
||||
|
||||
// 7. Load expectedReferenceFiles from EVAL.ts (if declared)
|
||||
const { expectedReferenceFiles = [] } = await import(evalFilePath).catch(
|
||||
() => ({ expectedReferenceFiles: [] as string[] }),
|
||||
);
|
||||
|
||||
// 8. Run scorers
|
||||
// 6. Run scorers
|
||||
const skillScore = skillUsageScorer(summary, skillName);
|
||||
const refScore = referenceFilesUsageScorer(summary, expectedReferenceFiles);
|
||||
const assertScore = assertionsPassedScorer({
|
||||
testsPassed: testResult.passedCount,
|
||||
testsTotal: testResult.totalCount,
|
||||
status: testResult.passed ? "passed" : "failed",
|
||||
testsPassed: passedCount,
|
||||
testsTotal: totalCount,
|
||||
status: passed ? "passed" : "failed",
|
||||
} as EvalRunResult);
|
||||
const finalScore = finalResultScorer({
|
||||
status: testResult.passed ? "passed" : "failed",
|
||||
testsPassed: testResult.passedCount,
|
||||
testsTotal: testResult.totalCount,
|
||||
status: passed ? "passed" : "failed",
|
||||
testsPassed: passedCount,
|
||||
testsTotal: totalCount,
|
||||
passThreshold: passThreshold ?? undefined,
|
||||
} as EvalRunResult);
|
||||
|
||||
@@ -188,18 +222,17 @@ async function runEval(
|
||||
agent: "claude-code",
|
||||
model,
|
||||
skillEnabled,
|
||||
status: testResult.passed ? "passed" : "failed",
|
||||
status: passed ? "passed" : "failed",
|
||||
duration: agentResult.duration,
|
||||
testOutput: testResult.output,
|
||||
agentOutput: agentResult.output,
|
||||
testsPassed: testResult.passedCount,
|
||||
testsTotal: testResult.totalCount,
|
||||
testsPassed: passedCount,
|
||||
testsTotal: totalCount,
|
||||
passThreshold: passThreshold ?? undefined,
|
||||
assertionResults,
|
||||
filesModified,
|
||||
toolCallCount: summary.toolCalls.length,
|
||||
costUsd: summary.totalCostUsd ?? undefined,
|
||||
prompt,
|
||||
individualTests: testResult.individualTests,
|
||||
startedAt,
|
||||
durationApiMs: summary.totalDurationApiMs,
|
||||
totalInputTokens: summary.totalInputTokens,
|
||||
@@ -225,7 +258,7 @@ async function runEval(
|
||||
saveRunArtifacts({
|
||||
resultDir,
|
||||
rawTranscript: agentResult.rawTranscript,
|
||||
testOutput: testResult.output,
|
||||
assertionResults,
|
||||
result,
|
||||
transcriptSummary: summary,
|
||||
});
|
||||
@@ -241,7 +274,6 @@ async function runEval(
|
||||
skillEnabled,
|
||||
status: "error",
|
||||
duration: 0,
|
||||
testOutput: "",
|
||||
agentOutput: "",
|
||||
testsPassed: 0,
|
||||
testsTotal: 0,
|
||||
@@ -281,7 +313,7 @@ async function main() {
|
||||
startSupabase();
|
||||
const keys = getKeys();
|
||||
|
||||
// Inject keys into process.env so EVAL.ts tests can connect to the real DB.
|
||||
// Inject keys into process.env so assertions can connect to the real DB.
|
||||
process.env.SUPABASE_URL = keys.apiUrl;
|
||||
process.env.SUPABASE_ANON_KEY = keys.anonKey;
|
||||
process.env.SUPABASE_SERVICE_ROLE_KEY = keys.serviceRoleKey;
|
||||
@@ -291,7 +323,6 @@ async function main() {
|
||||
const transcripts = new Map<string, TranscriptSummary>();
|
||||
|
||||
const braintrustUpload = process.env.BRAINTRUST_UPLOAD === "true";
|
||||
const logger = braintrustUpload ? initBraintrustLogger() : undefined;
|
||||
|
||||
try {
|
||||
for (const scenario of scenarios) {
|
||||
@@ -304,15 +335,9 @@ async function main() {
|
||||
if (transcript) {
|
||||
transcripts.set(result.scenario, transcript);
|
||||
}
|
||||
|
||||
// Log immediately after each scenario for real-time visibility.
|
||||
if (logger) {
|
||||
logScenarioToLogger(logger, result, transcript);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
stopSupabase();
|
||||
await logger?.flush();
|
||||
}
|
||||
|
||||
// Use the results dir from the first result (all share the same timestamp)
|
||||
|
||||
@@ -70,7 +70,7 @@ export function logScenarioToLogger(
|
||||
status: r.status,
|
||||
agentOutput: r.agentOutput,
|
||||
filesModified: r.filesModified,
|
||||
testOutput: r.testOutput,
|
||||
assertionResults: r.assertionResults,
|
||||
},
|
||||
expected: { testsTotal: r.testsTotal },
|
||||
scores,
|
||||
@@ -106,7 +106,7 @@ export function logScenarioToLogger(
|
||||
status: r.status,
|
||||
agentOutput: r.agentOutput,
|
||||
filesModified: r.filesModified,
|
||||
testOutput: r.testOutput,
|
||||
assertionResults: r.assertionResults,
|
||||
},
|
||||
expected: { testsTotal: r.testsTotal },
|
||||
scores,
|
||||
@@ -121,7 +121,7 @@ export function logScenarioToLogger(
|
||||
*
|
||||
* Each EvalRunResult becomes a row in the experiment with:
|
||||
* - input: scenario ID, prompt content, skillEnabled flag
|
||||
* - output: status, agent output, files modified, test output
|
||||
* - output: status, agent output, files modified, assertion results
|
||||
* - expected: total tests, pass threshold
|
||||
* - scores: skill_usage, reference_files_usage, assertions_passed, final_result
|
||||
* - metadata: agent, model, skillEnabled, test counts, tool calls, context window, output tokens, model usage, errors, cost
|
||||
@@ -172,7 +172,7 @@ export async function uploadToBraintrust(
|
||||
status: r.status,
|
||||
agentOutput: r.agentOutput,
|
||||
filesModified: r.filesModified,
|
||||
testOutput: r.testOutput,
|
||||
assertionResults: r.assertionResults,
|
||||
};
|
||||
|
||||
const expected = {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import { mkdirSync, writeFileSync } from "node:fs";
|
||||
import { dirname, join } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import type { AssertionResult } from "../eval-types.js";
|
||||
import type { EvalRunResult } from "../types.js";
|
||||
import type { TranscriptSummary } from "./transcript.js";
|
||||
|
||||
@@ -32,7 +33,7 @@ export function createResultDir(
|
||||
export function saveRunArtifacts(opts: {
|
||||
resultDir: string;
|
||||
rawTranscript: string;
|
||||
testOutput: string;
|
||||
assertionResults: AssertionResult[];
|
||||
result: EvalRunResult;
|
||||
transcriptSummary: TranscriptSummary;
|
||||
}): void {
|
||||
@@ -43,8 +44,8 @@ export function saveRunArtifacts(opts: {
|
||||
);
|
||||
|
||||
writeFileSync(
|
||||
join(opts.resultDir, "test-output.txt"),
|
||||
opts.testOutput,
|
||||
join(opts.resultDir, "assertions.json"),
|
||||
JSON.stringify(opts.assertionResults, null, 2),
|
||||
"utf-8",
|
||||
);
|
||||
|
||||
|
||||
@@ -63,7 +63,7 @@ export function referenceFilesUsageScorer(
|
||||
}
|
||||
|
||||
/**
|
||||
* assertionsPassedScorer — ratio of vitest assertions passed vs total.
|
||||
* assertionsPassedScorer — ratio of assertions passed vs total.
|
||||
*/
|
||||
export function assertionsPassedScorer(result: EvalRunResult): ScoreResult {
|
||||
const score =
|
||||
|
||||
@@ -1,143 +0,0 @@
|
||||
import { execFile } from "node:child_process";
|
||||
import { copyFileSync, existsSync, writeFileSync } from "node:fs";
|
||||
import { dirname, join } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import { promisify } from "node:util";
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = dirname(__filename);
|
||||
|
||||
const exec = promisify(execFile);
|
||||
|
||||
export interface TestResult {
|
||||
passed: boolean;
|
||||
output: string;
|
||||
/** Number of tests that passed */
|
||||
passedCount: number;
|
||||
/** Total number of tests */
|
||||
totalCount: number;
|
||||
/** Per-test pass/fail extracted from vitest verbose output */
|
||||
individualTests: Record<string, boolean>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the hidden EVAL.ts tests against the agent's workspace.
|
||||
*
|
||||
* 1. Copy EVAL.ts into the workspace (agent is done, safe to expose)
|
||||
* 2. Run vitest against it
|
||||
* 3. Parse the output for pass/fail
|
||||
*/
|
||||
export async function runTests(opts: {
|
||||
workspacePath: string;
|
||||
evalFilePath: string;
|
||||
passThreshold?: number;
|
||||
}): Promise<TestResult> {
|
||||
// Copy the hidden test file into the workspace
|
||||
const evalFileName = opts.evalFilePath.endsWith(".tsx")
|
||||
? "EVAL.tsx"
|
||||
: "EVAL.ts";
|
||||
const destPath = join(opts.workspacePath, evalFileName);
|
||||
copyFileSync(opts.evalFilePath, destPath);
|
||||
|
||||
// Copy shared eval-utils.ts if it exists alongside the eval scenarios
|
||||
const evalUtilsSrc = join(
|
||||
dirname(dirname(opts.evalFilePath)),
|
||||
"eval-utils.ts",
|
||||
);
|
||||
if (existsSync(evalUtilsSrc)) {
|
||||
copyFileSync(evalUtilsSrc, join(opts.workspacePath, "eval-utils.ts"));
|
||||
}
|
||||
|
||||
// Write a minimal vitest config that overrides the default include pattern
|
||||
// so EVAL.ts (without .test. or .spec.) is picked up.
|
||||
const vitestConfigPath = join(opts.workspacePath, "vitest.config.mjs");
|
||||
if (!existsSync(vitestConfigPath)) {
|
||||
// Alias ../eval-utils.ts → ./eval-utils.ts so the import resolves in
|
||||
// the flat workspace (source tree has EVAL.ts one level deeper).
|
||||
const evalUtilsDest = join(opts.workspacePath, "eval-utils.ts");
|
||||
const aliasBlock = existsSync(evalUtilsDest)
|
||||
? `resolve: { alias: { "../eval-utils.ts": "./eval-utils.ts" } },`
|
||||
: "";
|
||||
writeFileSync(
|
||||
vitestConfigPath,
|
||||
`export default { ${aliasBlock} test: { include: ["EVAL.{ts,tsx}"] } };\n`,
|
||||
);
|
||||
}
|
||||
|
||||
// Use the vitest binary from the evals package (always available)
|
||||
const evalsVitest = join(
|
||||
__dirname,
|
||||
"..",
|
||||
"..",
|
||||
"node_modules",
|
||||
".bin",
|
||||
"vitest",
|
||||
);
|
||||
const vitestBin = join(opts.workspacePath, "node_modules", ".bin", "vitest");
|
||||
const cmd = existsSync(vitestBin) ? vitestBin : evalsVitest;
|
||||
const args = ["run", evalFileName, "--reporter=verbose", "--no-color"];
|
||||
|
||||
try {
|
||||
const { stdout, stderr } = await exec(cmd, args, {
|
||||
cwd: opts.workspacePath,
|
||||
timeout: 60_000,
|
||||
env: { ...process.env },
|
||||
maxBuffer: 5 * 1024 * 1024,
|
||||
});
|
||||
|
||||
const output = `${stdout}\n${stderr}`;
|
||||
return parseTestOutput(output, opts.passThreshold);
|
||||
} catch (error) {
|
||||
const err = error as Error & { stdout?: string; stderr?: string };
|
||||
const output = `${err.stdout ?? ""}\n${err.stderr ?? ""}`;
|
||||
return parseTestOutput(output, opts.passThreshold);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract per-test pass/fail from vitest verbose output.
|
||||
*
|
||||
* Vitest verbose format:
|
||||
* ✓ EVAL.ts > test name here 0ms → passed
|
||||
* × EVAL.ts > test name here 2ms → failed
|
||||
*/
|
||||
function parseIndividualTests(output: string): Record<string, boolean> {
|
||||
const results: Record<string, boolean> = {};
|
||||
const re = /[✓×]\s+EVAL\.tsx?\s+>\s+(.+?)\s+\d+ms/g;
|
||||
for (const match of output.matchAll(re)) {
|
||||
const testName = match[1].trim();
|
||||
const didPass = output[match.index] === "✓";
|
||||
results[testName] = didPass;
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
function parseTestOutput(output: string, passThreshold?: number): TestResult {
|
||||
// Parse vitest output for pass/fail counts
|
||||
// Vitest formats:
|
||||
// All passing: "Tests N passed (N)"
|
||||
// Mixed: "Tests N failed | M passed (T)"
|
||||
// All failing: "Tests N failed (N)"
|
||||
const mixedOrPassing = output.match(
|
||||
/Tests\s+(?:(\d+)\s+failed\s+\|\s+)?(\d+)\s+passed\s+\((\d+)\)/,
|
||||
);
|
||||
const allFailing = output.match(/Tests\s+(\d+)\s+failed\s+\((\d+)\)/);
|
||||
|
||||
let passedCount = 0;
|
||||
let totalCount = 0;
|
||||
|
||||
if (mixedOrPassing) {
|
||||
passedCount = Number.parseInt(mixedOrPassing[2], 10);
|
||||
totalCount = Number.parseInt(mixedOrPassing[3], 10);
|
||||
} else if (allFailing) {
|
||||
passedCount = 0;
|
||||
totalCount = Number.parseInt(allFailing[2], 10);
|
||||
}
|
||||
|
||||
const passed = passThreshold
|
||||
? totalCount > 0 && passedCount >= passThreshold
|
||||
: totalCount > 0 && passedCount === totalCount;
|
||||
const individualTests = parseIndividualTests(output);
|
||||
|
||||
return { passed, output, passedCount, totalCount, individualTests };
|
||||
}
|
||||
@@ -1,3 +1,5 @@
|
||||
import type { AssertionResult } from "./eval-types.js";
|
||||
|
||||
export interface EvalScenario {
|
||||
/** Directory name under evals/ */
|
||||
id: string;
|
||||
@@ -23,14 +25,17 @@ export interface EvalRunResult {
|
||||
skillEnabled: boolean;
|
||||
status: "passed" | "failed" | "error";
|
||||
duration: number;
|
||||
testOutput: string;
|
||||
/** Raw test runner output (for debugging) */
|
||||
testOutput?: string;
|
||||
agentOutput: string;
|
||||
/** Number of vitest tests that passed */
|
||||
/** Number of assertions that passed */
|
||||
testsPassed: number;
|
||||
/** Total number of vitest tests */
|
||||
/** Total number of assertions */
|
||||
testsTotal: number;
|
||||
/** Minimum tests required to pass (from scenario config) */
|
||||
passThreshold?: number;
|
||||
/** Per-assertion pass/fail results */
|
||||
assertionResults?: AssertionResult[];
|
||||
/** Files the agent created or modified in the workspace */
|
||||
filesModified: string[];
|
||||
error?: string;
|
||||
@@ -42,8 +47,6 @@ export interface EvalRunResult {
|
||||
costUsd?: number;
|
||||
/** The PROMPT.md content sent to the agent */
|
||||
prompt?: string;
|
||||
/** Per-test pass/fail results from vitest */
|
||||
individualTests?: Record<string, boolean>;
|
||||
/** Epoch ms when the agent run started (for Braintrust span timing) */
|
||||
startedAt?: number;
|
||||
/** API-only latency in ms (excludes local processing overhead) */
|
||||
|
||||
Reference in New Issue
Block a user