mirror of
https://github.com/supabase/agent-skills.git
synced 2026-03-27 10:09:26 +08:00
more two scenarios and claude code cli is now a dependency
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -22,3 +22,6 @@ dist/
|
|||||||
# Generated skills in any dot directory
|
# Generated skills in any dot directory
|
||||||
.*/skills/
|
.*/skills/
|
||||||
.claude/
|
.claude/
|
||||||
|
|
||||||
|
# Eval results (local debugging artifacts)
|
||||||
|
packages/evals/results/
|
||||||
|
|||||||
@@ -20,6 +20,12 @@ hidden tests check the result. Binary pass/fail.
|
|||||||
The agent is **Claude Code** invoked via `claude -p` (print mode). It operates
|
The agent is **Claude Code** invoked via `claude -p` (print mode). It operates
|
||||||
on a real filesystem in a temp directory and can read/write files freely.
|
on a real filesystem in a temp directory and can read/write files freely.
|
||||||
|
|
||||||
|
**Important**: MCP servers are disabled via `--strict-mcp-config` with an empty
|
||||||
|
config. This ensures the agent uses only local tools (Bash, Edit, Write, Read,
|
||||||
|
Glob, Grep) and cannot access remote services like Supabase MCP or Neon. All
|
||||||
|
work must happen on the local filesystem — e.g., creating migration files in
|
||||||
|
`supabase/migrations/`, not applying them to a remote project.
|
||||||
|
|
||||||
## Eval Structure
|
## Eval Structure
|
||||||
|
|
||||||
Each eval lives in `evals/{scenario-name}/`:
|
Each eval lives in `evals/{scenario-name}/`:
|
||||||
|
|||||||
252
packages/evals/evals/storage-rls-user-folders/EVAL.ts
Normal file
252
packages/evals/evals/storage-rls-user-folders/EVAL.ts
Normal file
@@ -0,0 +1,252 @@
|
|||||||
|
import { existsSync, readdirSync, readFileSync } from "node:fs";
|
||||||
|
import { join } from "node:path";
|
||||||
|
import { expect, test } from "vitest";
|
||||||
|
|
||||||
|
const supabaseDir = join(process.cwd(), "supabase");
|
||||||
|
const migrationsDir = join(supabaseDir, "migrations");
|
||||||
|
|
||||||
|
/** Find all .sql migration files (agent may create one or more). */
|
||||||
|
function findMigrationFiles(): string[] {
|
||||||
|
if (!existsSync(migrationsDir)) return [];
|
||||||
|
return readdirSync(migrationsDir)
|
||||||
|
.filter((f) => f.endsWith(".sql"))
|
||||||
|
.map((f) => join(migrationsDir, f));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Read and concatenate all migration SQL files. */
|
||||||
|
function getMigrationSQL(): string {
|
||||||
|
const files = findMigrationFiles();
|
||||||
|
if (files.length === 0)
|
||||||
|
throw new Error("No migration file found in supabase/migrations/");
|
||||||
|
return files.map((f) => readFileSync(f, "utf-8")).join("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
test("migration file exists", () => {
|
||||||
|
expect(findMigrationFiles().length).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("creates avatars bucket", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// Should insert into storage.buckets with id 'avatars' and public = true
|
||||||
|
expect(sql).toMatch(/storage\.buckets/);
|
||||||
|
expect(sql).toMatch(/avatars/);
|
||||||
|
expect(sql).toMatch(/public/);
|
||||||
|
// Verify it's marked as a public bucket (true)
|
||||||
|
const avatarsBlock = sql.match(
|
||||||
|
/insert\s+into\s+storage\.buckets[\s\S]*?avatars[\s\S]*?;/,
|
||||||
|
);
|
||||||
|
expect(avatarsBlock).not.toBeNull();
|
||||||
|
if (avatarsBlock) {
|
||||||
|
expect(avatarsBlock[0]).toMatch(/true/);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("creates documents bucket", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// Should insert into storage.buckets with id 'documents' and public = false
|
||||||
|
expect(sql).toMatch(/documents/);
|
||||||
|
const documentsBlock = sql.match(
|
||||||
|
/insert\s+into\s+storage\.buckets[\s\S]*?documents[\s\S]*?;/,
|
||||||
|
);
|
||||||
|
expect(documentsBlock).not.toBeNull();
|
||||||
|
if (documentsBlock) {
|
||||||
|
expect(documentsBlock[0]).toMatch(/false/);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("avatars bucket has mime type restriction", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// Should have allowed_mime_types with image types
|
||||||
|
expect(sql).toMatch(/allowed_mime_types/);
|
||||||
|
// Check for image MIME types (jpeg, png, webp)
|
||||||
|
expect(sql).toMatch(/image\/jpeg/);
|
||||||
|
expect(sql).toMatch(/image\/png/);
|
||||||
|
expect(sql).toMatch(/image\/webp/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("avatars bucket has file size limit", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// Should have file_size_limit set to approximately 2MB (2097152 bytes or 2MB string)
|
||||||
|
expect(sql).toMatch(/file_size_limit/);
|
||||||
|
// Accept either numeric bytes (2097152) or string form (2MB, 2MiB, 2 * 1024 * 1024)
|
||||||
|
const hasNumericLimit = /2097152/.test(sql);
|
||||||
|
const hasStringLimit = /2\s*m/i.test(sql);
|
||||||
|
const hasCalcLimit = /2\s*\*\s*1024\s*\*\s*1024/.test(sql);
|
||||||
|
expect(hasNumericLimit || hasStringLimit || hasCalcLimit).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("storage policy uses foldername or path for user isolation", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// Should use storage.foldername(name) with auth.uid()::text for folder isolation
|
||||||
|
const usesFoldername = /storage\.foldername\s*\(\s*name\s*\)/.test(sql);
|
||||||
|
// Also accept direct path matching patterns like (name ~ '^user-id/')
|
||||||
|
const usesPathMatch =
|
||||||
|
/\(\s*storage\.foldername\s*\(/.test(sql) ||
|
||||||
|
/\bname\b.*auth\.uid\(\)/.test(sql);
|
||||||
|
expect(usesFoldername || usesPathMatch).toBe(true);
|
||||||
|
// Should cast auth.uid() to text for comparison with folder name
|
||||||
|
expect(sql).toMatch(/auth\.uid\(\)\s*::\s*text/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("storage policy uses TO authenticated", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// Storage upload/delete/update policies should use TO authenticated
|
||||||
|
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||||
|
const storagePolicies = policyBlocks.filter((p) =>
|
||||||
|
p.toLowerCase().includes("storage.objects"),
|
||||||
|
);
|
||||||
|
// At least one storage policy should have TO authenticated
|
||||||
|
const hasAuthenticatedPolicy = storagePolicies.some((p) =>
|
||||||
|
/to\s+(authenticated|public)/.test(p.toLowerCase()),
|
||||||
|
);
|
||||||
|
expect(hasAuthenticatedPolicy).toBe(true);
|
||||||
|
// Specifically, upload/insert policies should be TO authenticated (not public)
|
||||||
|
const insertPolicies = storagePolicies.filter((p) =>
|
||||||
|
/for\s+insert/.test(p.toLowerCase()),
|
||||||
|
);
|
||||||
|
for (const policy of insertPolicies) {
|
||||||
|
expect(policy.toLowerCase()).toMatch(/to\s+authenticated/);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("public read policy for avatars", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// A SELECT policy on storage.objects for avatars bucket should allow public/anon access
|
||||||
|
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||||
|
const avatarSelectPolicies = policyBlocks.filter(
|
||||||
|
(p) =>
|
||||||
|
p.toLowerCase().includes("storage.objects") &&
|
||||||
|
/for\s+select/.test(p.toLowerCase()) &&
|
||||||
|
p.toLowerCase().includes("avatars"),
|
||||||
|
);
|
||||||
|
expect(avatarSelectPolicies.length).toBeGreaterThan(0);
|
||||||
|
// Should use TO public (or TO anon) for public read access
|
||||||
|
const hasPublicAccess = avatarSelectPolicies.some(
|
||||||
|
(p) =>
|
||||||
|
/to\s+public/.test(p.toLowerCase()) || /to\s+anon/.test(p.toLowerCase()),
|
||||||
|
);
|
||||||
|
expect(hasPublicAccess).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("documents bucket is fully private", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// All policies for documents bucket should restrict to authenticated owner
|
||||||
|
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||||
|
const documentPolicies = policyBlocks.filter(
|
||||||
|
(p) =>
|
||||||
|
p.toLowerCase().includes("storage.objects") &&
|
||||||
|
p.toLowerCase().includes("documents"),
|
||||||
|
);
|
||||||
|
expect(documentPolicies.length).toBeGreaterThan(0);
|
||||||
|
// None should allow public/anon access
|
||||||
|
for (const policy of documentPolicies) {
|
||||||
|
expect(policy).not.toMatch(/to\s+public/);
|
||||||
|
expect(policy).not.toMatch(/to\s+anon/);
|
||||||
|
}
|
||||||
|
// All should be scoped to authenticated
|
||||||
|
for (const policy of documentPolicies) {
|
||||||
|
expect(policy).toMatch(/to\s+authenticated/);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("creates file_metadata table", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
expect(sql).toMatch(/create\s+table/);
|
||||||
|
expect(sql).toMatch(/file_metadata/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("file_metadata has FK to auth.users with CASCADE", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// Find the file_metadata CREATE TABLE block or the surrounding context
|
||||||
|
expect(sql).toMatch(/references\s+auth\.users/);
|
||||||
|
expect(sql).toMatch(/on\s+delete\s+cascade/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("RLS enabled on file_metadata", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
expect(sql).toMatch(
|
||||||
|
/alter\s+table.*file_metadata.*enable\s+row\s+level\s+security/,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("file_metadata policies use (select auth.uid())", () => {
|
||||||
|
const sql = getMigrationSQL();
|
||||||
|
// Find policies that reference file_metadata
|
||||||
|
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||||
|
const metadataPolicies = policyBlocks.filter((p) =>
|
||||||
|
p.toLowerCase().includes("file_metadata"),
|
||||||
|
);
|
||||||
|
// Each policy that uses auth.uid() should use the subselect form
|
||||||
|
for (const policy of metadataPolicies) {
|
||||||
|
if (policy.includes("auth.uid()")) {
|
||||||
|
expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("uses timestamptz for time columns", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// Match "timestamp" that is NOT followed by "tz" or "with time zone"
|
||||||
|
const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
|
||||||
|
// Only check if the migration defines time-related columns
|
||||||
|
if (
|
||||||
|
sql.includes("created_at") ||
|
||||||
|
sql.includes("updated_at") ||
|
||||||
|
sql.includes("uploaded_at")
|
||||||
|
) {
|
||||||
|
expect(sql).not.toMatch(hasPlainTimestamp);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("index on file_metadata user_id", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
expect(sql).toMatch(/create\s+index/);
|
||||||
|
// Should index user_id on file_metadata
|
||||||
|
expect(sql).toMatch(/file_metadata/);
|
||||||
|
expect(sql).toMatch(/user_id/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("idempotent DDL", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
expect(sql).toMatch(/if\s+not\s+exists/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("overall quality score", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// A high-quality migration should contain most of these best-practice signals
|
||||||
|
const signals = [
|
||||||
|
// 1. Avatars bucket is public
|
||||||
|
/insert\s+into\s+storage\.buckets[\s\S]*?avatars/,
|
||||||
|
// 2. Documents bucket exists
|
||||||
|
/insert\s+into\s+storage\.buckets[\s\S]*?documents/,
|
||||||
|
// 3. MIME type restriction
|
||||||
|
/allowed_mime_types/,
|
||||||
|
// 4. File size limit
|
||||||
|
/file_size_limit/,
|
||||||
|
// 5. Storage foldername helper
|
||||||
|
/storage\.foldername/,
|
||||||
|
// 6. auth.uid()::text cast
|
||||||
|
/auth\.uid\(\)\s*::\s*text/,
|
||||||
|
// 7. TO authenticated on policies
|
||||||
|
/to\s+authenticated/,
|
||||||
|
// 8. Public read for avatars
|
||||||
|
/to\s+(public|anon)/,
|
||||||
|
// 9. RLS on file_metadata
|
||||||
|
/enable\s+row\s+level\s+security/,
|
||||||
|
// 10. FK to auth.users with cascade
|
||||||
|
/on\s+delete\s+cascade/,
|
||||||
|
// 11. (select auth.uid()) subselect form
|
||||||
|
/\(select\s+auth\.uid\(\)\)/,
|
||||||
|
// 12. Index on user_id
|
||||||
|
/create\s+index/,
|
||||||
|
// 13. timestamptz usage
|
||||||
|
/timestamptz/,
|
||||||
|
// 14. IF NOT EXISTS for idempotency
|
||||||
|
/if\s+not\s+exists/,
|
||||||
|
// 15. file_metadata table
|
||||||
|
/create\s+table[\s\S]*?file_metadata/,
|
||||||
|
];
|
||||||
|
const matches = signals.filter((r) => r.test(sql));
|
||||||
|
// Require at least 11 of 15 best-practice signals
|
||||||
|
expect(matches.length).toBeGreaterThanOrEqual(11);
|
||||||
|
});
|
||||||
12
packages/evals/evals/storage-rls-user-folders/PROMPT.md
Normal file
12
packages/evals/evals/storage-rls-user-folders/PROMPT.md
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
I need to set up file storage for my app. There are two use cases:
|
||||||
|
|
||||||
|
1. **Avatars** -- Users upload a profile picture. Anyone can view avatars but only the owning user can upload or replace their own. Only allow image files (JPEG, PNG, WebP). Max 2MB.
|
||||||
|
|
||||||
|
2. **Documents** -- Users upload private documents that only they can access. Max 50MB. No file type restriction.
|
||||||
|
|
||||||
|
The Supabase project is already initialized in the `supabase/` directory. Create a SQL migration that:
|
||||||
|
- Configures both storage buckets
|
||||||
|
- Adds RLS policies on `storage.objects` so each user can only access their own folder (folder name = user ID)
|
||||||
|
- Creates a `file_metadata` table to track uploaded files (file name, bucket, size, user reference) with appropriate security
|
||||||
|
|
||||||
|
Users are authenticated via Supabase Auth.
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"name": "storage-rls-user-folders",
|
||||||
|
"private": true,
|
||||||
|
"type": "module"
|
||||||
|
}
|
||||||
@@ -0,0 +1,64 @@
|
|||||||
|
# For detailed configuration reference documentation, visit:
|
||||||
|
# https://supabase.com/docs/guides/local-development/cli/config
|
||||||
|
# A string used to distinguish different Supabase projects on the same host. Defaults to the
|
||||||
|
# working directory name when running `supabase init`.
|
||||||
|
project_id = "storage-rls-user-folders"
|
||||||
|
|
||||||
|
[api]
|
||||||
|
enabled = true
|
||||||
|
# Port to use for the API URL.
|
||||||
|
port = 54321
|
||||||
|
# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
|
||||||
|
# endpoints. `public` and `graphql_public` schemas are included by default.
|
||||||
|
schemas = ["public", "graphql_public"]
|
||||||
|
# Extra schemas to add to the search_path of every request.
|
||||||
|
extra_search_path = ["public", "extensions"]
|
||||||
|
# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
|
||||||
|
# for accidental or malicious requests.
|
||||||
|
max_rows = 1000
|
||||||
|
|
||||||
|
[db]
|
||||||
|
# Port to use for the local database URL.
|
||||||
|
port = 54322
|
||||||
|
# Port used by db diff command to initialize the shadow database.
|
||||||
|
shadow_port = 54320
|
||||||
|
# The database major version to use. This has to be the same as your remote database's. Run `SHOW
|
||||||
|
# server_version;` on the remote database to check.
|
||||||
|
major_version = 17
|
||||||
|
|
||||||
|
[db.pooler]
|
||||||
|
enabled = false
|
||||||
|
# Port to use for the local connection pooler.
|
||||||
|
port = 54329
|
||||||
|
# Specifies when a server connection can be reused by other clients.
|
||||||
|
# Configure one of the supported pooler modes: `transaction`, `session`.
|
||||||
|
pool_mode = "transaction"
|
||||||
|
# How many server connections to allow per user/database pair.
|
||||||
|
default_pool_size = 20
|
||||||
|
# Maximum number of client connections allowed.
|
||||||
|
max_client_conn = 100
|
||||||
|
|
||||||
|
[storage]
|
||||||
|
enabled = true
|
||||||
|
# The maximum file size allowed (e.g. "5MB", "500KB").
|
||||||
|
file_size_limit = "50MiB"
|
||||||
|
|
||||||
|
[auth]
|
||||||
|
enabled = true
|
||||||
|
# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
|
||||||
|
# in emails.
|
||||||
|
site_url = "http://127.0.0.1:3000"
|
||||||
|
# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
|
||||||
|
additional_redirect_urls = ["https://127.0.0.1:3000"]
|
||||||
|
# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
|
||||||
|
jwt_expiry = 3600
|
||||||
|
# Allow/disallow new user signups to your project.
|
||||||
|
enable_signup = true
|
||||||
|
# Allow/disallow anonymous sign-ins to your project.
|
||||||
|
enable_anonymous_sign_ins = false
|
||||||
|
|
||||||
|
[auth.email]
|
||||||
|
# Allow/disallow new user signups via email to your project.
|
||||||
|
enable_signup = true
|
||||||
|
# If enabled, users need to confirm their email address before signing in.
|
||||||
|
enable_confirmations = false
|
||||||
201
packages/evals/evals/team-rls-security-definer/EVAL.ts
Normal file
201
packages/evals/evals/team-rls-security-definer/EVAL.ts
Normal file
@@ -0,0 +1,201 @@
|
|||||||
|
import { existsSync, readdirSync, readFileSync } from "node:fs";
|
||||||
|
import { join } from "node:path";
|
||||||
|
import { expect, test } from "vitest";
|
||||||
|
|
||||||
|
const supabaseDir = join(process.cwd(), "supabase");
|
||||||
|
const migrationsDir = join(supabaseDir, "migrations");
|
||||||
|
|
||||||
|
/** Find all .sql migration files (agent may create one or multiple). */
|
||||||
|
function findMigrationFiles(): string[] {
|
||||||
|
if (!existsSync(migrationsDir)) return [];
|
||||||
|
return readdirSync(migrationsDir)
|
||||||
|
.filter((f) => f.endsWith(".sql"))
|
||||||
|
.map((f) => join(migrationsDir, f));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Concatenate all migration SQL into a single string for assertions. */
|
||||||
|
function getMigrationSQL(): string {
|
||||||
|
const files = findMigrationFiles();
|
||||||
|
if (files.length === 0)
|
||||||
|
throw new Error("No migration file found in supabase/migrations/");
|
||||||
|
return files.map((f) => readFileSync(f, "utf-8")).join("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
test("migration file exists", () => {
|
||||||
|
expect(findMigrationFiles().length).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("creates organizations table", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
expect(sql).toMatch(/create\s+table[\s\S]*?organizations/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("creates memberships table", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
expect(sql).toMatch(/create\s+table[\s\S]*?memberships/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("creates projects table", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
expect(sql).toMatch(/create\s+table[\s\S]*?projects/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("enables RLS on all tables", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
expect(sql).toMatch(
|
||||||
|
/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/,
|
||||||
|
);
|
||||||
|
expect(sql).toMatch(
|
||||||
|
/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/,
|
||||||
|
);
|
||||||
|
expect(sql).toMatch(
|
||||||
|
/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("FK to auth.users with ON DELETE CASCADE", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// memberships should reference auth.users with cascade delete
|
||||||
|
expect(sql).toMatch(/references\s+auth\.users/);
|
||||||
|
expect(sql).toMatch(/on\s+delete\s+cascade/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("org_id FK on projects", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// projects should have a foreign key referencing organizations
|
||||||
|
expect(sql).toMatch(
|
||||||
|
/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("private schema created", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
expect(sql).toMatch(/create\s+schema[\s\S]*?private/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("security_definer helper function", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// Function should be in the private schema with SECURITY DEFINER and search_path = ''
|
||||||
|
expect(sql).toMatch(/private\./);
|
||||||
|
expect(sql).toMatch(/security\s+definer/);
|
||||||
|
expect(sql).toMatch(/set\s+search_path\s*=\s*''/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("policies use (select auth.uid())", () => {
|
||||||
|
const sql = getMigrationSQL();
|
||||||
|
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||||
|
expect(policyBlocks.length).toBeGreaterThan(0);
|
||||||
|
for (const policy of policyBlocks) {
|
||||||
|
if (policy.includes("auth.uid()")) {
|
||||||
|
// The subselect form: (select auth.uid())
|
||||||
|
expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("policies use TO authenticated", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||||
|
expect(policyBlocks.length).toBeGreaterThan(0);
|
||||||
|
for (const policy of policyBlocks) {
|
||||||
|
expect(policy).toMatch(/to\s+authenticated/);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("index on membership lookup columns", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
expect(sql).toMatch(/create\s+index/);
|
||||||
|
// Should index user_id and/or org_id on memberships for policy lookups
|
||||||
|
const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? [];
|
||||||
|
const indexesUserOrOrg = indexBlocks.filter(
|
||||||
|
(idx) =>
|
||||||
|
idx.includes("user_id") ||
|
||||||
|
idx.includes("org_id") ||
|
||||||
|
idx.includes("organization_id"),
|
||||||
|
);
|
||||||
|
expect(indexesUserOrOrg.length).toBeGreaterThanOrEqual(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("uses timestamptz", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// Match "timestamp" that is NOT followed by "tz" or "with time zone"
|
||||||
|
const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
|
||||||
|
// Only fail if the migration defines time columns with plain timestamp
|
||||||
|
if (
|
||||||
|
sql.includes("created_at") ||
|
||||||
|
sql.includes("updated_at") ||
|
||||||
|
sql.includes("_at ")
|
||||||
|
) {
|
||||||
|
expect(sql).not.toMatch(hasPlainTimestamp);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("idempotent DDL", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
expect(sql).toMatch(/if\s+not\s+exists/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("delete policy restricted to owner role", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
// Look for a delete policy on projects that checks for owner (or admin) role
|
||||||
|
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||||
|
const deletePolicy = policyBlocks.find(
|
||||||
|
(p) =>
|
||||||
|
p.toLowerCase().includes("delete") && p.toLowerCase().includes("project"),
|
||||||
|
);
|
||||||
|
expect(deletePolicy).toBeDefined();
|
||||||
|
// The delete policy should check for an owner/admin role
|
||||||
|
expect(deletePolicy?.toLowerCase()).toMatch(/owner|admin/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("overall quality score", () => {
|
||||||
|
const sql = getMigrationSQL().toLowerCase();
|
||||||
|
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
|
||||||
|
// A high-quality migration should contain most of these best-practice signals
|
||||||
|
const signals = [
|
||||||
|
// 1. RLS enabled on all three tables
|
||||||
|
/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||||
|
sql,
|
||||||
|
) &&
|
||||||
|
/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||||
|
sql,
|
||||||
|
) &&
|
||||||
|
/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/.test(
|
||||||
|
sql,
|
||||||
|
),
|
||||||
|
// 2. FK to auth.users with cascade
|
||||||
|
/references\s+auth\.users/.test(sql) && /on\s+delete\s+cascade/.test(sql),
|
||||||
|
// 3. Private schema created
|
||||||
|
/create\s+schema[\s\S]*?private/.test(sql),
|
||||||
|
// 4. security_definer with search_path
|
||||||
|
/security\s+definer/.test(sql) && /set\s+search_path\s*=\s*''/.test(sql),
|
||||||
|
// 5. Subselect auth.uid()
|
||||||
|
/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
|
||||||
|
// 6. TO authenticated on policies
|
||||||
|
policyBlocks.length > 0 &&
|
||||||
|
policyBlocks.every((p) => /to\s+authenticated/.test(p)),
|
||||||
|
// 7. Indexes on lookup columns
|
||||||
|
/create\s+index/.test(sql),
|
||||||
|
// 8. timestamptz (no plain timestamp)
|
||||||
|
!/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(sql),
|
||||||
|
// 9. Idempotent DDL
|
||||||
|
/if\s+not\s+exists/.test(sql),
|
||||||
|
// 10. Delete policy checks owner role
|
||||||
|
policyBlocks.some(
|
||||||
|
(p) =>
|
||||||
|
p.toLowerCase().includes("delete") &&
|
||||||
|
p.toLowerCase().includes("project") &&
|
||||||
|
/owner|admin/.test(p.toLowerCase()),
|
||||||
|
),
|
||||||
|
// 11. org_id FK on projects
|
||||||
|
/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/.test(sql),
|
||||||
|
// 12. Multiple policies (at least one per table)
|
||||||
|
policyBlocks.length >= 3,
|
||||||
|
// 13. Membership role column exists
|
||||||
|
/role/.test(sql),
|
||||||
|
// 14. Private schema function referenced in policies
|
||||||
|
/private\./.test(sql),
|
||||||
|
];
|
||||||
|
const passed = signals.filter(Boolean).length;
|
||||||
|
expect(passed).toBeGreaterThanOrEqual(10);
|
||||||
|
});
|
||||||
14
packages/evals/evals/team-rls-security-definer/PROMPT.md
Normal file
14
packages/evals/evals/team-rls-security-definer/PROMPT.md
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
I'm building a project management app where users can belong to multiple organizations. Each organization has projects that all members can view and edit.
|
||||||
|
|
||||||
|
The Supabase project is already initialized in the `supabase/` directory. Create a SQL migration with:
|
||||||
|
|
||||||
|
1. An `organizations` table (name, slug)
|
||||||
|
2. A `memberships` table linking users to organizations with a role column (owner, admin, member)
|
||||||
|
3. A `projects` table (name, description, status) belonging to an organization
|
||||||
|
|
||||||
|
Set up Row Level Security so:
|
||||||
|
- Users can only see organizations they belong to
|
||||||
|
- Users can only see and manage projects in their organizations
|
||||||
|
- Only org owners can delete projects
|
||||||
|
|
||||||
|
The migration should handle the case where a user is deleted from auth.
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"name": "team-rls-security-definer",
|
||||||
|
"private": true,
|
||||||
|
"type": "module"
|
||||||
|
}
|
||||||
@@ -0,0 +1,111 @@
|
|||||||
|
# For detailed configuration reference documentation, visit:
|
||||||
|
# https://supabase.com/docs/guides/local-development/cli/config
|
||||||
|
# A string used to distinguish different Supabase projects on the same host. Defaults to the
|
||||||
|
# working directory name when running `supabase init`.
|
||||||
|
project_id = "team-rls-security-definer"
|
||||||
|
|
||||||
|
[api]
|
||||||
|
enabled = true
|
||||||
|
# Port to use for the API URL.
|
||||||
|
port = 54321
|
||||||
|
# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
|
||||||
|
# endpoints. `public` and `graphql_public` schemas are included by default.
|
||||||
|
schemas = ["public", "graphql_public"]
|
||||||
|
# Extra schemas to add to the search_path of every request.
|
||||||
|
extra_search_path = ["public", "extensions"]
|
||||||
|
# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
|
||||||
|
# for accidental or malicious requests.
|
||||||
|
max_rows = 1000
|
||||||
|
|
||||||
|
[db]
|
||||||
|
# Port to use for the local database URL.
|
||||||
|
port = 54322
|
||||||
|
# Port used by db diff command to initialize the shadow database.
|
||||||
|
shadow_port = 54320
|
||||||
|
# The database major version to use. This has to be the same as your remote database's. Run `SHOW
|
||||||
|
# server_version;` on the remote database to check.
|
||||||
|
major_version = 17
|
||||||
|
|
||||||
|
[db.pooler]
|
||||||
|
enabled = false
|
||||||
|
# Port to use for the local connection pooler.
|
||||||
|
port = 54329
|
||||||
|
# Specifies when a server connection can be reused by other clients.
|
||||||
|
# Configure one of the supported pooler modes: `transaction`, `session`.
|
||||||
|
pool_mode = "transaction"
|
||||||
|
# How many server connections to allow per user/database pair.
|
||||||
|
default_pool_size = 20
|
||||||
|
# Maximum number of client connections allowed.
|
||||||
|
max_client_conn = 100
|
||||||
|
|
||||||
|
[db.migrations]
|
||||||
|
# If disabled, migrations will be skipped during a db push or reset.
|
||||||
|
enabled = true
|
||||||
|
schema_paths = []
|
||||||
|
|
||||||
|
[db.seed]
|
||||||
|
# If enabled, seeds the database after migrations during a db reset.
|
||||||
|
enabled = true
|
||||||
|
# Specifies an ordered list of seed files to load during db reset.
|
||||||
|
sql_paths = ["./seed.sql"]
|
||||||
|
|
||||||
|
[realtime]
|
||||||
|
enabled = true
|
||||||
|
|
||||||
|
[studio]
|
||||||
|
enabled = true
|
||||||
|
# Port to use for Supabase Studio.
|
||||||
|
port = 54323
|
||||||
|
# External URL of the API server that frontend connects to.
|
||||||
|
api_url = "http://127.0.0.1"
|
||||||
|
|
||||||
|
[inbucket]
|
||||||
|
enabled = true
|
||||||
|
# Port to use for the email testing server web interface.
|
||||||
|
port = 54324
|
||||||
|
|
||||||
|
[storage]
|
||||||
|
enabled = true
|
||||||
|
# The maximum file size allowed (e.g. "5MB", "500KB").
|
||||||
|
file_size_limit = "50MiB"
|
||||||
|
|
||||||
|
[auth]
|
||||||
|
enabled = true
|
||||||
|
# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
|
||||||
|
# in emails.
|
||||||
|
site_url = "http://127.0.0.1:3000"
|
||||||
|
# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
|
||||||
|
additional_redirect_urls = ["https://127.0.0.1:3000"]
|
||||||
|
# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
|
||||||
|
jwt_expiry = 3600
|
||||||
|
# If disabled, the refresh token will never expire.
|
||||||
|
enable_refresh_token_rotation = true
|
||||||
|
# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
|
||||||
|
# Requires enable_refresh_token_rotation = true.
|
||||||
|
refresh_token_reuse_interval = 10
|
||||||
|
# Allow/disallow new user signups to your project.
|
||||||
|
enable_signup = true
|
||||||
|
# Allow/disallow anonymous sign-ins to your project.
|
||||||
|
enable_anonymous_sign_ins = false
|
||||||
|
|
||||||
|
[auth.email]
|
||||||
|
# Allow/disallow new user signups via email to your project.
|
||||||
|
enable_signup = true
|
||||||
|
# If enabled, a user will be required to confirm any email change on both the old, and new email
|
||||||
|
# addresses. If disabled, only the new email is required to confirm.
|
||||||
|
double_confirm_changes = true
|
||||||
|
# If enabled, users need to confirm their email address before signing in.
|
||||||
|
enable_confirmations = false
|
||||||
|
|
||||||
|
[edge_runtime]
|
||||||
|
enabled = true
|
||||||
|
# Configure one of the supported request policies: `oneshot`, `per_worker`.
|
||||||
|
policy = "per_worker"
|
||||||
|
# Port to attach the Chrome inspector for debugging edge functions.
|
||||||
|
inspector_port = 8083
|
||||||
|
|
||||||
|
[analytics]
|
||||||
|
enabled = true
|
||||||
|
port = 54327
|
||||||
|
# Configure one of the supported backends: `postgres`, `bigquery`.
|
||||||
|
backend = "postgres"
|
||||||
328
packages/evals/package-lock.json
generated
328
packages/evals/package-lock.json
generated
@@ -9,6 +9,7 @@
|
|||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"@anthropic-ai/claude-code": "^2.1.49",
|
||||||
"braintrust": "^3.0.0"
|
"braintrust": "^3.0.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
@@ -18,6 +19,29 @@
|
|||||||
"vitest": "^3.1.0"
|
"vitest": "^3.1.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/@anthropic-ai/claude-code": {
|
||||||
|
"version": "2.1.49",
|
||||||
|
"resolved": "https://registry.npmjs.org/@anthropic-ai/claude-code/-/claude-code-2.1.49.tgz",
|
||||||
|
"integrity": "sha512-PonEmTZlB5IZbBu9TmtOpGZnupU7OxOXTsJKcXE/4Ak5qp3ptN1wSBRdgKYnn6GDYhXijTXuVVwrCQU+NAgwPA==",
|
||||||
|
"license": "SEE LICENSE IN README.md",
|
||||||
|
"bin": {
|
||||||
|
"claude": "cli.js"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18.0.0"
|
||||||
|
},
|
||||||
|
"optionalDependencies": {
|
||||||
|
"@img/sharp-darwin-arm64": "^0.34.2",
|
||||||
|
"@img/sharp-darwin-x64": "^0.34.2",
|
||||||
|
"@img/sharp-linux-arm": "^0.34.2",
|
||||||
|
"@img/sharp-linux-arm64": "^0.34.2",
|
||||||
|
"@img/sharp-linux-x64": "^0.34.2",
|
||||||
|
"@img/sharp-linuxmusl-arm64": "^0.34.2",
|
||||||
|
"@img/sharp-linuxmusl-x64": "^0.34.2",
|
||||||
|
"@img/sharp-win32-arm64": "^0.34.2",
|
||||||
|
"@img/sharp-win32-x64": "^0.34.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/@colors/colors": {
|
"node_modules/@colors/colors": {
|
||||||
"version": "1.5.0",
|
"version": "1.5.0",
|
||||||
"resolved": "https://registry.npmjs.org/@colors/colors/-/colors-1.5.0.tgz",
|
"resolved": "https://registry.npmjs.org/@colors/colors/-/colors-1.5.0.tgz",
|
||||||
@@ -444,6 +468,310 @@
|
|||||||
"node": ">=18"
|
"node": ">=18"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/@img/sharp-darwin-arm64": {
|
||||||
|
"version": "0.34.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.5.tgz",
|
||||||
|
"integrity": "sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w==",
|
||||||
|
"cpu": [
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"darwin"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://opencollective.com/libvips"
|
||||||
|
},
|
||||||
|
"optionalDependencies": {
|
||||||
|
"@img/sharp-libvips-darwin-arm64": "1.2.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@img/sharp-darwin-x64": {
|
||||||
|
"version": "0.34.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.5.tgz",
|
||||||
|
"integrity": "sha512-YNEFAF/4KQ/PeW0N+r+aVVsoIY0/qxxikF2SWdp+NRkmMB7y9LBZAVqQ4yhGCm/H3H270OSykqmQMKLBhBJDEw==",
|
||||||
|
"cpu": [
|
||||||
|
"x64"
|
||||||
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"darwin"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://opencollective.com/libvips"
|
||||||
|
},
|
||||||
|
"optionalDependencies": {
|
||||||
|
"@img/sharp-libvips-darwin-x64": "1.2.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@img/sharp-libvips-darwin-arm64": {
|
||||||
|
"version": "1.2.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.2.4.tgz",
|
||||||
|
"integrity": "sha512-zqjjo7RatFfFoP0MkQ51jfuFZBnVE2pRiaydKJ1G/rHZvnsrHAOcQALIi9sA5co5xenQdTugCvtb1cuf78Vf4g==",
|
||||||
|
"cpu": [
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
|
"license": "LGPL-3.0-or-later",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"darwin"
|
||||||
|
],
|
||||||
|
"funding": {
|
||||||
|
"url": "https://opencollective.com/libvips"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@img/sharp-libvips-darwin-x64": {
|
||||||
|
"version": "1.2.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.2.4.tgz",
|
||||||
|
"integrity": "sha512-1IOd5xfVhlGwX+zXv2N93k0yMONvUlANylbJw1eTah8K/Jtpi15KC+WSiaX/nBmbm2HxRM1gZ0nSdjSsrZbGKg==",
|
||||||
|
"cpu": [
|
||||||
|
"x64"
|
||||||
|
],
|
||||||
|
"license": "LGPL-3.0-or-later",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"darwin"
|
||||||
|
],
|
||||||
|
"funding": {
|
||||||
|
"url": "https://opencollective.com/libvips"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@img/sharp-libvips-linux-arm": {
|
||||||
|
"version": "1.2.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.2.4.tgz",
|
||||||
|
"integrity": "sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A==",
|
||||||
|
"cpu": [
|
||||||
|
"arm"
|
||||||
|
],
|
||||||
|
"license": "LGPL-3.0-or-later",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"funding": {
|
||||||
|
"url": "https://opencollective.com/libvips"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@img/sharp-libvips-linux-arm64": {
|
||||||
|
"version": "1.2.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.2.4.tgz",
|
||||||
|
"integrity": "sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw==",
|
||||||
|
"cpu": [
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
|
"license": "LGPL-3.0-or-later",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"funding": {
|
||||||
|
"url": "https://opencollective.com/libvips"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@img/sharp-libvips-linux-x64": {
|
||||||
|
"version": "1.2.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.2.4.tgz",
|
||||||
|
"integrity": "sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw==",
|
||||||
|
"cpu": [
|
||||||
|
"x64"
|
||||||
|
],
|
||||||
|
"license": "LGPL-3.0-or-later",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"funding": {
|
||||||
|
"url": "https://opencollective.com/libvips"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@img/sharp-libvips-linuxmusl-arm64": {
|
||||||
|
"version": "1.2.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.2.4.tgz",
|
||||||
|
"integrity": "sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw==",
|
||||||
|
"cpu": [
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
|
"license": "LGPL-3.0-or-later",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"funding": {
|
||||||
|
"url": "https://opencollective.com/libvips"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@img/sharp-libvips-linuxmusl-x64": {
|
||||||
|
"version": "1.2.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.2.4.tgz",
|
||||||
|
"integrity": "sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg==",
|
||||||
|
"cpu": [
|
||||||
|
"x64"
|
||||||
|
],
|
||||||
|
"license": "LGPL-3.0-or-later",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"funding": {
|
||||||
|
"url": "https://opencollective.com/libvips"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@img/sharp-linux-arm": {
|
||||||
|
"version": "0.34.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.5.tgz",
|
||||||
|
"integrity": "sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw==",
|
||||||
|
"cpu": [
|
||||||
|
"arm"
|
||||||
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://opencollective.com/libvips"
|
||||||
|
},
|
||||||
|
"optionalDependencies": {
|
||||||
|
"@img/sharp-libvips-linux-arm": "1.2.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@img/sharp-linux-arm64": {
|
||||||
|
"version": "0.34.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.5.tgz",
|
||||||
|
"integrity": "sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg==",
|
||||||
|
"cpu": [
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://opencollective.com/libvips"
|
||||||
|
},
|
||||||
|
"optionalDependencies": {
|
||||||
|
"@img/sharp-libvips-linux-arm64": "1.2.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@img/sharp-linux-x64": {
|
||||||
|
"version": "0.34.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.5.tgz",
|
||||||
|
"integrity": "sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ==",
|
||||||
|
"cpu": [
|
||||||
|
"x64"
|
||||||
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://opencollective.com/libvips"
|
||||||
|
},
|
||||||
|
"optionalDependencies": {
|
||||||
|
"@img/sharp-libvips-linux-x64": "1.2.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@img/sharp-linuxmusl-arm64": {
|
||||||
|
"version": "0.34.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.5.tgz",
|
||||||
|
"integrity": "sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg==",
|
||||||
|
"cpu": [
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://opencollective.com/libvips"
|
||||||
|
},
|
||||||
|
"optionalDependencies": {
|
||||||
|
"@img/sharp-libvips-linuxmusl-arm64": "1.2.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@img/sharp-linuxmusl-x64": {
|
||||||
|
"version": "0.34.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.5.tgz",
|
||||||
|
"integrity": "sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q==",
|
||||||
|
"cpu": [
|
||||||
|
"x64"
|
||||||
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://opencollective.com/libvips"
|
||||||
|
},
|
||||||
|
"optionalDependencies": {
|
||||||
|
"@img/sharp-libvips-linuxmusl-x64": "1.2.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@img/sharp-win32-arm64": {
|
||||||
|
"version": "0.34.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/@img/sharp-win32-arm64/-/sharp-win32-arm64-0.34.5.tgz",
|
||||||
|
"integrity": "sha512-WQ3AgWCWYSb2yt+IG8mnC6Jdk9Whs7O0gxphblsLvdhSpSTtmu69ZG1Gkb6NuvxsNACwiPV6cNSZNzt0KPsw7g==",
|
||||||
|
"cpu": [
|
||||||
|
"arm64"
|
||||||
|
],
|
||||||
|
"license": "Apache-2.0 AND LGPL-3.0-or-later",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"win32"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://opencollective.com/libvips"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/@img/sharp-win32-x64": {
|
||||||
|
"version": "0.34.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.5.tgz",
|
||||||
|
"integrity": "sha512-+29YMsqY2/9eFEiW93eqWnuLcWcufowXewwSNIT6UwZdUUCrM3oFjMWH/Z6/TMmb4hlFenmfAVbpWeup2jryCw==",
|
||||||
|
"cpu": [
|
||||||
|
"x64"
|
||||||
|
],
|
||||||
|
"license": "Apache-2.0 AND LGPL-3.0-or-later",
|
||||||
|
"optional": true,
|
||||||
|
"os": [
|
||||||
|
"win32"
|
||||||
|
],
|
||||||
|
"engines": {
|
||||||
|
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://opencollective.com/libvips"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/@jridgewell/sourcemap-codec": {
|
"node_modules/@jridgewell/sourcemap-codec": {
|
||||||
"version": "1.5.5",
|
"version": "1.5.5",
|
||||||
"resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz",
|
"resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz",
|
||||||
|
|||||||
@@ -10,6 +10,7 @@
|
|||||||
"eval:upload": "BRAINTRUST_UPLOAD=true tsx src/runner.ts"
|
"eval:upload": "BRAINTRUST_UPLOAD=true tsx src/runner.ts"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"@anthropic-ai/claude-code": "^2.1.49",
|
||||||
"braintrust": "^3.0.0"
|
"braintrust": "^3.0.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
|||||||
@@ -49,3 +49,121 @@ The agent initializes a Supabase project and creates a migration file that:
|
|||||||
| index on user_id | `CREATE INDEX` on the FK column |
|
| index on user_id | `CREATE INDEX` on the FK column |
|
||||||
| IF NOT EXISTS | Idempotent migration |
|
| IF NOT EXISTS | Idempotent migration |
|
||||||
| overall quality | At least 4/5 best-practice signals present |
|
| overall quality | At least 4/5 best-practice signals present |
|
||||||
|
|
||||||
|
## Scenario 2: team-rls-security-definer
|
||||||
|
|
||||||
|
**Description:** Create a SQL migration for a team-based project management app
|
||||||
|
where users belong to organizations via a membership table. The migration must
|
||||||
|
define tables for organizations, memberships, and projects, then secure them
|
||||||
|
with RLS policies that use a `security definer` helper function in a private
|
||||||
|
schema to efficiently resolve team membership without per-row joins.
|
||||||
|
|
||||||
|
**Setup:** The workspace starts with a pre-initialized Supabase project
|
||||||
|
(`supabase/config.toml` exists, empty `supabase/migrations/` directory). The
|
||||||
|
agent creates migration files within this structure.
|
||||||
|
|
||||||
|
**Expected skill files read:**
|
||||||
|
|
||||||
|
- `SKILL.md` (skill body with reference file index)
|
||||||
|
- `references/db-rls-mandatory.md`
|
||||||
|
- `references/db-rls-policy-types.md`
|
||||||
|
- `references/db-rls-common-mistakes.md`
|
||||||
|
- `references/db-rls-performance.md`
|
||||||
|
- `references/db-security-functions.md`
|
||||||
|
- `references/db-schema-auth-fk.md`
|
||||||
|
- `references/db-schema-timestamps.md`
|
||||||
|
- `references/db-perf-indexes.md`
|
||||||
|
- `references/db-migrations-idempotent.md`
|
||||||
|
|
||||||
|
**Expected result:**
|
||||||
|
|
||||||
|
The agent creates a migration file that:
|
||||||
|
|
||||||
|
- Creates organizations, memberships, and projects tables with `timestamptz` columns
|
||||||
|
- Has `user_id` FK to `auth.users(id)` with `ON DELETE CASCADE` on memberships
|
||||||
|
- Has `org_id` FK on projects referencing organizations
|
||||||
|
- Enables RLS on all three tables
|
||||||
|
- Creates a private schema with a `security definer` helper function (`SET search_path = ''`)
|
||||||
|
- Creates RLS policies using `(select auth.uid())` with `TO authenticated`
|
||||||
|
- Creates indexes on membership lookup columns (user_id, org_id)
|
||||||
|
- Has a delete policy on projects restricted to owner role
|
||||||
|
- Uses `IF NOT EXISTS` for idempotency
|
||||||
|
|
||||||
|
**Scorer:** Binary pass/fail (16 vitest assertions)
|
||||||
|
|
||||||
|
| Test | What it checks |
|
||||||
|
| --- | --- |
|
||||||
|
| migration file exists | A `.sql` file exists in `supabase/migrations/` |
|
||||||
|
| creates organizations table | SQL contains `CREATE TABLE` for organizations |
|
||||||
|
| creates memberships table | SQL contains `CREATE TABLE` for memberships |
|
||||||
|
| creates projects table | SQL contains `CREATE TABLE` for projects |
|
||||||
|
| enables RLS on all tables | `ALTER TABLE ... ENABLE ROW LEVEL SECURITY` for all three tables |
|
||||||
|
| FK to auth.users with ON DELETE CASCADE | memberships references `auth.users` with cascade |
|
||||||
|
| org_id FK on projects | projects references organizations |
|
||||||
|
| private schema created | `CREATE SCHEMA ... private` present |
|
||||||
|
| security_definer helper function | Function in private schema with `SECURITY DEFINER` and `SET search_path = ''` |
|
||||||
|
| policies use (select auth.uid()) | Subselect form in all policies referencing auth.uid() |
|
||||||
|
| policies use TO authenticated | All policies scoped to authenticated role |
|
||||||
|
| index on membership lookup columns | `CREATE INDEX` on user_id and/or org_id in memberships |
|
||||||
|
| uses timestamptz | No plain `timestamp` for time columns |
|
||||||
|
| idempotent DDL | Uses `IF NOT EXISTS` or `DROP ... IF EXISTS` patterns |
|
||||||
|
| delete policy restricted to owner role | A delete policy on projects checks for owner/admin role |
|
||||||
|
| overall quality score | At least 10/14 best-practice signals present |
|
||||||
|
|
||||||
|
## Scenario 3: storage-rls-user-folders
|
||||||
|
|
||||||
|
**Description:** Create a SQL migration that sets up Supabase Storage buckets
|
||||||
|
with RLS policies for user-content. An avatars bucket (public reads,
|
||||||
|
authenticated uploads restricted to user folders) and a documents bucket (fully
|
||||||
|
private, user-isolated), with file type restrictions, storage helper functions
|
||||||
|
in policies, and a file_metadata tracking table secured with RLS.
|
||||||
|
|
||||||
|
**Setup:** Pre-initialized Supabase project (`supabase/config.toml` exists)
|
||||||
|
with an empty `supabase/migrations/` directory. The agent creates migration
|
||||||
|
files within this structure.
|
||||||
|
|
||||||
|
**Expected skill files read:**
|
||||||
|
|
||||||
|
- `SKILL.md` (skill body with reference file index)
|
||||||
|
- `references/storage-access-control.md`
|
||||||
|
- `references/db-rls-mandatory.md`
|
||||||
|
- `references/db-rls-common-mistakes.md`
|
||||||
|
- `references/db-rls-performance.md`
|
||||||
|
- `references/db-schema-auth-fk.md`
|
||||||
|
- `references/db-schema-timestamps.md`
|
||||||
|
- `references/db-perf-indexes.md`
|
||||||
|
- `references/db-migrations-idempotent.md`
|
||||||
|
|
||||||
|
**Expected result:**
|
||||||
|
|
||||||
|
The agent creates a migration file that:
|
||||||
|
|
||||||
|
- Inserts avatars bucket into `storage.buckets` with `public = true`, MIME type restrictions, and file size limit
|
||||||
|
- Inserts documents bucket with `public = false`
|
||||||
|
- Creates RLS policies on `storage.objects` using `storage.foldername(name)` with `auth.uid()::text`
|
||||||
|
- Scopes upload policies `TO authenticated` and avatars SELECT policy `TO public`
|
||||||
|
- Creates `file_metadata` table with FK to `auth.users` with `ON DELETE CASCADE`
|
||||||
|
- Enables RLS on `file_metadata` with policies using `(select auth.uid())`
|
||||||
|
- Uses `timestamptz` for time columns, indexes `user_id`, and `IF NOT EXISTS` for idempotency
|
||||||
|
|
||||||
|
**Scorer:** Binary pass/fail (17 vitest assertions)
|
||||||
|
|
||||||
|
| Test | What it checks |
|
||||||
|
| --- | --- |
|
||||||
|
| migration file exists | A `.sql` file exists in `supabase/migrations/` |
|
||||||
|
| creates avatars bucket | SQL inserts into `storage.buckets` with id 'avatars' and `public = true` |
|
||||||
|
| creates documents bucket | SQL inserts into `storage.buckets` with id 'documents' and `public = false` |
|
||||||
|
| avatars bucket has mime type restriction | `allowed_mime_types` includes image types (jpeg, png, webp) |
|
||||||
|
| avatars bucket has file size limit | `file_size_limit` set (around 2MB / 2097152 bytes) |
|
||||||
|
| storage policy uses foldername or path for user isolation | Policy references `storage.foldername(name)` with `auth.uid()::text` |
|
||||||
|
| storage policy uses TO authenticated | Storage upload/delete policies scoped to `TO authenticated` |
|
||||||
|
| public read policy for avatars | A SELECT policy on storage.objects for avatars allows public/anon access |
|
||||||
|
| documents bucket is fully private | Policies for documents restrict all operations to authenticated owner |
|
||||||
|
| creates file_metadata table | SQL contains `CREATE TABLE` for file_metadata |
|
||||||
|
| file_metadata has FK to auth.users with CASCADE | `REFERENCES auth.users` with `ON DELETE CASCADE` |
|
||||||
|
| RLS enabled on file_metadata | `ALTER TABLE file_metadata ENABLE ROW LEVEL SECURITY` |
|
||||||
|
| file_metadata policies use (select auth.uid()) | Subselect form in policies |
|
||||||
|
| uses timestamptz for time columns | No plain `timestamp` in file_metadata |
|
||||||
|
| index on file_metadata user_id | `CREATE INDEX` on user_id column |
|
||||||
|
| idempotent DDL | Uses `IF NOT EXISTS` patterns |
|
||||||
|
| overall quality score | At least 11/15 best-practice signals present |
|
||||||
|
|||||||
144
packages/evals/scenarios/storage-rls-user-folders.md
Normal file
144
packages/evals/scenarios/storage-rls-user-folders.md
Normal file
@@ -0,0 +1,144 @@
|
|||||||
|
# Scenario: storage-rls-user-folders
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The agent must create a SQL migration that sets up Supabase Storage buckets
|
||||||
|
with RLS policies for a user-content application. The migration must configure
|
||||||
|
an avatars bucket (public reads, authenticated uploads restricted to user
|
||||||
|
folders) and a documents bucket (fully private, user-isolated), with proper
|
||||||
|
file type restrictions, storage helper functions in policies, and a
|
||||||
|
file_metadata tracking table secured with RLS.
|
||||||
|
|
||||||
|
## Real-World Justification
|
||||||
|
|
||||||
|
Why this is a common and important workflow:
|
||||||
|
|
||||||
|
1. **Storage RLS is confusing and under-documented compared to table RLS** --
|
||||||
|
Developers consistently struggle with the distinction between public/private
|
||||||
|
buckets and the RLS policies needed on `storage.objects`. Multiple GitHub
|
||||||
|
discussions show confusion about which SDK operations map to which SQL
|
||||||
|
operations (INSERT, SELECT, UPDATE, DELETE).
|
||||||
|
- Source: https://github.com/orgs/supabase/discussions/37611
|
||||||
|
- Source: https://github.com/orgs/supabase/discussions/38700
|
||||||
|
|
||||||
|
2. **User-folder isolation is the canonical storage security pattern** -- The
|
||||||
|
official Supabase docs demonstrate folder-based isolation using
|
||||||
|
`storage.foldername(name)` and `auth.uid()::text`, but developers frequently
|
||||||
|
get the casting or array indexing wrong.
|
||||||
|
- Source: https://supabase.com/docs/guides/storage/security/access-control
|
||||||
|
|
||||||
|
3. **Missing file type restrictions leads to security vulnerabilities** --
|
||||||
|
Without `allowed_mime_types` on the bucket or extension checks in RLS
|
||||||
|
policies, users can upload executable files or oversized payloads. The
|
||||||
|
Supabase security best practices guide calls this out as a common oversight.
|
||||||
|
- Source: https://supaexplorer.com/guides/supabase-security-best-practices
|
||||||
|
- Source: https://supabase.com/docs/guides/storage/buckets/fundamentals
|
||||||
|
|
||||||
|
## Skill References Exercised
|
||||||
|
|
||||||
|
Which reference files the agent should consult and what each teaches:
|
||||||
|
|
||||||
|
| Reference File | What It Teaches | What the Agent Should Apply |
|
||||||
|
|---|---|---|
|
||||||
|
| `references/storage-access-control.md` | Bucket visibility, RLS on storage.objects, storage helper functions, SDK-to-SQL operation mapping | User-folder policies using `storage.foldername()`, separate SELECT/INSERT policies |
|
||||||
|
| `references/db-rls-mandatory.md` | RLS must be enabled on all public tables | Enable RLS on the file_metadata tracking table |
|
||||||
|
| `references/db-rls-common-mistakes.md` | Missing TO clause, missing SELECT policy for UPDATE | Use `TO authenticated` (or `TO public` for public reads), include SELECT policy |
|
||||||
|
| `references/db-rls-performance.md` | Wrap auth.uid() in SELECT subquery | Use `(select auth.uid())` in both storage and table policies |
|
||||||
|
| `references/db-schema-auth-fk.md` | FK to auth.users with ON DELETE CASCADE | file_metadata.user_id references auth.users with cascade |
|
||||||
|
| `references/db-schema-timestamps.md` | Use timestamptz not timestamp | Time columns on file_metadata use timestamptz |
|
||||||
|
| `references/db-perf-indexes.md` | Index columns used in policy lookups | Index user_id on file_metadata |
|
||||||
|
| `references/db-migrations-idempotent.md` | IF NOT EXISTS for safe reruns | Idempotent DDL throughout |
|
||||||
|
|
||||||
|
## Workspace Setup
|
||||||
|
|
||||||
|
What the workspace starts with before the agent runs:
|
||||||
|
|
||||||
|
- Pre-initialized Supabase project (`supabase/config.toml` exists)
|
||||||
|
- Empty `supabase/migrations/` directory
|
||||||
|
- The agent creates migration files within this structure
|
||||||
|
|
||||||
|
## Agent Task (PROMPT.md draft)
|
||||||
|
|
||||||
|
The prompt to give the agent. Written as a developer would ask it:
|
||||||
|
|
||||||
|
> I need to set up file storage for my app. There are two use cases:
|
||||||
|
>
|
||||||
|
> 1. **Avatars** -- Users upload a profile picture. Anyone can view avatars but
|
||||||
|
> only the owning user can upload or replace their own. Only allow image
|
||||||
|
> files (JPEG, PNG, WebP). Max 2MB.
|
||||||
|
>
|
||||||
|
> 2. **Documents** -- Users upload private documents that only they can access.
|
||||||
|
> Max 50MB. No file type restriction.
|
||||||
|
>
|
||||||
|
> Create a SQL migration that:
|
||||||
|
> - Configures both storage buckets
|
||||||
|
> - Adds RLS policies on `storage.objects` so each user can only access their
|
||||||
|
> own folder (folder name = user ID)
|
||||||
|
> - Creates a `file_metadata` table to track uploaded files (file name, bucket,
|
||||||
|
> size, user reference) with appropriate security
|
||||||
|
>
|
||||||
|
> Users are authenticated via Supabase Auth.
|
||||||
|
|
||||||
|
## Evaluation Criteria
|
||||||
|
|
||||||
|
What vitest should assert on the agent's output. Each assertion tests a
|
||||||
|
specific quality signal:
|
||||||
|
|
||||||
|
| # | Test Name | What It Checks | Quality Dimension |
|
||||||
|
|---|-----------|----------------|-------------------|
|
||||||
|
| 1 | migration file exists | A `.sql` file exists in `supabase/migrations/` | structure |
|
||||||
|
| 2 | creates avatars bucket | SQL inserts into `storage.buckets` with id 'avatars' and `public = true` | correctness |
|
||||||
|
| 3 | creates documents bucket | SQL inserts into `storage.buckets` with id 'documents' and `public = false` | correctness |
|
||||||
|
| 4 | avatars bucket has mime type restriction | `allowed_mime_types` includes image types (jpeg, png, webp) | security |
|
||||||
|
| 5 | avatars bucket has file size limit | `file_size_limit` set (around 2MB / 2097152 bytes) | security |
|
||||||
|
| 6 | storage policy uses foldername or path for user isolation | Policy references `storage.foldername(name)` with `auth.uid()::text` | security |
|
||||||
|
| 7 | storage policy uses TO authenticated | Storage upload/delete policies scoped to `TO authenticated` | security |
|
||||||
|
| 8 | public read policy for avatars | A SELECT policy on storage.objects for avatars bucket allows public/anon access | correctness |
|
||||||
|
| 9 | documents bucket is fully private | Policies for documents bucket restrict all operations to authenticated owner | security |
|
||||||
|
| 10 | creates file_metadata table | SQL contains `CREATE TABLE` for file_metadata | correctness |
|
||||||
|
| 11 | file_metadata has FK to auth.users with CASCADE | `REFERENCES auth.users` with `ON DELETE CASCADE` | correctness |
|
||||||
|
| 12 | RLS enabled on file_metadata | `ALTER TABLE file_metadata ENABLE ROW LEVEL SECURITY` | security |
|
||||||
|
| 13 | file_metadata policies use (select auth.uid()) | Subselect form in policies | performance |
|
||||||
|
| 14 | uses timestamptz for time columns | No plain `timestamp` in file_metadata | correctness |
|
||||||
|
| 15 | index on file_metadata user_id | `CREATE INDEX` on user_id column | performance |
|
||||||
|
| 16 | idempotent DDL | Uses `IF NOT EXISTS` patterns | idempotency |
|
||||||
|
| 17 | overall quality score | At least 11/15 best-practice signals present | overall |
|
||||||
|
|
||||||
|
## Reasoning
|
||||||
|
|
||||||
|
Step-by-step reasoning for why this scenario is well-designed:
|
||||||
|
|
||||||
|
1. **Baseline differentiator:** An agent without the skill would likely: (a)
|
||||||
|
confuse public bucket visibility with unrestricted upload access, (b) write
|
||||||
|
storage policies without using `storage.foldername()` or get the array
|
||||||
|
indexing wrong, (c) forget to set `allowed_mime_types` on the bucket itself,
|
||||||
|
(d) omit the `TO authenticated` clause on storage policies, (e) use bare
|
||||||
|
`auth.uid()` instead of the subselect form, (f) skip the `::text` cast when
|
||||||
|
comparing auth.uid() to folder names. These are all Supabase-specific
|
||||||
|
patterns that require reading the skill references.
|
||||||
|
|
||||||
|
2. **Skill value:** The storage-access-control reference explicitly documents:
|
||||||
|
the public vs private bucket distinction, the `storage.foldername()` helper
|
||||||
|
function pattern, the SDK-to-SQL operation mapping, and bucket configuration
|
||||||
|
with mime types and size limits. Combined with the database security
|
||||||
|
references (RLS mandatory, common mistakes, performance), this scenario
|
||||||
|
exercises 8 reference files.
|
||||||
|
|
||||||
|
3. **Testability:** Bucket configuration (INSERT INTO storage.buckets), storage
|
||||||
|
helper function usage (storage.foldername), policy clauses (TO
|
||||||
|
authenticated, TO public), mime types, file size limits, and all table-level
|
||||||
|
patterns (RLS, FK, indexes, timestamptz) are reliably detectable via regex
|
||||||
|
on SQL text.
|
||||||
|
|
||||||
|
4. **Realism:** Nearly every Supabase application that handles user-generated
|
||||||
|
content needs avatar uploads and document storage. This is a day-one task
|
||||||
|
for any SaaS product. The GitHub discussions linked above show dozens of
|
||||||
|
developers hitting exactly these issues when setting up storage for the
|
||||||
|
first time.
|
||||||
|
|
||||||
|
## Difficulty
|
||||||
|
|
||||||
|
**Rating:** MEDIUM
|
||||||
|
|
||||||
|
- Without skill: ~30-45% of assertions expected to pass
|
||||||
|
- With skill: ~85-95% of assertions expected to pass
|
||||||
139
packages/evals/scenarios/team-rls-security-definer.md
Normal file
139
packages/evals/scenarios/team-rls-security-definer.md
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
# Scenario: team-rls-security-definer
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
The agent must create a SQL migration for a team-based project management app
|
||||||
|
where users belong to organizations via a membership table. The migration must
|
||||||
|
define tables for organizations, memberships, and projects, then secure them
|
||||||
|
with RLS policies that use a `security definer` helper function in a private
|
||||||
|
schema to efficiently resolve team membership without per-row joins.
|
||||||
|
|
||||||
|
## Real-World Justification
|
||||||
|
|
||||||
|
Why this is a common and important workflow:
|
||||||
|
|
||||||
|
1. **Multi-tenant team access is the most-asked RLS question on Supabase** --
|
||||||
|
The official Supabase GitHub has multiple high-engagement discussions about
|
||||||
|
how to write RLS policies that check team/org membership without causing
|
||||||
|
performance issues or security holes.
|
||||||
|
- Source: https://github.com/supabase/supabase/discussions/4509
|
||||||
|
- Source: https://github.com/supabase/supabase/discussions/811
|
||||||
|
|
||||||
|
2. **security_definer in public schema is a documented security anti-pattern** --
|
||||||
|
Developers frequently place security_definer functions in the public schema,
|
||||||
|
inadvertently exposing them via the PostgREST API. The Supabase docs and
|
||||||
|
community discussions explicitly warn against this.
|
||||||
|
- Source: https://github.com/supabase/supabase/discussions/3269
|
||||||
|
- Source: https://supabase.com/docs/guides/database/postgres/row-level-security
|
||||||
|
|
||||||
|
3. **RLS policy performance with joins is a top pain point** -- Naive policies
|
||||||
|
that join against a memberships table execute per-row, causing severe
|
||||||
|
performance degradation. The recommended pattern is a security_definer
|
||||||
|
function that caches results via subselect.
|
||||||
|
- Source: https://github.com/orgs/supabase/discussions/1148
|
||||||
|
- Source: https://makerkit.dev/blog/tutorials/supabase-rls-best-practices
|
||||||
|
|
||||||
|
## Skill References Exercised
|
||||||
|
|
||||||
|
Which reference files the agent should consult and what each teaches:
|
||||||
|
|
||||||
|
| Reference File | What It Teaches | What the Agent Should Apply |
|
||||||
|
|---|---|---|
|
||||||
|
| `references/db-rls-mandatory.md` | RLS must be enabled on all public tables | Enable RLS on organizations, memberships, and projects |
|
||||||
|
| `references/db-rls-policy-types.md` | PERMISSIVE vs RESTRICTIVE policies | Use PERMISSIVE policies for team OR owner access patterns |
|
||||||
|
| `references/db-rls-common-mistakes.md` | Missing TO clause, user_metadata pitfalls | Always use `TO authenticated` on all policies |
|
||||||
|
| `references/db-rls-performance.md` | Wrap auth.uid() in SELECT, use security_definer for joins | Use `(select auth.uid())` and a private-schema helper function |
|
||||||
|
| `references/db-security-functions.md` | security_definer in private schema with search_path = '' | Create helper function in private schema, revoke default permissions |
|
||||||
|
| `references/db-schema-auth-fk.md` | FK to auth.users with ON DELETE CASCADE | Reference auth.users with cascade on memberships |
|
||||||
|
| `references/db-schema-timestamps.md` | Use timestamptz not timestamp | All time columns use timestamptz |
|
||||||
|
| `references/db-perf-indexes.md` | Index columns used in RLS policies | Index user_id and org_id columns used in policy lookups |
|
||||||
|
| `references/db-migrations-idempotent.md` | IF NOT EXISTS for safe reruns | Idempotent DDL throughout the migration |
|
||||||
|
|
||||||
|
## Workspace Setup
|
||||||
|
|
||||||
|
What the workspace starts with before the agent runs:
|
||||||
|
|
||||||
|
- Pre-initialized Supabase project (`supabase/config.toml` exists)
|
||||||
|
- Empty `supabase/migrations/` directory
|
||||||
|
- The agent creates migration files within this structure
|
||||||
|
|
||||||
|
## Agent Task (PROMPT.md draft)
|
||||||
|
|
||||||
|
The prompt to give the agent. Written as a developer would ask it:
|
||||||
|
|
||||||
|
> I'm building a project management app where users can belong to multiple
|
||||||
|
> organizations. Each organization has projects that all members can view and
|
||||||
|
> edit.
|
||||||
|
>
|
||||||
|
> Create a SQL migration with:
|
||||||
|
>
|
||||||
|
> 1. An `organizations` table (name, slug)
|
||||||
|
> 2. A `memberships` table linking users to organizations with a role column
|
||||||
|
> (owner, admin, member)
|
||||||
|
> 3. A `projects` table (name, description, status) belonging to an organization
|
||||||
|
>
|
||||||
|
> Set up Row Level Security so:
|
||||||
|
> - Users can only see organizations they belong to
|
||||||
|
> - Users can only see and manage projects in their organizations
|
||||||
|
> - Only org owners can delete projects
|
||||||
|
>
|
||||||
|
> The migration should handle the case where a user is deleted from auth.
|
||||||
|
|
||||||
|
## Evaluation Criteria
|
||||||
|
|
||||||
|
What vitest should assert on the agent's output. Each assertion tests a
|
||||||
|
specific quality signal:
|
||||||
|
|
||||||
|
| # | Test Name | What It Checks | Quality Dimension |
|
||||||
|
|---|-----------|----------------|-------------------|
|
||||||
|
| 1 | migration file exists | A `.sql` file exists in `supabase/migrations/` | structure |
|
||||||
|
| 2 | creates organizations table | SQL contains `CREATE TABLE` for organizations | correctness |
|
||||||
|
| 3 | creates memberships table | SQL contains `CREATE TABLE` for memberships | correctness |
|
||||||
|
| 4 | creates projects table | SQL contains `CREATE TABLE` for projects | correctness |
|
||||||
|
| 5 | enables RLS on all tables | `ALTER TABLE ... ENABLE ROW LEVEL SECURITY` for all three tables | security |
|
||||||
|
| 6 | FK to auth.users with ON DELETE CASCADE | memberships references `auth.users` with cascade | correctness |
|
||||||
|
| 7 | org_id FK on projects | projects references organizations | correctness |
|
||||||
|
| 8 | private schema created | `CREATE SCHEMA ... private` present | security |
|
||||||
|
| 9 | security_definer helper function | A function in the private schema with `SECURITY DEFINER` and `SET search_path = ''` | security |
|
||||||
|
| 10 | policies use (select auth.uid()) | Subselect form in all policies referencing auth.uid() | performance |
|
||||||
|
| 11 | policies use TO authenticated | All policies scoped to authenticated role | security |
|
||||||
|
| 12 | index on membership lookup columns | `CREATE INDEX` on user_id and/or org_id in memberships | performance |
|
||||||
|
| 13 | uses timestamptz | No plain `timestamp` for time columns | correctness |
|
||||||
|
| 14 | idempotent DDL | Uses `IF NOT EXISTS` or `DROP ... IF EXISTS` patterns | idempotency |
|
||||||
|
| 15 | delete policy restricted to owner role | A delete policy on projects checks for owner/admin role | security |
|
||||||
|
| 16 | overall quality score | At least 10/14 best-practice signals present | overall |
|
||||||
|
|
||||||
|
## Reasoning
|
||||||
|
|
||||||
|
Step-by-step reasoning for why this scenario is well-designed:
|
||||||
|
|
||||||
|
1. **Baseline differentiator:** An agent without the skill would likely put the
|
||||||
|
security_definer function in the public schema, omit `SET search_path = ''`,
|
||||||
|
use bare `auth.uid()` instead of the subselect form, write inline joins in
|
||||||
|
policies instead of using a helper function, and possibly forget `TO
|
||||||
|
authenticated` on some policies. These are all patterns that require specific
|
||||||
|
knowledge of Supabase conventions.
|
||||||
|
|
||||||
|
2. **Skill value:** The skill explicitly teaches: (a) private schema for
|
||||||
|
security_definer functions, (b) `SET search_path = ''` to prevent injection,
|
||||||
|
(c) `(select auth.uid())` for per-statement caching, (d) using
|
||||||
|
security_definer functions to avoid per-row joins in policies, (e) `TO
|
||||||
|
authenticated` on every policy. This is a scenario where reading 5+ reference
|
||||||
|
files materially improves the output.
|
||||||
|
|
||||||
|
3. **Testability:** Every assertion checks for specific SQL patterns via regex.
|
||||||
|
The private schema, security_definer, search_path, subselect auth.uid(), TO
|
||||||
|
authenticated, indexes, and timestamptz are all reliably detectable in SQL
|
||||||
|
text without runtime execution.
|
||||||
|
|
||||||
|
4. **Realism:** Multi-tenant team-based access control is one of the most common
|
||||||
|
Supabase use cases. The GitHub discussions linked above have hundreds of
|
||||||
|
comments from developers working on exactly this pattern. Project management
|
||||||
|
apps (Notion, Linear, Asana clones) are a canonical example.
|
||||||
|
|
||||||
|
## Difficulty
|
||||||
|
|
||||||
|
**Rating:** MEDIUM
|
||||||
|
|
||||||
|
- Without skill: ~35-50% of assertions expected to pass
|
||||||
|
- With skill: ~85-95% of assertions expected to pass
|
||||||
@@ -2,10 +2,12 @@ import { existsSync, readdirSync, readFileSync } from "node:fs";
|
|||||||
import { join, resolve } from "node:path";
|
import { join, resolve } from "node:path";
|
||||||
import { runAgent } from "./runner/agent.js";
|
import { runAgent } from "./runner/agent.js";
|
||||||
import { uploadToBraintrust } from "./runner/braintrust.js";
|
import { uploadToBraintrust } from "./runner/braintrust.js";
|
||||||
|
import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
|
||||||
import { preflight } from "./runner/preflight.js";
|
import { preflight } from "./runner/preflight.js";
|
||||||
import { listModifiedFiles, printSummary } from "./runner/results.js";
|
import { listModifiedFiles, printSummary } from "./runner/results.js";
|
||||||
import { createWorkspace } from "./runner/scaffold.js";
|
import { createWorkspace } from "./runner/scaffold.js";
|
||||||
import { runTests } from "./runner/test.js";
|
import { runTests } from "./runner/test.js";
|
||||||
|
import { buildTranscriptSummary } from "./runner/transcript.js";
|
||||||
import type { EvalRunResult, EvalScenario } from "./types.js";
|
import type { EvalRunResult, EvalScenario } from "./types.js";
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
@@ -19,6 +21,12 @@ const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL;
|
|||||||
const scenarioFilter = process.env.EVAL_SCENARIO;
|
const scenarioFilter = process.env.EVAL_SCENARIO;
|
||||||
const runBaseline = process.env.EVAL_BASELINE === "true";
|
const runBaseline = process.env.EVAL_BASELINE === "true";
|
||||||
|
|
||||||
|
// Run-level timestamp shared across all scenarios in a single invocation
|
||||||
|
const runTimestamp = new Date()
|
||||||
|
.toISOString()
|
||||||
|
.replace(/[:.]/g, "-")
|
||||||
|
.replace("Z", "");
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Discover scenarios
|
// Discover scenarios
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
@@ -58,10 +66,9 @@ async function runEval(
|
|||||||
): Promise<EvalRunResult> {
|
): Promise<EvalRunResult> {
|
||||||
const evalsDir = findEvalsDir();
|
const evalsDir = findEvalsDir();
|
||||||
const evalDir = join(evalsDir, scenario.id);
|
const evalDir = join(evalsDir, scenario.id);
|
||||||
|
const variant = skillEnabled ? "with-skill" : "baseline";
|
||||||
|
|
||||||
console.log(
|
console.log(`\n--- ${scenario.id} (${variant}) ---`);
|
||||||
`\n--- ${scenario.id} (${skillEnabled ? "with-skill" : "baseline"}) ---`,
|
|
||||||
);
|
|
||||||
|
|
||||||
// 1. Create isolated workspace
|
// 1. Create isolated workspace
|
||||||
const { workspacePath, cleanup } = createWorkspace({
|
const { workspacePath, cleanup } = createWorkspace({
|
||||||
@@ -104,7 +111,10 @@ async function runEval(
|
|||||||
// 5. Collect modified files
|
// 5. Collect modified files
|
||||||
const filesModified = listModifiedFiles(workspacePath, evalDir);
|
const filesModified = listModifiedFiles(workspacePath, evalDir);
|
||||||
|
|
||||||
return {
|
// 6. Build transcript summary
|
||||||
|
const summary = buildTranscriptSummary(agentResult.events);
|
||||||
|
|
||||||
|
const result: EvalRunResult = {
|
||||||
scenario: scenario.id,
|
scenario: scenario.id,
|
||||||
agent: "claude-code",
|
agent: "claude-code",
|
||||||
model,
|
model,
|
||||||
@@ -116,7 +126,22 @@ async function runEval(
|
|||||||
testsPassed: testResult.passedCount,
|
testsPassed: testResult.passedCount,
|
||||||
testsTotal: testResult.totalCount,
|
testsTotal: testResult.totalCount,
|
||||||
filesModified,
|
filesModified,
|
||||||
|
toolCallCount: summary.toolCalls.length,
|
||||||
|
costUsd: summary.totalCostUsd ?? undefined,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// 7. Persist results
|
||||||
|
const resultDir = createResultDir(runTimestamp, scenario.id, variant);
|
||||||
|
result.resultsDir = resultDir;
|
||||||
|
saveRunArtifacts({
|
||||||
|
resultDir,
|
||||||
|
rawTranscript: agentResult.rawTranscript,
|
||||||
|
testOutput: testResult.output,
|
||||||
|
result,
|
||||||
|
transcriptSummary: summary,
|
||||||
|
});
|
||||||
|
|
||||||
|
return result;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
const err = error as Error;
|
const err = error as Error;
|
||||||
return {
|
return {
|
||||||
@@ -175,7 +200,9 @@ async function main() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
printSummary(results);
|
// Use the results dir from the first result (all share the same timestamp)
|
||||||
|
const resultsDir = results.find((r) => r.resultsDir)?.resultsDir;
|
||||||
|
printSummary(results, resultsDir);
|
||||||
|
|
||||||
if (process.env.BRAINTRUST_UPLOAD === "true") {
|
if (process.env.BRAINTRUST_UPLOAD === "true") {
|
||||||
console.log("\nUploading to Braintrust...");
|
console.log("\nUploading to Braintrust...");
|
||||||
|
|||||||
@@ -1,13 +1,27 @@
|
|||||||
import { spawn } from "node:child_process";
|
import { spawn } from "node:child_process";
|
||||||
|
import { resolveClaudeBin } from "./preflight.js";
|
||||||
|
import {
|
||||||
|
extractFinalOutput,
|
||||||
|
parseStreamJsonOutput,
|
||||||
|
type TranscriptEvent,
|
||||||
|
} from "./transcript.js";
|
||||||
|
|
||||||
export interface AgentRunResult {
|
export interface AgentRunResult {
|
||||||
|
/** Extracted final text output (backward-compatible). */
|
||||||
output: string;
|
output: string;
|
||||||
duration: number;
|
duration: number;
|
||||||
|
/** Raw NDJSON transcript string from stream-json. */
|
||||||
|
rawTranscript: string;
|
||||||
|
/** Parsed transcript events. */
|
||||||
|
events: TranscriptEvent[];
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Invoke Claude Code in print mode as a subprocess.
|
* Invoke Claude Code in print mode as a subprocess.
|
||||||
*
|
*
|
||||||
|
* Uses --output-format stream-json to capture structured NDJSON events
|
||||||
|
* including tool calls, results, and reasoning steps.
|
||||||
|
*
|
||||||
* The agent operates in the workspace directory and can read/write files.
|
* The agent operates in the workspace directory and can read/write files.
|
||||||
* When the skill is installed (symlinked into workspace), Claude Code
|
* When the skill is installed (symlinked into workspace), Claude Code
|
||||||
* discovers it automatically and uses it for guidance.
|
* discovers it automatically and uses it for guidance.
|
||||||
@@ -23,14 +37,22 @@ export async function runAgent(opts: {
|
|||||||
|
|
||||||
const args = [
|
const args = [
|
||||||
"-p", // Print mode (non-interactive)
|
"-p", // Print mode (non-interactive)
|
||||||
|
"--verbose",
|
||||||
"--output-format",
|
"--output-format",
|
||||||
"text",
|
"stream-json",
|
||||||
"--model",
|
"--model",
|
||||||
opts.model,
|
opts.model,
|
||||||
"--no-session-persistence",
|
"--no-session-persistence",
|
||||||
"--dangerously-skip-permissions",
|
"--dangerously-skip-permissions",
|
||||||
"--tools",
|
"--tools",
|
||||||
"Edit,Write,Bash,Read,Glob,Grep",
|
"Edit,Write,Bash,Read,Glob,Grep",
|
||||||
|
// Disable all MCP servers so the agent uses only local filesystem tools.
|
||||||
|
// Without this, MCP tools from the parent env (e.g. Supabase, Neon)
|
||||||
|
// leak in and the agent may apply migrations to a remote project
|
||||||
|
// instead of creating local files.
|
||||||
|
"--mcp-config",
|
||||||
|
'{"mcpServers":{}}',
|
||||||
|
"--strict-mcp-config",
|
||||||
];
|
];
|
||||||
|
|
||||||
// Disable skills for baseline runs so the agent relies on innate knowledge
|
// Disable skills for baseline runs so the agent relies on innate knowledge
|
||||||
@@ -46,8 +68,10 @@ export async function runAgent(opts: {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const claudeBin = resolveClaudeBin();
|
||||||
|
|
||||||
return new Promise<AgentRunResult>((resolve) => {
|
return new Promise<AgentRunResult>((resolve) => {
|
||||||
const child = spawn("claude", args, {
|
const child = spawn(claudeBin, args, {
|
||||||
cwd: opts.cwd,
|
cwd: opts.cwd,
|
||||||
env,
|
env,
|
||||||
stdio: ["pipe", "pipe", "pipe"],
|
stdio: ["pipe", "pipe", "pipe"],
|
||||||
@@ -73,9 +97,15 @@ export async function runAgent(opts: {
|
|||||||
|
|
||||||
child.on("close", () => {
|
child.on("close", () => {
|
||||||
clearTimeout(timer);
|
clearTimeout(timer);
|
||||||
|
const rawTranscript = stdout || stderr;
|
||||||
|
const events = parseStreamJsonOutput(rawTranscript);
|
||||||
|
const output = extractFinalOutput(events) || rawTranscript;
|
||||||
|
|
||||||
resolve({
|
resolve({
|
||||||
output: stdout || stderr,
|
output,
|
||||||
duration: Date.now() - start,
|
duration: Date.now() - start,
|
||||||
|
rawTranscript,
|
||||||
|
events,
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
56
packages/evals/src/runner/persist.ts
Normal file
56
packages/evals/src/runner/persist.ts
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
import { mkdirSync, writeFileSync } from "node:fs";
|
||||||
|
import { dirname, join } from "node:path";
|
||||||
|
import { fileURLToPath } from "node:url";
|
||||||
|
import type { EvalRunResult } from "../types.js";
|
||||||
|
import type { TranscriptSummary } from "./transcript.js";
|
||||||
|
|
||||||
|
const __filename = fileURLToPath(import.meta.url);
|
||||||
|
const __dirname = dirname(__filename);
|
||||||
|
|
||||||
|
/** Resolve the evals package root (packages/evals). */
|
||||||
|
function evalsRoot(): string {
|
||||||
|
// __dirname is packages/evals/src/runner
|
||||||
|
return join(__dirname, "..", "..");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Create the results directory for a single scenario run. Returns the path. */
|
||||||
|
export function createResultDir(
|
||||||
|
runTimestamp: string,
|
||||||
|
scenarioId: string,
|
||||||
|
variant: "with-skill" | "baseline",
|
||||||
|
): string {
|
||||||
|
const dir = join(evalsRoot(), "results", runTimestamp, scenarioId, variant);
|
||||||
|
mkdirSync(dir, { recursive: true });
|
||||||
|
return dir;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Save all artifacts for a single eval run. */
|
||||||
|
export function saveRunArtifacts(opts: {
|
||||||
|
resultDir: string;
|
||||||
|
rawTranscript: string;
|
||||||
|
testOutput: string;
|
||||||
|
result: EvalRunResult;
|
||||||
|
transcriptSummary: TranscriptSummary;
|
||||||
|
}): void {
|
||||||
|
writeFileSync(
|
||||||
|
join(opts.resultDir, "transcript.jsonl"),
|
||||||
|
opts.rawTranscript,
|
||||||
|
"utf-8",
|
||||||
|
);
|
||||||
|
|
||||||
|
writeFileSync(
|
||||||
|
join(opts.resultDir, "test-output.txt"),
|
||||||
|
opts.testOutput,
|
||||||
|
"utf-8",
|
||||||
|
);
|
||||||
|
|
||||||
|
writeFileSync(
|
||||||
|
join(opts.resultDir, "result.json"),
|
||||||
|
JSON.stringify(
|
||||||
|
{ ...opts.result, transcript: opts.transcriptSummary },
|
||||||
|
null,
|
||||||
|
2,
|
||||||
|
),
|
||||||
|
"utf-8",
|
||||||
|
);
|
||||||
|
}
|
||||||
@@ -1,10 +1,61 @@
|
|||||||
import { execFileSync } from "node:child_process";
|
import { execFileSync } from "node:child_process";
|
||||||
|
import { existsSync } from "node:fs";
|
||||||
|
import { dirname, join } from "node:path";
|
||||||
|
import { fileURLToPath } from "node:url";
|
||||||
|
|
||||||
|
const __filename = fileURLToPath(import.meta.url);
|
||||||
|
const __dirname = dirname(__filename);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve the `claude` binary path.
|
||||||
|
*
|
||||||
|
* Looks in the following order:
|
||||||
|
* 1. Local node_modules/.bin/claude (installed via @anthropic-ai/claude-code)
|
||||||
|
* 2. Global `claude` on PATH
|
||||||
|
*
|
||||||
|
* Throws with an actionable message when neither is found.
|
||||||
|
*/
|
||||||
|
export function resolveClaudeBin(): string {
|
||||||
|
// packages/evals/node_modules/.bin/claude
|
||||||
|
const localBin = join(
|
||||||
|
__dirname,
|
||||||
|
"..",
|
||||||
|
"..",
|
||||||
|
"node_modules",
|
||||||
|
".bin",
|
||||||
|
"claude",
|
||||||
|
);
|
||||||
|
if (existsSync(localBin)) {
|
||||||
|
return localBin;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to PATH
|
||||||
|
try {
|
||||||
|
execFileSync("claude", ["--version"], {
|
||||||
|
stdio: "ignore",
|
||||||
|
timeout: 10_000,
|
||||||
|
});
|
||||||
|
return "claude";
|
||||||
|
} catch {
|
||||||
|
throw new Error(
|
||||||
|
[
|
||||||
|
"claude CLI not found.",
|
||||||
|
"",
|
||||||
|
"Install it in one of these ways:",
|
||||||
|
" npm install (uses @anthropic-ai/claude-code from package.json)",
|
||||||
|
" npm i -g @anthropic-ai/claude-code",
|
||||||
|
"",
|
||||||
|
"Ensure ANTHROPIC_API_KEY is set in the environment.",
|
||||||
|
].join("\n"),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verify the host environment has everything needed before spending
|
* Verify the host environment has everything needed before spending
|
||||||
* API credits on an eval run.
|
* API credits on an eval run.
|
||||||
*
|
*
|
||||||
* Checks: Node >= 20, Docker running, claude CLI available.
|
* Checks: Node >= 20, Docker running, claude CLI available, API key set.
|
||||||
*/
|
*/
|
||||||
export function preflight(): void {
|
export function preflight(): void {
|
||||||
const errors: string[] = [];
|
const errors: string[] = [];
|
||||||
@@ -24,12 +75,16 @@ export function preflight(): void {
|
|||||||
|
|
||||||
// Claude CLI available
|
// Claude CLI available
|
||||||
try {
|
try {
|
||||||
execFileSync("claude", ["--version"], {
|
resolveClaudeBin();
|
||||||
stdio: "ignore",
|
} catch (err) {
|
||||||
timeout: 10_000,
|
errors.push((err as Error).message);
|
||||||
});
|
}
|
||||||
} catch {
|
|
||||||
errors.push("claude CLI not found on PATH");
|
// API key
|
||||||
|
if (!process.env.ANTHROPIC_API_KEY) {
|
||||||
|
errors.push(
|
||||||
|
"ANTHROPIC_API_KEY is not set. Claude Code requires this for authentication.",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (errors.length > 0) {
|
if (errors.length > 0) {
|
||||||
|
|||||||
@@ -46,7 +46,10 @@ export function listModifiedFiles(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Print a summary table of eval results. */
|
/** Print a summary table of eval results. */
|
||||||
export function printSummary(results: EvalRunResult[]): void {
|
export function printSummary(
|
||||||
|
results: EvalRunResult[],
|
||||||
|
resultsDir?: string,
|
||||||
|
): void {
|
||||||
console.log("\n=== Eval Results ===\n");
|
console.log("\n=== Eval Results ===\n");
|
||||||
|
|
||||||
for (const r of results) {
|
for (const r of results) {
|
||||||
@@ -65,4 +68,8 @@ export function printSummary(results: EvalRunResult[]): void {
|
|||||||
|
|
||||||
const passed = results.filter((r) => r.status === "passed").length;
|
const passed = results.filter((r) => r.status === "passed").length;
|
||||||
console.log(`\nTotal: ${passed}/${results.length} passed`);
|
console.log(`\nTotal: ${passed}/${results.length} passed`);
|
||||||
|
|
||||||
|
if (resultsDir) {
|
||||||
|
console.log(`\nResults saved to: ${resultsDir}`);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -78,17 +78,24 @@ export async function runTests(opts: {
|
|||||||
|
|
||||||
function parseTestOutput(output: string): TestResult {
|
function parseTestOutput(output: string): TestResult {
|
||||||
// Parse vitest output for pass/fail counts
|
// Parse vitest output for pass/fail counts
|
||||||
// Format: "Tests N passed (M)" or "Tests N failed | M passed (T)"
|
// Vitest formats:
|
||||||
const testsLine = output.match(
|
// All passing: "Tests N passed (N)"
|
||||||
|
// Mixed: "Tests N failed | M passed (T)"
|
||||||
|
// All failing: "Tests N failed (N)"
|
||||||
|
const mixedOrPassing = output.match(
|
||||||
/Tests\s+(?:(\d+)\s+failed\s+\|\s+)?(\d+)\s+passed\s+\((\d+)\)/,
|
/Tests\s+(?:(\d+)\s+failed\s+\|\s+)?(\d+)\s+passed\s+\((\d+)\)/,
|
||||||
);
|
);
|
||||||
|
const allFailing = output.match(/Tests\s+(\d+)\s+failed\s+\((\d+)\)/);
|
||||||
|
|
||||||
let passedCount = 0;
|
let passedCount = 0;
|
||||||
let totalCount = 0;
|
let totalCount = 0;
|
||||||
|
|
||||||
if (testsLine) {
|
if (mixedOrPassing) {
|
||||||
passedCount = Number.parseInt(testsLine[2], 10);
|
passedCount = Number.parseInt(mixedOrPassing[2], 10);
|
||||||
totalCount = Number.parseInt(testsLine[3], 10);
|
totalCount = Number.parseInt(mixedOrPassing[3], 10);
|
||||||
|
} else if (allFailing) {
|
||||||
|
passedCount = 0;
|
||||||
|
totalCount = Number.parseInt(allFailing[2], 10);
|
||||||
}
|
}
|
||||||
|
|
||||||
const passed = totalCount > 0 && passedCount === totalCount;
|
const passed = totalCount > 0 && passedCount === totalCount;
|
||||||
|
|||||||
154
packages/evals/src/runner/transcript.ts
Normal file
154
packages/evals/src/runner/transcript.ts
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
export interface TranscriptEvent {
|
||||||
|
type: string;
|
||||||
|
[key: string]: unknown;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ToolCallSummary {
|
||||||
|
tool: string;
|
||||||
|
toolUseId: string;
|
||||||
|
input: Record<string, unknown>;
|
||||||
|
/** First ~200 chars of output for quick scanning */
|
||||||
|
outputPreview: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface TranscriptSummary {
|
||||||
|
totalTurns: number;
|
||||||
|
totalDurationMs: number;
|
||||||
|
totalCostUsd: number | null;
|
||||||
|
model: string | null;
|
||||||
|
toolCalls: ToolCallSummary[];
|
||||||
|
finalOutput: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Parse a single NDJSON line. Returns null on empty or invalid input. */
|
||||||
|
export function parseStreamJsonLine(line: string): TranscriptEvent | null {
|
||||||
|
const trimmed = line.trim();
|
||||||
|
if (!trimmed) return null;
|
||||||
|
try {
|
||||||
|
return JSON.parse(trimmed) as TranscriptEvent;
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Parse raw NDJSON stdout into an array of events. */
|
||||||
|
export function parseStreamJsonOutput(raw: string): TranscriptEvent[] {
|
||||||
|
const events: TranscriptEvent[] = [];
|
||||||
|
for (const line of raw.split("\n")) {
|
||||||
|
const event = parseStreamJsonLine(line);
|
||||||
|
if (event) events.push(event);
|
||||||
|
}
|
||||||
|
return events;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Extract the final text output from parsed events (for backward compat). */
|
||||||
|
export function extractFinalOutput(events: TranscriptEvent[]): string {
|
||||||
|
// Prefer the result event
|
||||||
|
for (const event of events) {
|
||||||
|
if (event.type === "result") {
|
||||||
|
const result = (event as Record<string, unknown>).result;
|
||||||
|
if (typeof result === "string") return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: concatenate text blocks from the last assistant message
|
||||||
|
for (let i = events.length - 1; i >= 0; i--) {
|
||||||
|
const event = events[i];
|
||||||
|
if (event.type === "assistant") {
|
||||||
|
const msg = (event as Record<string, unknown>).message as
|
||||||
|
| Record<string, unknown>
|
||||||
|
| undefined;
|
||||||
|
const content = msg?.content;
|
||||||
|
if (Array.isArray(content)) {
|
||||||
|
const texts = content
|
||||||
|
.filter(
|
||||||
|
(b: Record<string, unknown>) =>
|
||||||
|
b.type === "text" && typeof b.text === "string",
|
||||||
|
)
|
||||||
|
.map((b: Record<string, unknown>) => b.text as string);
|
||||||
|
if (texts.length > 0) return texts.join("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Walk parsed events to build a transcript summary. */
|
||||||
|
export function buildTranscriptSummary(
|
||||||
|
events: TranscriptEvent[],
|
||||||
|
): TranscriptSummary {
|
||||||
|
const toolCalls: ToolCallSummary[] = [];
|
||||||
|
let finalOutput = "";
|
||||||
|
let totalDurationMs = 0;
|
||||||
|
let totalCostUsd: number | null = null;
|
||||||
|
let model: string | null = null;
|
||||||
|
let totalTurns = 0;
|
||||||
|
|
||||||
|
for (const event of events) {
|
||||||
|
const e = event as Record<string, unknown>;
|
||||||
|
|
||||||
|
// System init: extract model
|
||||||
|
if (e.type === "system" && e.subtype === "init") {
|
||||||
|
model = typeof e.model === "string" ? e.model : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assistant messages: extract tool_use blocks
|
||||||
|
if (e.type === "assistant") {
|
||||||
|
const msg = e.message as Record<string, unknown> | undefined;
|
||||||
|
const content = msg?.content;
|
||||||
|
if (Array.isArray(content)) {
|
||||||
|
for (const block of content) {
|
||||||
|
if (block.type === "tool_use") {
|
||||||
|
toolCalls.push({
|
||||||
|
tool: block.name ?? "unknown",
|
||||||
|
toolUseId: block.id ?? "",
|
||||||
|
input: block.input ?? {},
|
||||||
|
outputPreview: "",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// User messages: extract tool_result blocks and match to tool calls
|
||||||
|
if (e.type === "user") {
|
||||||
|
const msg = e.message as Record<string, unknown> | undefined;
|
||||||
|
const content = msg?.content;
|
||||||
|
if (Array.isArray(content)) {
|
||||||
|
for (const block of content) {
|
||||||
|
if (block.type === "tool_result") {
|
||||||
|
const matching = toolCalls.find(
|
||||||
|
(tc) => tc.toolUseId === block.tool_use_id,
|
||||||
|
);
|
||||||
|
if (matching) {
|
||||||
|
const text =
|
||||||
|
typeof block.content === "string"
|
||||||
|
? block.content
|
||||||
|
: JSON.stringify(block.content);
|
||||||
|
matching.outputPreview = text.slice(0, 200);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Result event: final output, cost, duration, turns
|
||||||
|
if (e.type === "result") {
|
||||||
|
finalOutput = typeof e.result === "string" ? e.result : "";
|
||||||
|
totalDurationMs = typeof e.duration_ms === "number" ? e.duration_ms : 0;
|
||||||
|
totalCostUsd =
|
||||||
|
typeof e.total_cost_usd === "number" ? e.total_cost_usd : null;
|
||||||
|
totalTurns = typeof e.num_turns === "number" ? e.num_turns : 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
totalTurns,
|
||||||
|
totalDurationMs,
|
||||||
|
totalCostUsd,
|
||||||
|
model,
|
||||||
|
toolCalls,
|
||||||
|
finalOutput,
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -32,4 +32,10 @@ export interface EvalRunResult {
|
|||||||
/** Files the agent created or modified in the workspace */
|
/** Files the agent created or modified in the workspace */
|
||||||
filesModified: string[];
|
filesModified: string[];
|
||||||
error?: string;
|
error?: string;
|
||||||
|
/** Path to the persisted results directory for this run */
|
||||||
|
resultsDir?: string;
|
||||||
|
/** Number of tool calls the agent made */
|
||||||
|
toolCallCount?: number;
|
||||||
|
/** Total cost in USD (from stream-json result event) */
|
||||||
|
costUsd?: number;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user