more two scenarios and claude code cli is now a dependency

This commit is contained in:
Pedro Rodrigues
2026-02-20 15:02:59 +00:00
parent 9a23c6b021
commit e03bc99ebb
24 changed files with 1766 additions and 21 deletions

View File

@@ -20,6 +20,12 @@ hidden tests check the result. Binary pass/fail.
The agent is **Claude Code** invoked via `claude -p` (print mode). It operates
on a real filesystem in a temp directory and can read/write files freely.
**Important**: MCP servers are disabled via `--strict-mcp-config` with an empty
config. This ensures the agent uses only local tools (Bash, Edit, Write, Read,
Glob, Grep) and cannot access remote services like Supabase MCP or Neon. All
work must happen on the local filesystem — e.g., creating migration files in
`supabase/migrations/`, not applying them to a remote project.
## Eval Structure
Each eval lives in `evals/{scenario-name}/`:

View File

@@ -0,0 +1,252 @@
import { existsSync, readdirSync, readFileSync } from "node:fs";
import { join } from "node:path";
import { expect, test } from "vitest";
const supabaseDir = join(process.cwd(), "supabase");
const migrationsDir = join(supabaseDir, "migrations");
/** Find all .sql migration files (agent may create one or more). */
function findMigrationFiles(): string[] {
if (!existsSync(migrationsDir)) return [];
return readdirSync(migrationsDir)
.filter((f) => f.endsWith(".sql"))
.map((f) => join(migrationsDir, f));
}
/** Read and concatenate all migration SQL files. */
function getMigrationSQL(): string {
const files = findMigrationFiles();
if (files.length === 0)
throw new Error("No migration file found in supabase/migrations/");
return files.map((f) => readFileSync(f, "utf-8")).join("\n");
}
test("migration file exists", () => {
expect(findMigrationFiles().length).toBeGreaterThan(0);
});
test("creates avatars bucket", () => {
const sql = getMigrationSQL().toLowerCase();
// Should insert into storage.buckets with id 'avatars' and public = true
expect(sql).toMatch(/storage\.buckets/);
expect(sql).toMatch(/avatars/);
expect(sql).toMatch(/public/);
// Verify it's marked as a public bucket (true)
const avatarsBlock = sql.match(
/insert\s+into\s+storage\.buckets[\s\S]*?avatars[\s\S]*?;/,
);
expect(avatarsBlock).not.toBeNull();
if (avatarsBlock) {
expect(avatarsBlock[0]).toMatch(/true/);
}
});
test("creates documents bucket", () => {
const sql = getMigrationSQL().toLowerCase();
// Should insert into storage.buckets with id 'documents' and public = false
expect(sql).toMatch(/documents/);
const documentsBlock = sql.match(
/insert\s+into\s+storage\.buckets[\s\S]*?documents[\s\S]*?;/,
);
expect(documentsBlock).not.toBeNull();
if (documentsBlock) {
expect(documentsBlock[0]).toMatch(/false/);
}
});
test("avatars bucket has mime type restriction", () => {
const sql = getMigrationSQL().toLowerCase();
// Should have allowed_mime_types with image types
expect(sql).toMatch(/allowed_mime_types/);
// Check for image MIME types (jpeg, png, webp)
expect(sql).toMatch(/image\/jpeg/);
expect(sql).toMatch(/image\/png/);
expect(sql).toMatch(/image\/webp/);
});
test("avatars bucket has file size limit", () => {
const sql = getMigrationSQL().toLowerCase();
// Should have file_size_limit set to approximately 2MB (2097152 bytes or 2MB string)
expect(sql).toMatch(/file_size_limit/);
// Accept either numeric bytes (2097152) or string form (2MB, 2MiB, 2 * 1024 * 1024)
const hasNumericLimit = /2097152/.test(sql);
const hasStringLimit = /2\s*m/i.test(sql);
const hasCalcLimit = /2\s*\*\s*1024\s*\*\s*1024/.test(sql);
expect(hasNumericLimit || hasStringLimit || hasCalcLimit).toBe(true);
});
test("storage policy uses foldername or path for user isolation", () => {
const sql = getMigrationSQL().toLowerCase();
// Should use storage.foldername(name) with auth.uid()::text for folder isolation
const usesFoldername = /storage\.foldername\s*\(\s*name\s*\)/.test(sql);
// Also accept direct path matching patterns like (name ~ '^user-id/')
const usesPathMatch =
/\(\s*storage\.foldername\s*\(/.test(sql) ||
/\bname\b.*auth\.uid\(\)/.test(sql);
expect(usesFoldername || usesPathMatch).toBe(true);
// Should cast auth.uid() to text for comparison with folder name
expect(sql).toMatch(/auth\.uid\(\)\s*::\s*text/);
});
test("storage policy uses TO authenticated", () => {
const sql = getMigrationSQL().toLowerCase();
// Storage upload/delete/update policies should use TO authenticated
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
const storagePolicies = policyBlocks.filter((p) =>
p.toLowerCase().includes("storage.objects"),
);
// At least one storage policy should have TO authenticated
const hasAuthenticatedPolicy = storagePolicies.some((p) =>
/to\s+(authenticated|public)/.test(p.toLowerCase()),
);
expect(hasAuthenticatedPolicy).toBe(true);
// Specifically, upload/insert policies should be TO authenticated (not public)
const insertPolicies = storagePolicies.filter((p) =>
/for\s+insert/.test(p.toLowerCase()),
);
for (const policy of insertPolicies) {
expect(policy.toLowerCase()).toMatch(/to\s+authenticated/);
}
});
test("public read policy for avatars", () => {
const sql = getMigrationSQL().toLowerCase();
// A SELECT policy on storage.objects for avatars bucket should allow public/anon access
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
const avatarSelectPolicies = policyBlocks.filter(
(p) =>
p.toLowerCase().includes("storage.objects") &&
/for\s+select/.test(p.toLowerCase()) &&
p.toLowerCase().includes("avatars"),
);
expect(avatarSelectPolicies.length).toBeGreaterThan(0);
// Should use TO public (or TO anon) for public read access
const hasPublicAccess = avatarSelectPolicies.some(
(p) =>
/to\s+public/.test(p.toLowerCase()) || /to\s+anon/.test(p.toLowerCase()),
);
expect(hasPublicAccess).toBe(true);
});
test("documents bucket is fully private", () => {
const sql = getMigrationSQL().toLowerCase();
// All policies for documents bucket should restrict to authenticated owner
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
const documentPolicies = policyBlocks.filter(
(p) =>
p.toLowerCase().includes("storage.objects") &&
p.toLowerCase().includes("documents"),
);
expect(documentPolicies.length).toBeGreaterThan(0);
// None should allow public/anon access
for (const policy of documentPolicies) {
expect(policy).not.toMatch(/to\s+public/);
expect(policy).not.toMatch(/to\s+anon/);
}
// All should be scoped to authenticated
for (const policy of documentPolicies) {
expect(policy).toMatch(/to\s+authenticated/);
}
});
test("creates file_metadata table", () => {
const sql = getMigrationSQL().toLowerCase();
expect(sql).toMatch(/create\s+table/);
expect(sql).toMatch(/file_metadata/);
});
test("file_metadata has FK to auth.users with CASCADE", () => {
const sql = getMigrationSQL().toLowerCase();
// Find the file_metadata CREATE TABLE block or the surrounding context
expect(sql).toMatch(/references\s+auth\.users/);
expect(sql).toMatch(/on\s+delete\s+cascade/);
});
test("RLS enabled on file_metadata", () => {
const sql = getMigrationSQL().toLowerCase();
expect(sql).toMatch(
/alter\s+table.*file_metadata.*enable\s+row\s+level\s+security/,
);
});
test("file_metadata policies use (select auth.uid())", () => {
const sql = getMigrationSQL();
// Find policies that reference file_metadata
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
const metadataPolicies = policyBlocks.filter((p) =>
p.toLowerCase().includes("file_metadata"),
);
// Each policy that uses auth.uid() should use the subselect form
for (const policy of metadataPolicies) {
if (policy.includes("auth.uid()")) {
expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
}
}
});
test("uses timestamptz for time columns", () => {
const sql = getMigrationSQL().toLowerCase();
// Match "timestamp" that is NOT followed by "tz" or "with time zone"
const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
// Only check if the migration defines time-related columns
if (
sql.includes("created_at") ||
sql.includes("updated_at") ||
sql.includes("uploaded_at")
) {
expect(sql).not.toMatch(hasPlainTimestamp);
}
});
test("index on file_metadata user_id", () => {
const sql = getMigrationSQL().toLowerCase();
expect(sql).toMatch(/create\s+index/);
// Should index user_id on file_metadata
expect(sql).toMatch(/file_metadata/);
expect(sql).toMatch(/user_id/);
});
test("idempotent DDL", () => {
const sql = getMigrationSQL().toLowerCase();
expect(sql).toMatch(/if\s+not\s+exists/);
});
test("overall quality score", () => {
const sql = getMigrationSQL().toLowerCase();
// A high-quality migration should contain most of these best-practice signals
const signals = [
// 1. Avatars bucket is public
/insert\s+into\s+storage\.buckets[\s\S]*?avatars/,
// 2. Documents bucket exists
/insert\s+into\s+storage\.buckets[\s\S]*?documents/,
// 3. MIME type restriction
/allowed_mime_types/,
// 4. File size limit
/file_size_limit/,
// 5. Storage foldername helper
/storage\.foldername/,
// 6. auth.uid()::text cast
/auth\.uid\(\)\s*::\s*text/,
// 7. TO authenticated on policies
/to\s+authenticated/,
// 8. Public read for avatars
/to\s+(public|anon)/,
// 9. RLS on file_metadata
/enable\s+row\s+level\s+security/,
// 10. FK to auth.users with cascade
/on\s+delete\s+cascade/,
// 11. (select auth.uid()) subselect form
/\(select\s+auth\.uid\(\)\)/,
// 12. Index on user_id
/create\s+index/,
// 13. timestamptz usage
/timestamptz/,
// 14. IF NOT EXISTS for idempotency
/if\s+not\s+exists/,
// 15. file_metadata table
/create\s+table[\s\S]*?file_metadata/,
];
const matches = signals.filter((r) => r.test(sql));
// Require at least 11 of 15 best-practice signals
expect(matches.length).toBeGreaterThanOrEqual(11);
});

View File

@@ -0,0 +1,12 @@
I need to set up file storage for my app. There are two use cases:
1. **Avatars** -- Users upload a profile picture. Anyone can view avatars but only the owning user can upload or replace their own. Only allow image files (JPEG, PNG, WebP). Max 2MB.
2. **Documents** -- Users upload private documents that only they can access. Max 50MB. No file type restriction.
The Supabase project is already initialized in the `supabase/` directory. Create a SQL migration that:
- Configures both storage buckets
- Adds RLS policies on `storage.objects` so each user can only access their own folder (folder name = user ID)
- Creates a `file_metadata` table to track uploaded files (file name, bucket, size, user reference) with appropriate security
Users are authenticated via Supabase Auth.

View File

@@ -0,0 +1,5 @@
{
"name": "storage-rls-user-folders",
"private": true,
"type": "module"
}

View File

@@ -0,0 +1,64 @@
# For detailed configuration reference documentation, visit:
# https://supabase.com/docs/guides/local-development/cli/config
# A string used to distinguish different Supabase projects on the same host. Defaults to the
# working directory name when running `supabase init`.
project_id = "storage-rls-user-folders"
[api]
enabled = true
# Port to use for the API URL.
port = 54321
# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
# endpoints. `public` and `graphql_public` schemas are included by default.
schemas = ["public", "graphql_public"]
# Extra schemas to add to the search_path of every request.
extra_search_path = ["public", "extensions"]
# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
# for accidental or malicious requests.
max_rows = 1000
[db]
# Port to use for the local database URL.
port = 54322
# Port used by db diff command to initialize the shadow database.
shadow_port = 54320
# The database major version to use. This has to be the same as your remote database's. Run `SHOW
# server_version;` on the remote database to check.
major_version = 17
[db.pooler]
enabled = false
# Port to use for the local connection pooler.
port = 54329
# Specifies when a server connection can be reused by other clients.
# Configure one of the supported pooler modes: `transaction`, `session`.
pool_mode = "transaction"
# How many server connections to allow per user/database pair.
default_pool_size = 20
# Maximum number of client connections allowed.
max_client_conn = 100
[storage]
enabled = true
# The maximum file size allowed (e.g. "5MB", "500KB").
file_size_limit = "50MiB"
[auth]
enabled = true
# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
# in emails.
site_url = "http://127.0.0.1:3000"
# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
additional_redirect_urls = ["https://127.0.0.1:3000"]
# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
jwt_expiry = 3600
# Allow/disallow new user signups to your project.
enable_signup = true
# Allow/disallow anonymous sign-ins to your project.
enable_anonymous_sign_ins = false
[auth.email]
# Allow/disallow new user signups via email to your project.
enable_signup = true
# If enabled, users need to confirm their email address before signing in.
enable_confirmations = false

View File

@@ -0,0 +1,201 @@
import { existsSync, readdirSync, readFileSync } from "node:fs";
import { join } from "node:path";
import { expect, test } from "vitest";
const supabaseDir = join(process.cwd(), "supabase");
const migrationsDir = join(supabaseDir, "migrations");
/** Find all .sql migration files (agent may create one or multiple). */
function findMigrationFiles(): string[] {
if (!existsSync(migrationsDir)) return [];
return readdirSync(migrationsDir)
.filter((f) => f.endsWith(".sql"))
.map((f) => join(migrationsDir, f));
}
/** Concatenate all migration SQL into a single string for assertions. */
function getMigrationSQL(): string {
const files = findMigrationFiles();
if (files.length === 0)
throw new Error("No migration file found in supabase/migrations/");
return files.map((f) => readFileSync(f, "utf-8")).join("\n");
}
test("migration file exists", () => {
expect(findMigrationFiles().length).toBeGreaterThan(0);
});
test("creates organizations table", () => {
const sql = getMigrationSQL().toLowerCase();
expect(sql).toMatch(/create\s+table[\s\S]*?organizations/);
});
test("creates memberships table", () => {
const sql = getMigrationSQL().toLowerCase();
expect(sql).toMatch(/create\s+table[\s\S]*?memberships/);
});
test("creates projects table", () => {
const sql = getMigrationSQL().toLowerCase();
expect(sql).toMatch(/create\s+table[\s\S]*?projects/);
});
test("enables RLS on all tables", () => {
const sql = getMigrationSQL().toLowerCase();
expect(sql).toMatch(
/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/,
);
expect(sql).toMatch(
/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/,
);
expect(sql).toMatch(
/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/,
);
});
test("FK to auth.users with ON DELETE CASCADE", () => {
const sql = getMigrationSQL().toLowerCase();
// memberships should reference auth.users with cascade delete
expect(sql).toMatch(/references\s+auth\.users/);
expect(sql).toMatch(/on\s+delete\s+cascade/);
});
test("org_id FK on projects", () => {
const sql = getMigrationSQL().toLowerCase();
// projects should have a foreign key referencing organizations
expect(sql).toMatch(
/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/,
);
});
test("private schema created", () => {
const sql = getMigrationSQL().toLowerCase();
expect(sql).toMatch(/create\s+schema[\s\S]*?private/);
});
test("security_definer helper function", () => {
const sql = getMigrationSQL().toLowerCase();
// Function should be in the private schema with SECURITY DEFINER and search_path = ''
expect(sql).toMatch(/private\./);
expect(sql).toMatch(/security\s+definer/);
expect(sql).toMatch(/set\s+search_path\s*=\s*''/);
});
test("policies use (select auth.uid())", () => {
const sql = getMigrationSQL();
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
expect(policyBlocks.length).toBeGreaterThan(0);
for (const policy of policyBlocks) {
if (policy.includes("auth.uid()")) {
// The subselect form: (select auth.uid())
expect(policy).toMatch(/\(\s*select\s+auth\.uid\(\)\s*\)/i);
}
}
});
test("policies use TO authenticated", () => {
const sql = getMigrationSQL().toLowerCase();
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
expect(policyBlocks.length).toBeGreaterThan(0);
for (const policy of policyBlocks) {
expect(policy).toMatch(/to\s+authenticated/);
}
});
test("index on membership lookup columns", () => {
const sql = getMigrationSQL().toLowerCase();
expect(sql).toMatch(/create\s+index/);
// Should index user_id and/or org_id on memberships for policy lookups
const indexBlocks = sql.match(/create\s+index[\s\S]*?;/gi) ?? [];
const indexesUserOrOrg = indexBlocks.filter(
(idx) =>
idx.includes("user_id") ||
idx.includes("org_id") ||
idx.includes("organization_id"),
);
expect(indexesUserOrOrg.length).toBeGreaterThanOrEqual(1);
});
test("uses timestamptz", () => {
const sql = getMigrationSQL().toLowerCase();
// Match "timestamp" that is NOT followed by "tz" or "with time zone"
const hasPlainTimestamp = /\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/;
// Only fail if the migration defines time columns with plain timestamp
if (
sql.includes("created_at") ||
sql.includes("updated_at") ||
sql.includes("_at ")
) {
expect(sql).not.toMatch(hasPlainTimestamp);
}
});
test("idempotent DDL", () => {
const sql = getMigrationSQL().toLowerCase();
expect(sql).toMatch(/if\s+not\s+exists/);
});
test("delete policy restricted to owner role", () => {
const sql = getMigrationSQL().toLowerCase();
// Look for a delete policy on projects that checks for owner (or admin) role
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
const deletePolicy = policyBlocks.find(
(p) =>
p.toLowerCase().includes("delete") && p.toLowerCase().includes("project"),
);
expect(deletePolicy).toBeDefined();
// The delete policy should check for an owner/admin role
expect(deletePolicy?.toLowerCase()).toMatch(/owner|admin/);
});
test("overall quality score", () => {
const sql = getMigrationSQL().toLowerCase();
const policyBlocks = sql.match(/create\s+policy[\s\S]*?;/gi) ?? [];
// A high-quality migration should contain most of these best-practice signals
const signals = [
// 1. RLS enabled on all three tables
/alter\s+table[\s\S]*?organizations[\s\S]*?enable\s+row\s+level\s+security/.test(
sql,
) &&
/alter\s+table[\s\S]*?memberships[\s\S]*?enable\s+row\s+level\s+security/.test(
sql,
) &&
/alter\s+table[\s\S]*?projects[\s\S]*?enable\s+row\s+level\s+security/.test(
sql,
),
// 2. FK to auth.users with cascade
/references\s+auth\.users/.test(sql) && /on\s+delete\s+cascade/.test(sql),
// 3. Private schema created
/create\s+schema[\s\S]*?private/.test(sql),
// 4. security_definer with search_path
/security\s+definer/.test(sql) && /set\s+search_path\s*=\s*''/.test(sql),
// 5. Subselect auth.uid()
/\(\s*select\s+auth\.uid\(\)\s*\)/.test(sql),
// 6. TO authenticated on policies
policyBlocks.length > 0 &&
policyBlocks.every((p) => /to\s+authenticated/.test(p)),
// 7. Indexes on lookup columns
/create\s+index/.test(sql),
// 8. timestamptz (no plain timestamp)
!/\btimestamp\b(?!\s*tz)(?!\s+with\s+time\s+zone)/.test(sql),
// 9. Idempotent DDL
/if\s+not\s+exists/.test(sql),
// 10. Delete policy checks owner role
policyBlocks.some(
(p) =>
p.toLowerCase().includes("delete") &&
p.toLowerCase().includes("project") &&
/owner|admin/.test(p.toLowerCase()),
),
// 11. org_id FK on projects
/org[anization_]*id[\s\S]*?references[\s\S]*?organizations/.test(sql),
// 12. Multiple policies (at least one per table)
policyBlocks.length >= 3,
// 13. Membership role column exists
/role/.test(sql),
// 14. Private schema function referenced in policies
/private\./.test(sql),
];
const passed = signals.filter(Boolean).length;
expect(passed).toBeGreaterThanOrEqual(10);
});

View File

@@ -0,0 +1,14 @@
I'm building a project management app where users can belong to multiple organizations. Each organization has projects that all members can view and edit.
The Supabase project is already initialized in the `supabase/` directory. Create a SQL migration with:
1. An `organizations` table (name, slug)
2. A `memberships` table linking users to organizations with a role column (owner, admin, member)
3. A `projects` table (name, description, status) belonging to an organization
Set up Row Level Security so:
- Users can only see organizations they belong to
- Users can only see and manage projects in their organizations
- Only org owners can delete projects
The migration should handle the case where a user is deleted from auth.

View File

@@ -0,0 +1,5 @@
{
"name": "team-rls-security-definer",
"private": true,
"type": "module"
}

View File

@@ -0,0 +1,111 @@
# For detailed configuration reference documentation, visit:
# https://supabase.com/docs/guides/local-development/cli/config
# A string used to distinguish different Supabase projects on the same host. Defaults to the
# working directory name when running `supabase init`.
project_id = "team-rls-security-definer"
[api]
enabled = true
# Port to use for the API URL.
port = 54321
# Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
# endpoints. `public` and `graphql_public` schemas are included by default.
schemas = ["public", "graphql_public"]
# Extra schemas to add to the search_path of every request.
extra_search_path = ["public", "extensions"]
# The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
# for accidental or malicious requests.
max_rows = 1000
[db]
# Port to use for the local database URL.
port = 54322
# Port used by db diff command to initialize the shadow database.
shadow_port = 54320
# The database major version to use. This has to be the same as your remote database's. Run `SHOW
# server_version;` on the remote database to check.
major_version = 17
[db.pooler]
enabled = false
# Port to use for the local connection pooler.
port = 54329
# Specifies when a server connection can be reused by other clients.
# Configure one of the supported pooler modes: `transaction`, `session`.
pool_mode = "transaction"
# How many server connections to allow per user/database pair.
default_pool_size = 20
# Maximum number of client connections allowed.
max_client_conn = 100
[db.migrations]
# If disabled, migrations will be skipped during a db push or reset.
enabled = true
schema_paths = []
[db.seed]
# If enabled, seeds the database after migrations during a db reset.
enabled = true
# Specifies an ordered list of seed files to load during db reset.
sql_paths = ["./seed.sql"]
[realtime]
enabled = true
[studio]
enabled = true
# Port to use for Supabase Studio.
port = 54323
# External URL of the API server that frontend connects to.
api_url = "http://127.0.0.1"
[inbucket]
enabled = true
# Port to use for the email testing server web interface.
port = 54324
[storage]
enabled = true
# The maximum file size allowed (e.g. "5MB", "500KB").
file_size_limit = "50MiB"
[auth]
enabled = true
# The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
# in emails.
site_url = "http://127.0.0.1:3000"
# A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
additional_redirect_urls = ["https://127.0.0.1:3000"]
# How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 (1 week).
jwt_expiry = 3600
# If disabled, the refresh token will never expire.
enable_refresh_token_rotation = true
# Allows refresh tokens to be reused after expiry, up to the specified interval in seconds.
# Requires enable_refresh_token_rotation = true.
refresh_token_reuse_interval = 10
# Allow/disallow new user signups to your project.
enable_signup = true
# Allow/disallow anonymous sign-ins to your project.
enable_anonymous_sign_ins = false
[auth.email]
# Allow/disallow new user signups via email to your project.
enable_signup = true
# If enabled, a user will be required to confirm any email change on both the old, and new email
# addresses. If disabled, only the new email is required to confirm.
double_confirm_changes = true
# If enabled, users need to confirm their email address before signing in.
enable_confirmations = false
[edge_runtime]
enabled = true
# Configure one of the supported request policies: `oneshot`, `per_worker`.
policy = "per_worker"
# Port to attach the Chrome inspector for debugging edge functions.
inspector_port = 8083
[analytics]
enabled = true
port = 54327
# Configure one of the supported backends: `postgres`, `bigquery`.
backend = "postgres"

View File

@@ -9,6 +9,7 @@
"version": "1.0.0",
"license": "MIT",
"dependencies": {
"@anthropic-ai/claude-code": "^2.1.49",
"braintrust": "^3.0.0"
},
"devDependencies": {
@@ -18,6 +19,29 @@
"vitest": "^3.1.0"
}
},
"node_modules/@anthropic-ai/claude-code": {
"version": "2.1.49",
"resolved": "https://registry.npmjs.org/@anthropic-ai/claude-code/-/claude-code-2.1.49.tgz",
"integrity": "sha512-PonEmTZlB5IZbBu9TmtOpGZnupU7OxOXTsJKcXE/4Ak5qp3ptN1wSBRdgKYnn6GDYhXijTXuVVwrCQU+NAgwPA==",
"license": "SEE LICENSE IN README.md",
"bin": {
"claude": "cli.js"
},
"engines": {
"node": ">=18.0.0"
},
"optionalDependencies": {
"@img/sharp-darwin-arm64": "^0.34.2",
"@img/sharp-darwin-x64": "^0.34.2",
"@img/sharp-linux-arm": "^0.34.2",
"@img/sharp-linux-arm64": "^0.34.2",
"@img/sharp-linux-x64": "^0.34.2",
"@img/sharp-linuxmusl-arm64": "^0.34.2",
"@img/sharp-linuxmusl-x64": "^0.34.2",
"@img/sharp-win32-arm64": "^0.34.2",
"@img/sharp-win32-x64": "^0.34.2"
}
},
"node_modules/@colors/colors": {
"version": "1.5.0",
"resolved": "https://registry.npmjs.org/@colors/colors/-/colors-1.5.0.tgz",
@@ -444,6 +468,310 @@
"node": ">=18"
}
},
"node_modules/@img/sharp-darwin-arm64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.5.tgz",
"integrity": "sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w==",
"cpu": [
"arm64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
},
"funding": {
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-darwin-arm64": "1.2.4"
}
},
"node_modules/@img/sharp-darwin-x64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.5.tgz",
"integrity": "sha512-YNEFAF/4KQ/PeW0N+r+aVVsoIY0/qxxikF2SWdp+NRkmMB7y9LBZAVqQ4yhGCm/H3H270OSykqmQMKLBhBJDEw==",
"cpu": [
"x64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
},
"funding": {
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-darwin-x64": "1.2.4"
}
},
"node_modules/@img/sharp-libvips-darwin-arm64": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.2.4.tgz",
"integrity": "sha512-zqjjo7RatFfFoP0MkQ51jfuFZBnVE2pRiaydKJ1G/rHZvnsrHAOcQALIi9sA5co5xenQdTugCvtb1cuf78Vf4g==",
"cpu": [
"arm64"
],
"license": "LGPL-3.0-or-later",
"optional": true,
"os": [
"darwin"
],
"funding": {
"url": "https://opencollective.com/libvips"
}
},
"node_modules/@img/sharp-libvips-darwin-x64": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.2.4.tgz",
"integrity": "sha512-1IOd5xfVhlGwX+zXv2N93k0yMONvUlANylbJw1eTah8K/Jtpi15KC+WSiaX/nBmbm2HxRM1gZ0nSdjSsrZbGKg==",
"cpu": [
"x64"
],
"license": "LGPL-3.0-or-later",
"optional": true,
"os": [
"darwin"
],
"funding": {
"url": "https://opencollective.com/libvips"
}
},
"node_modules/@img/sharp-libvips-linux-arm": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.2.4.tgz",
"integrity": "sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A==",
"cpu": [
"arm"
],
"license": "LGPL-3.0-or-later",
"optional": true,
"os": [
"linux"
],
"funding": {
"url": "https://opencollective.com/libvips"
}
},
"node_modules/@img/sharp-libvips-linux-arm64": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.2.4.tgz",
"integrity": "sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw==",
"cpu": [
"arm64"
],
"license": "LGPL-3.0-or-later",
"optional": true,
"os": [
"linux"
],
"funding": {
"url": "https://opencollective.com/libvips"
}
},
"node_modules/@img/sharp-libvips-linux-x64": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.2.4.tgz",
"integrity": "sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw==",
"cpu": [
"x64"
],
"license": "LGPL-3.0-or-later",
"optional": true,
"os": [
"linux"
],
"funding": {
"url": "https://opencollective.com/libvips"
}
},
"node_modules/@img/sharp-libvips-linuxmusl-arm64": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.2.4.tgz",
"integrity": "sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw==",
"cpu": [
"arm64"
],
"license": "LGPL-3.0-or-later",
"optional": true,
"os": [
"linux"
],
"funding": {
"url": "https://opencollective.com/libvips"
}
},
"node_modules/@img/sharp-libvips-linuxmusl-x64": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.2.4.tgz",
"integrity": "sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg==",
"cpu": [
"x64"
],
"license": "LGPL-3.0-or-later",
"optional": true,
"os": [
"linux"
],
"funding": {
"url": "https://opencollective.com/libvips"
}
},
"node_modules/@img/sharp-linux-arm": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.5.tgz",
"integrity": "sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw==",
"cpu": [
"arm"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
},
"funding": {
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-linux-arm": "1.2.4"
}
},
"node_modules/@img/sharp-linux-arm64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.5.tgz",
"integrity": "sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg==",
"cpu": [
"arm64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
},
"funding": {
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-linux-arm64": "1.2.4"
}
},
"node_modules/@img/sharp-linux-x64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.5.tgz",
"integrity": "sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ==",
"cpu": [
"x64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
},
"funding": {
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-linux-x64": "1.2.4"
}
},
"node_modules/@img/sharp-linuxmusl-arm64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.5.tgz",
"integrity": "sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg==",
"cpu": [
"arm64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
},
"funding": {
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-linuxmusl-arm64": "1.2.4"
}
},
"node_modules/@img/sharp-linuxmusl-x64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.5.tgz",
"integrity": "sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q==",
"cpu": [
"x64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
},
"funding": {
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-linuxmusl-x64": "1.2.4"
}
},
"node_modules/@img/sharp-win32-arm64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-win32-arm64/-/sharp-win32-arm64-0.34.5.tgz",
"integrity": "sha512-WQ3AgWCWYSb2yt+IG8mnC6Jdk9Whs7O0gxphblsLvdhSpSTtmu69ZG1Gkb6NuvxsNACwiPV6cNSZNzt0KPsw7g==",
"cpu": [
"arm64"
],
"license": "Apache-2.0 AND LGPL-3.0-or-later",
"optional": true,
"os": [
"win32"
],
"engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
},
"funding": {
"url": "https://opencollective.com/libvips"
}
},
"node_modules/@img/sharp-win32-x64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.5.tgz",
"integrity": "sha512-+29YMsqY2/9eFEiW93eqWnuLcWcufowXewwSNIT6UwZdUUCrM3oFjMWH/Z6/TMmb4hlFenmfAVbpWeup2jryCw==",
"cpu": [
"x64"
],
"license": "Apache-2.0 AND LGPL-3.0-or-later",
"optional": true,
"os": [
"win32"
],
"engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
},
"funding": {
"url": "https://opencollective.com/libvips"
}
},
"node_modules/@jridgewell/sourcemap-codec": {
"version": "1.5.5",
"resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz",

View File

@@ -10,6 +10,7 @@
"eval:upload": "BRAINTRUST_UPLOAD=true tsx src/runner.ts"
},
"dependencies": {
"@anthropic-ai/claude-code": "^2.1.49",
"braintrust": "^3.0.0"
},
"devDependencies": {

View File

@@ -49,3 +49,121 @@ The agent initializes a Supabase project and creates a migration file that:
| index on user_id | `CREATE INDEX` on the FK column |
| IF NOT EXISTS | Idempotent migration |
| overall quality | At least 4/5 best-practice signals present |
## Scenario 2: team-rls-security-definer
**Description:** Create a SQL migration for a team-based project management app
where users belong to organizations via a membership table. The migration must
define tables for organizations, memberships, and projects, then secure them
with RLS policies that use a `security definer` helper function in a private
schema to efficiently resolve team membership without per-row joins.
**Setup:** The workspace starts with a pre-initialized Supabase project
(`supabase/config.toml` exists, empty `supabase/migrations/` directory). The
agent creates migration files within this structure.
**Expected skill files read:**
- `SKILL.md` (skill body with reference file index)
- `references/db-rls-mandatory.md`
- `references/db-rls-policy-types.md`
- `references/db-rls-common-mistakes.md`
- `references/db-rls-performance.md`
- `references/db-security-functions.md`
- `references/db-schema-auth-fk.md`
- `references/db-schema-timestamps.md`
- `references/db-perf-indexes.md`
- `references/db-migrations-idempotent.md`
**Expected result:**
The agent creates a migration file that:
- Creates organizations, memberships, and projects tables with `timestamptz` columns
- Has `user_id` FK to `auth.users(id)` with `ON DELETE CASCADE` on memberships
- Has `org_id` FK on projects referencing organizations
- Enables RLS on all three tables
- Creates a private schema with a `security definer` helper function (`SET search_path = ''`)
- Creates RLS policies using `(select auth.uid())` with `TO authenticated`
- Creates indexes on membership lookup columns (user_id, org_id)
- Has a delete policy on projects restricted to owner role
- Uses `IF NOT EXISTS` for idempotency
**Scorer:** Binary pass/fail (16 vitest assertions)
| Test | What it checks |
| --- | --- |
| migration file exists | A `.sql` file exists in `supabase/migrations/` |
| creates organizations table | SQL contains `CREATE TABLE` for organizations |
| creates memberships table | SQL contains `CREATE TABLE` for memberships |
| creates projects table | SQL contains `CREATE TABLE` for projects |
| enables RLS on all tables | `ALTER TABLE ... ENABLE ROW LEVEL SECURITY` for all three tables |
| FK to auth.users with ON DELETE CASCADE | memberships references `auth.users` with cascade |
| org_id FK on projects | projects references organizations |
| private schema created | `CREATE SCHEMA ... private` present |
| security_definer helper function | Function in private schema with `SECURITY DEFINER` and `SET search_path = ''` |
| policies use (select auth.uid()) | Subselect form in all policies referencing auth.uid() |
| policies use TO authenticated | All policies scoped to authenticated role |
| index on membership lookup columns | `CREATE INDEX` on user_id and/or org_id in memberships |
| uses timestamptz | No plain `timestamp` for time columns |
| idempotent DDL | Uses `IF NOT EXISTS` or `DROP ... IF EXISTS` patterns |
| delete policy restricted to owner role | A delete policy on projects checks for owner/admin role |
| overall quality score | At least 10/14 best-practice signals present |
## Scenario 3: storage-rls-user-folders
**Description:** Create a SQL migration that sets up Supabase Storage buckets
with RLS policies for user-content. An avatars bucket (public reads,
authenticated uploads restricted to user folders) and a documents bucket (fully
private, user-isolated), with file type restrictions, storage helper functions
in policies, and a file_metadata tracking table secured with RLS.
**Setup:** Pre-initialized Supabase project (`supabase/config.toml` exists)
with an empty `supabase/migrations/` directory. The agent creates migration
files within this structure.
**Expected skill files read:**
- `SKILL.md` (skill body with reference file index)
- `references/storage-access-control.md`
- `references/db-rls-mandatory.md`
- `references/db-rls-common-mistakes.md`
- `references/db-rls-performance.md`
- `references/db-schema-auth-fk.md`
- `references/db-schema-timestamps.md`
- `references/db-perf-indexes.md`
- `references/db-migrations-idempotent.md`
**Expected result:**
The agent creates a migration file that:
- Inserts avatars bucket into `storage.buckets` with `public = true`, MIME type restrictions, and file size limit
- Inserts documents bucket with `public = false`
- Creates RLS policies on `storage.objects` using `storage.foldername(name)` with `auth.uid()::text`
- Scopes upload policies `TO authenticated` and avatars SELECT policy `TO public`
- Creates `file_metadata` table with FK to `auth.users` with `ON DELETE CASCADE`
- Enables RLS on `file_metadata` with policies using `(select auth.uid())`
- Uses `timestamptz` for time columns, indexes `user_id`, and `IF NOT EXISTS` for idempotency
**Scorer:** Binary pass/fail (17 vitest assertions)
| Test | What it checks |
| --- | --- |
| migration file exists | A `.sql` file exists in `supabase/migrations/` |
| creates avatars bucket | SQL inserts into `storage.buckets` with id 'avatars' and `public = true` |
| creates documents bucket | SQL inserts into `storage.buckets` with id 'documents' and `public = false` |
| avatars bucket has mime type restriction | `allowed_mime_types` includes image types (jpeg, png, webp) |
| avatars bucket has file size limit | `file_size_limit` set (around 2MB / 2097152 bytes) |
| storage policy uses foldername or path for user isolation | Policy references `storage.foldername(name)` with `auth.uid()::text` |
| storage policy uses TO authenticated | Storage upload/delete policies scoped to `TO authenticated` |
| public read policy for avatars | A SELECT policy on storage.objects for avatars allows public/anon access |
| documents bucket is fully private | Policies for documents restrict all operations to authenticated owner |
| creates file_metadata table | SQL contains `CREATE TABLE` for file_metadata |
| file_metadata has FK to auth.users with CASCADE | `REFERENCES auth.users` with `ON DELETE CASCADE` |
| RLS enabled on file_metadata | `ALTER TABLE file_metadata ENABLE ROW LEVEL SECURITY` |
| file_metadata policies use (select auth.uid()) | Subselect form in policies |
| uses timestamptz for time columns | No plain `timestamp` in file_metadata |
| index on file_metadata user_id | `CREATE INDEX` on user_id column |
| idempotent DDL | Uses `IF NOT EXISTS` patterns |
| overall quality score | At least 11/15 best-practice signals present |

View File

@@ -0,0 +1,144 @@
# Scenario: storage-rls-user-folders
## Summary
The agent must create a SQL migration that sets up Supabase Storage buckets
with RLS policies for a user-content application. The migration must configure
an avatars bucket (public reads, authenticated uploads restricted to user
folders) and a documents bucket (fully private, user-isolated), with proper
file type restrictions, storage helper functions in policies, and a
file_metadata tracking table secured with RLS.
## Real-World Justification
Why this is a common and important workflow:
1. **Storage RLS is confusing and under-documented compared to table RLS** --
Developers consistently struggle with the distinction between public/private
buckets and the RLS policies needed on `storage.objects`. Multiple GitHub
discussions show confusion about which SDK operations map to which SQL
operations (INSERT, SELECT, UPDATE, DELETE).
- Source: https://github.com/orgs/supabase/discussions/37611
- Source: https://github.com/orgs/supabase/discussions/38700
2. **User-folder isolation is the canonical storage security pattern** -- The
official Supabase docs demonstrate folder-based isolation using
`storage.foldername(name)` and `auth.uid()::text`, but developers frequently
get the casting or array indexing wrong.
- Source: https://supabase.com/docs/guides/storage/security/access-control
3. **Missing file type restrictions leads to security vulnerabilities** --
Without `allowed_mime_types` on the bucket or extension checks in RLS
policies, users can upload executable files or oversized payloads. The
Supabase security best practices guide calls this out as a common oversight.
- Source: https://supaexplorer.com/guides/supabase-security-best-practices
- Source: https://supabase.com/docs/guides/storage/buckets/fundamentals
## Skill References Exercised
Which reference files the agent should consult and what each teaches:
| Reference File | What It Teaches | What the Agent Should Apply |
|---|---|---|
| `references/storage-access-control.md` | Bucket visibility, RLS on storage.objects, storage helper functions, SDK-to-SQL operation mapping | User-folder policies using `storage.foldername()`, separate SELECT/INSERT policies |
| `references/db-rls-mandatory.md` | RLS must be enabled on all public tables | Enable RLS on the file_metadata tracking table |
| `references/db-rls-common-mistakes.md` | Missing TO clause, missing SELECT policy for UPDATE | Use `TO authenticated` (or `TO public` for public reads), include SELECT policy |
| `references/db-rls-performance.md` | Wrap auth.uid() in SELECT subquery | Use `(select auth.uid())` in both storage and table policies |
| `references/db-schema-auth-fk.md` | FK to auth.users with ON DELETE CASCADE | file_metadata.user_id references auth.users with cascade |
| `references/db-schema-timestamps.md` | Use timestamptz not timestamp | Time columns on file_metadata use timestamptz |
| `references/db-perf-indexes.md` | Index columns used in policy lookups | Index user_id on file_metadata |
| `references/db-migrations-idempotent.md` | IF NOT EXISTS for safe reruns | Idempotent DDL throughout |
## Workspace Setup
What the workspace starts with before the agent runs:
- Pre-initialized Supabase project (`supabase/config.toml` exists)
- Empty `supabase/migrations/` directory
- The agent creates migration files within this structure
## Agent Task (PROMPT.md draft)
The prompt to give the agent. Written as a developer would ask it:
> I need to set up file storage for my app. There are two use cases:
>
> 1. **Avatars** -- Users upload a profile picture. Anyone can view avatars but
> only the owning user can upload or replace their own. Only allow image
> files (JPEG, PNG, WebP). Max 2MB.
>
> 2. **Documents** -- Users upload private documents that only they can access.
> Max 50MB. No file type restriction.
>
> Create a SQL migration that:
> - Configures both storage buckets
> - Adds RLS policies on `storage.objects` so each user can only access their
> own folder (folder name = user ID)
> - Creates a `file_metadata` table to track uploaded files (file name, bucket,
> size, user reference) with appropriate security
>
> Users are authenticated via Supabase Auth.
## Evaluation Criteria
What vitest should assert on the agent's output. Each assertion tests a
specific quality signal:
| # | Test Name | What It Checks | Quality Dimension |
|---|-----------|----------------|-------------------|
| 1 | migration file exists | A `.sql` file exists in `supabase/migrations/` | structure |
| 2 | creates avatars bucket | SQL inserts into `storage.buckets` with id 'avatars' and `public = true` | correctness |
| 3 | creates documents bucket | SQL inserts into `storage.buckets` with id 'documents' and `public = false` | correctness |
| 4 | avatars bucket has mime type restriction | `allowed_mime_types` includes image types (jpeg, png, webp) | security |
| 5 | avatars bucket has file size limit | `file_size_limit` set (around 2MB / 2097152 bytes) | security |
| 6 | storage policy uses foldername or path for user isolation | Policy references `storage.foldername(name)` with `auth.uid()::text` | security |
| 7 | storage policy uses TO authenticated | Storage upload/delete policies scoped to `TO authenticated` | security |
| 8 | public read policy for avatars | A SELECT policy on storage.objects for avatars bucket allows public/anon access | correctness |
| 9 | documents bucket is fully private | Policies for documents bucket restrict all operations to authenticated owner | security |
| 10 | creates file_metadata table | SQL contains `CREATE TABLE` for file_metadata | correctness |
| 11 | file_metadata has FK to auth.users with CASCADE | `REFERENCES auth.users` with `ON DELETE CASCADE` | correctness |
| 12 | RLS enabled on file_metadata | `ALTER TABLE file_metadata ENABLE ROW LEVEL SECURITY` | security |
| 13 | file_metadata policies use (select auth.uid()) | Subselect form in policies | performance |
| 14 | uses timestamptz for time columns | No plain `timestamp` in file_metadata | correctness |
| 15 | index on file_metadata user_id | `CREATE INDEX` on user_id column | performance |
| 16 | idempotent DDL | Uses `IF NOT EXISTS` patterns | idempotency |
| 17 | overall quality score | At least 11/15 best-practice signals present | overall |
## Reasoning
Step-by-step reasoning for why this scenario is well-designed:
1. **Baseline differentiator:** An agent without the skill would likely: (a)
confuse public bucket visibility with unrestricted upload access, (b) write
storage policies without using `storage.foldername()` or get the array
indexing wrong, (c) forget to set `allowed_mime_types` on the bucket itself,
(d) omit the `TO authenticated` clause on storage policies, (e) use bare
`auth.uid()` instead of the subselect form, (f) skip the `::text` cast when
comparing auth.uid() to folder names. These are all Supabase-specific
patterns that require reading the skill references.
2. **Skill value:** The storage-access-control reference explicitly documents:
the public vs private bucket distinction, the `storage.foldername()` helper
function pattern, the SDK-to-SQL operation mapping, and bucket configuration
with mime types and size limits. Combined with the database security
references (RLS mandatory, common mistakes, performance), this scenario
exercises 8 reference files.
3. **Testability:** Bucket configuration (INSERT INTO storage.buckets), storage
helper function usage (storage.foldername), policy clauses (TO
authenticated, TO public), mime types, file size limits, and all table-level
patterns (RLS, FK, indexes, timestamptz) are reliably detectable via regex
on SQL text.
4. **Realism:** Nearly every Supabase application that handles user-generated
content needs avatar uploads and document storage. This is a day-one task
for any SaaS product. The GitHub discussions linked above show dozens of
developers hitting exactly these issues when setting up storage for the
first time.
## Difficulty
**Rating:** MEDIUM
- Without skill: ~30-45% of assertions expected to pass
- With skill: ~85-95% of assertions expected to pass

View File

@@ -0,0 +1,139 @@
# Scenario: team-rls-security-definer
## Summary
The agent must create a SQL migration for a team-based project management app
where users belong to organizations via a membership table. The migration must
define tables for organizations, memberships, and projects, then secure them
with RLS policies that use a `security definer` helper function in a private
schema to efficiently resolve team membership without per-row joins.
## Real-World Justification
Why this is a common and important workflow:
1. **Multi-tenant team access is the most-asked RLS question on Supabase** --
The official Supabase GitHub has multiple high-engagement discussions about
how to write RLS policies that check team/org membership without causing
performance issues or security holes.
- Source: https://github.com/supabase/supabase/discussions/4509
- Source: https://github.com/supabase/supabase/discussions/811
2. **security_definer in public schema is a documented security anti-pattern** --
Developers frequently place security_definer functions in the public schema,
inadvertently exposing them via the PostgREST API. The Supabase docs and
community discussions explicitly warn against this.
- Source: https://github.com/supabase/supabase/discussions/3269
- Source: https://supabase.com/docs/guides/database/postgres/row-level-security
3. **RLS policy performance with joins is a top pain point** -- Naive policies
that join against a memberships table execute per-row, causing severe
performance degradation. The recommended pattern is a security_definer
function that caches results via subselect.
- Source: https://github.com/orgs/supabase/discussions/1148
- Source: https://makerkit.dev/blog/tutorials/supabase-rls-best-practices
## Skill References Exercised
Which reference files the agent should consult and what each teaches:
| Reference File | What It Teaches | What the Agent Should Apply |
|---|---|---|
| `references/db-rls-mandatory.md` | RLS must be enabled on all public tables | Enable RLS on organizations, memberships, and projects |
| `references/db-rls-policy-types.md` | PERMISSIVE vs RESTRICTIVE policies | Use PERMISSIVE policies for team OR owner access patterns |
| `references/db-rls-common-mistakes.md` | Missing TO clause, user_metadata pitfalls | Always use `TO authenticated` on all policies |
| `references/db-rls-performance.md` | Wrap auth.uid() in SELECT, use security_definer for joins | Use `(select auth.uid())` and a private-schema helper function |
| `references/db-security-functions.md` | security_definer in private schema with search_path = '' | Create helper function in private schema, revoke default permissions |
| `references/db-schema-auth-fk.md` | FK to auth.users with ON DELETE CASCADE | Reference auth.users with cascade on memberships |
| `references/db-schema-timestamps.md` | Use timestamptz not timestamp | All time columns use timestamptz |
| `references/db-perf-indexes.md` | Index columns used in RLS policies | Index user_id and org_id columns used in policy lookups |
| `references/db-migrations-idempotent.md` | IF NOT EXISTS for safe reruns | Idempotent DDL throughout the migration |
## Workspace Setup
What the workspace starts with before the agent runs:
- Pre-initialized Supabase project (`supabase/config.toml` exists)
- Empty `supabase/migrations/` directory
- The agent creates migration files within this structure
## Agent Task (PROMPT.md draft)
The prompt to give the agent. Written as a developer would ask it:
> I'm building a project management app where users can belong to multiple
> organizations. Each organization has projects that all members can view and
> edit.
>
> Create a SQL migration with:
>
> 1. An `organizations` table (name, slug)
> 2. A `memberships` table linking users to organizations with a role column
> (owner, admin, member)
> 3. A `projects` table (name, description, status) belonging to an organization
>
> Set up Row Level Security so:
> - Users can only see organizations they belong to
> - Users can only see and manage projects in their organizations
> - Only org owners can delete projects
>
> The migration should handle the case where a user is deleted from auth.
## Evaluation Criteria
What vitest should assert on the agent's output. Each assertion tests a
specific quality signal:
| # | Test Name | What It Checks | Quality Dimension |
|---|-----------|----------------|-------------------|
| 1 | migration file exists | A `.sql` file exists in `supabase/migrations/` | structure |
| 2 | creates organizations table | SQL contains `CREATE TABLE` for organizations | correctness |
| 3 | creates memberships table | SQL contains `CREATE TABLE` for memberships | correctness |
| 4 | creates projects table | SQL contains `CREATE TABLE` for projects | correctness |
| 5 | enables RLS on all tables | `ALTER TABLE ... ENABLE ROW LEVEL SECURITY` for all three tables | security |
| 6 | FK to auth.users with ON DELETE CASCADE | memberships references `auth.users` with cascade | correctness |
| 7 | org_id FK on projects | projects references organizations | correctness |
| 8 | private schema created | `CREATE SCHEMA ... private` present | security |
| 9 | security_definer helper function | A function in the private schema with `SECURITY DEFINER` and `SET search_path = ''` | security |
| 10 | policies use (select auth.uid()) | Subselect form in all policies referencing auth.uid() | performance |
| 11 | policies use TO authenticated | All policies scoped to authenticated role | security |
| 12 | index on membership lookup columns | `CREATE INDEX` on user_id and/or org_id in memberships | performance |
| 13 | uses timestamptz | No plain `timestamp` for time columns | correctness |
| 14 | idempotent DDL | Uses `IF NOT EXISTS` or `DROP ... IF EXISTS` patterns | idempotency |
| 15 | delete policy restricted to owner role | A delete policy on projects checks for owner/admin role | security |
| 16 | overall quality score | At least 10/14 best-practice signals present | overall |
## Reasoning
Step-by-step reasoning for why this scenario is well-designed:
1. **Baseline differentiator:** An agent without the skill would likely put the
security_definer function in the public schema, omit `SET search_path = ''`,
use bare `auth.uid()` instead of the subselect form, write inline joins in
policies instead of using a helper function, and possibly forget `TO
authenticated` on some policies. These are all patterns that require specific
knowledge of Supabase conventions.
2. **Skill value:** The skill explicitly teaches: (a) private schema for
security_definer functions, (b) `SET search_path = ''` to prevent injection,
(c) `(select auth.uid())` for per-statement caching, (d) using
security_definer functions to avoid per-row joins in policies, (e) `TO
authenticated` on every policy. This is a scenario where reading 5+ reference
files materially improves the output.
3. **Testability:** Every assertion checks for specific SQL patterns via regex.
The private schema, security_definer, search_path, subselect auth.uid(), TO
authenticated, indexes, and timestamptz are all reliably detectable in SQL
text without runtime execution.
4. **Realism:** Multi-tenant team-based access control is one of the most common
Supabase use cases. The GitHub discussions linked above have hundreds of
comments from developers working on exactly this pattern. Project management
apps (Notion, Linear, Asana clones) are a canonical example.
## Difficulty
**Rating:** MEDIUM
- Without skill: ~35-50% of assertions expected to pass
- With skill: ~85-95% of assertions expected to pass

View File

@@ -2,10 +2,12 @@ import { existsSync, readdirSync, readFileSync } from "node:fs";
import { join, resolve } from "node:path";
import { runAgent } from "./runner/agent.js";
import { uploadToBraintrust } from "./runner/braintrust.js";
import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
import { preflight } from "./runner/preflight.js";
import { listModifiedFiles, printSummary } from "./runner/results.js";
import { createWorkspace } from "./runner/scaffold.js";
import { runTests } from "./runner/test.js";
import { buildTranscriptSummary } from "./runner/transcript.js";
import type { EvalRunResult, EvalScenario } from "./types.js";
// ---------------------------------------------------------------------------
@@ -19,6 +21,12 @@ const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL;
const scenarioFilter = process.env.EVAL_SCENARIO;
const runBaseline = process.env.EVAL_BASELINE === "true";
// Run-level timestamp shared across all scenarios in a single invocation
const runTimestamp = new Date()
.toISOString()
.replace(/[:.]/g, "-")
.replace("Z", "");
// ---------------------------------------------------------------------------
// Discover scenarios
// ---------------------------------------------------------------------------
@@ -58,10 +66,9 @@ async function runEval(
): Promise<EvalRunResult> {
const evalsDir = findEvalsDir();
const evalDir = join(evalsDir, scenario.id);
const variant = skillEnabled ? "with-skill" : "baseline";
console.log(
`\n--- ${scenario.id} (${skillEnabled ? "with-skill" : "baseline"}) ---`,
);
console.log(`\n--- ${scenario.id} (${variant}) ---`);
// 1. Create isolated workspace
const { workspacePath, cleanup } = createWorkspace({
@@ -104,7 +111,10 @@ async function runEval(
// 5. Collect modified files
const filesModified = listModifiedFiles(workspacePath, evalDir);
return {
// 6. Build transcript summary
const summary = buildTranscriptSummary(agentResult.events);
const result: EvalRunResult = {
scenario: scenario.id,
agent: "claude-code",
model,
@@ -116,7 +126,22 @@ async function runEval(
testsPassed: testResult.passedCount,
testsTotal: testResult.totalCount,
filesModified,
toolCallCount: summary.toolCalls.length,
costUsd: summary.totalCostUsd ?? undefined,
};
// 7. Persist results
const resultDir = createResultDir(runTimestamp, scenario.id, variant);
result.resultsDir = resultDir;
saveRunArtifacts({
resultDir,
rawTranscript: agentResult.rawTranscript,
testOutput: testResult.output,
result,
transcriptSummary: summary,
});
return result;
} catch (error) {
const err = error as Error;
return {
@@ -175,7 +200,9 @@ async function main() {
}
}
printSummary(results);
// Use the results dir from the first result (all share the same timestamp)
const resultsDir = results.find((r) => r.resultsDir)?.resultsDir;
printSummary(results, resultsDir);
if (process.env.BRAINTRUST_UPLOAD === "true") {
console.log("\nUploading to Braintrust...");

View File

@@ -1,13 +1,27 @@
import { spawn } from "node:child_process";
import { resolveClaudeBin } from "./preflight.js";
import {
extractFinalOutput,
parseStreamJsonOutput,
type TranscriptEvent,
} from "./transcript.js";
export interface AgentRunResult {
/** Extracted final text output (backward-compatible). */
output: string;
duration: number;
/** Raw NDJSON transcript string from stream-json. */
rawTranscript: string;
/** Parsed transcript events. */
events: TranscriptEvent[];
}
/**
* Invoke Claude Code in print mode as a subprocess.
*
* Uses --output-format stream-json to capture structured NDJSON events
* including tool calls, results, and reasoning steps.
*
* The agent operates in the workspace directory and can read/write files.
* When the skill is installed (symlinked into workspace), Claude Code
* discovers it automatically and uses it for guidance.
@@ -23,14 +37,22 @@ export async function runAgent(opts: {
const args = [
"-p", // Print mode (non-interactive)
"--verbose",
"--output-format",
"text",
"stream-json",
"--model",
opts.model,
"--no-session-persistence",
"--dangerously-skip-permissions",
"--tools",
"Edit,Write,Bash,Read,Glob,Grep",
// Disable all MCP servers so the agent uses only local filesystem tools.
// Without this, MCP tools from the parent env (e.g. Supabase, Neon)
// leak in and the agent may apply migrations to a remote project
// instead of creating local files.
"--mcp-config",
'{"mcpServers":{}}',
"--strict-mcp-config",
];
// Disable skills for baseline runs so the agent relies on innate knowledge
@@ -46,8 +68,10 @@ export async function runAgent(opts: {
}
}
const claudeBin = resolveClaudeBin();
return new Promise<AgentRunResult>((resolve) => {
const child = spawn("claude", args, {
const child = spawn(claudeBin, args, {
cwd: opts.cwd,
env,
stdio: ["pipe", "pipe", "pipe"],
@@ -73,9 +97,15 @@ export async function runAgent(opts: {
child.on("close", () => {
clearTimeout(timer);
const rawTranscript = stdout || stderr;
const events = parseStreamJsonOutput(rawTranscript);
const output = extractFinalOutput(events) || rawTranscript;
resolve({
output: stdout || stderr,
output,
duration: Date.now() - start,
rawTranscript,
events,
});
});
});

View File

@@ -0,0 +1,56 @@
import { mkdirSync, writeFileSync } from "node:fs";
import { dirname, join } from "node:path";
import { fileURLToPath } from "node:url";
import type { EvalRunResult } from "../types.js";
import type { TranscriptSummary } from "./transcript.js";
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
/** Resolve the evals package root (packages/evals). */
function evalsRoot(): string {
// __dirname is packages/evals/src/runner
return join(__dirname, "..", "..");
}
/** Create the results directory for a single scenario run. Returns the path. */
export function createResultDir(
runTimestamp: string,
scenarioId: string,
variant: "with-skill" | "baseline",
): string {
const dir = join(evalsRoot(), "results", runTimestamp, scenarioId, variant);
mkdirSync(dir, { recursive: true });
return dir;
}
/** Save all artifacts for a single eval run. */
export function saveRunArtifacts(opts: {
resultDir: string;
rawTranscript: string;
testOutput: string;
result: EvalRunResult;
transcriptSummary: TranscriptSummary;
}): void {
writeFileSync(
join(opts.resultDir, "transcript.jsonl"),
opts.rawTranscript,
"utf-8",
);
writeFileSync(
join(opts.resultDir, "test-output.txt"),
opts.testOutput,
"utf-8",
);
writeFileSync(
join(opts.resultDir, "result.json"),
JSON.stringify(
{ ...opts.result, transcript: opts.transcriptSummary },
null,
2,
),
"utf-8",
);
}

View File

@@ -1,10 +1,61 @@
import { execFileSync } from "node:child_process";
import { existsSync } from "node:fs";
import { dirname, join } from "node:path";
import { fileURLToPath } from "node:url";
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
/**
* Resolve the `claude` binary path.
*
* Looks in the following order:
* 1. Local node_modules/.bin/claude (installed via @anthropic-ai/claude-code)
* 2. Global `claude` on PATH
*
* Throws with an actionable message when neither is found.
*/
export function resolveClaudeBin(): string {
// packages/evals/node_modules/.bin/claude
const localBin = join(
__dirname,
"..",
"..",
"node_modules",
".bin",
"claude",
);
if (existsSync(localBin)) {
return localBin;
}
// Fall back to PATH
try {
execFileSync("claude", ["--version"], {
stdio: "ignore",
timeout: 10_000,
});
return "claude";
} catch {
throw new Error(
[
"claude CLI not found.",
"",
"Install it in one of these ways:",
" npm install (uses @anthropic-ai/claude-code from package.json)",
" npm i -g @anthropic-ai/claude-code",
"",
"Ensure ANTHROPIC_API_KEY is set in the environment.",
].join("\n"),
);
}
}
/**
* Verify the host environment has everything needed before spending
* API credits on an eval run.
*
* Checks: Node >= 20, Docker running, claude CLI available.
* Checks: Node >= 20, Docker running, claude CLI available, API key set.
*/
export function preflight(): void {
const errors: string[] = [];
@@ -24,12 +75,16 @@ export function preflight(): void {
// Claude CLI available
try {
execFileSync("claude", ["--version"], {
stdio: "ignore",
timeout: 10_000,
});
} catch {
errors.push("claude CLI not found on PATH");
resolveClaudeBin();
} catch (err) {
errors.push((err as Error).message);
}
// API key
if (!process.env.ANTHROPIC_API_KEY) {
errors.push(
"ANTHROPIC_API_KEY is not set. Claude Code requires this for authentication.",
);
}
if (errors.length > 0) {

View File

@@ -46,7 +46,10 @@ export function listModifiedFiles(
}
/** Print a summary table of eval results. */
export function printSummary(results: EvalRunResult[]): void {
export function printSummary(
results: EvalRunResult[],
resultsDir?: string,
): void {
console.log("\n=== Eval Results ===\n");
for (const r of results) {
@@ -65,4 +68,8 @@ export function printSummary(results: EvalRunResult[]): void {
const passed = results.filter((r) => r.status === "passed").length;
console.log(`\nTotal: ${passed}/${results.length} passed`);
if (resultsDir) {
console.log(`\nResults saved to: ${resultsDir}`);
}
}

View File

@@ -78,17 +78,24 @@ export async function runTests(opts: {
function parseTestOutput(output: string): TestResult {
// Parse vitest output for pass/fail counts
// Format: "Tests N passed (M)" or "Tests N failed | M passed (T)"
const testsLine = output.match(
// Vitest formats:
// All passing: "Tests N passed (N)"
// Mixed: "Tests N failed | M passed (T)"
// All failing: "Tests N failed (N)"
const mixedOrPassing = output.match(
/Tests\s+(?:(\d+)\s+failed\s+\|\s+)?(\d+)\s+passed\s+\((\d+)\)/,
);
const allFailing = output.match(/Tests\s+(\d+)\s+failed\s+\((\d+)\)/);
let passedCount = 0;
let totalCount = 0;
if (testsLine) {
passedCount = Number.parseInt(testsLine[2], 10);
totalCount = Number.parseInt(testsLine[3], 10);
if (mixedOrPassing) {
passedCount = Number.parseInt(mixedOrPassing[2], 10);
totalCount = Number.parseInt(mixedOrPassing[3], 10);
} else if (allFailing) {
passedCount = 0;
totalCount = Number.parseInt(allFailing[2], 10);
}
const passed = totalCount > 0 && passedCount === totalCount;

View File

@@ -0,0 +1,154 @@
export interface TranscriptEvent {
type: string;
[key: string]: unknown;
}
export interface ToolCallSummary {
tool: string;
toolUseId: string;
input: Record<string, unknown>;
/** First ~200 chars of output for quick scanning */
outputPreview: string;
}
export interface TranscriptSummary {
totalTurns: number;
totalDurationMs: number;
totalCostUsd: number | null;
model: string | null;
toolCalls: ToolCallSummary[];
finalOutput: string;
}
/** Parse a single NDJSON line. Returns null on empty or invalid input. */
export function parseStreamJsonLine(line: string): TranscriptEvent | null {
const trimmed = line.trim();
if (!trimmed) return null;
try {
return JSON.parse(trimmed) as TranscriptEvent;
} catch {
return null;
}
}
/** Parse raw NDJSON stdout into an array of events. */
export function parseStreamJsonOutput(raw: string): TranscriptEvent[] {
const events: TranscriptEvent[] = [];
for (const line of raw.split("\n")) {
const event = parseStreamJsonLine(line);
if (event) events.push(event);
}
return events;
}
/** Extract the final text output from parsed events (for backward compat). */
export function extractFinalOutput(events: TranscriptEvent[]): string {
// Prefer the result event
for (const event of events) {
if (event.type === "result") {
const result = (event as Record<string, unknown>).result;
if (typeof result === "string") return result;
}
}
// Fallback: concatenate text blocks from the last assistant message
for (let i = events.length - 1; i >= 0; i--) {
const event = events[i];
if (event.type === "assistant") {
const msg = (event as Record<string, unknown>).message as
| Record<string, unknown>
| undefined;
const content = msg?.content;
if (Array.isArray(content)) {
const texts = content
.filter(
(b: Record<string, unknown>) =>
b.type === "text" && typeof b.text === "string",
)
.map((b: Record<string, unknown>) => b.text as string);
if (texts.length > 0) return texts.join("\n");
}
}
}
return "";
}
/** Walk parsed events to build a transcript summary. */
export function buildTranscriptSummary(
events: TranscriptEvent[],
): TranscriptSummary {
const toolCalls: ToolCallSummary[] = [];
let finalOutput = "";
let totalDurationMs = 0;
let totalCostUsd: number | null = null;
let model: string | null = null;
let totalTurns = 0;
for (const event of events) {
const e = event as Record<string, unknown>;
// System init: extract model
if (e.type === "system" && e.subtype === "init") {
model = typeof e.model === "string" ? e.model : null;
}
// Assistant messages: extract tool_use blocks
if (e.type === "assistant") {
const msg = e.message as Record<string, unknown> | undefined;
const content = msg?.content;
if (Array.isArray(content)) {
for (const block of content) {
if (block.type === "tool_use") {
toolCalls.push({
tool: block.name ?? "unknown",
toolUseId: block.id ?? "",
input: block.input ?? {},
outputPreview: "",
});
}
}
}
}
// User messages: extract tool_result blocks and match to tool calls
if (e.type === "user") {
const msg = e.message as Record<string, unknown> | undefined;
const content = msg?.content;
if (Array.isArray(content)) {
for (const block of content) {
if (block.type === "tool_result") {
const matching = toolCalls.find(
(tc) => tc.toolUseId === block.tool_use_id,
);
if (matching) {
const text =
typeof block.content === "string"
? block.content
: JSON.stringify(block.content);
matching.outputPreview = text.slice(0, 200);
}
}
}
}
}
// Result event: final output, cost, duration, turns
if (e.type === "result") {
finalOutput = typeof e.result === "string" ? e.result : "";
totalDurationMs = typeof e.duration_ms === "number" ? e.duration_ms : 0;
totalCostUsd =
typeof e.total_cost_usd === "number" ? e.total_cost_usd : null;
totalTurns = typeof e.num_turns === "number" ? e.num_turns : 0;
}
}
return {
totalTurns,
totalDurationMs,
totalCostUsd,
model,
toolCalls,
finalOutput,
};
}

View File

@@ -32,4 +32,10 @@ export interface EvalRunResult {
/** Files the agent created or modified in the workspace */
filesModified: string[];
error?: string;
/** Path to the persisted results directory for this run */
resultsDir?: string;
/** Number of tool calls the agent made */
toolCallCount?: number;
/** Total cost in USD (from stream-json result event) */
costUsd?: number;
}