feat(evals): replace mock CLIs with real Supabase instance per eval run

Start a shared local Supabase stack once before all scenarios and reset the database (drop/recreate public schema + clear migration history) between each run. This lets agents apply migrations via `supabase db push` against a real Postgres instance instead of mock shell scripts. - Add supabase-setup.ts: startSupabase / stopSupabase / resetDB / getKeys - Update runner.ts to start/stop Supabase and inject keys into process.env - Update agent.ts to point MCP config at the local Supabase HTTP endpoint - Update preflight.ts to check supabase CLI availability and Docker socket - Update scaffold.ts to seed workspace with supabase/config.toml - Add passThreshold support (test.ts / results.ts / types.ts) for partial pass - Delete mock shell scripts (mocks/docker, mocks/psql, mocks/supabase) - Update Dockerfile/docker-compose to mount Docker socket for supabase CLI Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-27 10:09:26 +08:00 · 2026-02-25 14:39:54 +00:00
parent 2da5cae2ac
commit 9b08864e94
14 changed files with 277 additions and 249 deletions
--- a/packages/evals/Dockerfile
+++ b/packages/evals/Dockerfile
@@ -29,13 +29,33 @@ RUN npm --prefix packages/skills-build run build
 # ---------- Stage 2: runtime ----------
 FROM node:22-slim

-RUN apt-get update && apt-get install -y --no-install-recommends git && \
-    rm -rf /var/lib/apt/lists/*
+# Install Docker CLI and curl (needed for supabase CLI install)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    curl \
+    ca-certificates \
+    docker.io \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install supabase CLI binary (pinned version)
+ARG SUPABASE_CLI_VERSION=2.67.1
+RUN ARCH=$(dpkg --print-architecture) && \
+    case "$ARCH" in \
+      amd64) SUPABASE_ARCH="linux_amd64" ;; \
+      arm64) SUPABASE_ARCH="linux_arm64" ;; \
+      *) echo "Unsupported arch: $ARCH" && exit 1 ;; \
+    esac && \
+    curl -fsSL "https://github.com/supabase/cli/releases/download/v${SUPABASE_CLI_VERSION}/supabase_${SUPABASE_ARCH}.tar.gz" \
+      | tar xz -C /usr/local/bin supabase && \
+    chmod +x /usr/local/bin/supabase

 WORKDIR /app

-# Use the existing node user (UID 1000) — Claude Code refuses --dangerously-skip-permissions as root
-# node:22-slim already ships with user "node" (uid=1000, gid=1000)
+# Use the existing node user (UID 1000) — Claude Code refuses --dangerously-skip-permissions as root.
+# Add node user to the docker group so it can reach the mounted Docker socket.
+# DOCKER_GID must match the host's docker group GID (default 999 on most Linux systems).
+ARG DOCKER_GID=999
+RUN groupadd -f -g ${DOCKER_GID} docker && usermod -aG docker node

 # Copy built artifacts from builder
 COPY --from=builder /app/package.json /app/package-lock.json ./
@@ -44,12 +64,6 @@ COPY --from=builder /app/skills/ skills/
 COPY --from=builder /app/packages/skills-build/ packages/skills-build/
 COPY --from=builder /app/packages/evals/ packages/evals/

-# Install mock scripts
-COPY packages/evals/mocks/supabase /usr/local/bin/supabase
-COPY packages/evals/mocks/docker /usr/local/bin/docker
-COPY packages/evals/mocks/psql /usr/local/bin/psql
-RUN chmod +x /usr/local/bin/supabase /usr/local/bin/docker /usr/local/bin/psql
-
 # Install entrypoint
 COPY packages/evals/docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
 RUN chmod +x /usr/local/bin/docker-entrypoint.sh
--- a/packages/evals/docker-compose.yml
+++ b/packages/evals/docker-compose.yml
@@ -3,6 +3,10 @@ services:
    build:
      context: ../..
      dockerfile: packages/evals/Dockerfile
+      args:
+        # Match the host's docker group GID so the node user can reach the socket.
+        # Override with: DOCKER_GID=$(getent group docker | cut -d: -f3) docker compose up
+        DOCKER_GID: "${DOCKER_GID:-999}"
    environment:
      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
      - EVAL_MODEL=${EVAL_MODEL:-}
@@ -15,3 +19,5 @@ services:
      - EVAL_RESULTS_DIR=/app/results
    volumes:
      - ./results:/app/results
+      # Mount the host Docker socket so the supabase CLI can manage containers.
+      - /var/run/docker.sock:/var/run/docker.sock
--- a/packages/evals/docker-entrypoint.sh
+++ b/packages/evals/docker-entrypoint.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 # Entrypoint for the eval Docker container.
-# Validates environment, adds mocks to PATH, then runs the given command.
+# Validates environment, then runs the given command.
 set -euo pipefail

 export IN_DOCKER=true
@@ -12,13 +12,11 @@ if [[ -z "${ANTHROPIC_API_KEY:-}" ]]; then
  exit 1
 fi

-# Prepend mocks directory to PATH so mock supabase/docker/psql are found first
-export PATH="/app/packages/evals/mocks:${PATH}"
-
 echo "=== Eval Environment ==="
 echo "  Node:     $(node --version)"
 echo "  Claude:   $(claude --version 2>/dev/null || echo 'n/a')"
-echo "  Docker:  mock"
+echo "  Supabase: $(supabase --version 2>/dev/null || echo 'n/a')"
+echo "  Docker:   $(docker --version 2>/dev/null || echo 'n/a')"
 echo "  Model:    ${EVAL_MODEL:-default}"
 echo "  Scenario: ${EVAL_SCENARIO:-all}"
 echo "========================"
--- a/packages/evals/mocks/docker
+++ b/packages/evals/mocks/docker
@@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-# Mock Docker CLI for eval environments.
-# Returns success for common commands the agent may invoke.
-set -euo pipefail
-
-CMD="${1:-}"
-shift || true
-
-case "$CMD" in
-  ps)
-    echo "CONTAINER ID   IMAGE   COMMAND   CREATED   STATUS   PORTS   NAMES"
-    ;;
-  exec)
-    # Consume flags until we hit something that isn't a flag
-    while [[ "${1:-}" == -* ]]; do shift || true; done
-    # Remaining args are container + command — just succeed silently
-    ;;
-  info)
-    echo "Server Version: 24.0.0 (mock)"
-    ;;
-  compose)
-    echo "docker compose: ok"
-    ;;
-  *)
-    # Default: succeed silently
-    ;;
-esac
--- a/packages/evals/mocks/psql
+++ b/packages/evals/mocks/psql
@@ -1,15 +0,0 @@
-#!/usr/bin/env bash
-# Mock psql for eval environments.
-# Accepts any arguments and returns an empty result set.
-set -euo pipefail
-
-# If -c is used (inline command), print column headers for a SELECT
-for arg in "$@"; do
-  if [[ "$arg" == "-c" ]]; then
-    echo "(0 rows)"
-    exit 0
-  fi
-done
-
-# Default: succeed silently
-exit 0
--- a/packages/evals/mocks/supabase
+++ b/packages/evals/mocks/supabase
@@ -1,161 +0,0 @@
-#!/usr/bin/env bash
-# Mock Supabase CLI for eval environments.
-# Returns realistic output so the agent doesn't retry, and creates real
-# migration files when asked.
-set -euo pipefail
-
-CMD="${1:-}"
-shift || true
-
-case "$CMD" in
-  init)
-    mkdir -p supabase/migrations supabase/functions
-    cat > supabase/config.toml << 'TOML'
-[project]
-id = "mock-project-ref"
-
-[api]
-enabled = true
-port = 54321
-schemas = ["public", "graphql_public"]
-
-[db]
-port = 54322
-major_version = 15
-
-[studio]
-enabled = true
-port = 54323
-TOML
-    echo "Finished supabase init."
-    ;;
-
-  start)
-    echo "Applying migration 00000000000000_init.sql..."
-    echo "Started supabase local development setup."
-    echo ""
-    echo "         API URL: http://127.0.0.1:54321"
-    echo "     GraphQL URL: http://127.0.0.1:54321/graphql/v1"
-    echo "  S3 Storage URL: http://127.0.0.1:54321/storage/v1/s3"
-    echo "          DB URL: postgresql://postgres:postgres@127.0.0.1:54322/postgres"
-    echo "      Studio URL: http://127.0.0.1:54323"
-    echo "    Inbucket URL: http://127.0.0.1:54324"
-    echo "      JWT secret: super-secret-jwt-token-with-at-least-32-characters-long"
-    echo "        anon key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6ImFub24iLCJleHAiOjE5ODM4MTI5OTZ9.CRXP1A7WOeoJeXxjNni43kdQwgnWNReilDMblYTn_I0"
-    echo "service_role key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU"
-    echo "   S3 Access Key: 625729a08b95bf1b7ff351a663f3a23c"
-    echo "   S3 Secret Key: 850181e4652dd023b7a98c58ae0d2d34bd487ee0cc3254aed6eda37307425907"
-    echo "       S3 Region: local"
-    ;;
-
-  stop)
-    echo "Stopped supabase local development setup."
-    ;;
-
-  status)
-    if [[ "${1:-}" == "-o" && "${2:-}" == "env" ]]; then
-      echo "ANON_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6ImFub24iLCJleHAiOjE5ODM4MTI5OTZ9.CRXP1A7WOeoJeXxjNni43kdQwgnWNReilDMblYTn_I0"
-      echo "SERVICE_ROLE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU"
-      echo "API_URL=http://127.0.0.1:54321"
-      echo "DB_URL=postgresql://postgres:postgres@127.0.0.1:54322/postgres"
-      echo "STUDIO_URL=http://127.0.0.1:54323"
-    else
-      echo "         API URL: http://127.0.0.1:54321"
-      echo "          DB URL: postgresql://postgres:postgres@127.0.0.1:54322/postgres"
-      echo "      Studio URL: http://127.0.0.1:54323"
-      echo "        DB: running"
-      echo "      Auth: running"
-      echo "      REST: running"
-      echo "   Realtime: running"
-      echo "   Storage: running"
-    fi
-    ;;
-
-  migration)
-    SUBCMD="${1:-}"
-    shift || true
-    case "$SUBCMD" in
-      new)
-        NAME="${1:-migration}"
-        TIMESTAMP=$(date -u +"%Y%m%d%H%M%S")
-        mkdir -p supabase/migrations
-        MIGRATION_FILE="supabase/migrations/${TIMESTAMP}_${NAME}.sql"
-        touch "$MIGRATION_FILE"
-        echo "Created new migration at $MIGRATION_FILE"
-        ;;
-      list)
-        echo "No migrations found."
-        ;;
-      *)
-        echo "supabase migration $SUBCMD: ok"
-        ;;
-    esac
-    ;;
-
-  db)
-    SUBCMD="${1:-}"
-    shift || true
-    case "$SUBCMD" in
-      push)
-        echo "Applying unapplied migrations..."
-        echo "Applied migration(s) successfully."
-        ;;
-      reset)
-        echo "Resetting local database..."
-        echo "Database reset successfully."
-        ;;
-      diff)
-        echo "No schema changes detected."
-        ;;
-      *)
-        echo "supabase db $SUBCMD: ok"
-        ;;
-    esac
-    ;;
-
-  functions)
-    SUBCMD="${1:-}"
-    shift || true
-    case "$SUBCMD" in
-      new)
-        FUNC_NAME="${1:-my-function}"
-        mkdir -p "supabase/functions/$FUNC_NAME"
-        cat > "supabase/functions/$FUNC_NAME/index.ts" << 'TS'
-import { serve } from "https://deno.land/std@0.168.0/http/server.ts"
-
-serve(async (req) => {
-  return new Response(JSON.stringify({ message: "Hello from Edge Functions!" }), {
-    headers: { "Content-Type": "application/json" },
-  })
-})
-TS
-        echo "Created new Function at supabase/functions/$FUNC_NAME"
-        ;;
-      serve)
-        echo "Serving functions on http://127.0.0.1:54321/functions/v1/<function-name>"
-        ;;
-      deploy)
-        echo "Deployed function successfully."
-        ;;
-      *)
-        echo "supabase functions $SUBCMD: ok"
-        ;;
-    esac
-    ;;
-
-  gen)
-    echo "Generated types successfully."
-    ;;
-
-  link)
-    echo "Linked project: mock-project-ref"
-    ;;
-
-  login)
-    echo "Already logged in."
-    ;;
-
-  *)
-    echo "supabase $CMD: ok"
-    ;;
-esac
--- a/packages/evals/src/runner.ts
+++ b/packages/evals/src/runner.ts
@@ -6,6 +6,12 @@ import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
 import { preflight } from "./runner/preflight.js";
 import { listModifiedFiles, printSummary } from "./runner/results.js";
 import { createWorkspace } from "./runner/scaffold.js";
+import {
+	getKeys,
+	resetDB,
+	startSupabase,
+	stopSupabase,
+} from "./runner/supabase-setup.js";
 import { runTests } from "./runner/test.js";
 import {
 	buildTranscriptSummary,
@@ -60,6 +66,20 @@ function discoverScenarios(): EvalScenario[] {
 	}));
 }

+// ---------------------------------------------------------------------------
+// Scenario threshold
+// ---------------------------------------------------------------------------
+
+function getPassThreshold(scenarioId: string): number | null {
+	const scenariosDir = join(findEvalsDir(), "..", "scenarios");
+	const scenarioFile = join(scenariosDir, `${scenarioId}.md`);
+	if (!existsSync(scenarioFile)) return null;
+
+	const content = readFileSync(scenarioFile, "utf-8");
+	const match = content.match(/\*\*pass_threshold:\*\*\s*(\d+)/);
+	return match ? Number.parseInt(match[1], 10) : null;
+}
+
 // ---------------------------------------------------------------------------
 // Run a single eval
 // ---------------------------------------------------------------------------
@@ -103,13 +123,24 @@ async function runEval(
 			? join(evalDir, "EVAL.tsx")
 			: join(evalDir, "EVAL.ts");

+		const passThreshold = getPassThreshold(scenario.id);
+
 		console.log("  Running tests...");
 		const testResult = await runTests({
 			workspacePath,
 			evalFilePath,
+			passThreshold: passThreshold ?? undefined,
 		});
+
+		const pct =
+			testResult.totalCount > 0
+				? ((testResult.passedCount / testResult.totalCount) * 100).toFixed(1)
+				: "0.0";
+		const thresholdInfo = passThreshold
+			? `, threshold: ${((passThreshold / testResult.totalCount) * 100).toFixed(0)}%`
+			: "";
 		console.log(
-			`  Tests: ${testResult.passedCount}/${testResult.totalCount} passed`,
+			`  Tests: ${testResult.passedCount}/${testResult.totalCount} passed (${pct}%${thresholdInfo})`,
 		);

 		// 5. Collect modified files
@@ -129,6 +160,7 @@ async function runEval(
 			agentOutput: agentResult.output,
 			testsPassed: testResult.passedCount,
 			testsTotal: testResult.totalCount,
+			passThreshold: passThreshold ?? undefined,
 			filesModified,
 			toolCallCount: summary.toolCalls.length,
 			costUsd: summary.totalCostUsd ?? undefined,
@@ -194,16 +226,34 @@ async function main() {

 	console.log(`Scenarios: ${scenarios.map((s) => s.id).join(", ")}`);

+	// Start the shared Supabase instance once for all scenarios.
+	startSupabase();
+	const keys = getKeys();
+
+	// Inject keys into process.env so EVAL.ts tests can connect to the real DB.
+	process.env.SUPABASE_URL = keys.apiUrl;
+	process.env.SUPABASE_ANON_KEY = keys.anonKey;
+	process.env.SUPABASE_SERVICE_ROLE_KEY = keys.serviceRoleKey;
+	process.env.SUPABASE_DB_URL = keys.dbUrl;
+
 	const results: EvalRunResult[] = [];
 	const transcripts = new Map<string, TranscriptSummary>();

+	try {
 		for (const scenario of scenarios) {
+			// Reset the database before each scenario for a clean slate.
+			console.log(`\n  Resetting DB for ${scenario.id}...`);
+			resetDB(keys.dbUrl);
+
 			const { result, transcript } = await runEval(scenario, skillEnabled);
 			results.push(result);
 			if (transcript) {
 				transcripts.set(result.scenario, transcript);
 			}
 		}
+	} finally {
+		stopSupabase();
+	}

 	// Use the results dir from the first result (all share the same timestamp)
 	const resultsDir = results.find((r) => r.resultsDir)?.resultsDir;
--- a/packages/evals/src/runner/agent.ts
+++ b/packages/evals/src/runner/agent.ts
@@ -22,9 +22,10 @@ export interface AgentRunResult {
 * Uses --output-format stream-json to capture structured NDJSON events
 * including tool calls, results, and reasoning steps.
 *
- * The agent operates in the workspace directory and can read/write files.
- * When skills are installed (via the `skills` CLI), Claude Code
- * discovers them automatically and uses them for guidance.
+ * The agent operates in the workspace directory and can read/write files,
+ * and has access to the local Supabase MCP server so it can apply migrations
+ * and query the real database. --strict-mcp-config ensures only the local
+ * Supabase instance is reachable — no host MCP servers leak in.
 */
 export async function runAgent(opts: {
 	cwd: string;
@@ -35,6 +36,18 @@ export async function runAgent(opts: {
 }): Promise<AgentRunResult> {
 	const start = Date.now();

+	// Point the agent's MCP config at the shared local Supabase instance.
+	// --strict-mcp-config ensures host .mcp.json is ignored entirely.
+	const supabaseUrl = process.env.SUPABASE_URL ?? "http://127.0.0.1:54321";
+	const mcpConfig = JSON.stringify({
+		mcpServers: {
+			supabase: {
+				type: "http",
+				url: `${supabaseUrl}/mcp`,
+			},
+		},
+	});
+
 	const args = [
 		"-p", // Print mode (non-interactive)
 		"--verbose",
@@ -46,12 +59,8 @@ export async function runAgent(opts: {
 		"--dangerously-skip-permissions",
 		"--tools",
 		"Edit,Write,Bash,Read,Glob,Grep",
-		// Disable all MCP servers so the agent uses only local filesystem tools.
-		// Without this, MCP tools from the parent env (e.g. Supabase, Neon)
-		// leak in and the agent may apply migrations to a remote project
-		// instead of creating local files.
 		"--mcp-config",
-		'{"mcpServers":{}}',
+		mcpConfig,
 		"--strict-mcp-config",
 	];

--- a/packages/evals/src/runner/preflight.ts
+++ b/packages/evals/src/runner/preflight.ts
@@ -66,7 +66,7 @@ export function resolveClaudeBin(): string {
 * Verify the host environment has everything needed before spending
 * API credits on an eval run.
 *
- * Checks: Node >= 20, Docker running, claude CLI available, API key set.
+ * Checks: Node >= 20, Docker running, supabase CLI available, claude CLI available, API key set.
 */
 export function preflight(): void {
 	const errors: string[] = [];
@@ -77,13 +77,28 @@ export function preflight(): void {
 		errors.push(`Node.js >= 20 required (found ${process.versions.node})`);
 	}

-	// Docker daemon running (skip when inside the eval container — mocks handle it)
-	if (!isRunningInDocker()) {
+	// Docker daemon must be running — needed by the supabase CLI to manage containers.
+	// Required whether running locally or inside the eval container (socket-mounted).
 	try {
 		execFileSync("docker", ["info"], { stdio: "ignore", timeout: 10_000 });
 	} catch {
-			errors.push("Docker is not running (required by supabase CLI)");
+		errors.push(
+			isRunningInDocker()
+				? "Docker daemon not reachable inside container. Mount the socket: -v /var/run/docker.sock:/var/run/docker.sock"
+				: "Docker is not running (required by supabase CLI)",
+		);
 	}
+
+	// Supabase CLI available
+	try {
+		execFileSync("supabase", ["--version"], {
+			stdio: "ignore",
+			timeout: 10_000,
+		});
+	} catch {
+		errors.push(
+			"supabase CLI not found. Install it: https://supabase.com/docs/guides/cli/getting-started",
+		);
 	}

 	// Claude CLI available
--- a/packages/evals/src/runner/results.ts
+++ b/packages/evals/src/runner/results.ts
@@ -56,8 +56,16 @@ export function printSummary(
 	for (const r of results) {
 		const icon = r.status === "passed" ? "PASS" : "FAIL";
 		const skill = r.skillEnabled ? "with-skill" : "baseline";
+		const pct =
+			r.testsTotal > 0
+				? ((r.testsPassed / r.testsTotal) * 100).toFixed(1)
+				: "0.0";
+		const thresholdInfo =
+			r.passThreshold && r.testsTotal > 0
+				? `, threshold: ${((r.passThreshold / r.testsTotal) * 100).toFixed(0)}%`
+				: "";
 		console.log(
-			`[${icon}] ${r.scenario} | ${r.model} | ${skill} | ${(r.duration / 1000).toFixed(1)}s`,
+			`[${icon}] ${r.scenario} | ${r.model} | ${skill} | ${(r.duration / 1000).toFixed(1)}s | ${pct}% (${r.testsPassed}/${r.testsTotal}${thresholdInfo})`,
 		);
 		if (r.filesModified.length > 0) {
 			console.log(`       Files: ${r.filesModified.join(", ")}`);
--- a/packages/evals/src/runner/scaffold.ts
+++ b/packages/evals/src/runner/scaffold.ts
@@ -1,8 +1,16 @@
 import { execFileSync } from "node:child_process";
-import { cpSync, existsSync, mkdtempSync, readdirSync, rmSync } from "node:fs";
+import {
+	cpSync,
+	existsSync,
+	mkdirSync,
+	mkdtempSync,
+	readdirSync,
+	rmSync,
+} from "node:fs";
 import { tmpdir } from "node:os";
 import { dirname, join, resolve } from "node:path";
 import { fileURLToPath } from "node:url";
+import { EVAL_PROJECT_DIR } from "./supabase-setup.js";

 const __filename = fileURLToPath(import.meta.url);
 const __dirname = dirname(__filename);
@@ -54,6 +62,16 @@ export function createWorkspace(opts: {
 		cpSync(src, dest, { recursive: true });
 	}

+	// Seed the workspace with the eval project's supabase/config.toml so the
+	// agent can run `supabase db push` against the shared local instance without
+	// needing to run `supabase init` or `supabase start` first.
+	const projectConfigSrc = join(EVAL_PROJECT_DIR, "supabase", "config.toml");
+	if (existsSync(projectConfigSrc)) {
+		const destSupabaseDir = join(workspacePath, "supabase");
+		mkdirSync(join(destSupabaseDir, "migrations"), { recursive: true });
+		cpSync(projectConfigSrc, join(destSupabaseDir, "config.toml"));
+	}
+
 	// Install skills into the workspace via the `skills` CLI
 	if (opts.skillEnabled) {
 		const skillsDir = join(repoRoot, "skills");
--- a/packages/evals/src/runner/supabase-setup.ts
+++ b/packages/evals/src/runner/supabase-setup.ts
@@ -0,0 +1,108 @@
+import { execFileSync } from "node:child_process";
+import { dirname, resolve } from "node:path";
+import { fileURLToPath } from "node:url";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+
+/**
+ * Directory that contains the eval Supabase project (supabase/config.toml).
+ * The runner starts the shared Supabase instance from here.
+ * Agent workspaces get a copy of supabase/config.toml so they can
+ * connect to the same running instance via `supabase db push`.
+ */
+export const EVAL_PROJECT_DIR = resolve(__dirname, "..", "..", "project");
+
+export interface SupabaseKeys {
+	apiUrl: string;
+	dbUrl: string;
+	anonKey: string;
+	serviceRoleKey: string;
+}
+
+/**
+ * Start the local Supabase stack for the eval project.
+ * Idempotent: if already running, the CLI prints a message and exits 0.
+ */
+export function startSupabase(): void {
+	console.log("  Starting Supabase...");
+	execFileSync("supabase", ["start", "--exclude", "studio,imgproxy,mailpit"], {
+		cwd: EVAL_PROJECT_DIR,
+		stdio: "inherit",
+		timeout: 5 * 60 * 1000, // 5 min for first image pull
+	});
+}
+
+// SQL that clears all user-created objects and migration history between scenarios.
+// Avoids `supabase db reset` which restarts containers and triggers flaky health checks.
+const RESET_SQL = `
+  -- Drop and recreate public schema (removes all user tables/views/functions)
+  DROP SCHEMA public CASCADE;
+  CREATE SCHEMA public;
+  GRANT ALL ON SCHEMA public TO postgres;
+  GRANT ALL ON SCHEMA public TO anon;
+  GRANT ALL ON SCHEMA public TO authenticated;
+  GRANT ALL ON SCHEMA public TO service_role;
+
+  -- Clear migration history so the next agent's db push starts from a clean slate
+  DROP SCHEMA IF EXISTS supabase_migrations CASCADE;
+
+  -- Notify PostgREST to reload its schema cache
+  NOTIFY pgrst, 'reload schema';
+`.trim();
+
+/**
+ * Reset the database to a clean state between scenarios.
+ *
+ * Uses direct SQL via psql instead of `supabase db reset` to avoid the
+ * container-restart cycle and its flaky health checks. This drops the
+ * public schema (all user tables) and clears the migration history so
+ * `supabase db push` in agent workspaces always starts fresh.
+ */
+export function resetDB(dbUrl: string): void {
+	execFileSync("psql", [dbUrl, "--no-psqlrc", "-c", RESET_SQL], {
+		stdio: "inherit",
+		timeout: 30 * 1000,
+	});
+}
+
+/**
+ * Stop all Supabase containers for the eval project.
+ * Called once after all scenarios complete.
+ */
+export function stopSupabase(): void {
+	console.log("  Stopping Supabase...");
+	execFileSync("supabase", ["stop", "--no-backup"], {
+		cwd: EVAL_PROJECT_DIR,
+		stdio: "inherit",
+		timeout: 60 * 1000,
+	});
+}
+
+/**
+ * Read the running instance's API URL and JWT keys.
+ * Returns values that the runner injects into process.env so EVAL.ts
+ * tests can connect to the real database.
+ */
+export function getKeys(): SupabaseKeys {
+	const raw = execFileSync("supabase", ["status", "--output", "json"], {
+		cwd: EVAL_PROJECT_DIR,
+		timeout: 30 * 1000,
+	}).toString();
+
+	const status = JSON.parse(raw) as Record<string, string>;
+
+	const apiUrl = status.API_URL ?? "http://127.0.0.1:54321";
+	const dbUrl =
+		status.DB_URL ?? "postgresql://postgres:postgres@127.0.0.1:54322/postgres";
+	const anonKey = status.ANON_KEY ?? "";
+	const serviceRoleKey = status.SERVICE_ROLE_KEY ?? "";
+
+	if (!anonKey || !serviceRoleKey) {
+		throw new Error(
+			`supabase status returned missing keys. Raw output:\n${raw}`,
+		);
+	}
+
+	return { apiUrl, dbUrl, anonKey, serviceRoleKey };
+}
--- a/packages/evals/src/runner/test.ts
+++ b/packages/evals/src/runner/test.ts
@@ -30,6 +30,7 @@ export interface TestResult {
 export async function runTests(opts: {
 	workspacePath: string;
 	evalFilePath: string;
+	passThreshold?: number;
 }): Promise<TestResult> {
 	// Copy the hidden test file into the workspace
 	const evalFileName = opts.evalFilePath.endsWith(".tsx")
@@ -85,11 +86,11 @@ export async function runTests(opts: {
 		});

 		const output = `${stdout}\n${stderr}`;
-		return parseTestOutput(output);
+		return parseTestOutput(output, opts.passThreshold);
 	} catch (error) {
 		const err = error as Error & { stdout?: string; stderr?: string };
 		const output = `${err.stdout ?? ""}\n${err.stderr ?? ""}`;
-		return parseTestOutput(output);
+		return parseTestOutput(output, opts.passThreshold);
 	}
 }

@@ -111,7 +112,7 @@ function parseIndividualTests(output: string): Record<string, boolean> {
 	return results;
 }

-function parseTestOutput(output: string): TestResult {
+function parseTestOutput(output: string, passThreshold?: number): TestResult {
 	// Parse vitest output for pass/fail counts
 	// Vitest formats:
 	//   All passing:  "Tests  N passed (N)"
@@ -133,7 +134,9 @@ function parseTestOutput(output: string): TestResult {
 		totalCount = Number.parseInt(allFailing[2], 10);
 	}

-	const passed = totalCount > 0 && passedCount === totalCount;
+	const passed = passThreshold
+		? totalCount > 0 && passedCount >= passThreshold
+		: totalCount > 0 && passedCount === totalCount;
 	const individualTests = parseIndividualTests(output);

 	return { passed, output, passedCount, totalCount, individualTests };
--- a/packages/evals/src/types.ts
+++ b/packages/evals/src/types.ts
@@ -29,6 +29,8 @@ export interface EvalRunResult {
 	testsPassed: number;
 	/** Total number of vitest tests */
 	testsTotal: number;
+	/** Minimum tests required to pass (from scenario config) */
+	passThreshold?: number;
 	/** Files the agent created or modified in the workspace */
 	filesModified: string[];
 	error?: string;