feat(evals): replace mock CLIs with real Supabase instance per eval run

Start a shared local Supabase stack once before all scenarios and reset the database (drop/recreate public schema + clear migration history) between each run. This lets agents apply migrations via `supabase db push` against a real Postgres instance instead of mock shell scripts. - Add supabase-setup.ts: startSupabase / stopSupabase / resetDB / getKeys - Update runner.ts to start/stop Supabase and inject keys into process.env - Update agent.ts to point MCP config at the local Supabase HTTP endpoint - Update preflight.ts to check supabase CLI availability and Docker socket - Update scaffold.ts to seed workspace with supabase/config.toml - Add passThreshold support (test.ts / results.ts / types.ts) for partial pass - Delete mock shell scripts (mocks/docker, mocks/psql, mocks/supabase) - Update Dockerfile/docker-compose to mount Docker socket for supabase CLI Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-27 10:09:26 +08:00 · 2026-02-25 14:39:54 +00:00
parent 2da5cae2ac
commit 9b08864e94
14 changed files with 277 additions and 249 deletions
--- a/packages/evals/Dockerfile
+++ b/packages/evals/Dockerfile
@@ -29,13 +29,33 @@ RUN npm --prefix packages/skills-build run build
 # ---------- Stage 2: runtime ----------
 FROM node:22-slim
-RUN apt-get update && apt-get install -y --no-install-recommends git && \
+# Install Docker CLI and curl (needed for supabase CLI install)
-    rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y --no-install-recommends \
    git \
    curl \
    ca-certificates \
    docker.io \
    && rm -rf /var/lib/apt/lists/*
 # Install supabase CLI binary (pinned version)
 ARG SUPABASE_CLI_VERSION=2.67.1
 RUN ARCH=$(dpkg --print-architecture) && \
    case "$ARCH" in \
      amd64) SUPABASE_ARCH="linux_amd64" ;; \
      arm64) SUPABASE_ARCH="linux_arm64" ;; \
      *) echo "Unsupported arch: $ARCH" && exit 1 ;; \
    esac && \
    curl -fsSL "https://github.com/supabase/cli/releases/download/v${SUPABASE_CLI_VERSION}/supabase_${SUPABASE_ARCH}.tar.gz" \
      | tar xz -C /usr/local/bin supabase && \
    chmod +x /usr/local/bin/supabase
 WORKDIR /app
-# Use the existing node user (UID 1000) — Claude Code refuses --dangerously-skip-permissions as root
+# Use the existing node user (UID 1000) — Claude Code refuses --dangerously-skip-permissions as root.
-# node:22-slim already ships with user "node" (uid=1000, gid=1000)
+# Add node user to the docker group so it can reach the mounted Docker socket.
 # DOCKER_GID must match the host's docker group GID (default 999 on most Linux systems).
 ARG DOCKER_GID=999
 RUN groupadd -f -g ${DOCKER_GID} docker && usermod -aG docker node
 # Copy built artifacts from builder
 COPY --from=builder /app/package.json /app/package-lock.json ./
@@ -44,12 +64,6 @@ COPY --from=builder /app/skills/ skills/
 COPY --from=builder /app/packages/skills-build/ packages/skills-build/
 COPY --from=builder /app/packages/evals/ packages/evals/
 # Install mock scripts
 COPY packages/evals/mocks/supabase /usr/local/bin/supabase
 COPY packages/evals/mocks/docker /usr/local/bin/docker
 COPY packages/evals/mocks/psql /usr/local/bin/psql
 RUN chmod +x /usr/local/bin/supabase /usr/local/bin/docker /usr/local/bin/psql
 # Install entrypoint
 COPY packages/evals/docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
 RUN chmod +x /usr/local/bin/docker-entrypoint.sh
--- a/packages/evals/docker-compose.yml
+++ b/packages/evals/docker-compose.yml
@@ -3,6 +3,10 @@ services:
    build:
      context: ../..
      dockerfile: packages/evals/Dockerfile
      args:
        # Match the host's docker group GID so the node user can reach the socket.
        # Override with: DOCKER_GID=$(getent group docker | cut -d: -f3) docker compose up
        DOCKER_GID: "${DOCKER_GID:-999}"
    environment:
      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
      - EVAL_MODEL=${EVAL_MODEL:-}
@@ -15,3 +19,5 @@ services:
      - EVAL_RESULTS_DIR=/app/results
    volumes:
      - ./results:/app/results
      # Mount the host Docker socket so the supabase CLI can manage containers.
      - /var/run/docker.sock:/var/run/docker.sock
--- a/packages/evals/docker-entrypoint.sh
+++ b/packages/evals/docker-entrypoint.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 # Entrypoint for the eval Docker container.
-# Validates environment, adds mocks to PATH, then runs the given command.
+# Validates environment, then runs the given command.
 set -euo pipefail
 export IN_DOCKER=true
@@ -12,14 +12,12 @@ if [[ -z "${ANTHROPIC_API_KEY:-}" ]]; then
  exit 1
 fi
 # Prepend mocks directory to PATH so mock supabase/docker/psql are found first
 export PATH="/app/packages/evals/mocks:${PATH}"
 echo "=== Eval Environment ==="
-echo "  Node:    $(node --version)"
+echo "  Node:     $(node --version)"
-echo "  Claude:  $(claude --version 2>/dev/null || echo 'n/a')"
+echo "  Claude:   $(claude --version 2>/dev/null || echo 'n/a')"
-echo "  Docker:  mock"
+echo "  Supabase: $(supabase --version 2>/dev/null || echo 'n/a')"
-echo "  Model:   ${EVAL_MODEL:-default}"
+echo "  Docker:   $(docker --version 2>/dev/null || echo 'n/a')"
 echo "  Model:    ${EVAL_MODEL:-default}"
 echo "  Scenario: ${EVAL_SCENARIO:-all}"
 echo "========================"
--- a/packages/evals/mocks/docker
+++ b/packages/evals/mocks/docker
@@ -1,27 +0,0 @@
 #!/usr/bin/env bash
 # Mock Docker CLI for eval environments.
 # Returns success for common commands the agent may invoke.
 set -euo pipefail
 CMD="${1:-}"
 shift || true
 case "$CMD" in
  ps)
    echo "CONTAINER ID   IMAGE   COMMAND   CREATED   STATUS   PORTS   NAMES"
    ;;
  exec)
    # Consume flags until we hit something that isn't a flag
    while [[ "${1:-}" == -* ]]; do shift || true; done
    # Remaining args are container + command — just succeed silently
    ;;
  info)
    echo "Server Version: 24.0.0 (mock)"
    ;;
  compose)
    echo "docker compose: ok"
    ;;
  *)
    # Default: succeed silently
    ;;
 esac
--- a/packages/evals/mocks/psql
+++ b/packages/evals/mocks/psql
@@ -1,15 +0,0 @@
 #!/usr/bin/env bash
 # Mock psql for eval environments.
 # Accepts any arguments and returns an empty result set.
 set -euo pipefail
 # If -c is used (inline command), print column headers for a SELECT
 for arg in "$@"; do
  if [[ "$arg" == "-c" ]]; then
    echo "(0 rows)"
    exit 0
  fi
 done
 # Default: succeed silently
 exit 0
--- a/packages/evals/mocks/supabase
+++ b/packages/evals/mocks/supabase
@@ -1,161 +0,0 @@
 #!/usr/bin/env bash
 # Mock Supabase CLI for eval environments.
 # Returns realistic output so the agent doesn't retry, and creates real
 # migration files when asked.
 set -euo pipefail
 CMD="${1:-}"
 shift || true
 case "$CMD" in
  init)
    mkdir -p supabase/migrations supabase/functions
    cat > supabase/config.toml << 'TOML'
 [project]
 id = "mock-project-ref"
 [api]
 enabled = true
 port = 54321
 schemas = ["public", "graphql_public"]
 [db]
 port = 54322
 major_version = 15
 [studio]
 enabled = true
 port = 54323
 TOML
    echo "Finished supabase init."
    ;;
  start)
    echo "Applying migration 00000000000000_init.sql..."
    echo "Started supabase local development setup."
    echo ""
    echo "         API URL: http://127.0.0.1:54321"
    echo "     GraphQL URL: http://127.0.0.1:54321/graphql/v1"
    echo "  S3 Storage URL: http://127.0.0.1:54321/storage/v1/s3"
    echo "          DB URL: postgresql://postgres:postgres@127.0.0.1:54322/postgres"
    echo "      Studio URL: http://127.0.0.1:54323"
    echo "    Inbucket URL: http://127.0.0.1:54324"
    echo "      JWT secret: super-secret-jwt-token-with-at-least-32-characters-long"
    echo "        anon key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6ImFub24iLCJleHAiOjE5ODM4MTI5OTZ9.CRXP1A7WOeoJeXxjNni43kdQwgnWNReilDMblYTn_I0"
    echo "service_role key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU"
    echo "   S3 Access Key: 625729a08b95bf1b7ff351a663f3a23c"
    echo "   S3 Secret Key: 850181e4652dd023b7a98c58ae0d2d34bd487ee0cc3254aed6eda37307425907"
    echo "       S3 Region: local"
    ;;
  stop)
    echo "Stopped supabase local development setup."
    ;;
  status)
    if [[ "${1:-}" == "-o" && "${2:-}" == "env" ]]; then
      echo "ANON_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6ImFub24iLCJleHAiOjE5ODM4MTI5OTZ9.CRXP1A7WOeoJeXxjNni43kdQwgnWNReilDMblYTn_I0"
      echo "SERVICE_ROLE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU"
      echo "API_URL=http://127.0.0.1:54321"
      echo "DB_URL=postgresql://postgres:postgres@127.0.0.1:54322/postgres"
      echo "STUDIO_URL=http://127.0.0.1:54323"
    else
      echo "         API URL: http://127.0.0.1:54321"
      echo "          DB URL: postgresql://postgres:postgres@127.0.0.1:54322/postgres"
      echo "      Studio URL: http://127.0.0.1:54323"
      echo "        DB: running"
      echo "      Auth: running"
      echo "      REST: running"
      echo "   Realtime: running"
      echo "   Storage: running"
    fi
    ;;
  migration)
    SUBCMD="${1:-}"
    shift || true
    case "$SUBCMD" in
      new)
        NAME="${1:-migration}"
        TIMESTAMP=$(date -u +"%Y%m%d%H%M%S")
        mkdir -p supabase/migrations
        MIGRATION_FILE="supabase/migrations/${TIMESTAMP}_${NAME}.sql"
        touch "$MIGRATION_FILE"
        echo "Created new migration at $MIGRATION_FILE"
        ;;
      list)
        echo "No migrations found."
        ;;
      *)
        echo "supabase migration $SUBCMD: ok"
        ;;
    esac
    ;;
  db)
    SUBCMD="${1:-}"
    shift || true
    case "$SUBCMD" in
      push)
        echo "Applying unapplied migrations..."
        echo "Applied migration(s) successfully."
        ;;
      reset)
        echo "Resetting local database..."
        echo "Database reset successfully."
        ;;
      diff)
        echo "No schema changes detected."
        ;;
      *)
        echo "supabase db $SUBCMD: ok"
        ;;
    esac
    ;;
  functions)
    SUBCMD="${1:-}"
    shift || true
    case "$SUBCMD" in
      new)
        FUNC_NAME="${1:-my-function}"
        mkdir -p "supabase/functions/$FUNC_NAME"
        cat > "supabase/functions/$FUNC_NAME/index.ts" << 'TS'
 import { serve } from "https://deno.land/std@0.168.0/http/server.ts"
 serve(async (req) => {
  return new Response(JSON.stringify({ message: "Hello from Edge Functions!" }), {
    headers: { "Content-Type": "application/json" },
  })
 })
 TS
        echo "Created new Function at supabase/functions/$FUNC_NAME"
        ;;
      serve)
        echo "Serving functions on http://127.0.0.1:54321/functions/v1/<function-name>"
        ;;
      deploy)
        echo "Deployed function successfully."
        ;;
      *)
        echo "supabase functions $SUBCMD: ok"
        ;;
    esac
    ;;
  gen)
    echo "Generated types successfully."
    ;;
  link)
    echo "Linked project: mock-project-ref"
    ;;
  login)
    echo "Already logged in."
    ;;
  *)
    echo "supabase $CMD: ok"
    ;;
 esac
--- a/packages/evals/src/runner.ts
+++ b/packages/evals/src/runner.ts
@@ -6,6 +6,12 @@ import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
 import { preflight } from "./runner/preflight.js";
 import { listModifiedFiles, printSummary } from "./runner/results.js";
 import { createWorkspace } from "./runner/scaffold.js";
 import {
 	getKeys,
 	resetDB,
 	startSupabase,
 	stopSupabase,
 } from "./runner/supabase-setup.js";
 import { runTests } from "./runner/test.js";
 import {
 	buildTranscriptSummary,
@@ -60,6 +66,20 @@ function discoverScenarios(): EvalScenario[] {
 	}));
 }
 // ---------------------------------------------------------------------------
 // Scenario threshold
 // ---------------------------------------------------------------------------
 function getPassThreshold(scenarioId: string): number | null {
 	const scenariosDir = join(findEvalsDir(), "..", "scenarios");
 	const scenarioFile = join(scenariosDir, `${scenarioId}.md`);
 	if (!existsSync(scenarioFile)) return null;
 	const content = readFileSync(scenarioFile, "utf-8");
 	const match = content.match(/\*\*pass_threshold:\*\*\s*(\d+)/);
 	return match ? Number.parseInt(match[1], 10) : null;
 }
 // ---------------------------------------------------------------------------
 // Run a single eval
 // ---------------------------------------------------------------------------
@@ -103,13 +123,24 @@ async function runEval(
 			? join(evalDir, "EVAL.tsx")
 			: join(evalDir, "EVAL.ts");
 		const passThreshold = getPassThreshold(scenario.id);
 		console.log("  Running tests...");
 		const testResult = await runTests({
 			workspacePath,
 			evalFilePath,
 			passThreshold: passThreshold ?? undefined,
 		});
 		const pct =
 			testResult.totalCount > 0
 				? ((testResult.passedCount / testResult.totalCount) * 100).toFixed(1)
 				: "0.0";
 		const thresholdInfo = passThreshold
 			? `, threshold: ${((passThreshold / testResult.totalCount) * 100).toFixed(0)}%`
 			: "";
 		console.log(
-			`  Tests: ${testResult.passedCount}/${testResult.totalCount} passed`,
+			`  Tests: ${testResult.passedCount}/${testResult.totalCount} passed (${pct}%${thresholdInfo})`,
 		);
 		// 5. Collect modified files
@@ -129,6 +160,7 @@ async function runEval(
 			agentOutput: agentResult.output,
 			testsPassed: testResult.passedCount,
 			testsTotal: testResult.totalCount,
 			passThreshold: passThreshold ?? undefined,
 			filesModified,
 			toolCallCount: summary.toolCalls.length,
 			costUsd: summary.totalCostUsd ?? undefined,
@@ -194,15 +226,33 @@ async function main() {
 	console.log(`Scenarios: ${scenarios.map((s) => s.id).join(", ")}`);
 	// Start the shared Supabase instance once for all scenarios.
 	startSupabase();
 	const keys = getKeys();
 	// Inject keys into process.env so EVAL.ts tests can connect to the real DB.
 	process.env.SUPABASE_URL = keys.apiUrl;
 	process.env.SUPABASE_ANON_KEY = keys.anonKey;
 	process.env.SUPABASE_SERVICE_ROLE_KEY = keys.serviceRoleKey;
 	process.env.SUPABASE_DB_URL = keys.dbUrl;
 	const results: EvalRunResult[] = [];
 	const transcripts = new Map<string, TranscriptSummary>();
-	for (const scenario of scenarios) {
+	try {
-		const { result, transcript } = await runEval(scenario, skillEnabled);
+		for (const scenario of scenarios) {
-		results.push(result);
+			// Reset the database before each scenario for a clean slate.
-		if (transcript) {
+			console.log(`\n  Resetting DB for ${scenario.id}...`);
-			transcripts.set(result.scenario, transcript);
+			resetDB(keys.dbUrl);
 			const { result, transcript } = await runEval(scenario, skillEnabled);
 			results.push(result);
 			if (transcript) {
 				transcripts.set(result.scenario, transcript);
 			}
 		}
 	} finally {
 		stopSupabase();
 	}
 	// Use the results dir from the first result (all share the same timestamp)
--- a/packages/evals/src/runner/agent.ts
+++ b/packages/evals/src/runner/agent.ts
@@ -22,9 +22,10 @@ export interface AgentRunResult {
 * Uses --output-format stream-json to capture structured NDJSON events
 * including tool calls, results, and reasoning steps.
 *
- * The agent operates in the workspace directory and can read/write files.
+ * The agent operates in the workspace directory and can read/write files,
- * When skills are installed (via the `skills` CLI), Claude Code
+ * and has access to the local Supabase MCP server so it can apply migrations
- * discovers them automatically and uses them for guidance.
+ * and query the real database. --strict-mcp-config ensures only the local
 * Supabase instance is reachable — no host MCP servers leak in.
 */
 export async function runAgent(opts: {
 	cwd: string;
@@ -35,6 +36,18 @@ export async function runAgent(opts: {
 }): Promise<AgentRunResult> {
 	const start = Date.now();
 	// Point the agent's MCP config at the shared local Supabase instance.
 	// --strict-mcp-config ensures host .mcp.json is ignored entirely.
 	const supabaseUrl = process.env.SUPABASE_URL ?? "http://127.0.0.1:54321";
 	const mcpConfig = JSON.stringify({
 		mcpServers: {
 			supabase: {
 				type: "http",
 				url: `${supabaseUrl}/mcp`,
 			},
 		},
 	});
 	const args = [
 		"-p", // Print mode (non-interactive)
 		"--verbose",
@@ -46,12 +59,8 @@ export async function runAgent(opts: {
 		"--dangerously-skip-permissions",
 		"--tools",
 		"Edit,Write,Bash,Read,Glob,Grep",
 		// Disable all MCP servers so the agent uses only local filesystem tools.
 		// Without this, MCP tools from the parent env (e.g. Supabase, Neon)
 		// leak in and the agent may apply migrations to a remote project
 		// instead of creating local files.
 		"--mcp-config",
-		'{"mcpServers":{}}',
+		mcpConfig,
 		"--strict-mcp-config",
 	];
--- a/packages/evals/src/runner/preflight.ts
+++ b/packages/evals/src/runner/preflight.ts
@@ -66,7 +66,7 @@ export function resolveClaudeBin(): string {
 * Verify the host environment has everything needed before spending
 * API credits on an eval run.
 *
- * Checks: Node >= 20, Docker running, claude CLI available, API key set.
+ * Checks: Node >= 20, Docker running, supabase CLI available, claude CLI available, API key set.
 */
 export function preflight(): void {
 	const errors: string[] = [];
@@ -77,13 +77,28 @@ export function preflight(): void {
 		errors.push(`Node.js >= 20 required (found ${process.versions.node})`);
 	}
-	// Docker daemon running (skip when inside the eval container — mocks handle it)
+	// Docker daemon must be running — needed by the supabase CLI to manage containers.
-	if (!isRunningInDocker()) {
+	// Required whether running locally or inside the eval container (socket-mounted).
-		try {
+	try {
-			execFileSync("docker", ["info"], { stdio: "ignore", timeout: 10_000 });
+		execFileSync("docker", ["info"], { stdio: "ignore", timeout: 10_000 });
-		} catch {
+	} catch {
-			errors.push("Docker is not running (required by supabase CLI)");
+		errors.push(
-		}
+			isRunningInDocker()
 				? "Docker daemon not reachable inside container. Mount the socket: -v /var/run/docker.sock:/var/run/docker.sock"
 				: "Docker is not running (required by supabase CLI)",
 		);
 	}
 	// Supabase CLI available
 	try {
 		execFileSync("supabase", ["--version"], {
 			stdio: "ignore",
 			timeout: 10_000,
 		});
 	} catch {
 		errors.push(
 			"supabase CLI not found. Install it: https://supabase.com/docs/guides/cli/getting-started",
 		);
 	}
 	// Claude CLI available
--- a/packages/evals/src/runner/results.ts
+++ b/packages/evals/src/runner/results.ts
@@ -56,8 +56,16 @@ export function printSummary(
 	for (const r of results) {
 		const icon = r.status === "passed" ? "PASS" : "FAIL";
 		const skill = r.skillEnabled ? "with-skill" : "baseline";
 		const pct =
 			r.testsTotal > 0
 				? ((r.testsPassed / r.testsTotal) * 100).toFixed(1)
 				: "0.0";
 		const thresholdInfo =
 			r.passThreshold && r.testsTotal > 0
 				? `, threshold: ${((r.passThreshold / r.testsTotal) * 100).toFixed(0)}%`
 				: "";
 		console.log(
-			`[${icon}] ${r.scenario} | ${r.model} | ${skill} | ${(r.duration / 1000).toFixed(1)}s`,
+			`[${icon}] ${r.scenario} | ${r.model} | ${skill} | ${(r.duration / 1000).toFixed(1)}s | ${pct}% (${r.testsPassed}/${r.testsTotal}${thresholdInfo})`,
 		);
 		if (r.filesModified.length > 0) {
 			console.log(`       Files: ${r.filesModified.join(", ")}`);
--- a/packages/evals/src/runner/scaffold.ts
+++ b/packages/evals/src/runner/scaffold.ts
@@ -1,8 +1,16 @@
 import { execFileSync } from "node:child_process";
-import { cpSync, existsSync, mkdtempSync, readdirSync, rmSync } from "node:fs";
+import {
 	cpSync,
 	existsSync,
 	mkdirSync,
 	mkdtempSync,
 	readdirSync,
 	rmSync,
 } from "node:fs";
 import { tmpdir } from "node:os";
 import { dirname, join, resolve } from "node:path";
 import { fileURLToPath } from "node:url";
 import { EVAL_PROJECT_DIR } from "./supabase-setup.js";
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = dirname(__filename);
@@ -54,6 +62,16 @@ export function createWorkspace(opts: {
 		cpSync(src, dest, { recursive: true });
 	}
 	// Seed the workspace with the eval project's supabase/config.toml so the
 	// agent can run `supabase db push` against the shared local instance without
 	// needing to run `supabase init` or `supabase start` first.
 	const projectConfigSrc = join(EVAL_PROJECT_DIR, "supabase", "config.toml");
 	if (existsSync(projectConfigSrc)) {
 		const destSupabaseDir = join(workspacePath, "supabase");
 		mkdirSync(join(destSupabaseDir, "migrations"), { recursive: true });
 		cpSync(projectConfigSrc, join(destSupabaseDir, "config.toml"));
 	}
 	// Install skills into the workspace via the `skills` CLI
 	if (opts.skillEnabled) {
 		const skillsDir = join(repoRoot, "skills");
--- a/packages/evals/src/runner/supabase-setup.ts
+++ b/packages/evals/src/runner/supabase-setup.ts
@@ -0,0 +1,108 @@
 import { execFileSync } from "node:child_process";
 import { dirname, resolve } from "node:path";
 import { fileURLToPath } from "node:url";
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = dirname(__filename);
 /**
 * Directory that contains the eval Supabase project (supabase/config.toml).
 * The runner starts the shared Supabase instance from here.
 * Agent workspaces get a copy of supabase/config.toml so they can
 * connect to the same running instance via `supabase db push`.
 */
 export const EVAL_PROJECT_DIR = resolve(__dirname, "..", "..", "project");
 export interface SupabaseKeys {
 	apiUrl: string;
 	dbUrl: string;
 	anonKey: string;
 	serviceRoleKey: string;
 }
 /**
 * Start the local Supabase stack for the eval project.
 * Idempotent: if already running, the CLI prints a message and exits 0.
 */
 export function startSupabase(): void {
 	console.log("  Starting Supabase...");
 	execFileSync("supabase", ["start", "--exclude", "studio,imgproxy,mailpit"], {
 		cwd: EVAL_PROJECT_DIR,
 		stdio: "inherit",
 		timeout: 5 * 60 * 1000, // 5 min for first image pull
 	});
 }
 // SQL that clears all user-created objects and migration history between scenarios.
 // Avoids `supabase db reset` which restarts containers and triggers flaky health checks.
 const RESET_SQL = `
  -- Drop and recreate public schema (removes all user tables/views/functions)
  DROP SCHEMA public CASCADE;
  CREATE SCHEMA public;
  GRANT ALL ON SCHEMA public TO postgres;
  GRANT ALL ON SCHEMA public TO anon;
  GRANT ALL ON SCHEMA public TO authenticated;
  GRANT ALL ON SCHEMA public TO service_role;
  -- Clear migration history so the next agent's db push starts from a clean slate
  DROP SCHEMA IF EXISTS supabase_migrations CASCADE;
  -- Notify PostgREST to reload its schema cache
  NOTIFY pgrst, 'reload schema';
 `.trim();
 /**
 * Reset the database to a clean state between scenarios.
 *
 * Uses direct SQL via psql instead of `supabase db reset` to avoid the
 * container-restart cycle and its flaky health checks. This drops the
 * public schema (all user tables) and clears the migration history so
 * `supabase db push` in agent workspaces always starts fresh.
 */
 export function resetDB(dbUrl: string): void {
 	execFileSync("psql", [dbUrl, "--no-psqlrc", "-c", RESET_SQL], {
 		stdio: "inherit",
 		timeout: 30 * 1000,
 	});
 }
 /**
 * Stop all Supabase containers for the eval project.
 * Called once after all scenarios complete.
 */
 export function stopSupabase(): void {
 	console.log("  Stopping Supabase...");
 	execFileSync("supabase", ["stop", "--no-backup"], {
 		cwd: EVAL_PROJECT_DIR,
 		stdio: "inherit",
 		timeout: 60 * 1000,
 	});
 }
 /**
 * Read the running instance's API URL and JWT keys.
 * Returns values that the runner injects into process.env so EVAL.ts
 * tests can connect to the real database.
 */
 export function getKeys(): SupabaseKeys {
 	const raw = execFileSync("supabase", ["status", "--output", "json"], {
 		cwd: EVAL_PROJECT_DIR,
 		timeout: 30 * 1000,
 	}).toString();
 	const status = JSON.parse(raw) as Record<string, string>;
 	const apiUrl = status.API_URL ?? "http://127.0.0.1:54321";
 	const dbUrl =
 		status.DB_URL ?? "postgresql://postgres:postgres@127.0.0.1:54322/postgres";
 	const anonKey = status.ANON_KEY ?? "";
 	const serviceRoleKey = status.SERVICE_ROLE_KEY ?? "";
 	if (!anonKey || !serviceRoleKey) {
 		throw new Error(
 			`supabase status returned missing keys. Raw output:\n${raw}`,
 		);
 	}
 	return { apiUrl, dbUrl, anonKey, serviceRoleKey };
 }
--- a/packages/evals/src/runner/test.ts
+++ b/packages/evals/src/runner/test.ts
@@ -30,6 +30,7 @@ export interface TestResult {
 export async function runTests(opts: {
 	workspacePath: string;
 	evalFilePath: string;
 	passThreshold?: number;
 }): Promise<TestResult> {
 	// Copy the hidden test file into the workspace
 	const evalFileName = opts.evalFilePath.endsWith(".tsx")
@@ -85,11 +86,11 @@ export async function runTests(opts: {
 		});
 		const output = `${stdout}\n${stderr}`;
-		return parseTestOutput(output);
+		return parseTestOutput(output, opts.passThreshold);
 	} catch (error) {
 		const err = error as Error & { stdout?: string; stderr?: string };
 		const output = `${err.stdout ?? ""}\n${err.stderr ?? ""}`;
-		return parseTestOutput(output);
+		return parseTestOutput(output, opts.passThreshold);
 	}
 }
@@ -111,7 +112,7 @@ function parseIndividualTests(output: string): Record<string, boolean> {
 	return results;
 }
-function parseTestOutput(output: string): TestResult {
+function parseTestOutput(output: string, passThreshold?: number): TestResult {
 	// Parse vitest output for pass/fail counts
 	// Vitest formats:
 	//   All passing:  "Tests  N passed (N)"
@@ -133,7 +134,9 @@ function parseTestOutput(output: string): TestResult {
 		totalCount = Number.parseInt(allFailing[2], 10);
 	}
-	const passed = totalCount > 0 && passedCount === totalCount;
+	const passed = passThreshold
 		? totalCount > 0 && passedCount >= passThreshold
 		: totalCount > 0 && passedCount === totalCount;
 	const individualTests = parseIndividualTests(output);
 	return { passed, output, passedCount, totalCount, individualTests };
--- a/packages/evals/src/types.ts
+++ b/packages/evals/src/types.ts
@@ -29,6 +29,8 @@ export interface EvalRunResult {
 	testsPassed: number;
 	/** Total number of vitest tests */
 	testsTotal: number;
 	/** Minimum tests required to pass (from scenario config) */
 	passThreshold?: number;
 	/** Files the agent created or modified in the workspace */
 	filesModified: string[];
 	error?: string;