feat(evals): replace mock CLIs with real Supabase instance per eval run

Start a shared local Supabase stack once before all scenarios and reset
the database (drop/recreate public schema + clear migration history) between
each run. This lets agents apply migrations via `supabase db push` against a
real Postgres instance instead of mock shell scripts.

- Add supabase-setup.ts: startSupabase / stopSupabase / resetDB / getKeys
- Update runner.ts to start/stop Supabase and inject keys into process.env
- Update agent.ts to point MCP config at the local Supabase HTTP endpoint
- Update preflight.ts to check supabase CLI availability and Docker socket
- Update scaffold.ts to seed workspace with supabase/config.toml
- Add passThreshold support (test.ts / results.ts / types.ts) for partial pass
- Delete mock shell scripts (mocks/docker, mocks/psql, mocks/supabase)
- Update Dockerfile/docker-compose to mount Docker socket for supabase CLI

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Pedro Rodrigues
2026-02-25 14:39:54 +00:00
parent 2da5cae2ac
commit 9b08864e94
14 changed files with 277 additions and 249 deletions

View File

@@ -29,13 +29,33 @@ RUN npm --prefix packages/skills-build run build
# ---------- Stage 2: runtime ----------
FROM node:22-slim
RUN apt-get update && apt-get install -y --no-install-recommends git && \
rm -rf /var/lib/apt/lists/*
# Install Docker CLI and curl (needed for supabase CLI install)
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
curl \
ca-certificates \
docker.io \
&& rm -rf /var/lib/apt/lists/*
# Install supabase CLI binary (pinned version)
ARG SUPABASE_CLI_VERSION=2.67.1
RUN ARCH=$(dpkg --print-architecture) && \
case "$ARCH" in \
amd64) SUPABASE_ARCH="linux_amd64" ;; \
arm64) SUPABASE_ARCH="linux_arm64" ;; \
*) echo "Unsupported arch: $ARCH" && exit 1 ;; \
esac && \
curl -fsSL "https://github.com/supabase/cli/releases/download/v${SUPABASE_CLI_VERSION}/supabase_${SUPABASE_ARCH}.tar.gz" \
| tar xz -C /usr/local/bin supabase && \
chmod +x /usr/local/bin/supabase
WORKDIR /app
# Use the existing node user (UID 1000) — Claude Code refuses --dangerously-skip-permissions as root
# node:22-slim already ships with user "node" (uid=1000, gid=1000)
# Use the existing node user (UID 1000) — Claude Code refuses --dangerously-skip-permissions as root.
# Add node user to the docker group so it can reach the mounted Docker socket.
# DOCKER_GID must match the host's docker group GID (default 999 on most Linux systems).
ARG DOCKER_GID=999
RUN groupadd -f -g ${DOCKER_GID} docker && usermod -aG docker node
# Copy built artifacts from builder
COPY --from=builder /app/package.json /app/package-lock.json ./
@@ -44,12 +64,6 @@ COPY --from=builder /app/skills/ skills/
COPY --from=builder /app/packages/skills-build/ packages/skills-build/
COPY --from=builder /app/packages/evals/ packages/evals/
# Install mock scripts
COPY packages/evals/mocks/supabase /usr/local/bin/supabase
COPY packages/evals/mocks/docker /usr/local/bin/docker
COPY packages/evals/mocks/psql /usr/local/bin/psql
RUN chmod +x /usr/local/bin/supabase /usr/local/bin/docker /usr/local/bin/psql
# Install entrypoint
COPY packages/evals/docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
RUN chmod +x /usr/local/bin/docker-entrypoint.sh

View File

@@ -3,6 +3,10 @@ services:
build:
context: ../..
dockerfile: packages/evals/Dockerfile
args:
# Match the host's docker group GID so the node user can reach the socket.
# Override with: DOCKER_GID=$(getent group docker | cut -d: -f3) docker compose up
DOCKER_GID: "${DOCKER_GID:-999}"
environment:
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
- EVAL_MODEL=${EVAL_MODEL:-}
@@ -15,3 +19,5 @@ services:
- EVAL_RESULTS_DIR=/app/results
volumes:
- ./results:/app/results
# Mount the host Docker socket so the supabase CLI can manage containers.
- /var/run/docker.sock:/var/run/docker.sock

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env bash
# Entrypoint for the eval Docker container.
# Validates environment, adds mocks to PATH, then runs the given command.
# Validates environment, then runs the given command.
set -euo pipefail
export IN_DOCKER=true
@@ -12,13 +12,11 @@ if [[ -z "${ANTHROPIC_API_KEY:-}" ]]; then
exit 1
fi
# Prepend mocks directory to PATH so mock supabase/docker/psql are found first
export PATH="/app/packages/evals/mocks:${PATH}"
echo "=== Eval Environment ==="
echo " Node: $(node --version)"
echo " Claude: $(claude --version 2>/dev/null || echo 'n/a')"
echo " Docker: mock"
echo " Supabase: $(supabase --version 2>/dev/null || echo 'n/a')"
echo " Docker: $(docker --version 2>/dev/null || echo 'n/a')"
echo " Model: ${EVAL_MODEL:-default}"
echo " Scenario: ${EVAL_SCENARIO:-all}"
echo "========================"

View File

@@ -1,27 +0,0 @@
#!/usr/bin/env bash
# Mock Docker CLI for eval environments.
# Returns success for common commands the agent may invoke.
set -euo pipefail
CMD="${1:-}"
shift || true
case "$CMD" in
ps)
echo "CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES"
;;
exec)
# Consume flags until we hit something that isn't a flag
while [[ "${1:-}" == -* ]]; do shift || true; done
# Remaining args are container + command — just succeed silently
;;
info)
echo "Server Version: 24.0.0 (mock)"
;;
compose)
echo "docker compose: ok"
;;
*)
# Default: succeed silently
;;
esac

View File

@@ -1,15 +0,0 @@
#!/usr/bin/env bash
# Mock psql for eval environments.
# Accepts any arguments and returns an empty result set.
set -euo pipefail
# If -c is used (inline command), print column headers for a SELECT
for arg in "$@"; do
if [[ "$arg" == "-c" ]]; then
echo "(0 rows)"
exit 0
fi
done
# Default: succeed silently
exit 0

View File

@@ -1,161 +0,0 @@
#!/usr/bin/env bash
# Mock Supabase CLI for eval environments.
# Returns realistic output so the agent doesn't retry, and creates real
# migration files when asked.
set -euo pipefail
CMD="${1:-}"
shift || true
case "$CMD" in
init)
mkdir -p supabase/migrations supabase/functions
cat > supabase/config.toml << 'TOML'
[project]
id = "mock-project-ref"
[api]
enabled = true
port = 54321
schemas = ["public", "graphql_public"]
[db]
port = 54322
major_version = 15
[studio]
enabled = true
port = 54323
TOML
echo "Finished supabase init."
;;
start)
echo "Applying migration 00000000000000_init.sql..."
echo "Started supabase local development setup."
echo ""
echo " API URL: http://127.0.0.1:54321"
echo " GraphQL URL: http://127.0.0.1:54321/graphql/v1"
echo " S3 Storage URL: http://127.0.0.1:54321/storage/v1/s3"
echo " DB URL: postgresql://postgres:postgres@127.0.0.1:54322/postgres"
echo " Studio URL: http://127.0.0.1:54323"
echo " Inbucket URL: http://127.0.0.1:54324"
echo " JWT secret: super-secret-jwt-token-with-at-least-32-characters-long"
echo " anon key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6ImFub24iLCJleHAiOjE5ODM4MTI5OTZ9.CRXP1A7WOeoJeXxjNni43kdQwgnWNReilDMblYTn_I0"
echo "service_role key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU"
echo " S3 Access Key: 625729a08b95bf1b7ff351a663f3a23c"
echo " S3 Secret Key: 850181e4652dd023b7a98c58ae0d2d34bd487ee0cc3254aed6eda37307425907"
echo " S3 Region: local"
;;
stop)
echo "Stopped supabase local development setup."
;;
status)
if [[ "${1:-}" == "-o" && "${2:-}" == "env" ]]; then
echo "ANON_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6ImFub24iLCJleHAiOjE5ODM4MTI5OTZ9.CRXP1A7WOeoJeXxjNni43kdQwgnWNReilDMblYTn_I0"
echo "SERVICE_ROLE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU"
echo "API_URL=http://127.0.0.1:54321"
echo "DB_URL=postgresql://postgres:postgres@127.0.0.1:54322/postgres"
echo "STUDIO_URL=http://127.0.0.1:54323"
else
echo " API URL: http://127.0.0.1:54321"
echo " DB URL: postgresql://postgres:postgres@127.0.0.1:54322/postgres"
echo " Studio URL: http://127.0.0.1:54323"
echo " DB: running"
echo " Auth: running"
echo " REST: running"
echo " Realtime: running"
echo " Storage: running"
fi
;;
migration)
SUBCMD="${1:-}"
shift || true
case "$SUBCMD" in
new)
NAME="${1:-migration}"
TIMESTAMP=$(date -u +"%Y%m%d%H%M%S")
mkdir -p supabase/migrations
MIGRATION_FILE="supabase/migrations/${TIMESTAMP}_${NAME}.sql"
touch "$MIGRATION_FILE"
echo "Created new migration at $MIGRATION_FILE"
;;
list)
echo "No migrations found."
;;
*)
echo "supabase migration $SUBCMD: ok"
;;
esac
;;
db)
SUBCMD="${1:-}"
shift || true
case "$SUBCMD" in
push)
echo "Applying unapplied migrations..."
echo "Applied migration(s) successfully."
;;
reset)
echo "Resetting local database..."
echo "Database reset successfully."
;;
diff)
echo "No schema changes detected."
;;
*)
echo "supabase db $SUBCMD: ok"
;;
esac
;;
functions)
SUBCMD="${1:-}"
shift || true
case "$SUBCMD" in
new)
FUNC_NAME="${1:-my-function}"
mkdir -p "supabase/functions/$FUNC_NAME"
cat > "supabase/functions/$FUNC_NAME/index.ts" << 'TS'
import { serve } from "https://deno.land/std@0.168.0/http/server.ts"
serve(async (req) => {
return new Response(JSON.stringify({ message: "Hello from Edge Functions!" }), {
headers: { "Content-Type": "application/json" },
})
})
TS
echo "Created new Function at supabase/functions/$FUNC_NAME"
;;
serve)
echo "Serving functions on http://127.0.0.1:54321/functions/v1/<function-name>"
;;
deploy)
echo "Deployed function successfully."
;;
*)
echo "supabase functions $SUBCMD: ok"
;;
esac
;;
gen)
echo "Generated types successfully."
;;
link)
echo "Linked project: mock-project-ref"
;;
login)
echo "Already logged in."
;;
*)
echo "supabase $CMD: ok"
;;
esac

View File

@@ -6,6 +6,12 @@ import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
import { preflight } from "./runner/preflight.js";
import { listModifiedFiles, printSummary } from "./runner/results.js";
import { createWorkspace } from "./runner/scaffold.js";
import {
getKeys,
resetDB,
startSupabase,
stopSupabase,
} from "./runner/supabase-setup.js";
import { runTests } from "./runner/test.js";
import {
buildTranscriptSummary,
@@ -60,6 +66,20 @@ function discoverScenarios(): EvalScenario[] {
}));
}
// ---------------------------------------------------------------------------
// Scenario threshold
// ---------------------------------------------------------------------------
function getPassThreshold(scenarioId: string): number | null {
const scenariosDir = join(findEvalsDir(), "..", "scenarios");
const scenarioFile = join(scenariosDir, `${scenarioId}.md`);
if (!existsSync(scenarioFile)) return null;
const content = readFileSync(scenarioFile, "utf-8");
const match = content.match(/\*\*pass_threshold:\*\*\s*(\d+)/);
return match ? Number.parseInt(match[1], 10) : null;
}
// ---------------------------------------------------------------------------
// Run a single eval
// ---------------------------------------------------------------------------
@@ -103,13 +123,24 @@ async function runEval(
? join(evalDir, "EVAL.tsx")
: join(evalDir, "EVAL.ts");
const passThreshold = getPassThreshold(scenario.id);
console.log(" Running tests...");
const testResult = await runTests({
workspacePath,
evalFilePath,
passThreshold: passThreshold ?? undefined,
});
const pct =
testResult.totalCount > 0
? ((testResult.passedCount / testResult.totalCount) * 100).toFixed(1)
: "0.0";
const thresholdInfo = passThreshold
? `, threshold: ${((passThreshold / testResult.totalCount) * 100).toFixed(0)}%`
: "";
console.log(
` Tests: ${testResult.passedCount}/${testResult.totalCount} passed`,
` Tests: ${testResult.passedCount}/${testResult.totalCount} passed (${pct}%${thresholdInfo})`,
);
// 5. Collect modified files
@@ -129,6 +160,7 @@ async function runEval(
agentOutput: agentResult.output,
testsPassed: testResult.passedCount,
testsTotal: testResult.totalCount,
passThreshold: passThreshold ?? undefined,
filesModified,
toolCallCount: summary.toolCalls.length,
costUsd: summary.totalCostUsd ?? undefined,
@@ -194,16 +226,34 @@ async function main() {
console.log(`Scenarios: ${scenarios.map((s) => s.id).join(", ")}`);
// Start the shared Supabase instance once for all scenarios.
startSupabase();
const keys = getKeys();
// Inject keys into process.env so EVAL.ts tests can connect to the real DB.
process.env.SUPABASE_URL = keys.apiUrl;
process.env.SUPABASE_ANON_KEY = keys.anonKey;
process.env.SUPABASE_SERVICE_ROLE_KEY = keys.serviceRoleKey;
process.env.SUPABASE_DB_URL = keys.dbUrl;
const results: EvalRunResult[] = [];
const transcripts = new Map<string, TranscriptSummary>();
try {
for (const scenario of scenarios) {
// Reset the database before each scenario for a clean slate.
console.log(`\n Resetting DB for ${scenario.id}...`);
resetDB(keys.dbUrl);
const { result, transcript } = await runEval(scenario, skillEnabled);
results.push(result);
if (transcript) {
transcripts.set(result.scenario, transcript);
}
}
} finally {
stopSupabase();
}
// Use the results dir from the first result (all share the same timestamp)
const resultsDir = results.find((r) => r.resultsDir)?.resultsDir;

View File

@@ -22,9 +22,10 @@ export interface AgentRunResult {
* Uses --output-format stream-json to capture structured NDJSON events
* including tool calls, results, and reasoning steps.
*
* The agent operates in the workspace directory and can read/write files.
* When skills are installed (via the `skills` CLI), Claude Code
* discovers them automatically and uses them for guidance.
* The agent operates in the workspace directory and can read/write files,
* and has access to the local Supabase MCP server so it can apply migrations
* and query the real database. --strict-mcp-config ensures only the local
* Supabase instance is reachable — no host MCP servers leak in.
*/
export async function runAgent(opts: {
cwd: string;
@@ -35,6 +36,18 @@ export async function runAgent(opts: {
}): Promise<AgentRunResult> {
const start = Date.now();
// Point the agent's MCP config at the shared local Supabase instance.
// --strict-mcp-config ensures host .mcp.json is ignored entirely.
const supabaseUrl = process.env.SUPABASE_URL ?? "http://127.0.0.1:54321";
const mcpConfig = JSON.stringify({
mcpServers: {
supabase: {
type: "http",
url: `${supabaseUrl}/mcp`,
},
},
});
const args = [
"-p", // Print mode (non-interactive)
"--verbose",
@@ -46,12 +59,8 @@ export async function runAgent(opts: {
"--dangerously-skip-permissions",
"--tools",
"Edit,Write,Bash,Read,Glob,Grep",
// Disable all MCP servers so the agent uses only local filesystem tools.
// Without this, MCP tools from the parent env (e.g. Supabase, Neon)
// leak in and the agent may apply migrations to a remote project
// instead of creating local files.
"--mcp-config",
'{"mcpServers":{}}',
mcpConfig,
"--strict-mcp-config",
];

View File

@@ -66,7 +66,7 @@ export function resolveClaudeBin(): string {
* Verify the host environment has everything needed before spending
* API credits on an eval run.
*
* Checks: Node >= 20, Docker running, claude CLI available, API key set.
* Checks: Node >= 20, Docker running, supabase CLI available, claude CLI available, API key set.
*/
export function preflight(): void {
const errors: string[] = [];
@@ -77,13 +77,28 @@ export function preflight(): void {
errors.push(`Node.js >= 20 required (found ${process.versions.node})`);
}
// Docker daemon running (skip when inside the eval container — mocks handle it)
if (!isRunningInDocker()) {
// Docker daemon must be running — needed by the supabase CLI to manage containers.
// Required whether running locally or inside the eval container (socket-mounted).
try {
execFileSync("docker", ["info"], { stdio: "ignore", timeout: 10_000 });
} catch {
errors.push("Docker is not running (required by supabase CLI)");
errors.push(
isRunningInDocker()
? "Docker daemon not reachable inside container. Mount the socket: -v /var/run/docker.sock:/var/run/docker.sock"
: "Docker is not running (required by supabase CLI)",
);
}
// Supabase CLI available
try {
execFileSync("supabase", ["--version"], {
stdio: "ignore",
timeout: 10_000,
});
} catch {
errors.push(
"supabase CLI not found. Install it: https://supabase.com/docs/guides/cli/getting-started",
);
}
// Claude CLI available

View File

@@ -56,8 +56,16 @@ export function printSummary(
for (const r of results) {
const icon = r.status === "passed" ? "PASS" : "FAIL";
const skill = r.skillEnabled ? "with-skill" : "baseline";
const pct =
r.testsTotal > 0
? ((r.testsPassed / r.testsTotal) * 100).toFixed(1)
: "0.0";
const thresholdInfo =
r.passThreshold && r.testsTotal > 0
? `, threshold: ${((r.passThreshold / r.testsTotal) * 100).toFixed(0)}%`
: "";
console.log(
`[${icon}] ${r.scenario} | ${r.model} | ${skill} | ${(r.duration / 1000).toFixed(1)}s`,
`[${icon}] ${r.scenario} | ${r.model} | ${skill} | ${(r.duration / 1000).toFixed(1)}s | ${pct}% (${r.testsPassed}/${r.testsTotal}${thresholdInfo})`,
);
if (r.filesModified.length > 0) {
console.log(` Files: ${r.filesModified.join(", ")}`);

View File

@@ -1,8 +1,16 @@
import { execFileSync } from "node:child_process";
import { cpSync, existsSync, mkdtempSync, readdirSync, rmSync } from "node:fs";
import {
cpSync,
existsSync,
mkdirSync,
mkdtempSync,
readdirSync,
rmSync,
} from "node:fs";
import { tmpdir } from "node:os";
import { dirname, join, resolve } from "node:path";
import { fileURLToPath } from "node:url";
import { EVAL_PROJECT_DIR } from "./supabase-setup.js";
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
@@ -54,6 +62,16 @@ export function createWorkspace(opts: {
cpSync(src, dest, { recursive: true });
}
// Seed the workspace with the eval project's supabase/config.toml so the
// agent can run `supabase db push` against the shared local instance without
// needing to run `supabase init` or `supabase start` first.
const projectConfigSrc = join(EVAL_PROJECT_DIR, "supabase", "config.toml");
if (existsSync(projectConfigSrc)) {
const destSupabaseDir = join(workspacePath, "supabase");
mkdirSync(join(destSupabaseDir, "migrations"), { recursive: true });
cpSync(projectConfigSrc, join(destSupabaseDir, "config.toml"));
}
// Install skills into the workspace via the `skills` CLI
if (opts.skillEnabled) {
const skillsDir = join(repoRoot, "skills");

View File

@@ -0,0 +1,108 @@
import { execFileSync } from "node:child_process";
import { dirname, resolve } from "node:path";
import { fileURLToPath } from "node:url";
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
/**
* Directory that contains the eval Supabase project (supabase/config.toml).
* The runner starts the shared Supabase instance from here.
* Agent workspaces get a copy of supabase/config.toml so they can
* connect to the same running instance via `supabase db push`.
*/
export const EVAL_PROJECT_DIR = resolve(__dirname, "..", "..", "project");
export interface SupabaseKeys {
apiUrl: string;
dbUrl: string;
anonKey: string;
serviceRoleKey: string;
}
/**
* Start the local Supabase stack for the eval project.
* Idempotent: if already running, the CLI prints a message and exits 0.
*/
export function startSupabase(): void {
console.log(" Starting Supabase...");
execFileSync("supabase", ["start", "--exclude", "studio,imgproxy,mailpit"], {
cwd: EVAL_PROJECT_DIR,
stdio: "inherit",
timeout: 5 * 60 * 1000, // 5 min for first image pull
});
}
// SQL that clears all user-created objects and migration history between scenarios.
// Avoids `supabase db reset` which restarts containers and triggers flaky health checks.
const RESET_SQL = `
-- Drop and recreate public schema (removes all user tables/views/functions)
DROP SCHEMA public CASCADE;
CREATE SCHEMA public;
GRANT ALL ON SCHEMA public TO postgres;
GRANT ALL ON SCHEMA public TO anon;
GRANT ALL ON SCHEMA public TO authenticated;
GRANT ALL ON SCHEMA public TO service_role;
-- Clear migration history so the next agent's db push starts from a clean slate
DROP SCHEMA IF EXISTS supabase_migrations CASCADE;
-- Notify PostgREST to reload its schema cache
NOTIFY pgrst, 'reload schema';
`.trim();
/**
* Reset the database to a clean state between scenarios.
*
* Uses direct SQL via psql instead of `supabase db reset` to avoid the
* container-restart cycle and its flaky health checks. This drops the
* public schema (all user tables) and clears the migration history so
* `supabase db push` in agent workspaces always starts fresh.
*/
export function resetDB(dbUrl: string): void {
execFileSync("psql", [dbUrl, "--no-psqlrc", "-c", RESET_SQL], {
stdio: "inherit",
timeout: 30 * 1000,
});
}
/**
* Stop all Supabase containers for the eval project.
* Called once after all scenarios complete.
*/
export function stopSupabase(): void {
console.log(" Stopping Supabase...");
execFileSync("supabase", ["stop", "--no-backup"], {
cwd: EVAL_PROJECT_DIR,
stdio: "inherit",
timeout: 60 * 1000,
});
}
/**
* Read the running instance's API URL and JWT keys.
* Returns values that the runner injects into process.env so EVAL.ts
* tests can connect to the real database.
*/
export function getKeys(): SupabaseKeys {
const raw = execFileSync("supabase", ["status", "--output", "json"], {
cwd: EVAL_PROJECT_DIR,
timeout: 30 * 1000,
}).toString();
const status = JSON.parse(raw) as Record<string, string>;
const apiUrl = status.API_URL ?? "http://127.0.0.1:54321";
const dbUrl =
status.DB_URL ?? "postgresql://postgres:postgres@127.0.0.1:54322/postgres";
const anonKey = status.ANON_KEY ?? "";
const serviceRoleKey = status.SERVICE_ROLE_KEY ?? "";
if (!anonKey || !serviceRoleKey) {
throw new Error(
`supabase status returned missing keys. Raw output:\n${raw}`,
);
}
return { apiUrl, dbUrl, anonKey, serviceRoleKey };
}

View File

@@ -30,6 +30,7 @@ export interface TestResult {
export async function runTests(opts: {
workspacePath: string;
evalFilePath: string;
passThreshold?: number;
}): Promise<TestResult> {
// Copy the hidden test file into the workspace
const evalFileName = opts.evalFilePath.endsWith(".tsx")
@@ -85,11 +86,11 @@ export async function runTests(opts: {
});
const output = `${stdout}\n${stderr}`;
return parseTestOutput(output);
return parseTestOutput(output, opts.passThreshold);
} catch (error) {
const err = error as Error & { stdout?: string; stderr?: string };
const output = `${err.stdout ?? ""}\n${err.stderr ?? ""}`;
return parseTestOutput(output);
return parseTestOutput(output, opts.passThreshold);
}
}
@@ -111,7 +112,7 @@ function parseIndividualTests(output: string): Record<string, boolean> {
return results;
}
function parseTestOutput(output: string): TestResult {
function parseTestOutput(output: string, passThreshold?: number): TestResult {
// Parse vitest output for pass/fail counts
// Vitest formats:
// All passing: "Tests N passed (N)"
@@ -133,7 +134,9 @@ function parseTestOutput(output: string): TestResult {
totalCount = Number.parseInt(allFailing[2], 10);
}
const passed = totalCount > 0 && passedCount === totalCount;
const passed = passThreshold
? totalCount > 0 && passedCount >= passThreshold
: totalCount > 0 && passedCount === totalCount;
const individualTests = parseIndividualTests(output);
return { passed, output, passedCount, totalCount, individualTests };

View File

@@ -29,6 +29,8 @@ export interface EvalRunResult {
testsPassed: number;
/** Total number of vitest tests */
testsTotal: number;
/** Minimum tests required to pass (from scenario config) */
passThreshold?: number;
/** Files the agent created or modified in the workspace */
filesModified: string[];
error?: string;