mirror of
https://github.com/supabase/agent-skills.git
synced 2026-03-27 10:09:26 +08:00
feat(evals): replace mock CLIs with real Supabase instance per eval run
Start a shared local Supabase stack once before all scenarios and reset the database (drop/recreate public schema + clear migration history) between each run. This lets agents apply migrations via `supabase db push` against a real Postgres instance instead of mock shell scripts. - Add supabase-setup.ts: startSupabase / stopSupabase / resetDB / getKeys - Update runner.ts to start/stop Supabase and inject keys into process.env - Update agent.ts to point MCP config at the local Supabase HTTP endpoint - Update preflight.ts to check supabase CLI availability and Docker socket - Update scaffold.ts to seed workspace with supabase/config.toml - Add passThreshold support (test.ts / results.ts / types.ts) for partial pass - Delete mock shell scripts (mocks/docker, mocks/psql, mocks/supabase) - Update Dockerfile/docker-compose to mount Docker socket for supabase CLI Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -29,13 +29,33 @@ RUN npm --prefix packages/skills-build run build
|
|||||||
# ---------- Stage 2: runtime ----------
|
# ---------- Stage 2: runtime ----------
|
||||||
FROM node:22-slim
|
FROM node:22-slim
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends git && \
|
# Install Docker CLI and curl (needed for supabase CLI install)
|
||||||
rm -rf /var/lib/apt/lists/*
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
git \
|
||||||
|
curl \
|
||||||
|
ca-certificates \
|
||||||
|
docker.io \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install supabase CLI binary (pinned version)
|
||||||
|
ARG SUPABASE_CLI_VERSION=2.67.1
|
||||||
|
RUN ARCH=$(dpkg --print-architecture) && \
|
||||||
|
case "$ARCH" in \
|
||||||
|
amd64) SUPABASE_ARCH="linux_amd64" ;; \
|
||||||
|
arm64) SUPABASE_ARCH="linux_arm64" ;; \
|
||||||
|
*) echo "Unsupported arch: $ARCH" && exit 1 ;; \
|
||||||
|
esac && \
|
||||||
|
curl -fsSL "https://github.com/supabase/cli/releases/download/v${SUPABASE_CLI_VERSION}/supabase_${SUPABASE_ARCH}.tar.gz" \
|
||||||
|
| tar xz -C /usr/local/bin supabase && \
|
||||||
|
chmod +x /usr/local/bin/supabase
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Use the existing node user (UID 1000) — Claude Code refuses --dangerously-skip-permissions as root
|
# Use the existing node user (UID 1000) — Claude Code refuses --dangerously-skip-permissions as root.
|
||||||
# node:22-slim already ships with user "node" (uid=1000, gid=1000)
|
# Add node user to the docker group so it can reach the mounted Docker socket.
|
||||||
|
# DOCKER_GID must match the host's docker group GID (default 999 on most Linux systems).
|
||||||
|
ARG DOCKER_GID=999
|
||||||
|
RUN groupadd -f -g ${DOCKER_GID} docker && usermod -aG docker node
|
||||||
|
|
||||||
# Copy built artifacts from builder
|
# Copy built artifacts from builder
|
||||||
COPY --from=builder /app/package.json /app/package-lock.json ./
|
COPY --from=builder /app/package.json /app/package-lock.json ./
|
||||||
@@ -44,12 +64,6 @@ COPY --from=builder /app/skills/ skills/
|
|||||||
COPY --from=builder /app/packages/skills-build/ packages/skills-build/
|
COPY --from=builder /app/packages/skills-build/ packages/skills-build/
|
||||||
COPY --from=builder /app/packages/evals/ packages/evals/
|
COPY --from=builder /app/packages/evals/ packages/evals/
|
||||||
|
|
||||||
# Install mock scripts
|
|
||||||
COPY packages/evals/mocks/supabase /usr/local/bin/supabase
|
|
||||||
COPY packages/evals/mocks/docker /usr/local/bin/docker
|
|
||||||
COPY packages/evals/mocks/psql /usr/local/bin/psql
|
|
||||||
RUN chmod +x /usr/local/bin/supabase /usr/local/bin/docker /usr/local/bin/psql
|
|
||||||
|
|
||||||
# Install entrypoint
|
# Install entrypoint
|
||||||
COPY packages/evals/docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
|
COPY packages/evals/docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
|
||||||
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
|
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
|
||||||
|
|||||||
@@ -3,6 +3,10 @@ services:
|
|||||||
build:
|
build:
|
||||||
context: ../..
|
context: ../..
|
||||||
dockerfile: packages/evals/Dockerfile
|
dockerfile: packages/evals/Dockerfile
|
||||||
|
args:
|
||||||
|
# Match the host's docker group GID so the node user can reach the socket.
|
||||||
|
# Override with: DOCKER_GID=$(getent group docker | cut -d: -f3) docker compose up
|
||||||
|
DOCKER_GID: "${DOCKER_GID:-999}"
|
||||||
environment:
|
environment:
|
||||||
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
|
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
|
||||||
- EVAL_MODEL=${EVAL_MODEL:-}
|
- EVAL_MODEL=${EVAL_MODEL:-}
|
||||||
@@ -15,3 +19,5 @@ services:
|
|||||||
- EVAL_RESULTS_DIR=/app/results
|
- EVAL_RESULTS_DIR=/app/results
|
||||||
volumes:
|
volumes:
|
||||||
- ./results:/app/results
|
- ./results:/app/results
|
||||||
|
# Mount the host Docker socket so the supabase CLI can manage containers.
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
# Entrypoint for the eval Docker container.
|
# Entrypoint for the eval Docker container.
|
||||||
# Validates environment, adds mocks to PATH, then runs the given command.
|
# Validates environment, then runs the given command.
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
export IN_DOCKER=true
|
export IN_DOCKER=true
|
||||||
@@ -12,14 +12,12 @@ if [[ -z "${ANTHROPIC_API_KEY:-}" ]]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Prepend mocks directory to PATH so mock supabase/docker/psql are found first
|
|
||||||
export PATH="/app/packages/evals/mocks:${PATH}"
|
|
||||||
|
|
||||||
echo "=== Eval Environment ==="
|
echo "=== Eval Environment ==="
|
||||||
echo " Node: $(node --version)"
|
echo " Node: $(node --version)"
|
||||||
echo " Claude: $(claude --version 2>/dev/null || echo 'n/a')"
|
echo " Claude: $(claude --version 2>/dev/null || echo 'n/a')"
|
||||||
echo " Docker: mock"
|
echo " Supabase: $(supabase --version 2>/dev/null || echo 'n/a')"
|
||||||
echo " Model: ${EVAL_MODEL:-default}"
|
echo " Docker: $(docker --version 2>/dev/null || echo 'n/a')"
|
||||||
|
echo " Model: ${EVAL_MODEL:-default}"
|
||||||
echo " Scenario: ${EVAL_SCENARIO:-all}"
|
echo " Scenario: ${EVAL_SCENARIO:-all}"
|
||||||
echo "========================"
|
echo "========================"
|
||||||
|
|
||||||
|
|||||||
@@ -1,27 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# Mock Docker CLI for eval environments.
|
|
||||||
# Returns success for common commands the agent may invoke.
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
CMD="${1:-}"
|
|
||||||
shift || true
|
|
||||||
|
|
||||||
case "$CMD" in
|
|
||||||
ps)
|
|
||||||
echo "CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES"
|
|
||||||
;;
|
|
||||||
exec)
|
|
||||||
# Consume flags until we hit something that isn't a flag
|
|
||||||
while [[ "${1:-}" == -* ]]; do shift || true; done
|
|
||||||
# Remaining args are container + command — just succeed silently
|
|
||||||
;;
|
|
||||||
info)
|
|
||||||
echo "Server Version: 24.0.0 (mock)"
|
|
||||||
;;
|
|
||||||
compose)
|
|
||||||
echo "docker compose: ok"
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
# Default: succeed silently
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# Mock psql for eval environments.
|
|
||||||
# Accepts any arguments and returns an empty result set.
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# If -c is used (inline command), print column headers for a SELECT
|
|
||||||
for arg in "$@"; do
|
|
||||||
if [[ "$arg" == "-c" ]]; then
|
|
||||||
echo "(0 rows)"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
# Default: succeed silently
|
|
||||||
exit 0
|
|
||||||
@@ -1,161 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# Mock Supabase CLI for eval environments.
|
|
||||||
# Returns realistic output so the agent doesn't retry, and creates real
|
|
||||||
# migration files when asked.
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
CMD="${1:-}"
|
|
||||||
shift || true
|
|
||||||
|
|
||||||
case "$CMD" in
|
|
||||||
init)
|
|
||||||
mkdir -p supabase/migrations supabase/functions
|
|
||||||
cat > supabase/config.toml << 'TOML'
|
|
||||||
[project]
|
|
||||||
id = "mock-project-ref"
|
|
||||||
|
|
||||||
[api]
|
|
||||||
enabled = true
|
|
||||||
port = 54321
|
|
||||||
schemas = ["public", "graphql_public"]
|
|
||||||
|
|
||||||
[db]
|
|
||||||
port = 54322
|
|
||||||
major_version = 15
|
|
||||||
|
|
||||||
[studio]
|
|
||||||
enabled = true
|
|
||||||
port = 54323
|
|
||||||
TOML
|
|
||||||
echo "Finished supabase init."
|
|
||||||
;;
|
|
||||||
|
|
||||||
start)
|
|
||||||
echo "Applying migration 00000000000000_init.sql..."
|
|
||||||
echo "Started supabase local development setup."
|
|
||||||
echo ""
|
|
||||||
echo " API URL: http://127.0.0.1:54321"
|
|
||||||
echo " GraphQL URL: http://127.0.0.1:54321/graphql/v1"
|
|
||||||
echo " S3 Storage URL: http://127.0.0.1:54321/storage/v1/s3"
|
|
||||||
echo " DB URL: postgresql://postgres:postgres@127.0.0.1:54322/postgres"
|
|
||||||
echo " Studio URL: http://127.0.0.1:54323"
|
|
||||||
echo " Inbucket URL: http://127.0.0.1:54324"
|
|
||||||
echo " JWT secret: super-secret-jwt-token-with-at-least-32-characters-long"
|
|
||||||
echo " anon key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6ImFub24iLCJleHAiOjE5ODM4MTI5OTZ9.CRXP1A7WOeoJeXxjNni43kdQwgnWNReilDMblYTn_I0"
|
|
||||||
echo "service_role key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU"
|
|
||||||
echo " S3 Access Key: 625729a08b95bf1b7ff351a663f3a23c"
|
|
||||||
echo " S3 Secret Key: 850181e4652dd023b7a98c58ae0d2d34bd487ee0cc3254aed6eda37307425907"
|
|
||||||
echo " S3 Region: local"
|
|
||||||
;;
|
|
||||||
|
|
||||||
stop)
|
|
||||||
echo "Stopped supabase local development setup."
|
|
||||||
;;
|
|
||||||
|
|
||||||
status)
|
|
||||||
if [[ "${1:-}" == "-o" && "${2:-}" == "env" ]]; then
|
|
||||||
echo "ANON_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6ImFub24iLCJleHAiOjE5ODM4MTI5OTZ9.CRXP1A7WOeoJeXxjNni43kdQwgnWNReilDMblYTn_I0"
|
|
||||||
echo "SERVICE_ROLE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU"
|
|
||||||
echo "API_URL=http://127.0.0.1:54321"
|
|
||||||
echo "DB_URL=postgresql://postgres:postgres@127.0.0.1:54322/postgres"
|
|
||||||
echo "STUDIO_URL=http://127.0.0.1:54323"
|
|
||||||
else
|
|
||||||
echo " API URL: http://127.0.0.1:54321"
|
|
||||||
echo " DB URL: postgresql://postgres:postgres@127.0.0.1:54322/postgres"
|
|
||||||
echo " Studio URL: http://127.0.0.1:54323"
|
|
||||||
echo " DB: running"
|
|
||||||
echo " Auth: running"
|
|
||||||
echo " REST: running"
|
|
||||||
echo " Realtime: running"
|
|
||||||
echo " Storage: running"
|
|
||||||
fi
|
|
||||||
;;
|
|
||||||
|
|
||||||
migration)
|
|
||||||
SUBCMD="${1:-}"
|
|
||||||
shift || true
|
|
||||||
case "$SUBCMD" in
|
|
||||||
new)
|
|
||||||
NAME="${1:-migration}"
|
|
||||||
TIMESTAMP=$(date -u +"%Y%m%d%H%M%S")
|
|
||||||
mkdir -p supabase/migrations
|
|
||||||
MIGRATION_FILE="supabase/migrations/${TIMESTAMP}_${NAME}.sql"
|
|
||||||
touch "$MIGRATION_FILE"
|
|
||||||
echo "Created new migration at $MIGRATION_FILE"
|
|
||||||
;;
|
|
||||||
list)
|
|
||||||
echo "No migrations found."
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "supabase migration $SUBCMD: ok"
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
;;
|
|
||||||
|
|
||||||
db)
|
|
||||||
SUBCMD="${1:-}"
|
|
||||||
shift || true
|
|
||||||
case "$SUBCMD" in
|
|
||||||
push)
|
|
||||||
echo "Applying unapplied migrations..."
|
|
||||||
echo "Applied migration(s) successfully."
|
|
||||||
;;
|
|
||||||
reset)
|
|
||||||
echo "Resetting local database..."
|
|
||||||
echo "Database reset successfully."
|
|
||||||
;;
|
|
||||||
diff)
|
|
||||||
echo "No schema changes detected."
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "supabase db $SUBCMD: ok"
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
;;
|
|
||||||
|
|
||||||
functions)
|
|
||||||
SUBCMD="${1:-}"
|
|
||||||
shift || true
|
|
||||||
case "$SUBCMD" in
|
|
||||||
new)
|
|
||||||
FUNC_NAME="${1:-my-function}"
|
|
||||||
mkdir -p "supabase/functions/$FUNC_NAME"
|
|
||||||
cat > "supabase/functions/$FUNC_NAME/index.ts" << 'TS'
|
|
||||||
import { serve } from "https://deno.land/std@0.168.0/http/server.ts"
|
|
||||||
|
|
||||||
serve(async (req) => {
|
|
||||||
return new Response(JSON.stringify({ message: "Hello from Edge Functions!" }), {
|
|
||||||
headers: { "Content-Type": "application/json" },
|
|
||||||
})
|
|
||||||
})
|
|
||||||
TS
|
|
||||||
echo "Created new Function at supabase/functions/$FUNC_NAME"
|
|
||||||
;;
|
|
||||||
serve)
|
|
||||||
echo "Serving functions on http://127.0.0.1:54321/functions/v1/<function-name>"
|
|
||||||
;;
|
|
||||||
deploy)
|
|
||||||
echo "Deployed function successfully."
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "supabase functions $SUBCMD: ok"
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
;;
|
|
||||||
|
|
||||||
gen)
|
|
||||||
echo "Generated types successfully."
|
|
||||||
;;
|
|
||||||
|
|
||||||
link)
|
|
||||||
echo "Linked project: mock-project-ref"
|
|
||||||
;;
|
|
||||||
|
|
||||||
login)
|
|
||||||
echo "Already logged in."
|
|
||||||
;;
|
|
||||||
|
|
||||||
*)
|
|
||||||
echo "supabase $CMD: ok"
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
@@ -6,6 +6,12 @@ import { createResultDir, saveRunArtifacts } from "./runner/persist.js";
|
|||||||
import { preflight } from "./runner/preflight.js";
|
import { preflight } from "./runner/preflight.js";
|
||||||
import { listModifiedFiles, printSummary } from "./runner/results.js";
|
import { listModifiedFiles, printSummary } from "./runner/results.js";
|
||||||
import { createWorkspace } from "./runner/scaffold.js";
|
import { createWorkspace } from "./runner/scaffold.js";
|
||||||
|
import {
|
||||||
|
getKeys,
|
||||||
|
resetDB,
|
||||||
|
startSupabase,
|
||||||
|
stopSupabase,
|
||||||
|
} from "./runner/supabase-setup.js";
|
||||||
import { runTests } from "./runner/test.js";
|
import { runTests } from "./runner/test.js";
|
||||||
import {
|
import {
|
||||||
buildTranscriptSummary,
|
buildTranscriptSummary,
|
||||||
@@ -60,6 +66,20 @@ function discoverScenarios(): EvalScenario[] {
|
|||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Scenario threshold
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
function getPassThreshold(scenarioId: string): number | null {
|
||||||
|
const scenariosDir = join(findEvalsDir(), "..", "scenarios");
|
||||||
|
const scenarioFile = join(scenariosDir, `${scenarioId}.md`);
|
||||||
|
if (!existsSync(scenarioFile)) return null;
|
||||||
|
|
||||||
|
const content = readFileSync(scenarioFile, "utf-8");
|
||||||
|
const match = content.match(/\*\*pass_threshold:\*\*\s*(\d+)/);
|
||||||
|
return match ? Number.parseInt(match[1], 10) : null;
|
||||||
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Run a single eval
|
// Run a single eval
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
@@ -103,13 +123,24 @@ async function runEval(
|
|||||||
? join(evalDir, "EVAL.tsx")
|
? join(evalDir, "EVAL.tsx")
|
||||||
: join(evalDir, "EVAL.ts");
|
: join(evalDir, "EVAL.ts");
|
||||||
|
|
||||||
|
const passThreshold = getPassThreshold(scenario.id);
|
||||||
|
|
||||||
console.log(" Running tests...");
|
console.log(" Running tests...");
|
||||||
const testResult = await runTests({
|
const testResult = await runTests({
|
||||||
workspacePath,
|
workspacePath,
|
||||||
evalFilePath,
|
evalFilePath,
|
||||||
|
passThreshold: passThreshold ?? undefined,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const pct =
|
||||||
|
testResult.totalCount > 0
|
||||||
|
? ((testResult.passedCount / testResult.totalCount) * 100).toFixed(1)
|
||||||
|
: "0.0";
|
||||||
|
const thresholdInfo = passThreshold
|
||||||
|
? `, threshold: ${((passThreshold / testResult.totalCount) * 100).toFixed(0)}%`
|
||||||
|
: "";
|
||||||
console.log(
|
console.log(
|
||||||
` Tests: ${testResult.passedCount}/${testResult.totalCount} passed`,
|
` Tests: ${testResult.passedCount}/${testResult.totalCount} passed (${pct}%${thresholdInfo})`,
|
||||||
);
|
);
|
||||||
|
|
||||||
// 5. Collect modified files
|
// 5. Collect modified files
|
||||||
@@ -129,6 +160,7 @@ async function runEval(
|
|||||||
agentOutput: agentResult.output,
|
agentOutput: agentResult.output,
|
||||||
testsPassed: testResult.passedCount,
|
testsPassed: testResult.passedCount,
|
||||||
testsTotal: testResult.totalCount,
|
testsTotal: testResult.totalCount,
|
||||||
|
passThreshold: passThreshold ?? undefined,
|
||||||
filesModified,
|
filesModified,
|
||||||
toolCallCount: summary.toolCalls.length,
|
toolCallCount: summary.toolCalls.length,
|
||||||
costUsd: summary.totalCostUsd ?? undefined,
|
costUsd: summary.totalCostUsd ?? undefined,
|
||||||
@@ -194,15 +226,33 @@ async function main() {
|
|||||||
|
|
||||||
console.log(`Scenarios: ${scenarios.map((s) => s.id).join(", ")}`);
|
console.log(`Scenarios: ${scenarios.map((s) => s.id).join(", ")}`);
|
||||||
|
|
||||||
|
// Start the shared Supabase instance once for all scenarios.
|
||||||
|
startSupabase();
|
||||||
|
const keys = getKeys();
|
||||||
|
|
||||||
|
// Inject keys into process.env so EVAL.ts tests can connect to the real DB.
|
||||||
|
process.env.SUPABASE_URL = keys.apiUrl;
|
||||||
|
process.env.SUPABASE_ANON_KEY = keys.anonKey;
|
||||||
|
process.env.SUPABASE_SERVICE_ROLE_KEY = keys.serviceRoleKey;
|
||||||
|
process.env.SUPABASE_DB_URL = keys.dbUrl;
|
||||||
|
|
||||||
const results: EvalRunResult[] = [];
|
const results: EvalRunResult[] = [];
|
||||||
const transcripts = new Map<string, TranscriptSummary>();
|
const transcripts = new Map<string, TranscriptSummary>();
|
||||||
|
|
||||||
for (const scenario of scenarios) {
|
try {
|
||||||
const { result, transcript } = await runEval(scenario, skillEnabled);
|
for (const scenario of scenarios) {
|
||||||
results.push(result);
|
// Reset the database before each scenario for a clean slate.
|
||||||
if (transcript) {
|
console.log(`\n Resetting DB for ${scenario.id}...`);
|
||||||
transcripts.set(result.scenario, transcript);
|
resetDB(keys.dbUrl);
|
||||||
|
|
||||||
|
const { result, transcript } = await runEval(scenario, skillEnabled);
|
||||||
|
results.push(result);
|
||||||
|
if (transcript) {
|
||||||
|
transcripts.set(result.scenario, transcript);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
} finally {
|
||||||
|
stopSupabase();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use the results dir from the first result (all share the same timestamp)
|
// Use the results dir from the first result (all share the same timestamp)
|
||||||
|
|||||||
@@ -22,9 +22,10 @@ export interface AgentRunResult {
|
|||||||
* Uses --output-format stream-json to capture structured NDJSON events
|
* Uses --output-format stream-json to capture structured NDJSON events
|
||||||
* including tool calls, results, and reasoning steps.
|
* including tool calls, results, and reasoning steps.
|
||||||
*
|
*
|
||||||
* The agent operates in the workspace directory and can read/write files.
|
* The agent operates in the workspace directory and can read/write files,
|
||||||
* When skills are installed (via the `skills` CLI), Claude Code
|
* and has access to the local Supabase MCP server so it can apply migrations
|
||||||
* discovers them automatically and uses them for guidance.
|
* and query the real database. --strict-mcp-config ensures only the local
|
||||||
|
* Supabase instance is reachable — no host MCP servers leak in.
|
||||||
*/
|
*/
|
||||||
export async function runAgent(opts: {
|
export async function runAgent(opts: {
|
||||||
cwd: string;
|
cwd: string;
|
||||||
@@ -35,6 +36,18 @@ export async function runAgent(opts: {
|
|||||||
}): Promise<AgentRunResult> {
|
}): Promise<AgentRunResult> {
|
||||||
const start = Date.now();
|
const start = Date.now();
|
||||||
|
|
||||||
|
// Point the agent's MCP config at the shared local Supabase instance.
|
||||||
|
// --strict-mcp-config ensures host .mcp.json is ignored entirely.
|
||||||
|
const supabaseUrl = process.env.SUPABASE_URL ?? "http://127.0.0.1:54321";
|
||||||
|
const mcpConfig = JSON.stringify({
|
||||||
|
mcpServers: {
|
||||||
|
supabase: {
|
||||||
|
type: "http",
|
||||||
|
url: `${supabaseUrl}/mcp`,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
const args = [
|
const args = [
|
||||||
"-p", // Print mode (non-interactive)
|
"-p", // Print mode (non-interactive)
|
||||||
"--verbose",
|
"--verbose",
|
||||||
@@ -46,12 +59,8 @@ export async function runAgent(opts: {
|
|||||||
"--dangerously-skip-permissions",
|
"--dangerously-skip-permissions",
|
||||||
"--tools",
|
"--tools",
|
||||||
"Edit,Write,Bash,Read,Glob,Grep",
|
"Edit,Write,Bash,Read,Glob,Grep",
|
||||||
// Disable all MCP servers so the agent uses only local filesystem tools.
|
|
||||||
// Without this, MCP tools from the parent env (e.g. Supabase, Neon)
|
|
||||||
// leak in and the agent may apply migrations to a remote project
|
|
||||||
// instead of creating local files.
|
|
||||||
"--mcp-config",
|
"--mcp-config",
|
||||||
'{"mcpServers":{}}',
|
mcpConfig,
|
||||||
"--strict-mcp-config",
|
"--strict-mcp-config",
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ export function resolveClaudeBin(): string {
|
|||||||
* Verify the host environment has everything needed before spending
|
* Verify the host environment has everything needed before spending
|
||||||
* API credits on an eval run.
|
* API credits on an eval run.
|
||||||
*
|
*
|
||||||
* Checks: Node >= 20, Docker running, claude CLI available, API key set.
|
* Checks: Node >= 20, Docker running, supabase CLI available, claude CLI available, API key set.
|
||||||
*/
|
*/
|
||||||
export function preflight(): void {
|
export function preflight(): void {
|
||||||
const errors: string[] = [];
|
const errors: string[] = [];
|
||||||
@@ -77,13 +77,28 @@ export function preflight(): void {
|
|||||||
errors.push(`Node.js >= 20 required (found ${process.versions.node})`);
|
errors.push(`Node.js >= 20 required (found ${process.versions.node})`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Docker daemon running (skip when inside the eval container — mocks handle it)
|
// Docker daemon must be running — needed by the supabase CLI to manage containers.
|
||||||
if (!isRunningInDocker()) {
|
// Required whether running locally or inside the eval container (socket-mounted).
|
||||||
try {
|
try {
|
||||||
execFileSync("docker", ["info"], { stdio: "ignore", timeout: 10_000 });
|
execFileSync("docker", ["info"], { stdio: "ignore", timeout: 10_000 });
|
||||||
} catch {
|
} catch {
|
||||||
errors.push("Docker is not running (required by supabase CLI)");
|
errors.push(
|
||||||
}
|
isRunningInDocker()
|
||||||
|
? "Docker daemon not reachable inside container. Mount the socket: -v /var/run/docker.sock:/var/run/docker.sock"
|
||||||
|
: "Docker is not running (required by supabase CLI)",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Supabase CLI available
|
||||||
|
try {
|
||||||
|
execFileSync("supabase", ["--version"], {
|
||||||
|
stdio: "ignore",
|
||||||
|
timeout: 10_000,
|
||||||
|
});
|
||||||
|
} catch {
|
||||||
|
errors.push(
|
||||||
|
"supabase CLI not found. Install it: https://supabase.com/docs/guides/cli/getting-started",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Claude CLI available
|
// Claude CLI available
|
||||||
|
|||||||
@@ -56,8 +56,16 @@ export function printSummary(
|
|||||||
for (const r of results) {
|
for (const r of results) {
|
||||||
const icon = r.status === "passed" ? "PASS" : "FAIL";
|
const icon = r.status === "passed" ? "PASS" : "FAIL";
|
||||||
const skill = r.skillEnabled ? "with-skill" : "baseline";
|
const skill = r.skillEnabled ? "with-skill" : "baseline";
|
||||||
|
const pct =
|
||||||
|
r.testsTotal > 0
|
||||||
|
? ((r.testsPassed / r.testsTotal) * 100).toFixed(1)
|
||||||
|
: "0.0";
|
||||||
|
const thresholdInfo =
|
||||||
|
r.passThreshold && r.testsTotal > 0
|
||||||
|
? `, threshold: ${((r.passThreshold / r.testsTotal) * 100).toFixed(0)}%`
|
||||||
|
: "";
|
||||||
console.log(
|
console.log(
|
||||||
`[${icon}] ${r.scenario} | ${r.model} | ${skill} | ${(r.duration / 1000).toFixed(1)}s`,
|
`[${icon}] ${r.scenario} | ${r.model} | ${skill} | ${(r.duration / 1000).toFixed(1)}s | ${pct}% (${r.testsPassed}/${r.testsTotal}${thresholdInfo})`,
|
||||||
);
|
);
|
||||||
if (r.filesModified.length > 0) {
|
if (r.filesModified.length > 0) {
|
||||||
console.log(` Files: ${r.filesModified.join(", ")}`);
|
console.log(` Files: ${r.filesModified.join(", ")}`);
|
||||||
|
|||||||
@@ -1,8 +1,16 @@
|
|||||||
import { execFileSync } from "node:child_process";
|
import { execFileSync } from "node:child_process";
|
||||||
import { cpSync, existsSync, mkdtempSync, readdirSync, rmSync } from "node:fs";
|
import {
|
||||||
|
cpSync,
|
||||||
|
existsSync,
|
||||||
|
mkdirSync,
|
||||||
|
mkdtempSync,
|
||||||
|
readdirSync,
|
||||||
|
rmSync,
|
||||||
|
} from "node:fs";
|
||||||
import { tmpdir } from "node:os";
|
import { tmpdir } from "node:os";
|
||||||
import { dirname, join, resolve } from "node:path";
|
import { dirname, join, resolve } from "node:path";
|
||||||
import { fileURLToPath } from "node:url";
|
import { fileURLToPath } from "node:url";
|
||||||
|
import { EVAL_PROJECT_DIR } from "./supabase-setup.js";
|
||||||
|
|
||||||
const __filename = fileURLToPath(import.meta.url);
|
const __filename = fileURLToPath(import.meta.url);
|
||||||
const __dirname = dirname(__filename);
|
const __dirname = dirname(__filename);
|
||||||
@@ -54,6 +62,16 @@ export function createWorkspace(opts: {
|
|||||||
cpSync(src, dest, { recursive: true });
|
cpSync(src, dest, { recursive: true });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Seed the workspace with the eval project's supabase/config.toml so the
|
||||||
|
// agent can run `supabase db push` against the shared local instance without
|
||||||
|
// needing to run `supabase init` or `supabase start` first.
|
||||||
|
const projectConfigSrc = join(EVAL_PROJECT_DIR, "supabase", "config.toml");
|
||||||
|
if (existsSync(projectConfigSrc)) {
|
||||||
|
const destSupabaseDir = join(workspacePath, "supabase");
|
||||||
|
mkdirSync(join(destSupabaseDir, "migrations"), { recursive: true });
|
||||||
|
cpSync(projectConfigSrc, join(destSupabaseDir, "config.toml"));
|
||||||
|
}
|
||||||
|
|
||||||
// Install skills into the workspace via the `skills` CLI
|
// Install skills into the workspace via the `skills` CLI
|
||||||
if (opts.skillEnabled) {
|
if (opts.skillEnabled) {
|
||||||
const skillsDir = join(repoRoot, "skills");
|
const skillsDir = join(repoRoot, "skills");
|
||||||
|
|||||||
108
packages/evals/src/runner/supabase-setup.ts
Normal file
108
packages/evals/src/runner/supabase-setup.ts
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
import { execFileSync } from "node:child_process";
|
||||||
|
import { dirname, resolve } from "node:path";
|
||||||
|
import { fileURLToPath } from "node:url";
|
||||||
|
|
||||||
|
const __filename = fileURLToPath(import.meta.url);
|
||||||
|
const __dirname = dirname(__filename);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Directory that contains the eval Supabase project (supabase/config.toml).
|
||||||
|
* The runner starts the shared Supabase instance from here.
|
||||||
|
* Agent workspaces get a copy of supabase/config.toml so they can
|
||||||
|
* connect to the same running instance via `supabase db push`.
|
||||||
|
*/
|
||||||
|
export const EVAL_PROJECT_DIR = resolve(__dirname, "..", "..", "project");
|
||||||
|
|
||||||
|
export interface SupabaseKeys {
|
||||||
|
apiUrl: string;
|
||||||
|
dbUrl: string;
|
||||||
|
anonKey: string;
|
||||||
|
serviceRoleKey: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Start the local Supabase stack for the eval project.
|
||||||
|
* Idempotent: if already running, the CLI prints a message and exits 0.
|
||||||
|
*/
|
||||||
|
export function startSupabase(): void {
|
||||||
|
console.log(" Starting Supabase...");
|
||||||
|
execFileSync("supabase", ["start", "--exclude", "studio,imgproxy,mailpit"], {
|
||||||
|
cwd: EVAL_PROJECT_DIR,
|
||||||
|
stdio: "inherit",
|
||||||
|
timeout: 5 * 60 * 1000, // 5 min for first image pull
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// SQL that clears all user-created objects and migration history between scenarios.
|
||||||
|
// Avoids `supabase db reset` which restarts containers and triggers flaky health checks.
|
||||||
|
const RESET_SQL = `
|
||||||
|
-- Drop and recreate public schema (removes all user tables/views/functions)
|
||||||
|
DROP SCHEMA public CASCADE;
|
||||||
|
CREATE SCHEMA public;
|
||||||
|
GRANT ALL ON SCHEMA public TO postgres;
|
||||||
|
GRANT ALL ON SCHEMA public TO anon;
|
||||||
|
GRANT ALL ON SCHEMA public TO authenticated;
|
||||||
|
GRANT ALL ON SCHEMA public TO service_role;
|
||||||
|
|
||||||
|
-- Clear migration history so the next agent's db push starts from a clean slate
|
||||||
|
DROP SCHEMA IF EXISTS supabase_migrations CASCADE;
|
||||||
|
|
||||||
|
-- Notify PostgREST to reload its schema cache
|
||||||
|
NOTIFY pgrst, 'reload schema';
|
||||||
|
`.trim();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reset the database to a clean state between scenarios.
|
||||||
|
*
|
||||||
|
* Uses direct SQL via psql instead of `supabase db reset` to avoid the
|
||||||
|
* container-restart cycle and its flaky health checks. This drops the
|
||||||
|
* public schema (all user tables) and clears the migration history so
|
||||||
|
* `supabase db push` in agent workspaces always starts fresh.
|
||||||
|
*/
|
||||||
|
export function resetDB(dbUrl: string): void {
|
||||||
|
execFileSync("psql", [dbUrl, "--no-psqlrc", "-c", RESET_SQL], {
|
||||||
|
stdio: "inherit",
|
||||||
|
timeout: 30 * 1000,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stop all Supabase containers for the eval project.
|
||||||
|
* Called once after all scenarios complete.
|
||||||
|
*/
|
||||||
|
export function stopSupabase(): void {
|
||||||
|
console.log(" Stopping Supabase...");
|
||||||
|
execFileSync("supabase", ["stop", "--no-backup"], {
|
||||||
|
cwd: EVAL_PROJECT_DIR,
|
||||||
|
stdio: "inherit",
|
||||||
|
timeout: 60 * 1000,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the running instance's API URL and JWT keys.
|
||||||
|
* Returns values that the runner injects into process.env so EVAL.ts
|
||||||
|
* tests can connect to the real database.
|
||||||
|
*/
|
||||||
|
export function getKeys(): SupabaseKeys {
|
||||||
|
const raw = execFileSync("supabase", ["status", "--output", "json"], {
|
||||||
|
cwd: EVAL_PROJECT_DIR,
|
||||||
|
timeout: 30 * 1000,
|
||||||
|
}).toString();
|
||||||
|
|
||||||
|
const status = JSON.parse(raw) as Record<string, string>;
|
||||||
|
|
||||||
|
const apiUrl = status.API_URL ?? "http://127.0.0.1:54321";
|
||||||
|
const dbUrl =
|
||||||
|
status.DB_URL ?? "postgresql://postgres:postgres@127.0.0.1:54322/postgres";
|
||||||
|
const anonKey = status.ANON_KEY ?? "";
|
||||||
|
const serviceRoleKey = status.SERVICE_ROLE_KEY ?? "";
|
||||||
|
|
||||||
|
if (!anonKey || !serviceRoleKey) {
|
||||||
|
throw new Error(
|
||||||
|
`supabase status returned missing keys. Raw output:\n${raw}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return { apiUrl, dbUrl, anonKey, serviceRoleKey };
|
||||||
|
}
|
||||||
@@ -30,6 +30,7 @@ export interface TestResult {
|
|||||||
export async function runTests(opts: {
|
export async function runTests(opts: {
|
||||||
workspacePath: string;
|
workspacePath: string;
|
||||||
evalFilePath: string;
|
evalFilePath: string;
|
||||||
|
passThreshold?: number;
|
||||||
}): Promise<TestResult> {
|
}): Promise<TestResult> {
|
||||||
// Copy the hidden test file into the workspace
|
// Copy the hidden test file into the workspace
|
||||||
const evalFileName = opts.evalFilePath.endsWith(".tsx")
|
const evalFileName = opts.evalFilePath.endsWith(".tsx")
|
||||||
@@ -85,11 +86,11 @@ export async function runTests(opts: {
|
|||||||
});
|
});
|
||||||
|
|
||||||
const output = `${stdout}\n${stderr}`;
|
const output = `${stdout}\n${stderr}`;
|
||||||
return parseTestOutput(output);
|
return parseTestOutput(output, opts.passThreshold);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
const err = error as Error & { stdout?: string; stderr?: string };
|
const err = error as Error & { stdout?: string; stderr?: string };
|
||||||
const output = `${err.stdout ?? ""}\n${err.stderr ?? ""}`;
|
const output = `${err.stdout ?? ""}\n${err.stderr ?? ""}`;
|
||||||
return parseTestOutput(output);
|
return parseTestOutput(output, opts.passThreshold);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -111,7 +112,7 @@ function parseIndividualTests(output: string): Record<string, boolean> {
|
|||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
function parseTestOutput(output: string): TestResult {
|
function parseTestOutput(output: string, passThreshold?: number): TestResult {
|
||||||
// Parse vitest output for pass/fail counts
|
// Parse vitest output for pass/fail counts
|
||||||
// Vitest formats:
|
// Vitest formats:
|
||||||
// All passing: "Tests N passed (N)"
|
// All passing: "Tests N passed (N)"
|
||||||
@@ -133,7 +134,9 @@ function parseTestOutput(output: string): TestResult {
|
|||||||
totalCount = Number.parseInt(allFailing[2], 10);
|
totalCount = Number.parseInt(allFailing[2], 10);
|
||||||
}
|
}
|
||||||
|
|
||||||
const passed = totalCount > 0 && passedCount === totalCount;
|
const passed = passThreshold
|
||||||
|
? totalCount > 0 && passedCount >= passThreshold
|
||||||
|
: totalCount > 0 && passedCount === totalCount;
|
||||||
const individualTests = parseIndividualTests(output);
|
const individualTests = parseIndividualTests(output);
|
||||||
|
|
||||||
return { passed, output, passedCount, totalCount, individualTests };
|
return { passed, output, passedCount, totalCount, individualTests };
|
||||||
|
|||||||
@@ -29,6 +29,8 @@ export interface EvalRunResult {
|
|||||||
testsPassed: number;
|
testsPassed: number;
|
||||||
/** Total number of vitest tests */
|
/** Total number of vitest tests */
|
||||||
testsTotal: number;
|
testsTotal: number;
|
||||||
|
/** Minimum tests required to pass (from scenario config) */
|
||||||
|
passThreshold?: number;
|
||||||
/** Files the agent created or modified in the workspace */
|
/** Files the agent created or modified in the workspace */
|
||||||
filesModified: string[];
|
filesModified: string[];
|
||||||
error?: string;
|
error?: string;
|
||||||
|
|||||||
Reference in New Issue
Block a user