containerize eval environment with Docker and mock CLIs

Host now only needs Docker + ANTHROPIC_API_KEY to run evals. Adds
multi-stage Dockerfile, mock supabase/docker/psql scripts, entrypoint,
docker-compose for local use, and switches CI to Docker-based execution.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Pedro Rodrigues
2026-02-23 19:22:47 +00:00
parent 93a49374de
commit 3c3d1f55ca
11 changed files with 414 additions and 20 deletions

9
.dockerignore Normal file
View File

@@ -0,0 +1,9 @@
.git
.env
.env.*
node_modules
packages/evals/results
packages/evals/node_modules
packages/skills-build/node_modules
reports
*.log

View File

@@ -34,16 +34,34 @@ jobs:
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- uses: jdx/mise-action@v3 - name: Set up Docker Buildx
with: uses: docker/setup-buildx-action@v3
install: true
- name: Install dependencies - name: Build eval image
run: npm install && npm --prefix packages/evals install uses: docker/build-push-action@v6
with:
context: .
file: packages/evals/Dockerfile
tags: supabase-evals:ci
load: true
cache-from: type=gha
cache-to: type=gha,mode=max
- name: Run Evals - name: Run Evals
uses: braintrustdata/eval-action@v1 run: |
docker run --rm \
-e ANTHROPIC_API_KEY \
-e BRAINTRUST_PROJECT_ID \
-e BRAINTRUST_API_KEY=${{ secrets.BRAINTRUST_API_KEY }} \
-e BRAINTRUST_UPLOAD=true \
-e EVAL_RESULTS_DIR=/app/results \
-v "${{ github.workspace }}/results:/app/results" \
supabase-evals:ci
- name: Upload results
if: always()
uses: actions/upload-artifact@v4
with: with:
api_key: ${{ secrets.BRAINTRUST_API_KEY }} name: eval-results
runtime: node path: results/
root: packages/evals if-no-files-found: ignore

View File

@@ -54,3 +54,38 @@ sources = ["packages/evals/src/**", "packages/evals/evals/**"]
description = "Run workflow evals and upload to Braintrust" description = "Run workflow evals and upload to Braintrust"
run = "npm --prefix packages/evals run eval:upload" run = "npm --prefix packages/evals run eval:upload"
sources = ["packages/evals/src/**", "packages/evals/evals/**"] sources = ["packages/evals/src/**", "packages/evals/evals/**"]
# ── Docker eval tasks ────────────────────────────────────────────────
[tasks."eval:docker:build"]
description = "Build the eval Docker image"
run = "docker build -t supabase-evals:local -f packages/evals/Dockerfile ."
[tasks."eval:docker"]
description = "Run evals in Docker"
depends = ["eval:docker:build"]
run = """
docker run --rm \
-e ANTHROPIC_API_KEY \
-e EVAL_MODEL \
-e EVAL_SCENARIO \
-e EVAL_BASELINE \
-e EVAL_SKILL \
-e BRAINTRUST_UPLOAD \
-e BRAINTRUST_API_KEY \
-e BRAINTRUST_PROJECT_ID \
-e EVAL_RESULTS_DIR=/app/results \
-v "$(pwd)/packages/evals/results:/app/results" \
supabase-evals:local
"""
[tasks."eval:docker:shell"]
description = "Open a debug shell in the eval container"
depends = ["eval:docker:build"]
run = """
docker run --rm -it \
-e ANTHROPIC_API_KEY \
-e IN_DOCKER=true \
-v "$(pwd)/packages/evals/results:/app/results" \
supabase-evals:local /bin/bash
"""

69
packages/evals/Dockerfile Normal file
View File

@@ -0,0 +1,69 @@
# ---------- Stage 1: builder ----------
FROM node:22-slim AS builder
RUN apt-get update && apt-get install -y --no-install-recommends git && \
rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Copy root package files first (layer caching)
COPY package.json package-lock.json ./
# Copy workspace package files
COPY packages/skills-build/package.json packages/skills-build/
COPY packages/evals/package.json packages/evals/
# Install all dependencies
RUN npm install && \
npm --prefix packages/skills-build install && \
npm --prefix packages/evals install
# Copy source code
COPY skills/ skills/
COPY packages/skills-build/ packages/skills-build/
COPY packages/evals/ packages/evals/
# Build skills (generates AGENTS.md / CLAUDE.md files)
RUN npm --prefix packages/skills-build run build
# ---------- Stage 2: runtime ----------
FROM node:22-slim
RUN apt-get update && apt-get install -y --no-install-recommends git && \
rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Use the existing node user (UID 1000) — Claude Code refuses --dangerously-skip-permissions as root
# node:22-slim already ships with user "node" (uid=1000, gid=1000)
# Copy built artifacts from builder
COPY --from=builder /app/package.json /app/package-lock.json ./
COPY --from=builder /app/node_modules/ node_modules/
COPY --from=builder /app/skills/ skills/
COPY --from=builder /app/packages/skills-build/ packages/skills-build/
COPY --from=builder /app/packages/evals/ packages/evals/
# Install mock scripts
COPY packages/evals/mocks/supabase /usr/local/bin/supabase
COPY packages/evals/mocks/docker /usr/local/bin/docker
COPY packages/evals/mocks/psql /usr/local/bin/psql
RUN chmod +x /usr/local/bin/supabase /usr/local/bin/docker /usr/local/bin/psql
# Install entrypoint
COPY packages/evals/docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
# Create results directory writable by node user
RUN mkdir -p /app/packages/evals/results && chown -R node:node /app/packages/evals/results
# Ensure node user owns tmp and home for Claude Code
RUN mkdir -p /tmp && chmod 1777 /tmp && chown -R node:node /home/node
USER node
ENV IN_DOCKER=true
ENV NODE_ENV=production
ENTRYPOINT ["docker-entrypoint.sh"]
CMD ["npm", "--prefix", "packages/evals", "run", "eval"]

View File

@@ -0,0 +1,17 @@
services:
evals:
build:
context: ../..
dockerfile: packages/evals/Dockerfile
environment:
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
- EVAL_MODEL=${EVAL_MODEL:-}
- EVAL_SCENARIO=${EVAL_SCENARIO:-}
- EVAL_BASELINE=${EVAL_BASELINE:-}
- EVAL_SKILL=${EVAL_SKILL:-}
- BRAINTRUST_UPLOAD=${BRAINTRUST_UPLOAD:-}
- BRAINTRUST_API_KEY=${BRAINTRUST_API_KEY:-}
- BRAINTRUST_PROJECT_ID=${BRAINTRUST_PROJECT_ID:-}
- EVAL_RESULTS_DIR=/app/results
volumes:
- ./results:/app/results

View File

@@ -0,0 +1,26 @@
#!/usr/bin/env bash
# Entrypoint for the eval Docker container.
# Validates environment, adds mocks to PATH, then runs the given command.
set -euo pipefail
export IN_DOCKER=true
# Validate required env
if [[ -z "${ANTHROPIC_API_KEY:-}" ]]; then
echo "ERROR: ANTHROPIC_API_KEY is not set." >&2
echo "Pass it via: docker run -e ANTHROPIC_API_KEY=sk-ant-... ..." >&2
exit 1
fi
# Prepend mocks directory to PATH so mock supabase/docker/psql are found first
export PATH="/app/packages/evals/mocks:${PATH}"
echo "=== Eval Environment ==="
echo " Node: $(node --version)"
echo " Claude: $(claude --version 2>/dev/null || echo 'n/a')"
echo " Docker: mock"
echo " Model: ${EVAL_MODEL:-default}"
echo " Scenario: ${EVAL_SCENARIO:-all}"
echo "========================"
exec "$@"

27
packages/evals/mocks/docker Executable file
View File

@@ -0,0 +1,27 @@
#!/usr/bin/env bash
# Mock Docker CLI for eval environments.
# Returns success for common commands the agent may invoke.
set -euo pipefail
CMD="${1:-}"
shift || true
case "$CMD" in
ps)
echo "CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES"
;;
exec)
# Consume flags until we hit something that isn't a flag
while [[ "${1:-}" == -* ]]; do shift || true; done
# Remaining args are container + command — just succeed silently
;;
info)
echo "Server Version: 24.0.0 (mock)"
;;
compose)
echo "docker compose: ok"
;;
*)
# Default: succeed silently
;;
esac

15
packages/evals/mocks/psql Executable file
View File

@@ -0,0 +1,15 @@
#!/usr/bin/env bash
# Mock psql for eval environments.
# Accepts any arguments and returns an empty result set.
set -euo pipefail
# If -c is used (inline command), print column headers for a SELECT
for arg in "$@"; do
if [[ "$arg" == "-c" ]]; then
echo "(0 rows)"
exit 0
fi
done
# Default: succeed silently
exit 0

161
packages/evals/mocks/supabase Executable file
View File

@@ -0,0 +1,161 @@
#!/usr/bin/env bash
# Mock Supabase CLI for eval environments.
# Returns realistic output so the agent doesn't retry, and creates real
# migration files when asked.
set -euo pipefail
CMD="${1:-}"
shift || true
case "$CMD" in
init)
mkdir -p supabase/migrations supabase/functions
cat > supabase/config.toml << 'TOML'
[project]
id = "mock-project-ref"
[api]
enabled = true
port = 54321
schemas = ["public", "graphql_public"]
[db]
port = 54322
major_version = 15
[studio]
enabled = true
port = 54323
TOML
echo "Finished supabase init."
;;
start)
echo "Applying migration 00000000000000_init.sql..."
echo "Started supabase local development setup."
echo ""
echo " API URL: http://127.0.0.1:54321"
echo " GraphQL URL: http://127.0.0.1:54321/graphql/v1"
echo " S3 Storage URL: http://127.0.0.1:54321/storage/v1/s3"
echo " DB URL: postgresql://postgres:postgres@127.0.0.1:54322/postgres"
echo " Studio URL: http://127.0.0.1:54323"
echo " Inbucket URL: http://127.0.0.1:54324"
echo " JWT secret: super-secret-jwt-token-with-at-least-32-characters-long"
echo " anon key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6ImFub24iLCJleHAiOjE5ODM4MTI5OTZ9.CRXP1A7WOeoJeXxjNni43kdQwgnWNReilDMblYTn_I0"
echo "service_role key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU"
echo " S3 Access Key: 625729a08b95bf1b7ff351a663f3a23c"
echo " S3 Secret Key: 850181e4652dd023b7a98c58ae0d2d34bd487ee0cc3254aed6eda37307425907"
echo " S3 Region: local"
;;
stop)
echo "Stopped supabase local development setup."
;;
status)
if [[ "${1:-}" == "-o" && "${2:-}" == "env" ]]; then
echo "ANON_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6ImFub24iLCJleHAiOjE5ODM4MTI5OTZ9.CRXP1A7WOeoJeXxjNni43kdQwgnWNReilDMblYTn_I0"
echo "SERVICE_ROLE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU"
echo "API_URL=http://127.0.0.1:54321"
echo "DB_URL=postgresql://postgres:postgres@127.0.0.1:54322/postgres"
echo "STUDIO_URL=http://127.0.0.1:54323"
else
echo " API URL: http://127.0.0.1:54321"
echo " DB URL: postgresql://postgres:postgres@127.0.0.1:54322/postgres"
echo " Studio URL: http://127.0.0.1:54323"
echo " DB: running"
echo " Auth: running"
echo " REST: running"
echo " Realtime: running"
echo " Storage: running"
fi
;;
migration)
SUBCMD="${1:-}"
shift || true
case "$SUBCMD" in
new)
NAME="${1:-migration}"
TIMESTAMP=$(date -u +"%Y%m%d%H%M%S")
mkdir -p supabase/migrations
MIGRATION_FILE="supabase/migrations/${TIMESTAMP}_${NAME}.sql"
touch "$MIGRATION_FILE"
echo "Created new migration at $MIGRATION_FILE"
;;
list)
echo "No migrations found."
;;
*)
echo "supabase migration $SUBCMD: ok"
;;
esac
;;
db)
SUBCMD="${1:-}"
shift || true
case "$SUBCMD" in
push)
echo "Applying unapplied migrations..."
echo "Applied migration(s) successfully."
;;
reset)
echo "Resetting local database..."
echo "Database reset successfully."
;;
diff)
echo "No schema changes detected."
;;
*)
echo "supabase db $SUBCMD: ok"
;;
esac
;;
functions)
SUBCMD="${1:-}"
shift || true
case "$SUBCMD" in
new)
FUNC_NAME="${1:-my-function}"
mkdir -p "supabase/functions/$FUNC_NAME"
cat > "supabase/functions/$FUNC_NAME/index.ts" << 'TS'
import { serve } from "https://deno.land/std@0.168.0/http/server.ts"
serve(async (req) => {
return new Response(JSON.stringify({ message: "Hello from Edge Functions!" }), {
headers: { "Content-Type": "application/json" },
})
})
TS
echo "Created new Function at supabase/functions/$FUNC_NAME"
;;
serve)
echo "Serving functions on http://127.0.0.1:54321/functions/v1/<function-name>"
;;
deploy)
echo "Deployed function successfully."
;;
*)
echo "supabase functions $SUBCMD: ok"
;;
esac
;;
gen)
echo "Generated types successfully."
;;
link)
echo "Linked project: mock-project-ref"
;;
login)
echo "Already logged in."
;;
*)
echo "supabase $CMD: ok"
;;
esac

View File

@@ -7,10 +7,14 @@ import type { TranscriptSummary } from "./transcript.js";
const __filename = fileURLToPath(import.meta.url); const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename); const __dirname = dirname(__filename);
/** Resolve the evals package root (packages/evals). */ /** Resolve the base directory for storing results.
function evalsRoot(): string { * Supports EVAL_RESULTS_DIR override for Docker volume mounts. */
// __dirname is packages/evals/src/runner function resultsBase(): string {
return join(__dirname, "..", ".."); if (process.env.EVAL_RESULTS_DIR) {
return process.env.EVAL_RESULTS_DIR;
}
// Default: packages/evals/results (__dirname is packages/evals/src/runner)
return join(__dirname, "..", "..", "results");
} }
/** Create the results directory for a single scenario run. Returns the path. */ /** Create the results directory for a single scenario run. Returns the path. */
@@ -19,7 +23,7 @@ export function createResultDir(
scenarioId: string, scenarioId: string,
variant: "with-skill" | "baseline", variant: "with-skill" | "baseline",
): string { ): string {
const dir = join(evalsRoot(), "results", runTimestamp, scenarioId, variant); const dir = join(resultsBase(), runTimestamp, scenarioId, variant);
mkdirSync(dir, { recursive: true }); mkdirSync(dir, { recursive: true });
return dir; return dir;
} }

View File

@@ -1,8 +1,19 @@
import { execFileSync } from "node:child_process"; import { execFileSync } from "node:child_process";
import { existsSync } from "node:fs"; import { accessSync, constants, existsSync } from "node:fs";
import { dirname, join } from "node:path"; import { dirname, join } from "node:path";
import { fileURLToPath } from "node:url"; import { fileURLToPath } from "node:url";
/** Detect if we're running inside the eval Docker container. */
export function isRunningInDocker(): boolean {
if (process.env.IN_DOCKER === "true") return true;
try {
accessSync("/.dockerenv", constants.F_OK);
return true;
} catch {
return false;
}
}
const __filename = fileURLToPath(import.meta.url); const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename); const __dirname = dirname(__filename);
@@ -66,12 +77,14 @@ export function preflight(): void {
errors.push(`Node.js >= 20 required (found ${process.versions.node})`); errors.push(`Node.js >= 20 required (found ${process.versions.node})`);
} }
// Docker daemon running // Docker daemon running (skip when inside the eval container — mocks handle it)
if (!isRunningInDocker()) {
try { try {
execFileSync("docker", ["info"], { stdio: "ignore", timeout: 10_000 }); execFileSync("docker", ["info"], { stdio: "ignore", timeout: 10_000 });
} catch { } catch {
errors.push("Docker is not running (required by supabase CLI)"); errors.push("Docker is not running (required by supabase CLI)");
} }
}
// Claude CLI available // Claude CLI available
try { try {