From 3c3d1f55ca02586a689d7b1ef07e98586218a590 Mon Sep 17 00:00:00 2001 From: Pedro Rodrigues Date: Mon, 23 Feb 2026 19:22:47 +0000 Subject: [PATCH] containerize eval environment with Docker and mock CLIs Host now only needs Docker + ANTHROPIC_API_KEY to run evals. Adds multi-stage Dockerfile, mock supabase/docker/psql scripts, entrypoint, docker-compose for local use, and switches CI to Docker-based execution. Co-Authored-By: Claude Opus 4.6 --- .dockerignore | 9 ++ .github/workflows/evals.yml | 36 ++++-- mise.toml | 35 ++++++ packages/evals/Dockerfile | 69 +++++++++++ packages/evals/docker-compose.yml | 17 +++ packages/evals/docker-entrypoint.sh | 26 ++++ packages/evals/mocks/docker | 27 +++++ packages/evals/mocks/psql | 15 +++ packages/evals/mocks/supabase | 161 +++++++++++++++++++++++++ packages/evals/src/runner/persist.ts | 14 ++- packages/evals/src/runner/preflight.ts | 25 +++- 11 files changed, 414 insertions(+), 20 deletions(-) create mode 100644 .dockerignore create mode 100644 packages/evals/Dockerfile create mode 100644 packages/evals/docker-compose.yml create mode 100755 packages/evals/docker-entrypoint.sh create mode 100755 packages/evals/mocks/docker create mode 100755 packages/evals/mocks/psql create mode 100755 packages/evals/mocks/supabase diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..f6e6159 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,9 @@ +.git +.env +.env.* +node_modules +packages/evals/results +packages/evals/node_modules +packages/skills-build/node_modules +reports +*.log diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index aab65cf..ea1d3b2 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -34,16 +34,34 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: jdx/mise-action@v3 - with: - install: true + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 - - name: Install dependencies - run: npm install && npm --prefix packages/evals install + - name: Build eval image + uses: docker/build-push-action@v6 + with: + context: . + file: packages/evals/Dockerfile + tags: supabase-evals:ci + load: true + cache-from: type=gha + cache-to: type=gha,mode=max - name: Run Evals - uses: braintrustdata/eval-action@v1 + run: | + docker run --rm \ + -e ANTHROPIC_API_KEY \ + -e BRAINTRUST_PROJECT_ID \ + -e BRAINTRUST_API_KEY=${{ secrets.BRAINTRUST_API_KEY }} \ + -e BRAINTRUST_UPLOAD=true \ + -e EVAL_RESULTS_DIR=/app/results \ + -v "${{ github.workspace }}/results:/app/results" \ + supabase-evals:ci + + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 with: - api_key: ${{ secrets.BRAINTRUST_API_KEY }} - runtime: node - root: packages/evals + name: eval-results + path: results/ + if-no-files-found: ignore diff --git a/mise.toml b/mise.toml index 65b415d..be6fc15 100644 --- a/mise.toml +++ b/mise.toml @@ -54,3 +54,38 @@ sources = ["packages/evals/src/**", "packages/evals/evals/**"] description = "Run workflow evals and upload to Braintrust" run = "npm --prefix packages/evals run eval:upload" sources = ["packages/evals/src/**", "packages/evals/evals/**"] + +# ── Docker eval tasks ──────────────────────────────────────────────── + +[tasks."eval:docker:build"] +description = "Build the eval Docker image" +run = "docker build -t supabase-evals:local -f packages/evals/Dockerfile ." + +[tasks."eval:docker"] +description = "Run evals in Docker" +depends = ["eval:docker:build"] +run = """ +docker run --rm \ + -e ANTHROPIC_API_KEY \ + -e EVAL_MODEL \ + -e EVAL_SCENARIO \ + -e EVAL_BASELINE \ + -e EVAL_SKILL \ + -e BRAINTRUST_UPLOAD \ + -e BRAINTRUST_API_KEY \ + -e BRAINTRUST_PROJECT_ID \ + -e EVAL_RESULTS_DIR=/app/results \ + -v "$(pwd)/packages/evals/results:/app/results" \ + supabase-evals:local +""" + +[tasks."eval:docker:shell"] +description = "Open a debug shell in the eval container" +depends = ["eval:docker:build"] +run = """ +docker run --rm -it \ + -e ANTHROPIC_API_KEY \ + -e IN_DOCKER=true \ + -v "$(pwd)/packages/evals/results:/app/results" \ + supabase-evals:local /bin/bash +""" diff --git a/packages/evals/Dockerfile b/packages/evals/Dockerfile new file mode 100644 index 0000000..c6c31ec --- /dev/null +++ b/packages/evals/Dockerfile @@ -0,0 +1,69 @@ +# ---------- Stage 1: builder ---------- +FROM node:22-slim AS builder + +RUN apt-get update && apt-get install -y --no-install-recommends git && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy root package files first (layer caching) +COPY package.json package-lock.json ./ + +# Copy workspace package files +COPY packages/skills-build/package.json packages/skills-build/ +COPY packages/evals/package.json packages/evals/ + +# Install all dependencies +RUN npm install && \ + npm --prefix packages/skills-build install && \ + npm --prefix packages/evals install + +# Copy source code +COPY skills/ skills/ +COPY packages/skills-build/ packages/skills-build/ +COPY packages/evals/ packages/evals/ + +# Build skills (generates AGENTS.md / CLAUDE.md files) +RUN npm --prefix packages/skills-build run build + +# ---------- Stage 2: runtime ---------- +FROM node:22-slim + +RUN apt-get update && apt-get install -y --no-install-recommends git && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Use the existing node user (UID 1000) — Claude Code refuses --dangerously-skip-permissions as root +# node:22-slim already ships with user "node" (uid=1000, gid=1000) + +# Copy built artifacts from builder +COPY --from=builder /app/package.json /app/package-lock.json ./ +COPY --from=builder /app/node_modules/ node_modules/ +COPY --from=builder /app/skills/ skills/ +COPY --from=builder /app/packages/skills-build/ packages/skills-build/ +COPY --from=builder /app/packages/evals/ packages/evals/ + +# Install mock scripts +COPY packages/evals/mocks/supabase /usr/local/bin/supabase +COPY packages/evals/mocks/docker /usr/local/bin/docker +COPY packages/evals/mocks/psql /usr/local/bin/psql +RUN chmod +x /usr/local/bin/supabase /usr/local/bin/docker /usr/local/bin/psql + +# Install entrypoint +COPY packages/evals/docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh +RUN chmod +x /usr/local/bin/docker-entrypoint.sh + +# Create results directory writable by node user +RUN mkdir -p /app/packages/evals/results && chown -R node:node /app/packages/evals/results + +# Ensure node user owns tmp and home for Claude Code +RUN mkdir -p /tmp && chmod 1777 /tmp && chown -R node:node /home/node + +USER node + +ENV IN_DOCKER=true +ENV NODE_ENV=production + +ENTRYPOINT ["docker-entrypoint.sh"] +CMD ["npm", "--prefix", "packages/evals", "run", "eval"] diff --git a/packages/evals/docker-compose.yml b/packages/evals/docker-compose.yml new file mode 100644 index 0000000..5d4340d --- /dev/null +++ b/packages/evals/docker-compose.yml @@ -0,0 +1,17 @@ +services: + evals: + build: + context: ../.. + dockerfile: packages/evals/Dockerfile + environment: + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} + - EVAL_MODEL=${EVAL_MODEL:-} + - EVAL_SCENARIO=${EVAL_SCENARIO:-} + - EVAL_BASELINE=${EVAL_BASELINE:-} + - EVAL_SKILL=${EVAL_SKILL:-} + - BRAINTRUST_UPLOAD=${BRAINTRUST_UPLOAD:-} + - BRAINTRUST_API_KEY=${BRAINTRUST_API_KEY:-} + - BRAINTRUST_PROJECT_ID=${BRAINTRUST_PROJECT_ID:-} + - EVAL_RESULTS_DIR=/app/results + volumes: + - ./results:/app/results diff --git a/packages/evals/docker-entrypoint.sh b/packages/evals/docker-entrypoint.sh new file mode 100755 index 0000000..0441963 --- /dev/null +++ b/packages/evals/docker-entrypoint.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +# Entrypoint for the eval Docker container. +# Validates environment, adds mocks to PATH, then runs the given command. +set -euo pipefail + +export IN_DOCKER=true + +# Validate required env +if [[ -z "${ANTHROPIC_API_KEY:-}" ]]; then + echo "ERROR: ANTHROPIC_API_KEY is not set." >&2 + echo "Pass it via: docker run -e ANTHROPIC_API_KEY=sk-ant-... ..." >&2 + exit 1 +fi + +# Prepend mocks directory to PATH so mock supabase/docker/psql are found first +export PATH="/app/packages/evals/mocks:${PATH}" + +echo "=== Eval Environment ===" +echo " Node: $(node --version)" +echo " Claude: $(claude --version 2>/dev/null || echo 'n/a')" +echo " Docker: mock" +echo " Model: ${EVAL_MODEL:-default}" +echo " Scenario: ${EVAL_SCENARIO:-all}" +echo "========================" + +exec "$@" diff --git a/packages/evals/mocks/docker b/packages/evals/mocks/docker new file mode 100755 index 0000000..5efc136 --- /dev/null +++ b/packages/evals/mocks/docker @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# Mock Docker CLI for eval environments. +# Returns success for common commands the agent may invoke. +set -euo pipefail + +CMD="${1:-}" +shift || true + +case "$CMD" in + ps) + echo "CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES" + ;; + exec) + # Consume flags until we hit something that isn't a flag + while [[ "${1:-}" == -* ]]; do shift || true; done + # Remaining args are container + command — just succeed silently + ;; + info) + echo "Server Version: 24.0.0 (mock)" + ;; + compose) + echo "docker compose: ok" + ;; + *) + # Default: succeed silently + ;; +esac diff --git a/packages/evals/mocks/psql b/packages/evals/mocks/psql new file mode 100755 index 0000000..bb99c91 --- /dev/null +++ b/packages/evals/mocks/psql @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# Mock psql for eval environments. +# Accepts any arguments and returns an empty result set. +set -euo pipefail + +# If -c is used (inline command), print column headers for a SELECT +for arg in "$@"; do + if [[ "$arg" == "-c" ]]; then + echo "(0 rows)" + exit 0 + fi +done + +# Default: succeed silently +exit 0 diff --git a/packages/evals/mocks/supabase b/packages/evals/mocks/supabase new file mode 100755 index 0000000..e98298e --- /dev/null +++ b/packages/evals/mocks/supabase @@ -0,0 +1,161 @@ +#!/usr/bin/env bash +# Mock Supabase CLI for eval environments. +# Returns realistic output so the agent doesn't retry, and creates real +# migration files when asked. +set -euo pipefail + +CMD="${1:-}" +shift || true + +case "$CMD" in + init) + mkdir -p supabase/migrations supabase/functions + cat > supabase/config.toml << 'TOML' +[project] +id = "mock-project-ref" + +[api] +enabled = true +port = 54321 +schemas = ["public", "graphql_public"] + +[db] +port = 54322 +major_version = 15 + +[studio] +enabled = true +port = 54323 +TOML + echo "Finished supabase init." + ;; + + start) + echo "Applying migration 00000000000000_init.sql..." + echo "Started supabase local development setup." + echo "" + echo " API URL: http://127.0.0.1:54321" + echo " GraphQL URL: http://127.0.0.1:54321/graphql/v1" + echo " S3 Storage URL: http://127.0.0.1:54321/storage/v1/s3" + echo " DB URL: postgresql://postgres:postgres@127.0.0.1:54322/postgres" + echo " Studio URL: http://127.0.0.1:54323" + echo " Inbucket URL: http://127.0.0.1:54324" + echo " JWT secret: super-secret-jwt-token-with-at-least-32-characters-long" + echo " anon key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6ImFub24iLCJleHAiOjE5ODM4MTI5OTZ9.CRXP1A7WOeoJeXxjNni43kdQwgnWNReilDMblYTn_I0" + echo "service_role key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU" + echo " S3 Access Key: 625729a08b95bf1b7ff351a663f3a23c" + echo " S3 Secret Key: 850181e4652dd023b7a98c58ae0d2d34bd487ee0cc3254aed6eda37307425907" + echo " S3 Region: local" + ;; + + stop) + echo "Stopped supabase local development setup." + ;; + + status) + if [[ "${1:-}" == "-o" && "${2:-}" == "env" ]]; then + echo "ANON_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6ImFub24iLCJleHAiOjE5ODM4MTI5OTZ9.CRXP1A7WOeoJeXxjNni43kdQwgnWNReilDMblYTn_I0" + echo "SERVICE_ROLE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU" + echo "API_URL=http://127.0.0.1:54321" + echo "DB_URL=postgresql://postgres:postgres@127.0.0.1:54322/postgres" + echo "STUDIO_URL=http://127.0.0.1:54323" + else + echo " API URL: http://127.0.0.1:54321" + echo " DB URL: postgresql://postgres:postgres@127.0.0.1:54322/postgres" + echo " Studio URL: http://127.0.0.1:54323" + echo " DB: running" + echo " Auth: running" + echo " REST: running" + echo " Realtime: running" + echo " Storage: running" + fi + ;; + + migration) + SUBCMD="${1:-}" + shift || true + case "$SUBCMD" in + new) + NAME="${1:-migration}" + TIMESTAMP=$(date -u +"%Y%m%d%H%M%S") + mkdir -p supabase/migrations + MIGRATION_FILE="supabase/migrations/${TIMESTAMP}_${NAME}.sql" + touch "$MIGRATION_FILE" + echo "Created new migration at $MIGRATION_FILE" + ;; + list) + echo "No migrations found." + ;; + *) + echo "supabase migration $SUBCMD: ok" + ;; + esac + ;; + + db) + SUBCMD="${1:-}" + shift || true + case "$SUBCMD" in + push) + echo "Applying unapplied migrations..." + echo "Applied migration(s) successfully." + ;; + reset) + echo "Resetting local database..." + echo "Database reset successfully." + ;; + diff) + echo "No schema changes detected." + ;; + *) + echo "supabase db $SUBCMD: ok" + ;; + esac + ;; + + functions) + SUBCMD="${1:-}" + shift || true + case "$SUBCMD" in + new) + FUNC_NAME="${1:-my-function}" + mkdir -p "supabase/functions/$FUNC_NAME" + cat > "supabase/functions/$FUNC_NAME/index.ts" << 'TS' +import { serve } from "https://deno.land/std@0.168.0/http/server.ts" + +serve(async (req) => { + return new Response(JSON.stringify({ message: "Hello from Edge Functions!" }), { + headers: { "Content-Type": "application/json" }, + }) +}) +TS + echo "Created new Function at supabase/functions/$FUNC_NAME" + ;; + serve) + echo "Serving functions on http://127.0.0.1:54321/functions/v1/" + ;; + deploy) + echo "Deployed function successfully." + ;; + *) + echo "supabase functions $SUBCMD: ok" + ;; + esac + ;; + + gen) + echo "Generated types successfully." + ;; + + link) + echo "Linked project: mock-project-ref" + ;; + + login) + echo "Already logged in." + ;; + + *) + echo "supabase $CMD: ok" + ;; +esac diff --git a/packages/evals/src/runner/persist.ts b/packages/evals/src/runner/persist.ts index fb4ea7f..6694efb 100644 --- a/packages/evals/src/runner/persist.ts +++ b/packages/evals/src/runner/persist.ts @@ -7,10 +7,14 @@ import type { TranscriptSummary } from "./transcript.js"; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); -/** Resolve the evals package root (packages/evals). */ -function evalsRoot(): string { - // __dirname is packages/evals/src/runner - return join(__dirname, "..", ".."); +/** Resolve the base directory for storing results. + * Supports EVAL_RESULTS_DIR override for Docker volume mounts. */ +function resultsBase(): string { + if (process.env.EVAL_RESULTS_DIR) { + return process.env.EVAL_RESULTS_DIR; + } + // Default: packages/evals/results (__dirname is packages/evals/src/runner) + return join(__dirname, "..", "..", "results"); } /** Create the results directory for a single scenario run. Returns the path. */ @@ -19,7 +23,7 @@ export function createResultDir( scenarioId: string, variant: "with-skill" | "baseline", ): string { - const dir = join(evalsRoot(), "results", runTimestamp, scenarioId, variant); + const dir = join(resultsBase(), runTimestamp, scenarioId, variant); mkdirSync(dir, { recursive: true }); return dir; } diff --git a/packages/evals/src/runner/preflight.ts b/packages/evals/src/runner/preflight.ts index 2c7d8eb..1c3a2a6 100644 --- a/packages/evals/src/runner/preflight.ts +++ b/packages/evals/src/runner/preflight.ts @@ -1,8 +1,19 @@ import { execFileSync } from "node:child_process"; -import { existsSync } from "node:fs"; +import { accessSync, constants, existsSync } from "node:fs"; import { dirname, join } from "node:path"; import { fileURLToPath } from "node:url"; +/** Detect if we're running inside the eval Docker container. */ +export function isRunningInDocker(): boolean { + if (process.env.IN_DOCKER === "true") return true; + try { + accessSync("/.dockerenv", constants.F_OK); + return true; + } catch { + return false; + } +} + const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); @@ -66,11 +77,13 @@ export function preflight(): void { errors.push(`Node.js >= 20 required (found ${process.versions.node})`); } - // Docker daemon running - try { - execFileSync("docker", ["info"], { stdio: "ignore", timeout: 10_000 }); - } catch { - errors.push("Docker is not running (required by supabase CLI)"); + // Docker daemon running (skip when inside the eval container — mocks handle it) + if (!isRunningInDocker()) { + try { + execFileSync("docker", ["info"], { stdio: "ignore", timeout: 10_000 }); + } catch { + errors.push("Docker is not running (required by supabase CLI)"); + } } // Claude CLI available