mirror of
https://github.com/supabase/agent-skills.git
synced 2026-03-27 10:09:26 +08:00
containerize eval environment with Docker and mock CLIs
Host now only needs Docker + ANTHROPIC_API_KEY to run evals. Adds multi-stage Dockerfile, mock supabase/docker/psql scripts, entrypoint, docker-compose for local use, and switches CI to Docker-based execution. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
69
packages/evals/Dockerfile
Normal file
69
packages/evals/Dockerfile
Normal file
@@ -0,0 +1,69 @@
|
||||
# ---------- Stage 1: builder ----------
|
||||
FROM node:22-slim AS builder
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends git && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy root package files first (layer caching)
|
||||
COPY package.json package-lock.json ./
|
||||
|
||||
# Copy workspace package files
|
||||
COPY packages/skills-build/package.json packages/skills-build/
|
||||
COPY packages/evals/package.json packages/evals/
|
||||
|
||||
# Install all dependencies
|
||||
RUN npm install && \
|
||||
npm --prefix packages/skills-build install && \
|
||||
npm --prefix packages/evals install
|
||||
|
||||
# Copy source code
|
||||
COPY skills/ skills/
|
||||
COPY packages/skills-build/ packages/skills-build/
|
||||
COPY packages/evals/ packages/evals/
|
||||
|
||||
# Build skills (generates AGENTS.md / CLAUDE.md files)
|
||||
RUN npm --prefix packages/skills-build run build
|
||||
|
||||
# ---------- Stage 2: runtime ----------
|
||||
FROM node:22-slim
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends git && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Use the existing node user (UID 1000) — Claude Code refuses --dangerously-skip-permissions as root
|
||||
# node:22-slim already ships with user "node" (uid=1000, gid=1000)
|
||||
|
||||
# Copy built artifacts from builder
|
||||
COPY --from=builder /app/package.json /app/package-lock.json ./
|
||||
COPY --from=builder /app/node_modules/ node_modules/
|
||||
COPY --from=builder /app/skills/ skills/
|
||||
COPY --from=builder /app/packages/skills-build/ packages/skills-build/
|
||||
COPY --from=builder /app/packages/evals/ packages/evals/
|
||||
|
||||
# Install mock scripts
|
||||
COPY packages/evals/mocks/supabase /usr/local/bin/supabase
|
||||
COPY packages/evals/mocks/docker /usr/local/bin/docker
|
||||
COPY packages/evals/mocks/psql /usr/local/bin/psql
|
||||
RUN chmod +x /usr/local/bin/supabase /usr/local/bin/docker /usr/local/bin/psql
|
||||
|
||||
# Install entrypoint
|
||||
COPY packages/evals/docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
|
||||
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
|
||||
|
||||
# Create results directory writable by node user
|
||||
RUN mkdir -p /app/packages/evals/results && chown -R node:node /app/packages/evals/results
|
||||
|
||||
# Ensure node user owns tmp and home for Claude Code
|
||||
RUN mkdir -p /tmp && chmod 1777 /tmp && chown -R node:node /home/node
|
||||
|
||||
USER node
|
||||
|
||||
ENV IN_DOCKER=true
|
||||
ENV NODE_ENV=production
|
||||
|
||||
ENTRYPOINT ["docker-entrypoint.sh"]
|
||||
CMD ["npm", "--prefix", "packages/evals", "run", "eval"]
|
||||
17
packages/evals/docker-compose.yml
Normal file
17
packages/evals/docker-compose.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
services:
|
||||
evals:
|
||||
build:
|
||||
context: ../..
|
||||
dockerfile: packages/evals/Dockerfile
|
||||
environment:
|
||||
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
|
||||
- EVAL_MODEL=${EVAL_MODEL:-}
|
||||
- EVAL_SCENARIO=${EVAL_SCENARIO:-}
|
||||
- EVAL_BASELINE=${EVAL_BASELINE:-}
|
||||
- EVAL_SKILL=${EVAL_SKILL:-}
|
||||
- BRAINTRUST_UPLOAD=${BRAINTRUST_UPLOAD:-}
|
||||
- BRAINTRUST_API_KEY=${BRAINTRUST_API_KEY:-}
|
||||
- BRAINTRUST_PROJECT_ID=${BRAINTRUST_PROJECT_ID:-}
|
||||
- EVAL_RESULTS_DIR=/app/results
|
||||
volumes:
|
||||
- ./results:/app/results
|
||||
26
packages/evals/docker-entrypoint.sh
Executable file
26
packages/evals/docker-entrypoint.sh
Executable file
@@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env bash
|
||||
# Entrypoint for the eval Docker container.
|
||||
# Validates environment, adds mocks to PATH, then runs the given command.
|
||||
set -euo pipefail
|
||||
|
||||
export IN_DOCKER=true
|
||||
|
||||
# Validate required env
|
||||
if [[ -z "${ANTHROPIC_API_KEY:-}" ]]; then
|
||||
echo "ERROR: ANTHROPIC_API_KEY is not set." >&2
|
||||
echo "Pass it via: docker run -e ANTHROPIC_API_KEY=sk-ant-... ..." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Prepend mocks directory to PATH so mock supabase/docker/psql are found first
|
||||
export PATH="/app/packages/evals/mocks:${PATH}"
|
||||
|
||||
echo "=== Eval Environment ==="
|
||||
echo " Node: $(node --version)"
|
||||
echo " Claude: $(claude --version 2>/dev/null || echo 'n/a')"
|
||||
echo " Docker: mock"
|
||||
echo " Model: ${EVAL_MODEL:-default}"
|
||||
echo " Scenario: ${EVAL_SCENARIO:-all}"
|
||||
echo "========================"
|
||||
|
||||
exec "$@"
|
||||
27
packages/evals/mocks/docker
Executable file
27
packages/evals/mocks/docker
Executable file
@@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env bash
|
||||
# Mock Docker CLI for eval environments.
|
||||
# Returns success for common commands the agent may invoke.
|
||||
set -euo pipefail
|
||||
|
||||
CMD="${1:-}"
|
||||
shift || true
|
||||
|
||||
case "$CMD" in
|
||||
ps)
|
||||
echo "CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES"
|
||||
;;
|
||||
exec)
|
||||
# Consume flags until we hit something that isn't a flag
|
||||
while [[ "${1:-}" == -* ]]; do shift || true; done
|
||||
# Remaining args are container + command — just succeed silently
|
||||
;;
|
||||
info)
|
||||
echo "Server Version: 24.0.0 (mock)"
|
||||
;;
|
||||
compose)
|
||||
echo "docker compose: ok"
|
||||
;;
|
||||
*)
|
||||
# Default: succeed silently
|
||||
;;
|
||||
esac
|
||||
15
packages/evals/mocks/psql
Executable file
15
packages/evals/mocks/psql
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env bash
|
||||
# Mock psql for eval environments.
|
||||
# Accepts any arguments and returns an empty result set.
|
||||
set -euo pipefail
|
||||
|
||||
# If -c is used (inline command), print column headers for a SELECT
|
||||
for arg in "$@"; do
|
||||
if [[ "$arg" == "-c" ]]; then
|
||||
echo "(0 rows)"
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
|
||||
# Default: succeed silently
|
||||
exit 0
|
||||
161
packages/evals/mocks/supabase
Executable file
161
packages/evals/mocks/supabase
Executable file
@@ -0,0 +1,161 @@
|
||||
#!/usr/bin/env bash
|
||||
# Mock Supabase CLI for eval environments.
|
||||
# Returns realistic output so the agent doesn't retry, and creates real
|
||||
# migration files when asked.
|
||||
set -euo pipefail
|
||||
|
||||
CMD="${1:-}"
|
||||
shift || true
|
||||
|
||||
case "$CMD" in
|
||||
init)
|
||||
mkdir -p supabase/migrations supabase/functions
|
||||
cat > supabase/config.toml << 'TOML'
|
||||
[project]
|
||||
id = "mock-project-ref"
|
||||
|
||||
[api]
|
||||
enabled = true
|
||||
port = 54321
|
||||
schemas = ["public", "graphql_public"]
|
||||
|
||||
[db]
|
||||
port = 54322
|
||||
major_version = 15
|
||||
|
||||
[studio]
|
||||
enabled = true
|
||||
port = 54323
|
||||
TOML
|
||||
echo "Finished supabase init."
|
||||
;;
|
||||
|
||||
start)
|
||||
echo "Applying migration 00000000000000_init.sql..."
|
||||
echo "Started supabase local development setup."
|
||||
echo ""
|
||||
echo " API URL: http://127.0.0.1:54321"
|
||||
echo " GraphQL URL: http://127.0.0.1:54321/graphql/v1"
|
||||
echo " S3 Storage URL: http://127.0.0.1:54321/storage/v1/s3"
|
||||
echo " DB URL: postgresql://postgres:postgres@127.0.0.1:54322/postgres"
|
||||
echo " Studio URL: http://127.0.0.1:54323"
|
||||
echo " Inbucket URL: http://127.0.0.1:54324"
|
||||
echo " JWT secret: super-secret-jwt-token-with-at-least-32-characters-long"
|
||||
echo " anon key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6ImFub24iLCJleHAiOjE5ODM4MTI5OTZ9.CRXP1A7WOeoJeXxjNni43kdQwgnWNReilDMblYTn_I0"
|
||||
echo "service_role key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU"
|
||||
echo " S3 Access Key: 625729a08b95bf1b7ff351a663f3a23c"
|
||||
echo " S3 Secret Key: 850181e4652dd023b7a98c58ae0d2d34bd487ee0cc3254aed6eda37307425907"
|
||||
echo " S3 Region: local"
|
||||
;;
|
||||
|
||||
stop)
|
||||
echo "Stopped supabase local development setup."
|
||||
;;
|
||||
|
||||
status)
|
||||
if [[ "${1:-}" == "-o" && "${2:-}" == "env" ]]; then
|
||||
echo "ANON_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6ImFub24iLCJleHAiOjE5ODM4MTI5OTZ9.CRXP1A7WOeoJeXxjNni43kdQwgnWNReilDMblYTn_I0"
|
||||
echo "SERVICE_ROLE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU"
|
||||
echo "API_URL=http://127.0.0.1:54321"
|
||||
echo "DB_URL=postgresql://postgres:postgres@127.0.0.1:54322/postgres"
|
||||
echo "STUDIO_URL=http://127.0.0.1:54323"
|
||||
else
|
||||
echo " API URL: http://127.0.0.1:54321"
|
||||
echo " DB URL: postgresql://postgres:postgres@127.0.0.1:54322/postgres"
|
||||
echo " Studio URL: http://127.0.0.1:54323"
|
||||
echo " DB: running"
|
||||
echo " Auth: running"
|
||||
echo " REST: running"
|
||||
echo " Realtime: running"
|
||||
echo " Storage: running"
|
||||
fi
|
||||
;;
|
||||
|
||||
migration)
|
||||
SUBCMD="${1:-}"
|
||||
shift || true
|
||||
case "$SUBCMD" in
|
||||
new)
|
||||
NAME="${1:-migration}"
|
||||
TIMESTAMP=$(date -u +"%Y%m%d%H%M%S")
|
||||
mkdir -p supabase/migrations
|
||||
MIGRATION_FILE="supabase/migrations/${TIMESTAMP}_${NAME}.sql"
|
||||
touch "$MIGRATION_FILE"
|
||||
echo "Created new migration at $MIGRATION_FILE"
|
||||
;;
|
||||
list)
|
||||
echo "No migrations found."
|
||||
;;
|
||||
*)
|
||||
echo "supabase migration $SUBCMD: ok"
|
||||
;;
|
||||
esac
|
||||
;;
|
||||
|
||||
db)
|
||||
SUBCMD="${1:-}"
|
||||
shift || true
|
||||
case "$SUBCMD" in
|
||||
push)
|
||||
echo "Applying unapplied migrations..."
|
||||
echo "Applied migration(s) successfully."
|
||||
;;
|
||||
reset)
|
||||
echo "Resetting local database..."
|
||||
echo "Database reset successfully."
|
||||
;;
|
||||
diff)
|
||||
echo "No schema changes detected."
|
||||
;;
|
||||
*)
|
||||
echo "supabase db $SUBCMD: ok"
|
||||
;;
|
||||
esac
|
||||
;;
|
||||
|
||||
functions)
|
||||
SUBCMD="${1:-}"
|
||||
shift || true
|
||||
case "$SUBCMD" in
|
||||
new)
|
||||
FUNC_NAME="${1:-my-function}"
|
||||
mkdir -p "supabase/functions/$FUNC_NAME"
|
||||
cat > "supabase/functions/$FUNC_NAME/index.ts" << 'TS'
|
||||
import { serve } from "https://deno.land/std@0.168.0/http/server.ts"
|
||||
|
||||
serve(async (req) => {
|
||||
return new Response(JSON.stringify({ message: "Hello from Edge Functions!" }), {
|
||||
headers: { "Content-Type": "application/json" },
|
||||
})
|
||||
})
|
||||
TS
|
||||
echo "Created new Function at supabase/functions/$FUNC_NAME"
|
||||
;;
|
||||
serve)
|
||||
echo "Serving functions on http://127.0.0.1:54321/functions/v1/<function-name>"
|
||||
;;
|
||||
deploy)
|
||||
echo "Deployed function successfully."
|
||||
;;
|
||||
*)
|
||||
echo "supabase functions $SUBCMD: ok"
|
||||
;;
|
||||
esac
|
||||
;;
|
||||
|
||||
gen)
|
||||
echo "Generated types successfully."
|
||||
;;
|
||||
|
||||
link)
|
||||
echo "Linked project: mock-project-ref"
|
||||
;;
|
||||
|
||||
login)
|
||||
echo "Already logged in."
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "supabase $CMD: ok"
|
||||
;;
|
||||
esac
|
||||
@@ -7,10 +7,14 @@ import type { TranscriptSummary } from "./transcript.js";
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = dirname(__filename);
|
||||
|
||||
/** Resolve the evals package root (packages/evals). */
|
||||
function evalsRoot(): string {
|
||||
// __dirname is packages/evals/src/runner
|
||||
return join(__dirname, "..", "..");
|
||||
/** Resolve the base directory for storing results.
|
||||
* Supports EVAL_RESULTS_DIR override for Docker volume mounts. */
|
||||
function resultsBase(): string {
|
||||
if (process.env.EVAL_RESULTS_DIR) {
|
||||
return process.env.EVAL_RESULTS_DIR;
|
||||
}
|
||||
// Default: packages/evals/results (__dirname is packages/evals/src/runner)
|
||||
return join(__dirname, "..", "..", "results");
|
||||
}
|
||||
|
||||
/** Create the results directory for a single scenario run. Returns the path. */
|
||||
@@ -19,7 +23,7 @@ export function createResultDir(
|
||||
scenarioId: string,
|
||||
variant: "with-skill" | "baseline",
|
||||
): string {
|
||||
const dir = join(evalsRoot(), "results", runTimestamp, scenarioId, variant);
|
||||
const dir = join(resultsBase(), runTimestamp, scenarioId, variant);
|
||||
mkdirSync(dir, { recursive: true });
|
||||
return dir;
|
||||
}
|
||||
|
||||
@@ -1,8 +1,19 @@
|
||||
import { execFileSync } from "node:child_process";
|
||||
import { existsSync } from "node:fs";
|
||||
import { accessSync, constants, existsSync } from "node:fs";
|
||||
import { dirname, join } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
/** Detect if we're running inside the eval Docker container. */
|
||||
export function isRunningInDocker(): boolean {
|
||||
if (process.env.IN_DOCKER === "true") return true;
|
||||
try {
|
||||
accessSync("/.dockerenv", constants.F_OK);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = dirname(__filename);
|
||||
|
||||
@@ -66,11 +77,13 @@ export function preflight(): void {
|
||||
errors.push(`Node.js >= 20 required (found ${process.versions.node})`);
|
||||
}
|
||||
|
||||
// Docker daemon running
|
||||
try {
|
||||
execFileSync("docker", ["info"], { stdio: "ignore", timeout: 10_000 });
|
||||
} catch {
|
||||
errors.push("Docker is not running (required by supabase CLI)");
|
||||
// Docker daemon running (skip when inside the eval container — mocks handle it)
|
||||
if (!isRunningInDocker()) {
|
||||
try {
|
||||
execFileSync("docker", ["info"], { stdio: "ignore", timeout: 10_000 });
|
||||
} catch {
|
||||
errors.push("Docker is not running (required by supabase CLI)");
|
||||
}
|
||||
}
|
||||
|
||||
// Claude CLI available
|
||||
|
||||
Reference in New Issue
Block a user