containerize eval environment with Docker and mock CLIs

Host now only needs Docker + ANTHROPIC_API_KEY to run evals. Adds
multi-stage Dockerfile, mock supabase/docker/psql scripts, entrypoint,
docker-compose for local use, and switches CI to Docker-based execution.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Pedro Rodrigues
2026-02-23 19:22:47 +00:00
parent 93a49374de
commit 3c3d1f55ca
11 changed files with 414 additions and 20 deletions

View File

@@ -54,3 +54,38 @@ sources = ["packages/evals/src/**", "packages/evals/evals/**"]
description = "Run workflow evals and upload to Braintrust"
run = "npm --prefix packages/evals run eval:upload"
sources = ["packages/evals/src/**", "packages/evals/evals/**"]
# ── Docker eval tasks ────────────────────────────────────────────────
[tasks."eval:docker:build"]
description = "Build the eval Docker image"
run = "docker build -t supabase-evals:local -f packages/evals/Dockerfile ."
[tasks."eval:docker"]
description = "Run evals in Docker"
depends = ["eval:docker:build"]
run = """
docker run --rm \
-e ANTHROPIC_API_KEY \
-e EVAL_MODEL \
-e EVAL_SCENARIO \
-e EVAL_BASELINE \
-e EVAL_SKILL \
-e BRAINTRUST_UPLOAD \
-e BRAINTRUST_API_KEY \
-e BRAINTRUST_PROJECT_ID \
-e EVAL_RESULTS_DIR=/app/results \
-v "$(pwd)/packages/evals/results:/app/results" \
supabase-evals:local
"""
[tasks."eval:docker:shell"]
description = "Open a debug shell in the eval container"
depends = ["eval:docker:build"]
run = """
docker run --rm -it \
-e ANTHROPIC_API_KEY \
-e IN_DOCKER=true \
-v "$(pwd)/packages/evals/results:/app/results" \
supabase-evals:local /bin/bash
"""