Files
supabase-postgres-best-prac…/packages/evals/src/runner/test.ts
Pedro Rodrigues 9b08864e94 feat(evals): replace mock CLIs with real Supabase instance per eval run
Start a shared local Supabase stack once before all scenarios and reset
the database (drop/recreate public schema + clear migration history) between
each run. This lets agents apply migrations via `supabase db push` against a
real Postgres instance instead of mock shell scripts.

- Add supabase-setup.ts: startSupabase / stopSupabase / resetDB / getKeys
- Update runner.ts to start/stop Supabase and inject keys into process.env
- Update agent.ts to point MCP config at the local Supabase HTTP endpoint
- Update preflight.ts to check supabase CLI availability and Docker socket
- Update scaffold.ts to seed workspace with supabase/config.toml
- Add passThreshold support (test.ts / results.ts / types.ts) for partial pass
- Delete mock shell scripts (mocks/docker, mocks/psql, mocks/supabase)
- Update Dockerfile/docker-compose to mount Docker socket for supabase CLI

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-25 14:39:54 +00:00

144 lines
4.5 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { execFile } from "node:child_process";
import { copyFileSync, existsSync, writeFileSync } from "node:fs";
import { dirname, join } from "node:path";
import { fileURLToPath } from "node:url";
import { promisify } from "node:util";
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
const exec = promisify(execFile);
export interface TestResult {
passed: boolean;
output: string;
/** Number of tests that passed */
passedCount: number;
/** Total number of tests */
totalCount: number;
/** Per-test pass/fail extracted from vitest verbose output */
individualTests: Record<string, boolean>;
}
/**
* Run the hidden EVAL.ts tests against the agent's workspace.
*
* 1. Copy EVAL.ts into the workspace (agent is done, safe to expose)
* 2. Run vitest against it
* 3. Parse the output for pass/fail
*/
export async function runTests(opts: {
workspacePath: string;
evalFilePath: string;
passThreshold?: number;
}): Promise<TestResult> {
// Copy the hidden test file into the workspace
const evalFileName = opts.evalFilePath.endsWith(".tsx")
? "EVAL.tsx"
: "EVAL.ts";
const destPath = join(opts.workspacePath, evalFileName);
copyFileSync(opts.evalFilePath, destPath);
// Copy shared eval-utils.ts if it exists alongside the eval scenarios
const evalUtilsSrc = join(
dirname(dirname(opts.evalFilePath)),
"eval-utils.ts",
);
if (existsSync(evalUtilsSrc)) {
copyFileSync(evalUtilsSrc, join(opts.workspacePath, "eval-utils.ts"));
}
// Write a minimal vitest config that overrides the default include pattern
// so EVAL.ts (without .test. or .spec.) is picked up.
const vitestConfigPath = join(opts.workspacePath, "vitest.config.mjs");
if (!existsSync(vitestConfigPath)) {
// Alias ../eval-utils.ts → ./eval-utils.ts so the import resolves in
// the flat workspace (source tree has EVAL.ts one level deeper).
const evalUtilsDest = join(opts.workspacePath, "eval-utils.ts");
const aliasBlock = existsSync(evalUtilsDest)
? `resolve: { alias: { "../eval-utils.ts": "./eval-utils.ts" } },`
: "";
writeFileSync(
vitestConfigPath,
`export default { ${aliasBlock} test: { include: ["EVAL.{ts,tsx}"] } };\n`,
);
}
// Use the vitest binary from the evals package (always available)
const evalsVitest = join(
__dirname,
"..",
"..",
"node_modules",
".bin",
"vitest",
);
const vitestBin = join(opts.workspacePath, "node_modules", ".bin", "vitest");
const cmd = existsSync(vitestBin) ? vitestBin : evalsVitest;
const args = ["run", evalFileName, "--reporter=verbose", "--no-color"];
try {
const { stdout, stderr } = await exec(cmd, args, {
cwd: opts.workspacePath,
timeout: 60_000,
env: { ...process.env },
maxBuffer: 5 * 1024 * 1024,
});
const output = `${stdout}\n${stderr}`;
return parseTestOutput(output, opts.passThreshold);
} catch (error) {
const err = error as Error & { stdout?: string; stderr?: string };
const output = `${err.stdout ?? ""}\n${err.stderr ?? ""}`;
return parseTestOutput(output, opts.passThreshold);
}
}
/**
* Extract per-test pass/fail from vitest verbose output.
*
* Vitest verbose format:
* ✓ EVAL.ts > test name here 0ms → passed
* × EVAL.ts > test name here 2ms → failed
*/
function parseIndividualTests(output: string): Record<string, boolean> {
const results: Record<string, boolean> = {};
const re = /[✓×]\s+EVAL\.tsx?\s+>\s+(.+?)\s+\d+ms/g;
for (const match of output.matchAll(re)) {
const testName = match[1].trim();
const didPass = output[match.index] === "✓";
results[testName] = didPass;
}
return results;
}
function parseTestOutput(output: string, passThreshold?: number): TestResult {
// Parse vitest output for pass/fail counts
// Vitest formats:
// All passing: "Tests N passed (N)"
// Mixed: "Tests N failed | M passed (T)"
// All failing: "Tests N failed (N)"
const mixedOrPassing = output.match(
/Tests\s+(?:(\d+)\s+failed\s+\|\s+)?(\d+)\s+passed\s+\((\d+)\)/,
);
const allFailing = output.match(/Tests\s+(\d+)\s+failed\s+\((\d+)\)/);
let passedCount = 0;
let totalCount = 0;
if (mixedOrPassing) {
passedCount = Number.parseInt(mixedOrPassing[2], 10);
totalCount = Number.parseInt(mixedOrPassing[3], 10);
} else if (allFailing) {
passedCount = 0;
totalCount = Number.parseInt(allFailing[2], 10);
}
const passed = passThreshold
? totalCount > 0 && passedCount >= passThreshold
: totalCount > 0 && passedCount === totalCount;
const individualTests = parseIndividualTests(output);
return { passed, output, passedCount, totalCount, individualTests };
}