From baf94b04e3ff3e02890d1cf05e8478988faebcdb Mon Sep 17 00:00:00 2001 From: Pedro Rodrigues Date: Fri, 20 Feb 2026 17:41:41 +0000 Subject: [PATCH] load skills through skills CLI --- packages/evals/AGENTS.md | 35 +++++++++++++-------- packages/evals/package-lock.json | 16 +++++++++- packages/evals/package.json | 3 +- packages/evals/src/runner.ts | 16 +++------- packages/evals/src/runner/agent.ts | 4 +-- packages/evals/src/runner/results.ts | 3 +- packages/evals/src/runner/scaffold.ts | 44 +++++++++++++++++++-------- 7 files changed, 80 insertions(+), 41 deletions(-) diff --git a/packages/evals/AGENTS.md b/packages/evals/AGENTS.md index aeebdd7..622828c 100644 --- a/packages/evals/AGENTS.md +++ b/packages/evals/AGENTS.md @@ -10,7 +10,7 @@ hidden tests check the result. Binary pass/fail. ``` 1. Create temp dir with project skeleton (PROMPT.md, supabase/ dir) -2. Symlink supabase skill into workspace (or skip for baseline) +2. Install skills via `skills add` CLI (or skip for baseline) 3. Run: claude -p "prompt" --cwd /tmp/eval-xxx 4. Agent reads skill, creates migrations/code in the workspace 5. Copy hidden EVAL.ts into workspace, run vitest @@ -46,7 +46,7 @@ This prevents the agent from "teaching to the test." ## Running Evals ```bash -# Run all scenarios with Claude Sonnet 4.5 (default) +# Run all scenarios with skills (default) mise run eval # Run a specific scenario @@ -55,8 +55,14 @@ EVAL_SCENARIO=auth-rls-new-project mise run eval # Override model EVAL_MODEL=claude-opus-4-6 mise run eval -# Run with baseline comparison (with-skill vs without-skill) +# Run without skills (baseline) EVAL_BASELINE=true mise run eval + +# Install only a specific skill +EVAL_SKILL=supabase mise run eval + +# Upload results to Braintrust +mise run eval:upload ``` Or directly: @@ -65,19 +71,23 @@ Or directly: cd packages/evals npx tsx src/runner.ts -# Single scenario with baseline -EVAL_SCENARIO=auth-rls-new-project EVAL_BASELINE=true npx tsx src/runner.ts +# Single scenario, baseline mode +EVAL_BASELINE=true EVAL_SCENARIO=auth-rls-new-project npx tsx src/runner.ts ``` -## Baseline Comparison +## Baseline Mode -Set `EVAL_BASELINE=true` to run each scenario twice: +Set `EVAL_BASELINE=true` to run scenarios **without** skills. By default, +scenarios run with skills installed via the `skills` CLI. -- **With skill**: The supabase skill is symlinked into the workspace. Claude - Code discovers it and uses reference files for guidance. -- **Baseline**: No skill available. The agent relies on innate knowledge. +To compare with-skill vs baseline, run evals twice: -Compare pass rates to measure how much the skill improves agent output. +```bash +mise run eval # with skills +EVAL_BASELINE=true mise run eval # without skills (baseline) +``` + +Compare the results to measure how much skills improve agent output. ## Adding Scenarios @@ -92,7 +102,8 @@ Compare pass rates to measure how much the skill improves agent output. ANTHROPIC_API_KEY=sk-ant-... # Required: Claude Code authentication EVAL_MODEL=... # Optional: override model (default: claude-sonnet-4-5-20250929) EVAL_SCENARIO=... # Optional: run single scenario -EVAL_BASELINE=true # Optional: run baseline comparison +EVAL_SKILL=... # Optional: install only this skill (e.g., "supabase") +EVAL_BASELINE=true # Optional: run without skills (baseline mode) BRAINTRUST_UPLOAD=true # Optional: upload results to Braintrust ``` diff --git a/packages/evals/package-lock.json b/packages/evals/package-lock.json index fd8428c..24ee6ea 100644 --- a/packages/evals/package-lock.json +++ b/packages/evals/package-lock.json @@ -10,7 +10,8 @@ "license": "MIT", "dependencies": { "@anthropic-ai/claude-code": "^2.1.49", - "braintrust": "^3.0.0" + "braintrust": "^3.0.0", + "skills": "^1.4.0" }, "devDependencies": { "@types/node": "^20.10.0", @@ -3004,6 +3005,19 @@ "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", "license": "MIT" }, + "node_modules/skills": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/skills/-/skills-1.4.0.tgz", + "integrity": "sha512-uZd/HvRdrcNQjb8dwXlfDfYuSVpeVlEmWgpR8BS2MWA3/bi2eaQIg4RKtSyGU7twPTapW5V4ks/n92w7nugBcQ==", + "license": "MIT", + "bin": { + "add-skill": "bin/cli.mjs", + "skills": "bin/cli.mjs" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/source-map": { "version": "0.7.6", "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.7.6.tgz", diff --git a/packages/evals/package.json b/packages/evals/package.json index 9909434..84e9d30 100644 --- a/packages/evals/package.json +++ b/packages/evals/package.json @@ -11,7 +11,8 @@ }, "dependencies": { "@anthropic-ai/claude-code": "^2.1.49", - "braintrust": "^3.0.0" + "braintrust": "^3.0.0", + "skills": "^1.4.0" }, "devDependencies": { "@types/node": "^20.10.0", diff --git a/packages/evals/src/runner.ts b/packages/evals/src/runner.ts index 404568e..9bfd53c 100644 --- a/packages/evals/src/runner.ts +++ b/packages/evals/src/runner.ts @@ -19,7 +19,8 @@ const AGENT_TIMEOUT = 30 * 60 * 1000; // 30 minutes const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL; const scenarioFilter = process.env.EVAL_SCENARIO; -const runBaseline = process.env.EVAL_BASELINE === "true"; +const isBaseline = process.env.EVAL_BASELINE === "true"; +const skillEnabled = !isBaseline; // Run-level timestamp shared across all scenarios in a single invocation const runTimestamp = new Date() @@ -172,7 +173,7 @@ async function main() { console.log("Supabase Skills Evals"); console.log(`Model: ${model}`); - console.log(`Baseline: ${runBaseline}`); + console.log(`Mode: ${isBaseline ? "baseline (no skills)" : "with skills"}`); let scenarios = discoverScenarios(); @@ -189,15 +190,8 @@ async function main() { const results: EvalRunResult[] = []; for (const scenario of scenarios) { - // Run with skill enabled - const withSkill = await runEval(scenario, true); - results.push(withSkill); - - // Optionally run baseline (no skill) - if (runBaseline) { - const baseline = await runEval(scenario, false); - results.push(baseline); - } + const result = await runEval(scenario, skillEnabled); + results.push(result); } // Use the results dir from the first result (all share the same timestamp) diff --git a/packages/evals/src/runner/agent.ts b/packages/evals/src/runner/agent.ts index 694f82f..9c42677 100644 --- a/packages/evals/src/runner/agent.ts +++ b/packages/evals/src/runner/agent.ts @@ -23,8 +23,8 @@ export interface AgentRunResult { * including tool calls, results, and reasoning steps. * * The agent operates in the workspace directory and can read/write files. - * When the skill is installed (symlinked into workspace), Claude Code - * discovers it automatically and uses it for guidance. + * When skills are installed (via the `skills` CLI), Claude Code + * discovers them automatically and uses them for guidance. */ export async function runAgent(opts: { cwd: string; diff --git a/packages/evals/src/runner/results.ts b/packages/evals/src/runner/results.ts index 1e17cfa..901ed3a 100644 --- a/packages/evals/src/runner/results.ts +++ b/packages/evals/src/runner/results.ts @@ -17,7 +17,8 @@ export function listModifiedFiles( for (const entry of entries) { if ( entry.name === "node_modules" || - entry.name === "skills" || + entry.name === ".agents" || + entry.name === ".claude" || entry.name === "EVAL.ts" || entry.name === "EVAL.tsx" ) diff --git a/packages/evals/src/runner/scaffold.ts b/packages/evals/src/runner/scaffold.ts index 0b0d940..81caeea 100644 --- a/packages/evals/src/runner/scaffold.ts +++ b/packages/evals/src/runner/scaffold.ts @@ -1,13 +1,20 @@ -import { - cpSync, - existsSync, - mkdtempSync, - readdirSync, - rmSync, - symlinkSync, -} from "node:fs"; +import { execFileSync } from "node:child_process"; +import { cpSync, existsSync, mkdtempSync, readdirSync, rmSync } from "node:fs"; import { tmpdir } from "node:os"; -import { join, resolve } from "node:path"; +import { dirname, join, resolve } from "node:path"; +import { fileURLToPath } from "node:url"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +/** Resolve the `skills` binary from the evals package node_modules. */ +function resolveSkillsBin(): string { + // __dirname is packages/evals/src/runner/ (or compiled equivalent) + // Walk up to packages/evals/ and into node_modules/.bin/skills + const bin = resolve(__dirname, "..", "..", "node_modules", ".bin", "skills"); + if (existsSync(bin)) return bin; + throw new Error(`skills binary not found at ${bin}. Run npm install.`); +} /** Walk up from cwd to find the repository root (contains skills/ and packages/). */ function findRepoRoot(): string { @@ -27,7 +34,7 @@ function findRepoRoot(): string { * Create an isolated workspace for an eval run. * * 1. Copy the eval directory to a temp folder (excluding EVAL.ts) - * 2. Optionally symlink the supabase skill so Claude Code can discover it + * 2. Optionally install skills via the `skills` CLI so Claude Code can discover them * * Returns the path to the workspace and a cleanup function. */ @@ -47,12 +54,23 @@ export function createWorkspace(opts: { cpSync(src, dest, { recursive: true }); } - // Make the skill available to the agent by symlinking the skills dir + // Install skills into the workspace via the `skills` CLI if (opts.skillEnabled) { const skillsDir = join(repoRoot, "skills"); if (existsSync(skillsDir)) { - const destSkills = join(workspacePath, "skills"); - symlinkSync(skillsDir, destSkills); + const skillsBin = resolveSkillsBin(); + const args = ["add", skillsDir, "-a", "claude-code", "-y"]; + + const skillFilter = process.env.EVAL_SKILL; + if (skillFilter) { + args.push("--skill", skillFilter); + } + + execFileSync(skillsBin, args, { + cwd: workspacePath, + stdio: "pipe", + timeout: 60_000, + }); } }