load skills through skills CLI

2026-03-27 10:09:26 +08:00 · 2026-02-20 17:41:41 +00:00
parent ce7eb8b28b
commit baf94b04e3
7 changed files with 80 additions and 41 deletions
--- a/packages/evals/AGENTS.md
+++ b/packages/evals/AGENTS.md
@@ -10,7 +10,7 @@ hidden tests check the result. Binary pass/fail.

 ```
 1. Create temp dir with project skeleton (PROMPT.md, supabase/ dir)
-2. Symlink supabase skill into workspace (or skip for baseline)
+2. Install skills via `skills add` CLI (or skip for baseline)
 3. Run: claude -p "prompt" --cwd /tmp/eval-xxx
 4. Agent reads skill, creates migrations/code in the workspace
 5. Copy hidden EVAL.ts into workspace, run vitest
@@ -46,7 +46,7 @@ This prevents the agent from "teaching to the test."
 ## Running Evals

 ```bash
-# Run all scenarios with Claude Sonnet 4.5 (default)
+# Run all scenarios with skills (default)
 mise run eval

 # Run a specific scenario
@@ -55,8 +55,14 @@ EVAL_SCENARIO=auth-rls-new-project mise run eval
 # Override model
 EVAL_MODEL=claude-opus-4-6 mise run eval

-# Run with baseline comparison (with-skill vs without-skill)
+# Run without skills (baseline)
 EVAL_BASELINE=true mise run eval
+
+# Install only a specific skill
+EVAL_SKILL=supabase mise run eval
+
+# Upload results to Braintrust
+mise run eval:upload
 ```

 Or directly:
@@ -65,19 +71,23 @@ Or directly:
 cd packages/evals
 npx tsx src/runner.ts

-# Single scenario with baseline
-EVAL_SCENARIO=auth-rls-new-project EVAL_BASELINE=true npx tsx src/runner.ts
+# Single scenario, baseline mode
+EVAL_BASELINE=true EVAL_SCENARIO=auth-rls-new-project npx tsx src/runner.ts
 ```

-## Baseline Comparison
+## Baseline Mode

-Set `EVAL_BASELINE=true` to run each scenario twice:
+Set `EVAL_BASELINE=true` to run scenarios **without** skills. By default,
+scenarios run with skills installed via the `skills` CLI.

- **With skill**: The supabase skill is symlinked into the workspace. Claude
-  Code discovers it and uses reference files for guidance.
- **Baseline**: No skill available. The agent relies on innate knowledge.
+To compare with-skill vs baseline, run evals twice:

-Compare pass rates to measure how much the skill improves agent output.
+```bash
+mise run eval                        # with skills
+EVAL_BASELINE=true mise run eval     # without skills (baseline)
+```
+
+Compare the results to measure how much skills improve agent output.

 ## Adding Scenarios

@@ -92,7 +102,8 @@ Compare pass rates to measure how much the skill improves agent output.
 ANTHROPIC_API_KEY=sk-ant-...    # Required: Claude Code authentication
 EVAL_MODEL=...                  # Optional: override model (default: claude-sonnet-4-5-20250929)
 EVAL_SCENARIO=...               # Optional: run single scenario
-EVAL_BASELINE=true              # Optional: run baseline comparison
+EVAL_SKILL=...                  # Optional: install only this skill (e.g., "supabase")
+EVAL_BASELINE=true              # Optional: run without skills (baseline mode)
 BRAINTRUST_UPLOAD=true          # Optional: upload results to Braintrust
 ```

--- a/packages/evals/package-lock.json
+++ b/packages/evals/package-lock.json
@@ -10,7 +10,8 @@
 			"license": "MIT",
 			"dependencies": {
 				"@anthropic-ai/claude-code": "^2.1.49",
-				"braintrust": "^3.0.0"
+				"braintrust": "^3.0.0",
+				"skills": "^1.4.0"
 			},
 			"devDependencies": {
 				"@types/node": "^20.10.0",
@@ -3004,6 +3005,19 @@
 			"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
 			"license": "MIT"
 		},
+		"node_modules/skills": {
+			"version": "1.4.0",
+			"resolved": "https://registry.npmjs.org/skills/-/skills-1.4.0.tgz",
+			"integrity": "sha512-uZd/HvRdrcNQjb8dwXlfDfYuSVpeVlEmWgpR8BS2MWA3/bi2eaQIg4RKtSyGU7twPTapW5V4ks/n92w7nugBcQ==",
+			"license": "MIT",
+			"bin": {
+				"add-skill": "bin/cli.mjs",
+				"skills": "bin/cli.mjs"
+			},
+			"engines": {
+				"node": ">=18"
+			}
+		},
 		"node_modules/source-map": {
 			"version": "0.7.6",
 			"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.7.6.tgz",
--- a/packages/evals/package.json
+++ b/packages/evals/package.json
@@ -11,7 +11,8 @@
 	},
 	"dependencies": {
 		"@anthropic-ai/claude-code": "^2.1.49",
-		"braintrust": "^3.0.0"
+		"braintrust": "^3.0.0",
+		"skills": "^1.4.0"
 	},
 	"devDependencies": {
 		"@types/node": "^20.10.0",
--- a/packages/evals/src/runner.ts
+++ b/packages/evals/src/runner.ts
@@ -19,7 +19,8 @@ const AGENT_TIMEOUT = 30 * 60 * 1000; // 30 minutes

 const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL;
 const scenarioFilter = process.env.EVAL_SCENARIO;
-const runBaseline = process.env.EVAL_BASELINE === "true";
+const isBaseline = process.env.EVAL_BASELINE === "true";
+const skillEnabled = !isBaseline;

 // Run-level timestamp shared across all scenarios in a single invocation
 const runTimestamp = new Date()
@@ -172,7 +173,7 @@ async function main() {

 	console.log("Supabase Skills Evals");
 	console.log(`Model: ${model}`);
-	console.log(`Baseline: ${runBaseline}`);
+	console.log(`Mode: ${isBaseline ? "baseline (no skills)" : "with skills"}`);

 	let scenarios = discoverScenarios();

@@ -189,15 +190,8 @@ async function main() {
 	const results: EvalRunResult[] = [];

 	for (const scenario of scenarios) {
-		// Run with skill enabled
-		const withSkill = await runEval(scenario, true);
-		results.push(withSkill);
-
-		// Optionally run baseline (no skill)
-		if (runBaseline) {
-			const baseline = await runEval(scenario, false);
-			results.push(baseline);
-		}
+		const result = await runEval(scenario, skillEnabled);
+		results.push(result);
 	}

 	// Use the results dir from the first result (all share the same timestamp)
--- a/packages/evals/src/runner/agent.ts
+++ b/packages/evals/src/runner/agent.ts
@@ -23,8 +23,8 @@ export interface AgentRunResult {
 * including tool calls, results, and reasoning steps.
 *
 * The agent operates in the workspace directory and can read/write files.
- * When the skill is installed (symlinked into workspace), Claude Code
- * discovers it automatically and uses it for guidance.
+ * When skills are installed (via the `skills` CLI), Claude Code
+ * discovers them automatically and uses them for guidance.
 */
 export async function runAgent(opts: {
 	cwd: string;
--- a/packages/evals/src/runner/results.ts
+++ b/packages/evals/src/runner/results.ts
@@ -17,7 +17,8 @@ export function listModifiedFiles(
 		for (const entry of entries) {
 			if (
 				entry.name === "node_modules" ||
-				entry.name === "skills" ||
+				entry.name === ".agents" ||
+				entry.name === ".claude" ||
 				entry.name === "EVAL.ts" ||
 				entry.name === "EVAL.tsx"
 			)
--- a/packages/evals/src/runner/scaffold.ts
+++ b/packages/evals/src/runner/scaffold.ts
@@ -1,13 +1,20 @@
-import {
-	cpSync,
-	existsSync,
-	mkdtempSync,
-	readdirSync,
-	rmSync,
-	symlinkSync,
-} from "node:fs";
+import { execFileSync } from "node:child_process";
+import { cpSync, existsSync, mkdtempSync, readdirSync, rmSync } from "node:fs";
 import { tmpdir } from "node:os";
-import { join, resolve } from "node:path";
+import { dirname, join, resolve } from "node:path";
+import { fileURLToPath } from "node:url";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+
+/** Resolve the `skills` binary from the evals package node_modules. */
+function resolveSkillsBin(): string {
+	// __dirname is packages/evals/src/runner/ (or compiled equivalent)
+	// Walk up to packages/evals/ and into node_modules/.bin/skills
+	const bin = resolve(__dirname, "..", "..", "node_modules", ".bin", "skills");
+	if (existsSync(bin)) return bin;
+	throw new Error(`skills binary not found at ${bin}. Run npm install.`);
+}

 /** Walk up from cwd to find the repository root (contains skills/ and packages/). */
 function findRepoRoot(): string {
@@ -27,7 +34,7 @@ function findRepoRoot(): string {
 * Create an isolated workspace for an eval run.
 *
 * 1. Copy the eval directory to a temp folder (excluding EVAL.ts)
- * 2. Optionally symlink the supabase skill so Claude Code can discover it
+ * 2. Optionally install skills via the `skills` CLI so Claude Code can discover them
 *
 * Returns the path to the workspace and a cleanup function.
 */
@@ -47,12 +54,23 @@ export function createWorkspace(opts: {
 		cpSync(src, dest, { recursive: true });
 	}

-	// Make the skill available to the agent by symlinking the skills dir
+	// Install skills into the workspace via the `skills` CLI
 	if (opts.skillEnabled) {
 		const skillsDir = join(repoRoot, "skills");
 		if (existsSync(skillsDir)) {
-			const destSkills = join(workspacePath, "skills");
-			symlinkSync(skillsDir, destSkills);
+			const skillsBin = resolveSkillsBin();
+			const args = ["add", skillsDir, "-a", "claude-code", "-y"];
+
+			const skillFilter = process.env.EVAL_SKILL;
+			if (skillFilter) {
+				args.push("--skill", skillFilter);
+			}
+
+			execFileSync(skillsBin, args, {
+				cwd: workspacePath,
+				stdio: "pipe",
+				timeout: 60_000,
+			});
 		}
 	}