mirror of
https://github.com/supabase/agent-skills.git
synced 2026-03-27 10:09:26 +08:00
load skills through skills CLI
This commit is contained in:
@@ -10,7 +10,7 @@ hidden tests check the result. Binary pass/fail.
|
|||||||
|
|
||||||
```
|
```
|
||||||
1. Create temp dir with project skeleton (PROMPT.md, supabase/ dir)
|
1. Create temp dir with project skeleton (PROMPT.md, supabase/ dir)
|
||||||
2. Symlink supabase skill into workspace (or skip for baseline)
|
2. Install skills via `skills add` CLI (or skip for baseline)
|
||||||
3. Run: claude -p "prompt" --cwd /tmp/eval-xxx
|
3. Run: claude -p "prompt" --cwd /tmp/eval-xxx
|
||||||
4. Agent reads skill, creates migrations/code in the workspace
|
4. Agent reads skill, creates migrations/code in the workspace
|
||||||
5. Copy hidden EVAL.ts into workspace, run vitest
|
5. Copy hidden EVAL.ts into workspace, run vitest
|
||||||
@@ -46,7 +46,7 @@ This prevents the agent from "teaching to the test."
|
|||||||
## Running Evals
|
## Running Evals
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Run all scenarios with Claude Sonnet 4.5 (default)
|
# Run all scenarios with skills (default)
|
||||||
mise run eval
|
mise run eval
|
||||||
|
|
||||||
# Run a specific scenario
|
# Run a specific scenario
|
||||||
@@ -55,8 +55,14 @@ EVAL_SCENARIO=auth-rls-new-project mise run eval
|
|||||||
# Override model
|
# Override model
|
||||||
EVAL_MODEL=claude-opus-4-6 mise run eval
|
EVAL_MODEL=claude-opus-4-6 mise run eval
|
||||||
|
|
||||||
# Run with baseline comparison (with-skill vs without-skill)
|
# Run without skills (baseline)
|
||||||
EVAL_BASELINE=true mise run eval
|
EVAL_BASELINE=true mise run eval
|
||||||
|
|
||||||
|
# Install only a specific skill
|
||||||
|
EVAL_SKILL=supabase mise run eval
|
||||||
|
|
||||||
|
# Upload results to Braintrust
|
||||||
|
mise run eval:upload
|
||||||
```
|
```
|
||||||
|
|
||||||
Or directly:
|
Or directly:
|
||||||
@@ -65,19 +71,23 @@ Or directly:
|
|||||||
cd packages/evals
|
cd packages/evals
|
||||||
npx tsx src/runner.ts
|
npx tsx src/runner.ts
|
||||||
|
|
||||||
# Single scenario with baseline
|
# Single scenario, baseline mode
|
||||||
EVAL_SCENARIO=auth-rls-new-project EVAL_BASELINE=true npx tsx src/runner.ts
|
EVAL_BASELINE=true EVAL_SCENARIO=auth-rls-new-project npx tsx src/runner.ts
|
||||||
```
|
```
|
||||||
|
|
||||||
## Baseline Comparison
|
## Baseline Mode
|
||||||
|
|
||||||
Set `EVAL_BASELINE=true` to run each scenario twice:
|
Set `EVAL_BASELINE=true` to run scenarios **without** skills. By default,
|
||||||
|
scenarios run with skills installed via the `skills` CLI.
|
||||||
|
|
||||||
- **With skill**: The supabase skill is symlinked into the workspace. Claude
|
To compare with-skill vs baseline, run evals twice:
|
||||||
Code discovers it and uses reference files for guidance.
|
|
||||||
- **Baseline**: No skill available. The agent relies on innate knowledge.
|
|
||||||
|
|
||||||
Compare pass rates to measure how much the skill improves agent output.
|
```bash
|
||||||
|
mise run eval # with skills
|
||||||
|
EVAL_BASELINE=true mise run eval # without skills (baseline)
|
||||||
|
```
|
||||||
|
|
||||||
|
Compare the results to measure how much skills improve agent output.
|
||||||
|
|
||||||
## Adding Scenarios
|
## Adding Scenarios
|
||||||
|
|
||||||
@@ -92,7 +102,8 @@ Compare pass rates to measure how much the skill improves agent output.
|
|||||||
ANTHROPIC_API_KEY=sk-ant-... # Required: Claude Code authentication
|
ANTHROPIC_API_KEY=sk-ant-... # Required: Claude Code authentication
|
||||||
EVAL_MODEL=... # Optional: override model (default: claude-sonnet-4-5-20250929)
|
EVAL_MODEL=... # Optional: override model (default: claude-sonnet-4-5-20250929)
|
||||||
EVAL_SCENARIO=... # Optional: run single scenario
|
EVAL_SCENARIO=... # Optional: run single scenario
|
||||||
EVAL_BASELINE=true # Optional: run baseline comparison
|
EVAL_SKILL=... # Optional: install only this skill (e.g., "supabase")
|
||||||
|
EVAL_BASELINE=true # Optional: run without skills (baseline mode)
|
||||||
BRAINTRUST_UPLOAD=true # Optional: upload results to Braintrust
|
BRAINTRUST_UPLOAD=true # Optional: upload results to Braintrust
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
16
packages/evals/package-lock.json
generated
16
packages/evals/package-lock.json
generated
@@ -10,7 +10,8 @@
|
|||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@anthropic-ai/claude-code": "^2.1.49",
|
"@anthropic-ai/claude-code": "^2.1.49",
|
||||||
"braintrust": "^3.0.0"
|
"braintrust": "^3.0.0",
|
||||||
|
"skills": "^1.4.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/node": "^20.10.0",
|
"@types/node": "^20.10.0",
|
||||||
@@ -3004,6 +3005,19 @@
|
|||||||
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
|
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
|
||||||
"license": "MIT"
|
"license": "MIT"
|
||||||
},
|
},
|
||||||
|
"node_modules/skills": {
|
||||||
|
"version": "1.4.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/skills/-/skills-1.4.0.tgz",
|
||||||
|
"integrity": "sha512-uZd/HvRdrcNQjb8dwXlfDfYuSVpeVlEmWgpR8BS2MWA3/bi2eaQIg4RKtSyGU7twPTapW5V4ks/n92w7nugBcQ==",
|
||||||
|
"license": "MIT",
|
||||||
|
"bin": {
|
||||||
|
"add-skill": "bin/cli.mjs",
|
||||||
|
"skills": "bin/cli.mjs"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/source-map": {
|
"node_modules/source-map": {
|
||||||
"version": "0.7.6",
|
"version": "0.7.6",
|
||||||
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.7.6.tgz",
|
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.7.6.tgz",
|
||||||
|
|||||||
@@ -11,7 +11,8 @@
|
|||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@anthropic-ai/claude-code": "^2.1.49",
|
"@anthropic-ai/claude-code": "^2.1.49",
|
||||||
"braintrust": "^3.0.0"
|
"braintrust": "^3.0.0",
|
||||||
|
"skills": "^1.4.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/node": "^20.10.0",
|
"@types/node": "^20.10.0",
|
||||||
|
|||||||
@@ -19,7 +19,8 @@ const AGENT_TIMEOUT = 30 * 60 * 1000; // 30 minutes
|
|||||||
|
|
||||||
const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL;
|
const model = process.env.EVAL_MODEL ?? DEFAULT_MODEL;
|
||||||
const scenarioFilter = process.env.EVAL_SCENARIO;
|
const scenarioFilter = process.env.EVAL_SCENARIO;
|
||||||
const runBaseline = process.env.EVAL_BASELINE === "true";
|
const isBaseline = process.env.EVAL_BASELINE === "true";
|
||||||
|
const skillEnabled = !isBaseline;
|
||||||
|
|
||||||
// Run-level timestamp shared across all scenarios in a single invocation
|
// Run-level timestamp shared across all scenarios in a single invocation
|
||||||
const runTimestamp = new Date()
|
const runTimestamp = new Date()
|
||||||
@@ -172,7 +173,7 @@ async function main() {
|
|||||||
|
|
||||||
console.log("Supabase Skills Evals");
|
console.log("Supabase Skills Evals");
|
||||||
console.log(`Model: ${model}`);
|
console.log(`Model: ${model}`);
|
||||||
console.log(`Baseline: ${runBaseline}`);
|
console.log(`Mode: ${isBaseline ? "baseline (no skills)" : "with skills"}`);
|
||||||
|
|
||||||
let scenarios = discoverScenarios();
|
let scenarios = discoverScenarios();
|
||||||
|
|
||||||
@@ -189,15 +190,8 @@ async function main() {
|
|||||||
const results: EvalRunResult[] = [];
|
const results: EvalRunResult[] = [];
|
||||||
|
|
||||||
for (const scenario of scenarios) {
|
for (const scenario of scenarios) {
|
||||||
// Run with skill enabled
|
const result = await runEval(scenario, skillEnabled);
|
||||||
const withSkill = await runEval(scenario, true);
|
results.push(result);
|
||||||
results.push(withSkill);
|
|
||||||
|
|
||||||
// Optionally run baseline (no skill)
|
|
||||||
if (runBaseline) {
|
|
||||||
const baseline = await runEval(scenario, false);
|
|
||||||
results.push(baseline);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use the results dir from the first result (all share the same timestamp)
|
// Use the results dir from the first result (all share the same timestamp)
|
||||||
|
|||||||
@@ -23,8 +23,8 @@ export interface AgentRunResult {
|
|||||||
* including tool calls, results, and reasoning steps.
|
* including tool calls, results, and reasoning steps.
|
||||||
*
|
*
|
||||||
* The agent operates in the workspace directory and can read/write files.
|
* The agent operates in the workspace directory and can read/write files.
|
||||||
* When the skill is installed (symlinked into workspace), Claude Code
|
* When skills are installed (via the `skills` CLI), Claude Code
|
||||||
* discovers it automatically and uses it for guidance.
|
* discovers them automatically and uses them for guidance.
|
||||||
*/
|
*/
|
||||||
export async function runAgent(opts: {
|
export async function runAgent(opts: {
|
||||||
cwd: string;
|
cwd: string;
|
||||||
|
|||||||
@@ -17,7 +17,8 @@ export function listModifiedFiles(
|
|||||||
for (const entry of entries) {
|
for (const entry of entries) {
|
||||||
if (
|
if (
|
||||||
entry.name === "node_modules" ||
|
entry.name === "node_modules" ||
|
||||||
entry.name === "skills" ||
|
entry.name === ".agents" ||
|
||||||
|
entry.name === ".claude" ||
|
||||||
entry.name === "EVAL.ts" ||
|
entry.name === "EVAL.ts" ||
|
||||||
entry.name === "EVAL.tsx"
|
entry.name === "EVAL.tsx"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,13 +1,20 @@
|
|||||||
import {
|
import { execFileSync } from "node:child_process";
|
||||||
cpSync,
|
import { cpSync, existsSync, mkdtempSync, readdirSync, rmSync } from "node:fs";
|
||||||
existsSync,
|
|
||||||
mkdtempSync,
|
|
||||||
readdirSync,
|
|
||||||
rmSync,
|
|
||||||
symlinkSync,
|
|
||||||
} from "node:fs";
|
|
||||||
import { tmpdir } from "node:os";
|
import { tmpdir } from "node:os";
|
||||||
import { join, resolve } from "node:path";
|
import { dirname, join, resolve } from "node:path";
|
||||||
|
import { fileURLToPath } from "node:url";
|
||||||
|
|
||||||
|
const __filename = fileURLToPath(import.meta.url);
|
||||||
|
const __dirname = dirname(__filename);
|
||||||
|
|
||||||
|
/** Resolve the `skills` binary from the evals package node_modules. */
|
||||||
|
function resolveSkillsBin(): string {
|
||||||
|
// __dirname is packages/evals/src/runner/ (or compiled equivalent)
|
||||||
|
// Walk up to packages/evals/ and into node_modules/.bin/skills
|
||||||
|
const bin = resolve(__dirname, "..", "..", "node_modules", ".bin", "skills");
|
||||||
|
if (existsSync(bin)) return bin;
|
||||||
|
throw new Error(`skills binary not found at ${bin}. Run npm install.`);
|
||||||
|
}
|
||||||
|
|
||||||
/** Walk up from cwd to find the repository root (contains skills/ and packages/). */
|
/** Walk up from cwd to find the repository root (contains skills/ and packages/). */
|
||||||
function findRepoRoot(): string {
|
function findRepoRoot(): string {
|
||||||
@@ -27,7 +34,7 @@ function findRepoRoot(): string {
|
|||||||
* Create an isolated workspace for an eval run.
|
* Create an isolated workspace for an eval run.
|
||||||
*
|
*
|
||||||
* 1. Copy the eval directory to a temp folder (excluding EVAL.ts)
|
* 1. Copy the eval directory to a temp folder (excluding EVAL.ts)
|
||||||
* 2. Optionally symlink the supabase skill so Claude Code can discover it
|
* 2. Optionally install skills via the `skills` CLI so Claude Code can discover them
|
||||||
*
|
*
|
||||||
* Returns the path to the workspace and a cleanup function.
|
* Returns the path to the workspace and a cleanup function.
|
||||||
*/
|
*/
|
||||||
@@ -47,12 +54,23 @@ export function createWorkspace(opts: {
|
|||||||
cpSync(src, dest, { recursive: true });
|
cpSync(src, dest, { recursive: true });
|
||||||
}
|
}
|
||||||
|
|
||||||
// Make the skill available to the agent by symlinking the skills dir
|
// Install skills into the workspace via the `skills` CLI
|
||||||
if (opts.skillEnabled) {
|
if (opts.skillEnabled) {
|
||||||
const skillsDir = join(repoRoot, "skills");
|
const skillsDir = join(repoRoot, "skills");
|
||||||
if (existsSync(skillsDir)) {
|
if (existsSync(skillsDir)) {
|
||||||
const destSkills = join(workspacePath, "skills");
|
const skillsBin = resolveSkillsBin();
|
||||||
symlinkSync(skillsDir, destSkills);
|
const args = ["add", skillsDir, "-a", "claude-code", "-y"];
|
||||||
|
|
||||||
|
const skillFilter = process.env.EVAL_SKILL;
|
||||||
|
if (skillFilter) {
|
||||||
|
args.push("--skill", skillFilter);
|
||||||
|
}
|
||||||
|
|
||||||
|
execFileSync(skillsBin, args, {
|
||||||
|
cwd: workspacePath,
|
||||||
|
stdio: "pipe",
|
||||||
|
timeout: 60_000,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user