mirror of
https://github.com/supabase/agent-skills.git
synced 2026-03-27 10:09:26 +08:00
feat: transcript parsing based on vercel-labs/agent-eval
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,6 +1,7 @@
|
|||||||
node_modules/
|
node_modules/
|
||||||
dist/
|
dist/
|
||||||
*.log
|
*.log
|
||||||
|
evals/output/
|
||||||
.DS_Store
|
.DS_Store
|
||||||
.env
|
.env
|
||||||
|
|
||||||
|
|||||||
100
evals/main.ts
100
evals/main.ts
@@ -1,63 +1,7 @@
|
|||||||
import { spawnSync } from "node:child_process";
|
import { spawnSync } from "node:child_process";
|
||||||
import { mkdtempSync, readdirSync, readFileSync } from "node:fs";
|
import { mkdirSync, readdirSync, readFileSync } from "node:fs";
|
||||||
import os from "node:os";
|
|
||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
import { z } from "zod";
|
import { parseClaudeCodeTranscript } from "./parse-transcript";
|
||||||
|
|
||||||
/**
|
|
||||||
* Transcript schemas
|
|
||||||
*/
|
|
||||||
|
|
||||||
const TextBlock = z.object({ type: z.literal("text"), text: z.string() });
|
|
||||||
|
|
||||||
const ThinkingBlock = z.object({
|
|
||||||
type: z.literal("thinking"),
|
|
||||||
thinking: z.string(),
|
|
||||||
});
|
|
||||||
|
|
||||||
const ToolUseBlock = z.object({
|
|
||||||
type: z.literal("tool_use"),
|
|
||||||
id: z.string(),
|
|
||||||
name: z.string(),
|
|
||||||
input: z.record(z.string(), z.unknown()),
|
|
||||||
});
|
|
||||||
|
|
||||||
const ToolResultBlock = z.object({
|
|
||||||
type: z.literal("tool_result"),
|
|
||||||
tool_use_id: z.string(),
|
|
||||||
content: z.unknown(),
|
|
||||||
is_error: z.boolean().optional(),
|
|
||||||
});
|
|
||||||
|
|
||||||
const ContentBlock = z.union([
|
|
||||||
ToolUseBlock,
|
|
||||||
ThinkingBlock,
|
|
||||||
TextBlock,
|
|
||||||
ToolResultBlock,
|
|
||||||
z.looseObject({ type: z.string() }), // catch-all for unknown block types
|
|
||||||
]);
|
|
||||||
|
|
||||||
const AssistantEntry = z.object({
|
|
||||||
type: z.literal("assistant"),
|
|
||||||
sessionId: z.string(),
|
|
||||||
timestamp: z.string(),
|
|
||||||
uuid: z.string(),
|
|
||||||
message: z.object({
|
|
||||||
role: z.literal("assistant"),
|
|
||||||
content: z.array(ContentBlock),
|
|
||||||
stop_reason: z.string().nullable().optional(),
|
|
||||||
}),
|
|
||||||
});
|
|
||||||
|
|
||||||
// Catch-all — user messages, queue-operations, etc.
|
|
||||||
const TranscriptLine = z.union([
|
|
||||||
AssistantEntry,
|
|
||||||
z.looseObject({ type: z.string() }),
|
|
||||||
]);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Config
|
|
||||||
*/
|
|
||||||
|
|
||||||
const apiKey = process.env.ANTHROPIC_API_KEY;
|
const apiKey = process.env.ANTHROPIC_API_KEY;
|
||||||
if (!apiKey) throw new Error("ANTHROPIC_API_KEY required");
|
if (!apiKey) throw new Error("ANTHROPIC_API_KEY required");
|
||||||
@@ -75,23 +19,23 @@ SELECT * FROM orders WHERE user_id = 123 AND status = 'pending';
|
|||||||
|
|
||||||
What indexes should I add and why?`;
|
What indexes should I add and why?`;
|
||||||
|
|
||||||
/**
|
// Mount ~/.claude/projects to capture the built-in session transcript.
|
||||||
* Run the eval
|
// Written to a fixed path so you can inspect it after the run (e.g. in VSCode).
|
||||||
*/
|
const projectsDir = path.join(repoRoot, "evals", "output");
|
||||||
|
mkdirSync(projectsDir, { recursive: true });
|
||||||
// Mount ~/.claude/projects to capture the built-in session transcript
|
|
||||||
const projectsDir = mkdtempSync(path.join(os.tmpdir(), "eval-projects-"));
|
|
||||||
|
|
||||||
const result = spawnSync(
|
const result = spawnSync(
|
||||||
"docker",
|
"docker",
|
||||||
[
|
[
|
||||||
"run",
|
"run",
|
||||||
"--rm",
|
"--rm",
|
||||||
"-e",
|
"--workdir",
|
||||||
|
"/eval",
|
||||||
|
"--env",
|
||||||
`ANTHROPIC_API_KEY=${apiKey}`,
|
`ANTHROPIC_API_KEY=${apiKey}`,
|
||||||
"-v",
|
"--volume",
|
||||||
`${skillPath}:/home/claude/.claude/skills/supabase-postgres-best-practices:ro`, // :ro = read-only snapshot
|
`${skillPath}:/home/claude/.claude/skills/supabase-postgres-best-practices:ro`, // :ro = read-only snapshot
|
||||||
"-v",
|
"--volume",
|
||||||
`${projectsDir}:/home/claude/.claude/projects`,
|
`${projectsDir}:/home/claude/.claude/projects`,
|
||||||
"evals-claude",
|
"evals-claude",
|
||||||
"claude",
|
"claude",
|
||||||
@@ -106,26 +50,14 @@ if (result.status !== 0) {
|
|||||||
throw new Error(result.stderr || `Exit code ${result.status}`);
|
throw new Error(result.stderr || `Exit code ${result.status}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// Container's working dir is /eval, which becomes `eval` in the projects path
|
||||||
* Parse the transcript
|
const transcriptDir = path.join(projectsDir, "eval");
|
||||||
*/
|
|
||||||
|
|
||||||
// Container's working dir is /, which becomes `-` in the projects path
|
|
||||||
const transcriptDir = path.join(projectsDir, "-");
|
|
||||||
const [transcriptFile] = readdirSync(transcriptDir).filter((f) =>
|
const [transcriptFile] = readdirSync(transcriptDir).filter((f) =>
|
||||||
f.endsWith(".jsonl"),
|
f.endsWith(".jsonl"),
|
||||||
);
|
);
|
||||||
|
|
||||||
// Single typed array — all transcript entries parsed and validated
|
const transcript = parseClaudeCodeTranscript(
|
||||||
const transcript = readFileSync(
|
readFileSync(path.join(transcriptDir, transcriptFile), "utf-8"),
|
||||||
path.join(transcriptDir, transcriptFile),
|
);
|
||||||
"utf-8",
|
|
||||||
)
|
|
||||||
.split("\n")
|
|
||||||
.filter(Boolean)
|
|
||||||
.flatMap((l) => {
|
|
||||||
const parsed = TranscriptLine.safeParse(JSON.parse(l));
|
|
||||||
return parsed.success ? [parsed.data] : [];
|
|
||||||
});
|
|
||||||
|
|
||||||
console.log(JSON.stringify(transcript, null, 2));
|
console.log(JSON.stringify(transcript, null, 2));
|
||||||
|
|||||||
373
evals/parse-transcript.ts
Normal file
373
evals/parse-transcript.ts
Normal file
@@ -0,0 +1,373 @@
|
|||||||
|
/**
|
||||||
|
* Parser for Claude Code transcript format.
|
||||||
|
* Claude Code stores transcripts as JSONL at ~/.claude/projects/{path}/{session}.jsonl
|
||||||
|
*
|
||||||
|
* Format reference (based on Claude Code CLI output):
|
||||||
|
* - Messages have type: "user" | "assistant"
|
||||||
|
* - Tool use appears in assistant messages with tool_use blocks
|
||||||
|
* - Tool results appear as separate messages with type: "tool_result"
|
||||||
|
*
|
||||||
|
* Adapted from https://github.com/vercel-labs/agent-eval
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** Canonical tool names. */
|
||||||
|
export type ToolName =
|
||||||
|
| "file_read"
|
||||||
|
| "file_write"
|
||||||
|
| "file_edit"
|
||||||
|
| "shell"
|
||||||
|
| "web_fetch"
|
||||||
|
| "web_search"
|
||||||
|
| "glob"
|
||||||
|
| "grep"
|
||||||
|
| "list_dir"
|
||||||
|
| "agent_task"
|
||||||
|
| "skill"
|
||||||
|
| "unknown";
|
||||||
|
|
||||||
|
/** A single event in the parsed transcript. */
|
||||||
|
export interface TranscriptEvent {
|
||||||
|
/** ISO timestamp of the event */
|
||||||
|
timestamp?: string;
|
||||||
|
/** Event type */
|
||||||
|
type: "message" | "tool_call" | "tool_result" | "thinking" | "error";
|
||||||
|
/** For message events: the role */
|
||||||
|
role?: "user" | "assistant" | "system";
|
||||||
|
/** Text content (for messages, thinking, errors) */
|
||||||
|
content?: string;
|
||||||
|
/** For tool_call and tool_result events */
|
||||||
|
tool?: {
|
||||||
|
name: ToolName;
|
||||||
|
originalName: string;
|
||||||
|
args?: Record<string, unknown>;
|
||||||
|
result?: unknown;
|
||||||
|
durationMs?: number;
|
||||||
|
success?: boolean;
|
||||||
|
};
|
||||||
|
/** Raw event data from the agent (for debugging) */
|
||||||
|
raw?: unknown;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeToolName(name: string): ToolName {
|
||||||
|
const toolMap: Record<string, ToolName> = {
|
||||||
|
// File operations
|
||||||
|
Read: "file_read",
|
||||||
|
read_file: "file_read",
|
||||||
|
ReadFile: "file_read",
|
||||||
|
Write: "file_write",
|
||||||
|
write_file: "file_write",
|
||||||
|
WriteFile: "file_write",
|
||||||
|
write_to_file: "file_write",
|
||||||
|
Edit: "file_edit",
|
||||||
|
edit_file: "file_edit",
|
||||||
|
EditFile: "file_edit",
|
||||||
|
str_replace_editor: "file_edit",
|
||||||
|
StrReplace: "file_edit",
|
||||||
|
// Shell
|
||||||
|
Bash: "shell",
|
||||||
|
bash: "shell",
|
||||||
|
Shell: "shell",
|
||||||
|
shell: "shell",
|
||||||
|
execute_command: "shell",
|
||||||
|
run_command: "shell",
|
||||||
|
// Web
|
||||||
|
WebFetch: "web_fetch",
|
||||||
|
web_fetch: "web_fetch",
|
||||||
|
fetch_url: "web_fetch",
|
||||||
|
mcp__fetch__fetch: "web_fetch",
|
||||||
|
WebSearch: "web_search",
|
||||||
|
web_search: "web_search",
|
||||||
|
// Search/navigation
|
||||||
|
Glob: "glob",
|
||||||
|
glob: "glob",
|
||||||
|
list_files: "glob",
|
||||||
|
Grep: "grep",
|
||||||
|
grep: "grep",
|
||||||
|
search_files: "grep",
|
||||||
|
LS: "list_dir",
|
||||||
|
list_dir: "list_dir",
|
||||||
|
ListDir: "list_dir",
|
||||||
|
// Agent/subagent tools
|
||||||
|
Task: "agent_task",
|
||||||
|
task: "agent_task",
|
||||||
|
// Skills
|
||||||
|
Skill: "skill",
|
||||||
|
skill: "skill",
|
||||||
|
};
|
||||||
|
|
||||||
|
return toolMap[name] || "unknown";
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractFilePath(args: Record<string, unknown>): string | undefined {
|
||||||
|
return (args.path || args.file_path || args.filename || args.file) as
|
||||||
|
| string
|
||||||
|
| undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractUrl(args: Record<string, unknown>): string | undefined {
|
||||||
|
return (args.url || args.uri || args.href) as string | undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractCommand(args: Record<string, unknown>): string | undefined {
|
||||||
|
if (typeof args.command === "string") return args.command;
|
||||||
|
if (Array.isArray(args.command)) return args.command.join(" ");
|
||||||
|
if (typeof args.cmd === "string") return args.cmd;
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the content array from data, handling nested message format.
|
||||||
|
* Claude Code wraps messages: { type: "assistant", message: { content: [...] } }
|
||||||
|
*/
|
||||||
|
function getContentArray(data: Record<string, unknown>): unknown[] | undefined {
|
||||||
|
if (Array.isArray(data.content)) return data.content;
|
||||||
|
const message = data.message as Record<string, unknown> | undefined;
|
||||||
|
if (message && Array.isArray(message.content)) return message.content;
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getStringContent(data: Record<string, unknown>): string | undefined {
|
||||||
|
if (typeof data.content === "string") return data.content;
|
||||||
|
const message = data.message as Record<string, unknown> | undefined;
|
||||||
|
if (message && typeof message.content === "string") return message.content;
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractContent(data: Record<string, unknown>): string | undefined {
|
||||||
|
const stringContent = getStringContent(data);
|
||||||
|
if (stringContent) return stringContent;
|
||||||
|
|
||||||
|
const contentArray = getContentArray(data);
|
||||||
|
if (contentArray) {
|
||||||
|
const textBlocks = contentArray.filter(
|
||||||
|
(block: unknown) =>
|
||||||
|
(block as Record<string, unknown>).type === "text",
|
||||||
|
);
|
||||||
|
if (textBlocks.length > 0) {
|
||||||
|
return textBlocks
|
||||||
|
.map((b: unknown) => (b as Record<string, unknown>).text)
|
||||||
|
.join("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof data.text === "string") return data.text;
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractToolUses(
|
||||||
|
data: Record<string, unknown>,
|
||||||
|
): Array<{ name: string; input?: Record<string, unknown>; args?: Record<string, unknown> }> {
|
||||||
|
const toolUses: Array<{
|
||||||
|
name: string;
|
||||||
|
input?: Record<string, unknown>;
|
||||||
|
args?: Record<string, unknown>;
|
||||||
|
}> = [];
|
||||||
|
|
||||||
|
const contentArray = getContentArray(data);
|
||||||
|
if (contentArray) {
|
||||||
|
for (const block of contentArray) {
|
||||||
|
const b = block as Record<string, unknown>;
|
||||||
|
if (b.type === "tool_use") {
|
||||||
|
toolUses.push({
|
||||||
|
name: b.name as string,
|
||||||
|
input: b.input as Record<string, unknown> | undefined,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Also handle OpenAI-style tool_calls array
|
||||||
|
const toolCalls =
|
||||||
|
data.tool_calls ||
|
||||||
|
(data.message as Record<string, unknown>)?.tool_calls;
|
||||||
|
if (Array.isArray(toolCalls)) {
|
||||||
|
for (const call of toolCalls) {
|
||||||
|
const c = call as Record<string, unknown>;
|
||||||
|
const func = c.function as Record<string, unknown> | undefined;
|
||||||
|
toolUses.push({
|
||||||
|
name: (func?.name || c.name) as string,
|
||||||
|
args: func?.arguments
|
||||||
|
? JSON.parse(func.arguments as string)
|
||||||
|
: ((c.arguments || c.input) as
|
||||||
|
| Record<string, unknown>
|
||||||
|
| undefined),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return toolUses;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractThinking(data: Record<string, unknown>): string | undefined {
|
||||||
|
const contentArray = getContentArray(data);
|
||||||
|
if (contentArray) {
|
||||||
|
const thinkingBlocks = contentArray.filter(
|
||||||
|
(block: unknown) =>
|
||||||
|
(block as Record<string, unknown>).type === "thinking",
|
||||||
|
);
|
||||||
|
if (thinkingBlocks.length > 0) {
|
||||||
|
return thinkingBlocks
|
||||||
|
.map((b: unknown) => {
|
||||||
|
const block = b as Record<string, unknown>;
|
||||||
|
return block.thinking || block.text;
|
||||||
|
})
|
||||||
|
.join("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseClaudeCodeLine(line: string): TranscriptEvent[] {
|
||||||
|
const events: TranscriptEvent[] = [];
|
||||||
|
|
||||||
|
try {
|
||||||
|
const data = JSON.parse(line);
|
||||||
|
|
||||||
|
if (data.type === "user" || data.role === "user") {
|
||||||
|
const contentArray = getContentArray(data);
|
||||||
|
const toolResults = contentArray?.filter(
|
||||||
|
(block: unknown) =>
|
||||||
|
(block as Record<string, unknown>).type === "tool_result",
|
||||||
|
);
|
||||||
|
|
||||||
|
if (toolResults && toolResults.length > 0) {
|
||||||
|
for (const result of toolResults) {
|
||||||
|
const r = result as Record<string, unknown>;
|
||||||
|
events.push({
|
||||||
|
timestamp: data.timestamp,
|
||||||
|
type: "tool_result",
|
||||||
|
tool: {
|
||||||
|
name: "unknown",
|
||||||
|
originalName: (r.tool_use_id || "unknown") as string,
|
||||||
|
result: r.content,
|
||||||
|
success: !r.is_error && !r.error,
|
||||||
|
},
|
||||||
|
raw: r,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
events.push({
|
||||||
|
timestamp: data.timestamp,
|
||||||
|
type: "message",
|
||||||
|
role: "user",
|
||||||
|
content: extractContent(data),
|
||||||
|
raw: data,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else if (data.type === "assistant" || data.role === "assistant") {
|
||||||
|
const content = extractContent(data);
|
||||||
|
if (content) {
|
||||||
|
events.push({
|
||||||
|
timestamp: data.timestamp,
|
||||||
|
type: "message",
|
||||||
|
role: "assistant",
|
||||||
|
content,
|
||||||
|
raw: data,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const toolUse of extractToolUses(data)) {
|
||||||
|
events.push({
|
||||||
|
timestamp: data.timestamp,
|
||||||
|
type: "tool_call",
|
||||||
|
tool: {
|
||||||
|
name: normalizeToolName(toolUse.name),
|
||||||
|
originalName: toolUse.name,
|
||||||
|
args: toolUse.input || toolUse.args || {},
|
||||||
|
},
|
||||||
|
raw: toolUse,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const thinking = extractThinking(data);
|
||||||
|
if (thinking) {
|
||||||
|
events.push({
|
||||||
|
timestamp: data.timestamp,
|
||||||
|
type: "thinking",
|
||||||
|
content: thinking,
|
||||||
|
raw: data,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else if (
|
||||||
|
data.type === "tool_result" ||
|
||||||
|
data.type === "tool_response"
|
||||||
|
) {
|
||||||
|
events.push({
|
||||||
|
timestamp: data.timestamp,
|
||||||
|
type: "tool_result",
|
||||||
|
tool: {
|
||||||
|
name: "unknown",
|
||||||
|
originalName: data.tool_use_id || "unknown",
|
||||||
|
result: data.content || data.output || data.result,
|
||||||
|
success: !data.is_error && !data.error,
|
||||||
|
},
|
||||||
|
raw: data,
|
||||||
|
});
|
||||||
|
} else if (data.type === "system" || data.role === "system") {
|
||||||
|
events.push({
|
||||||
|
timestamp: data.timestamp,
|
||||||
|
type: "message",
|
||||||
|
role: "system",
|
||||||
|
content: extractContent(data),
|
||||||
|
raw: data,
|
||||||
|
});
|
||||||
|
} else if (data.type === "error" || data.error) {
|
||||||
|
events.push({
|
||||||
|
timestamp: data.timestamp,
|
||||||
|
type: "error",
|
||||||
|
content:
|
||||||
|
data.error?.message ||
|
||||||
|
data.message ||
|
||||||
|
JSON.stringify(data.error),
|
||||||
|
raw: data,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Skip unparseable lines
|
||||||
|
}
|
||||||
|
|
||||||
|
return events;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Parse a Claude Code JSONL transcript into a flat list of events. */
|
||||||
|
export function parseClaudeCodeTranscript(raw: string): {
|
||||||
|
events: TranscriptEvent[];
|
||||||
|
errors: string[];
|
||||||
|
} {
|
||||||
|
const events: TranscriptEvent[] = [];
|
||||||
|
const errors: string[] = [];
|
||||||
|
|
||||||
|
for (const line of raw.split("\n").filter((l) => l.trim())) {
|
||||||
|
try {
|
||||||
|
events.push(...parseClaudeCodeLine(line));
|
||||||
|
} catch (e) {
|
||||||
|
errors.push(
|
||||||
|
`Failed to parse line: ${e instanceof Error ? e.message : String(e)}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Post-process: extract metadata from tool args
|
||||||
|
for (const event of events) {
|
||||||
|
if (event.type === "tool_call" && event.tool) {
|
||||||
|
const args = event.tool.args || {};
|
||||||
|
|
||||||
|
if (["file_read", "file_write", "file_edit"].includes(event.tool.name)) {
|
||||||
|
const filePath = extractFilePath(args);
|
||||||
|
if (filePath) event.tool.args = { ...args, _extractedPath: filePath };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (event.tool.name === "web_fetch") {
|
||||||
|
const url = extractUrl(args);
|
||||||
|
if (url) event.tool.args = { ...args, _extractedUrl: url };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (event.tool.name === "shell") {
|
||||||
|
const command = extractCommand(args);
|
||||||
|
if (command) event.tool.args = { ...args, _extractedCommand: command };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { events, errors };
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user