From cdd65c52563ba47defe1bd5cc6e58690846b0717 Mon Sep 17 00:00:00 2001 From: Matt Rossman <22670878+mattrossman@users.noreply.github.com> Date: Tue, 3 Mar 2026 17:43:26 -0500 Subject: [PATCH] feat: transcript parsing based on vercel-labs/agent-eval --- .gitignore | 1 + evals/main.ts | 100 ++-------- evals/parse-transcript.ts | 373 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 390 insertions(+), 84 deletions(-) create mode 100644 evals/parse-transcript.ts diff --git a/.gitignore b/.gitignore index 63d70a0..8aeee47 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ node_modules/ dist/ *.log +evals/output/ .DS_Store .env diff --git a/evals/main.ts b/evals/main.ts index b61ec0c..0eaf692 100644 --- a/evals/main.ts +++ b/evals/main.ts @@ -1,63 +1,7 @@ import { spawnSync } from "node:child_process"; -import { mkdtempSync, readdirSync, readFileSync } from "node:fs"; -import os from "node:os"; +import { mkdirSync, readdirSync, readFileSync } from "node:fs"; import path from "node:path"; -import { z } from "zod"; - -/** - * Transcript schemas - */ - -const TextBlock = z.object({ type: z.literal("text"), text: z.string() }); - -const ThinkingBlock = z.object({ - type: z.literal("thinking"), - thinking: z.string(), -}); - -const ToolUseBlock = z.object({ - type: z.literal("tool_use"), - id: z.string(), - name: z.string(), - input: z.record(z.string(), z.unknown()), -}); - -const ToolResultBlock = z.object({ - type: z.literal("tool_result"), - tool_use_id: z.string(), - content: z.unknown(), - is_error: z.boolean().optional(), -}); - -const ContentBlock = z.union([ - ToolUseBlock, - ThinkingBlock, - TextBlock, - ToolResultBlock, - z.looseObject({ type: z.string() }), // catch-all for unknown block types -]); - -const AssistantEntry = z.object({ - type: z.literal("assistant"), - sessionId: z.string(), - timestamp: z.string(), - uuid: z.string(), - message: z.object({ - role: z.literal("assistant"), - content: z.array(ContentBlock), - stop_reason: z.string().nullable().optional(), - }), -}); - -// Catch-all — user messages, queue-operations, etc. -const TranscriptLine = z.union([ - AssistantEntry, - z.looseObject({ type: z.string() }), -]); - -/** - * Config - */ +import { parseClaudeCodeTranscript } from "./parse-transcript"; const apiKey = process.env.ANTHROPIC_API_KEY; if (!apiKey) throw new Error("ANTHROPIC_API_KEY required"); @@ -75,23 +19,23 @@ SELECT * FROM orders WHERE user_id = 123 AND status = 'pending'; What indexes should I add and why?`; -/** - * Run the eval - */ - -// Mount ~/.claude/projects to capture the built-in session transcript -const projectsDir = mkdtempSync(path.join(os.tmpdir(), "eval-projects-")); +// Mount ~/.claude/projects to capture the built-in session transcript. +// Written to a fixed path so you can inspect it after the run (e.g. in VSCode). +const projectsDir = path.join(repoRoot, "evals", "output"); +mkdirSync(projectsDir, { recursive: true }); const result = spawnSync( "docker", [ "run", "--rm", - "-e", + "--workdir", + "/eval", + "--env", `ANTHROPIC_API_KEY=${apiKey}`, - "-v", + "--volume", `${skillPath}:/home/claude/.claude/skills/supabase-postgres-best-practices:ro`, // :ro = read-only snapshot - "-v", + "--volume", `${projectsDir}:/home/claude/.claude/projects`, "evals-claude", "claude", @@ -106,26 +50,14 @@ if (result.status !== 0) { throw new Error(result.stderr || `Exit code ${result.status}`); } -/** - * Parse the transcript - */ - -// Container's working dir is /, which becomes `-` in the projects path -const transcriptDir = path.join(projectsDir, "-"); +// Container's working dir is /eval, which becomes `eval` in the projects path +const transcriptDir = path.join(projectsDir, "eval"); const [transcriptFile] = readdirSync(transcriptDir).filter((f) => f.endsWith(".jsonl"), ); -// Single typed array — all transcript entries parsed and validated -const transcript = readFileSync( - path.join(transcriptDir, transcriptFile), - "utf-8", -) - .split("\n") - .filter(Boolean) - .flatMap((l) => { - const parsed = TranscriptLine.safeParse(JSON.parse(l)); - return parsed.success ? [parsed.data] : []; - }); +const transcript = parseClaudeCodeTranscript( + readFileSync(path.join(transcriptDir, transcriptFile), "utf-8"), +); console.log(JSON.stringify(transcript, null, 2)); diff --git a/evals/parse-transcript.ts b/evals/parse-transcript.ts new file mode 100644 index 0000000..82a006e --- /dev/null +++ b/evals/parse-transcript.ts @@ -0,0 +1,373 @@ +/** + * Parser for Claude Code transcript format. + * Claude Code stores transcripts as JSONL at ~/.claude/projects/{path}/{session}.jsonl + * + * Format reference (based on Claude Code CLI output): + * - Messages have type: "user" | "assistant" + * - Tool use appears in assistant messages with tool_use blocks + * - Tool results appear as separate messages with type: "tool_result" + * + * Adapted from https://github.com/vercel-labs/agent-eval + */ + +/** Canonical tool names. */ +export type ToolName = + | "file_read" + | "file_write" + | "file_edit" + | "shell" + | "web_fetch" + | "web_search" + | "glob" + | "grep" + | "list_dir" + | "agent_task" + | "skill" + | "unknown"; + +/** A single event in the parsed transcript. */ +export interface TranscriptEvent { + /** ISO timestamp of the event */ + timestamp?: string; + /** Event type */ + type: "message" | "tool_call" | "tool_result" | "thinking" | "error"; + /** For message events: the role */ + role?: "user" | "assistant" | "system"; + /** Text content (for messages, thinking, errors) */ + content?: string; + /** For tool_call and tool_result events */ + tool?: { + name: ToolName; + originalName: string; + args?: Record; + result?: unknown; + durationMs?: number; + success?: boolean; + }; + /** Raw event data from the agent (for debugging) */ + raw?: unknown; +} + +function normalizeToolName(name: string): ToolName { + const toolMap: Record = { + // File operations + Read: "file_read", + read_file: "file_read", + ReadFile: "file_read", + Write: "file_write", + write_file: "file_write", + WriteFile: "file_write", + write_to_file: "file_write", + Edit: "file_edit", + edit_file: "file_edit", + EditFile: "file_edit", + str_replace_editor: "file_edit", + StrReplace: "file_edit", + // Shell + Bash: "shell", + bash: "shell", + Shell: "shell", + shell: "shell", + execute_command: "shell", + run_command: "shell", + // Web + WebFetch: "web_fetch", + web_fetch: "web_fetch", + fetch_url: "web_fetch", + mcp__fetch__fetch: "web_fetch", + WebSearch: "web_search", + web_search: "web_search", + // Search/navigation + Glob: "glob", + glob: "glob", + list_files: "glob", + Grep: "grep", + grep: "grep", + search_files: "grep", + LS: "list_dir", + list_dir: "list_dir", + ListDir: "list_dir", + // Agent/subagent tools + Task: "agent_task", + task: "agent_task", + // Skills + Skill: "skill", + skill: "skill", + }; + + return toolMap[name] || "unknown"; +} + +function extractFilePath(args: Record): string | undefined { + return (args.path || args.file_path || args.filename || args.file) as + | string + | undefined; +} + +function extractUrl(args: Record): string | undefined { + return (args.url || args.uri || args.href) as string | undefined; +} + +function extractCommand(args: Record): string | undefined { + if (typeof args.command === "string") return args.command; + if (Array.isArray(args.command)) return args.command.join(" "); + if (typeof args.cmd === "string") return args.cmd; + return undefined; +} + +/** + * Get the content array from data, handling nested message format. + * Claude Code wraps messages: { type: "assistant", message: { content: [...] } } + */ +function getContentArray(data: Record): unknown[] | undefined { + if (Array.isArray(data.content)) return data.content; + const message = data.message as Record | undefined; + if (message && Array.isArray(message.content)) return message.content; + return undefined; +} + +function getStringContent(data: Record): string | undefined { + if (typeof data.content === "string") return data.content; + const message = data.message as Record | undefined; + if (message && typeof message.content === "string") return message.content; + return undefined; +} + +function extractContent(data: Record): string | undefined { + const stringContent = getStringContent(data); + if (stringContent) return stringContent; + + const contentArray = getContentArray(data); + if (contentArray) { + const textBlocks = contentArray.filter( + (block: unknown) => + (block as Record).type === "text", + ); + if (textBlocks.length > 0) { + return textBlocks + .map((b: unknown) => (b as Record).text) + .join("\n"); + } + } + + if (typeof data.text === "string") return data.text; + return undefined; +} + +function extractToolUses( + data: Record, +): Array<{ name: string; input?: Record; args?: Record }> { + const toolUses: Array<{ + name: string; + input?: Record; + args?: Record; + }> = []; + + const contentArray = getContentArray(data); + if (contentArray) { + for (const block of contentArray) { + const b = block as Record; + if (b.type === "tool_use") { + toolUses.push({ + name: b.name as string, + input: b.input as Record | undefined, + }); + } + } + } + + // Also handle OpenAI-style tool_calls array + const toolCalls = + data.tool_calls || + (data.message as Record)?.tool_calls; + if (Array.isArray(toolCalls)) { + for (const call of toolCalls) { + const c = call as Record; + const func = c.function as Record | undefined; + toolUses.push({ + name: (func?.name || c.name) as string, + args: func?.arguments + ? JSON.parse(func.arguments as string) + : ((c.arguments || c.input) as + | Record + | undefined), + }); + } + } + + return toolUses; +} + +function extractThinking(data: Record): string | undefined { + const contentArray = getContentArray(data); + if (contentArray) { + const thinkingBlocks = contentArray.filter( + (block: unknown) => + (block as Record).type === "thinking", + ); + if (thinkingBlocks.length > 0) { + return thinkingBlocks + .map((b: unknown) => { + const block = b as Record; + return block.thinking || block.text; + }) + .join("\n"); + } + } + return undefined; +} + +function parseClaudeCodeLine(line: string): TranscriptEvent[] { + const events: TranscriptEvent[] = []; + + try { + const data = JSON.parse(line); + + if (data.type === "user" || data.role === "user") { + const contentArray = getContentArray(data); + const toolResults = contentArray?.filter( + (block: unknown) => + (block as Record).type === "tool_result", + ); + + if (toolResults && toolResults.length > 0) { + for (const result of toolResults) { + const r = result as Record; + events.push({ + timestamp: data.timestamp, + type: "tool_result", + tool: { + name: "unknown", + originalName: (r.tool_use_id || "unknown") as string, + result: r.content, + success: !r.is_error && !r.error, + }, + raw: r, + }); + } + } else { + events.push({ + timestamp: data.timestamp, + type: "message", + role: "user", + content: extractContent(data), + raw: data, + }); + } + } else if (data.type === "assistant" || data.role === "assistant") { + const content = extractContent(data); + if (content) { + events.push({ + timestamp: data.timestamp, + type: "message", + role: "assistant", + content, + raw: data, + }); + } + + for (const toolUse of extractToolUses(data)) { + events.push({ + timestamp: data.timestamp, + type: "tool_call", + tool: { + name: normalizeToolName(toolUse.name), + originalName: toolUse.name, + args: toolUse.input || toolUse.args || {}, + }, + raw: toolUse, + }); + } + + const thinking = extractThinking(data); + if (thinking) { + events.push({ + timestamp: data.timestamp, + type: "thinking", + content: thinking, + raw: data, + }); + } + } else if ( + data.type === "tool_result" || + data.type === "tool_response" + ) { + events.push({ + timestamp: data.timestamp, + type: "tool_result", + tool: { + name: "unknown", + originalName: data.tool_use_id || "unknown", + result: data.content || data.output || data.result, + success: !data.is_error && !data.error, + }, + raw: data, + }); + } else if (data.type === "system" || data.role === "system") { + events.push({ + timestamp: data.timestamp, + type: "message", + role: "system", + content: extractContent(data), + raw: data, + }); + } else if (data.type === "error" || data.error) { + events.push({ + timestamp: data.timestamp, + type: "error", + content: + data.error?.message || + data.message || + JSON.stringify(data.error), + raw: data, + }); + } + } catch { + // Skip unparseable lines + } + + return events; +} + +/** Parse a Claude Code JSONL transcript into a flat list of events. */ +export function parseClaudeCodeTranscript(raw: string): { + events: TranscriptEvent[]; + errors: string[]; +} { + const events: TranscriptEvent[] = []; + const errors: string[] = []; + + for (const line of raw.split("\n").filter((l) => l.trim())) { + try { + events.push(...parseClaudeCodeLine(line)); + } catch (e) { + errors.push( + `Failed to parse line: ${e instanceof Error ? e.message : String(e)}`, + ); + } + } + + // Post-process: extract metadata from tool args + for (const event of events) { + if (event.type === "tool_call" && event.tool) { + const args = event.tool.args || {}; + + if (["file_read", "file_write", "file_edit"].includes(event.tool.name)) { + const filePath = extractFilePath(args); + if (filePath) event.tool.args = { ...args, _extractedPath: filePath }; + } + + if (event.tool.name === "web_fetch") { + const url = extractUrl(args); + if (url) event.tool.args = { ...args, _extractedUrl: url }; + } + + if (event.tool.name === "shell") { + const command = extractCommand(args); + if (command) event.tool.args = { ...args, _extractedCommand: command }; + } + } + } + + return { events, errors }; +}