feat: transcript parsing based on vercel-labs/agent-eval

2026-03-27 10:09:26 +08:00 · 2026-03-03 17:43:26 -05:00
parent 18f58ffc7c
commit cdd65c5256
3 changed files with 390 additions and 84 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 node_modules/
 dist/
 *.log
+evals/output/
 .DS_Store
 .env

--- a/evals/main.ts
+++ b/evals/main.ts
@@ -1,63 +1,7 @@
 import { spawnSync } from "node:child_process";
-import { mkdtempSync, readdirSync, readFileSync } from "node:fs";
-import os from "node:os";
+import { mkdirSync, readdirSync, readFileSync } from "node:fs";
 import path from "node:path";
-import { z } from "zod";
-
-/**
- * Transcript schemas
- */
-
-const TextBlock = z.object({ type: z.literal("text"), text: z.string() });
-
-const ThinkingBlock = z.object({
-	type: z.literal("thinking"),
-	thinking: z.string(),
-});
-
-const ToolUseBlock = z.object({
-	type: z.literal("tool_use"),
-	id: z.string(),
-	name: z.string(),
-	input: z.record(z.string(), z.unknown()),
-});
-
-const ToolResultBlock = z.object({
-	type: z.literal("tool_result"),
-	tool_use_id: z.string(),
-	content: z.unknown(),
-	is_error: z.boolean().optional(),
-});
-
-const ContentBlock = z.union([
-	ToolUseBlock,
-	ThinkingBlock,
-	TextBlock,
-	ToolResultBlock,
-	z.looseObject({ type: z.string() }), // catch-all for unknown block types
-]);
-
-const AssistantEntry = z.object({
-	type: z.literal("assistant"),
-	sessionId: z.string(),
-	timestamp: z.string(),
-	uuid: z.string(),
-	message: z.object({
-		role: z.literal("assistant"),
-		content: z.array(ContentBlock),
-		stop_reason: z.string().nullable().optional(),
-	}),
-});
-
-// Catch-all — user messages, queue-operations, etc.
-const TranscriptLine = z.union([
-	AssistantEntry,
-	z.looseObject({ type: z.string() }),
-]);
-
-/**
- * Config
- */
+import { parseClaudeCodeTranscript } from "./parse-transcript";

 const apiKey = process.env.ANTHROPIC_API_KEY;
 if (!apiKey) throw new Error("ANTHROPIC_API_KEY required");
@@ -75,23 +19,23 @@ SELECT * FROM orders WHERE user_id = 123 AND status = 'pending';

 What indexes should I add and why?`;

-/**
- * Run the eval
- */
-
-// Mount ~/.claude/projects to capture the built-in session transcript
-const projectsDir = mkdtempSync(path.join(os.tmpdir(), "eval-projects-"));
+// Mount ~/.claude/projects to capture the built-in session transcript.
+// Written to a fixed path so you can inspect it after the run (e.g. in VSCode).
+const projectsDir = path.join(repoRoot, "evals", "output");
+mkdirSync(projectsDir, { recursive: true });

 const result = spawnSync(
 	"docker",
 	[
 		"run",
 		"--rm",
-		"-e",
+		"--workdir",
+		"/eval",
+		"--env",
 		`ANTHROPIC_API_KEY=${apiKey}`,
-		"-v",
+		"--volume",
 		`${skillPath}:/home/claude/.claude/skills/supabase-postgres-best-practices:ro`, // :ro = read-only snapshot
-		"-v",
+		"--volume",
 		`${projectsDir}:/home/claude/.claude/projects`,
 		"evals-claude",
 		"claude",
@@ -106,26 +50,14 @@ if (result.status !== 0) {
 	throw new Error(result.stderr || `Exit code ${result.status}`);
 }

-/**
- * Parse the transcript
- */
-
-// Container's working dir is /, which becomes `-` in the projects path
-const transcriptDir = path.join(projectsDir, "-");
+// Container's working dir is /eval, which becomes `eval` in the projects path
+const transcriptDir = path.join(projectsDir, "eval");
 const [transcriptFile] = readdirSync(transcriptDir).filter((f) =>
 	f.endsWith(".jsonl"),
 );

-// Single typed array — all transcript entries parsed and validated
-const transcript = readFileSync(
-	path.join(transcriptDir, transcriptFile),
-	"utf-8",
-)
-	.split("\n")
-	.filter(Boolean)
-	.flatMap((l) => {
-		const parsed = TranscriptLine.safeParse(JSON.parse(l));
-		return parsed.success ? [parsed.data] : [];
-	});
+const transcript = parseClaudeCodeTranscript(
+	readFileSync(path.join(transcriptDir, transcriptFile), "utf-8"),
+);

 console.log(JSON.stringify(transcript, null, 2));
--- a/evals/parse-transcript.ts
+++ b/evals/parse-transcript.ts
@@ -0,0 +1,373 @@
+/**
+ * Parser for Claude Code transcript format.
+ * Claude Code stores transcripts as JSONL at ~/.claude/projects/{path}/{session}.jsonl
+ *
+ * Format reference (based on Claude Code CLI output):
+ * - Messages have type: "user" | "assistant"
+ * - Tool use appears in assistant messages with tool_use blocks
+ * - Tool results appear as separate messages with type: "tool_result"
+ *
+ * Adapted from https://github.com/vercel-labs/agent-eval
+ */
+
+/** Canonical tool names. */
+export type ToolName =
+	| "file_read"
+	| "file_write"
+	| "file_edit"
+	| "shell"
+	| "web_fetch"
+	| "web_search"
+	| "glob"
+	| "grep"
+	| "list_dir"
+	| "agent_task"
+	| "skill"
+	| "unknown";
+
+/** A single event in the parsed transcript. */
+export interface TranscriptEvent {
+	/** ISO timestamp of the event */
+	timestamp?: string;
+	/** Event type */
+	type: "message" | "tool_call" | "tool_result" | "thinking" | "error";
+	/** For message events: the role */
+	role?: "user" | "assistant" | "system";
+	/** Text content (for messages, thinking, errors) */
+	content?: string;
+	/** For tool_call and tool_result events */
+	tool?: {
+		name: ToolName;
+		originalName: string;
+		args?: Record<string, unknown>;
+		result?: unknown;
+		durationMs?: number;
+		success?: boolean;
+	};
+	/** Raw event data from the agent (for debugging) */
+	raw?: unknown;
+}
+
+function normalizeToolName(name: string): ToolName {
+	const toolMap: Record<string, ToolName> = {
+		// File operations
+		Read: "file_read",
+		read_file: "file_read",
+		ReadFile: "file_read",
+		Write: "file_write",
+		write_file: "file_write",
+		WriteFile: "file_write",
+		write_to_file: "file_write",
+		Edit: "file_edit",
+		edit_file: "file_edit",
+		EditFile: "file_edit",
+		str_replace_editor: "file_edit",
+		StrReplace: "file_edit",
+		// Shell
+		Bash: "shell",
+		bash: "shell",
+		Shell: "shell",
+		shell: "shell",
+		execute_command: "shell",
+		run_command: "shell",
+		// Web
+		WebFetch: "web_fetch",
+		web_fetch: "web_fetch",
+		fetch_url: "web_fetch",
+		mcp__fetch__fetch: "web_fetch",
+		WebSearch: "web_search",
+		web_search: "web_search",
+		// Search/navigation
+		Glob: "glob",
+		glob: "glob",
+		list_files: "glob",
+		Grep: "grep",
+		grep: "grep",
+		search_files: "grep",
+		LS: "list_dir",
+		list_dir: "list_dir",
+		ListDir: "list_dir",
+		// Agent/subagent tools
+		Task: "agent_task",
+		task: "agent_task",
+		// Skills
+		Skill: "skill",
+		skill: "skill",
+	};
+
+	return toolMap[name] || "unknown";
+}
+
+function extractFilePath(args: Record<string, unknown>): string | undefined {
+	return (args.path || args.file_path || args.filename || args.file) as
+		| string
+		| undefined;
+}
+
+function extractUrl(args: Record<string, unknown>): string | undefined {
+	return (args.url || args.uri || args.href) as string | undefined;
+}
+
+function extractCommand(args: Record<string, unknown>): string | undefined {
+	if (typeof args.command === "string") return args.command;
+	if (Array.isArray(args.command)) return args.command.join(" ");
+	if (typeof args.cmd === "string") return args.cmd;
+	return undefined;
+}
+
+/**
+ * Get the content array from data, handling nested message format.
+ * Claude Code wraps messages: { type: "assistant", message: { content: [...] } }
+ */
+function getContentArray(data: Record<string, unknown>): unknown[] | undefined {
+	if (Array.isArray(data.content)) return data.content;
+	const message = data.message as Record<string, unknown> | undefined;
+	if (message && Array.isArray(message.content)) return message.content;
+	return undefined;
+}
+
+function getStringContent(data: Record<string, unknown>): string | undefined {
+	if (typeof data.content === "string") return data.content;
+	const message = data.message as Record<string, unknown> | undefined;
+	if (message && typeof message.content === "string") return message.content;
+	return undefined;
+}
+
+function extractContent(data: Record<string, unknown>): string | undefined {
+	const stringContent = getStringContent(data);
+	if (stringContent) return stringContent;
+
+	const contentArray = getContentArray(data);
+	if (contentArray) {
+		const textBlocks = contentArray.filter(
+			(block: unknown) =>
+				(block as Record<string, unknown>).type === "text",
+		);
+		if (textBlocks.length > 0) {
+			return textBlocks
+				.map((b: unknown) => (b as Record<string, unknown>).text)
+				.join("\n");
+		}
+	}
+
+	if (typeof data.text === "string") return data.text;
+	return undefined;
+}
+
+function extractToolUses(
+	data: Record<string, unknown>,
+): Array<{ name: string; input?: Record<string, unknown>; args?: Record<string, unknown> }> {
+	const toolUses: Array<{
+		name: string;
+		input?: Record<string, unknown>;
+		args?: Record<string, unknown>;
+	}> = [];
+
+	const contentArray = getContentArray(data);
+	if (contentArray) {
+		for (const block of contentArray) {
+			const b = block as Record<string, unknown>;
+			if (b.type === "tool_use") {
+				toolUses.push({
+					name: b.name as string,
+					input: b.input as Record<string, unknown> | undefined,
+				});
+			}
+		}
+	}
+
+	// Also handle OpenAI-style tool_calls array
+	const toolCalls =
+		data.tool_calls ||
+		(data.message as Record<string, unknown>)?.tool_calls;
+	if (Array.isArray(toolCalls)) {
+		for (const call of toolCalls) {
+			const c = call as Record<string, unknown>;
+			const func = c.function as Record<string, unknown> | undefined;
+			toolUses.push({
+				name: (func?.name || c.name) as string,
+				args: func?.arguments
+					? JSON.parse(func.arguments as string)
+					: ((c.arguments || c.input) as
+							| Record<string, unknown>
+							| undefined),
+			});
+		}
+	}
+
+	return toolUses;
+}
+
+function extractThinking(data: Record<string, unknown>): string | undefined {
+	const contentArray = getContentArray(data);
+	if (contentArray) {
+		const thinkingBlocks = contentArray.filter(
+			(block: unknown) =>
+				(block as Record<string, unknown>).type === "thinking",
+		);
+		if (thinkingBlocks.length > 0) {
+			return thinkingBlocks
+				.map((b: unknown) => {
+					const block = b as Record<string, unknown>;
+					return block.thinking || block.text;
+				})
+				.join("\n");
+		}
+	}
+	return undefined;
+}
+
+function parseClaudeCodeLine(line: string): TranscriptEvent[] {
+	const events: TranscriptEvent[] = [];
+
+	try {
+		const data = JSON.parse(line);
+
+		if (data.type === "user" || data.role === "user") {
+			const contentArray = getContentArray(data);
+			const toolResults = contentArray?.filter(
+				(block: unknown) =>
+					(block as Record<string, unknown>).type === "tool_result",
+			);
+
+			if (toolResults && toolResults.length > 0) {
+				for (const result of toolResults) {
+					const r = result as Record<string, unknown>;
+					events.push({
+						timestamp: data.timestamp,
+						type: "tool_result",
+						tool: {
+							name: "unknown",
+							originalName: (r.tool_use_id || "unknown") as string,
+							result: r.content,
+							success: !r.is_error && !r.error,
+						},
+						raw: r,
+					});
+				}
+			} else {
+				events.push({
+					timestamp: data.timestamp,
+					type: "message",
+					role: "user",
+					content: extractContent(data),
+					raw: data,
+				});
+			}
+		} else if (data.type === "assistant" || data.role === "assistant") {
+			const content = extractContent(data);
+			if (content) {
+				events.push({
+					timestamp: data.timestamp,
+					type: "message",
+					role: "assistant",
+					content,
+					raw: data,
+				});
+			}
+
+			for (const toolUse of extractToolUses(data)) {
+				events.push({
+					timestamp: data.timestamp,
+					type: "tool_call",
+					tool: {
+						name: normalizeToolName(toolUse.name),
+						originalName: toolUse.name,
+						args: toolUse.input || toolUse.args || {},
+					},
+					raw: toolUse,
+				});
+			}
+
+			const thinking = extractThinking(data);
+			if (thinking) {
+				events.push({
+					timestamp: data.timestamp,
+					type: "thinking",
+					content: thinking,
+					raw: data,
+				});
+			}
+		} else if (
+			data.type === "tool_result" ||
+			data.type === "tool_response"
+		) {
+			events.push({
+				timestamp: data.timestamp,
+				type: "tool_result",
+				tool: {
+					name: "unknown",
+					originalName: data.tool_use_id || "unknown",
+					result: data.content || data.output || data.result,
+					success: !data.is_error && !data.error,
+				},
+				raw: data,
+			});
+		} else if (data.type === "system" || data.role === "system") {
+			events.push({
+				timestamp: data.timestamp,
+				type: "message",
+				role: "system",
+				content: extractContent(data),
+				raw: data,
+			});
+		} else if (data.type === "error" || data.error) {
+			events.push({
+				timestamp: data.timestamp,
+				type: "error",
+				content:
+					data.error?.message ||
+					data.message ||
+					JSON.stringify(data.error),
+				raw: data,
+			});
+		}
+	} catch {
+		// Skip unparseable lines
+	}
+
+	return events;
+}
+
+/** Parse a Claude Code JSONL transcript into a flat list of events. */
+export function parseClaudeCodeTranscript(raw: string): {
+	events: TranscriptEvent[];
+	errors: string[];
+} {
+	const events: TranscriptEvent[] = [];
+	const errors: string[] = [];
+
+	for (const line of raw.split("\n").filter((l) => l.trim())) {
+		try {
+			events.push(...parseClaudeCodeLine(line));
+		} catch (e) {
+			errors.push(
+				`Failed to parse line: ${e instanceof Error ? e.message : String(e)}`,
+			);
+		}
+	}
+
+	// Post-process: extract metadata from tool args
+	for (const event of events) {
+		if (event.type === "tool_call" && event.tool) {
+			const args = event.tool.args || {};
+
+			if (["file_read", "file_write", "file_edit"].includes(event.tool.name)) {
+				const filePath = extractFilePath(args);
+				if (filePath) event.tool.args = { ...args, _extractedPath: filePath };
+			}
+
+			if (event.tool.name === "web_fetch") {
+				const url = extractUrl(args);
+				if (url) event.tool.args = { ...args, _extractedUrl: url };
+			}
+
+			if (event.tool.name === "shell") {
+				const command = extractCommand(args);
+				if (command) event.tool.args = { ...args, _extractedCommand: command };
+			}
+		}
+	}
+
+	return { events, errors };
+}