From 18f58ffc7c3c6a8e3958796acce88cc51717ebfe Mon Sep 17 00:00:00 2001 From: Matt Rossman <22670878+mattrossman@users.noreply.github.com> Date: Tue, 3 Mar 2026 16:02:05 -0500 Subject: [PATCH] feat: Zod-typed transcript parsing, drop hooks --- evals/Dockerfile | 3 ++ evals/main.ts | 108 +++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 98 insertions(+), 13 deletions(-) diff --git a/evals/Dockerfile b/evals/Dockerfile index f2a0fc6..ce0a82d 100644 --- a/evals/Dockerfile +++ b/evals/Dockerfile @@ -1,2 +1,5 @@ FROM node:24-slim RUN npm install -g @anthropic-ai/claude-code@2.1.63 +RUN useradd -m claude +USER claude +RUN mkdir -p /home/claude/.claude diff --git a/evals/main.ts b/evals/main.ts index 656d4ba..b61ec0c 100644 --- a/evals/main.ts +++ b/evals/main.ts @@ -1,18 +1,67 @@ -import { execSync, spawnSync } from "node:child_process"; +import { spawnSync } from "node:child_process"; +import { mkdtempSync, readdirSync, readFileSync } from "node:fs"; +import os from "node:os"; import path from "node:path"; +import { z } from "zod"; + +/** + * Transcript schemas + */ + +const TextBlock = z.object({ type: z.literal("text"), text: z.string() }); + +const ThinkingBlock = z.object({ + type: z.literal("thinking"), + thinking: z.string(), +}); + +const ToolUseBlock = z.object({ + type: z.literal("tool_use"), + id: z.string(), + name: z.string(), + input: z.record(z.string(), z.unknown()), +}); + +const ToolResultBlock = z.object({ + type: z.literal("tool_result"), + tool_use_id: z.string(), + content: z.unknown(), + is_error: z.boolean().optional(), +}); + +const ContentBlock = z.union([ + ToolUseBlock, + ThinkingBlock, + TextBlock, + ToolResultBlock, + z.looseObject({ type: z.string() }), // catch-all for unknown block types +]); + +const AssistantEntry = z.object({ + type: z.literal("assistant"), + sessionId: z.string(), + timestamp: z.string(), + uuid: z.string(), + message: z.object({ + role: z.literal("assistant"), + content: z.array(ContentBlock), + stop_reason: z.string().nullable().optional(), + }), +}); + +// Catch-all — user messages, queue-operations, etc. +const TranscriptLine = z.union([ + AssistantEntry, + z.looseObject({ type: z.string() }), +]); + +/** + * Config + */ const apiKey = process.env.ANTHROPIC_API_KEY; if (!apiKey) throw new Error("ANTHROPIC_API_KEY required"); -try { - execSync("docker image inspect evals-claude", { stdio: "ignore" }); -} catch { - console.error( - "Docker image 'evals-claude' not found. Build it first with:\n npm run evals:build", - ); - process.exit(1); -} - const repoRoot = path.resolve(__dirname, ".."); const skillPath = path.join( repoRoot, @@ -26,6 +75,13 @@ SELECT * FROM orders WHERE user_id = 123 AND status = 'pending'; What indexes should I add and why?`; +/** + * Run the eval + */ + +// Mount ~/.claude/projects to capture the built-in session transcript +const projectsDir = mkdtempSync(path.join(os.tmpdir(), "eval-projects-")); + const result = spawnSync( "docker", [ @@ -34,10 +90,13 @@ const result = spawnSync( "-e", `ANTHROPIC_API_KEY=${apiKey}`, "-v", - `${skillPath}:/root/.claude/skills/supabase-postgres-best-practices:ro`, // :ro = read-only snapshot + `${skillPath}:/home/claude/.claude/skills/supabase-postgres-best-practices:ro`, // :ro = read-only snapshot + "-v", + `${projectsDir}:/home/claude/.claude/projects`, "evals-claude", "claude", - "-p", + "--print", + "--dangerously-skip-permissions", prompt, ], { encoding: "utf-8" }, @@ -46,4 +105,27 @@ const result = spawnSync( if (result.status !== 0) { throw new Error(result.stderr || `Exit code ${result.status}`); } -console.log(result.stdout); + +/** + * Parse the transcript + */ + +// Container's working dir is /, which becomes `-` in the projects path +const transcriptDir = path.join(projectsDir, "-"); +const [transcriptFile] = readdirSync(transcriptDir).filter((f) => + f.endsWith(".jsonl"), +); + +// Single typed array — all transcript entries parsed and validated +const transcript = readFileSync( + path.join(transcriptDir, transcriptFile), + "utf-8", +) + .split("\n") + .filter(Boolean) + .flatMap((l) => { + const parsed = TranscriptLine.safeParse(JSON.parse(l)); + return parsed.success ? [parsed.data] : []; + }); + +console.log(JSON.stringify(transcript, null, 2));