diff --git a/packages/evals/AGENTS.md b/packages/evals/AGENTS.md
index 622828c..bc8c4f9 100644
--- a/packages/evals/AGENTS.md
+++ b/packages/evals/AGENTS.md
@@ -45,10 +45,18 @@ This prevents the agent from "teaching to the test."
 
 ## Running Evals
 
+Eval tasks in `mise.toml` have `sources` defined, so mise skips them when
+source files haven't changed. Use `--force` to bypass caching when you need
+to re-run evals regardless (e.g., after changing environment variables or
+re-running the same scenario):
+
 ```bash
 # Run all scenarios with skills (default)
 mise run eval
 
+# Force re-run (bypass source caching)
+mise run --force eval
+
 # Run a specific scenario
 EVAL_SCENARIO=auth-rls-new-project mise run eval
 
@@ -63,9 +71,12 @@ EVAL_SKILL=supabase mise run eval
 
 # Upload results to Braintrust
 mise run eval:upload
+
+# Force upload (bypass cache)
+mise run --force eval:upload
 ```
 
-Or directly:
+Or directly (no caching, always runs):
 
 ```bash
 cd packages/evals
@@ -99,12 +110,15 @@ Compare the results to measure how much skills improve agent output.
 ## Environment
 
 ```
-ANTHROPIC_API_KEY=sk-ant-...    # Required: Claude Code authentication
-EVAL_MODEL=...                  # Optional: override model (default: claude-sonnet-4-5-20250929)
-EVAL_SCENARIO=...               # Optional: run single scenario
-EVAL_SKILL=...                  # Optional: install only this skill (e.g., "supabase")
-EVAL_BASELINE=true              # Optional: run without skills (baseline mode)
-BRAINTRUST_UPLOAD=true          # Optional: upload results to Braintrust
+ANTHROPIC_API_KEY=sk-ant-...       # Required: Claude Code authentication
+EVAL_MODEL=...                     # Optional: override model (default: claude-sonnet-4-5-20250929)
+EVAL_SCENARIO=...                  # Optional: run single scenario
+EVAL_SKILL=...                     # Optional: install only this skill (e.g., "supabase")
+EVAL_BASELINE=true                 # Optional: run without skills (baseline mode)
+BRAINTRUST_UPLOAD=true             # Optional: upload results to Braintrust
+BRAINTRUST_API_KEY=...             # Required when BRAINTRUST_UPLOAD=true
+BRAINTRUST_PROJECT_ID=...          # Required when BRAINTRUST_UPLOAD=true
+BRAINTRUST_BASE_EXPERIMENT=...     # Optional: compare against a named experiment
 ```
 
 ## Key Files
diff --git a/packages/evals/src/runner.ts b/packages/evals/src/runner.ts
index 9bfd53c..4229cb3 100644
--- a/packages/evals/src/runner.ts
+++ b/packages/evals/src/runner.ts
@@ -7,7 +7,10 @@ import { preflight } from "./runner/preflight.js";
 import { listModifiedFiles, printSummary } from "./runner/results.js";
 import { createWorkspace } from "./runner/scaffold.js";
 import { runTests } from "./runner/test.js";
-import { buildTranscriptSummary } from "./runner/transcript.js";
+import {
+	buildTranscriptSummary,
+	type TranscriptSummary,
+} from "./runner/transcript.js";
 import type { EvalRunResult, EvalScenario } from "./types.js";
 
 // ---------------------------------------------------------------------------
@@ -64,7 +67,7 @@ function discoverScenarios(): EvalScenario[] {
 async function runEval(
 	scenario: EvalScenario,
 	skillEnabled: boolean,
-): Promise<EvalRunResult> {
+): Promise<{ result: EvalRunResult; transcript?: TranscriptSummary }> {
 	const evalsDir = findEvalsDir();
 	const evalDir = join(evalsDir, scenario.id);
 	const variant = skillEnabled ? "with-skill" : "baseline";
@@ -129,6 +132,8 @@ async function runEval(
 			filesModified,
 			toolCallCount: summary.toolCalls.length,
 			costUsd: summary.totalCostUsd ?? undefined,
+			prompt,
+			individualTests: testResult.individualTests,
 		};
 
 		// 7. Persist results
@@ -142,22 +147,24 @@ async function runEval(
 			transcriptSummary: summary,
 		});
 
-		return result;
+		return { result, transcript: summary };
 	} catch (error) {
 		const err = error as Error;
 		return {
-			scenario: scenario.id,
-			agent: "claude-code",
-			model,
-			skillEnabled,
-			status: "error",
-			duration: 0,
-			testOutput: "",
-			agentOutput: "",
-			testsPassed: 0,
-			testsTotal: 0,
-			filesModified: [],
-			error: err.message,
+			result: {
+				scenario: scenario.id,
+				agent: "claude-code",
+				model,
+				skillEnabled,
+				status: "error",
+				duration: 0,
+				testOutput: "",
+				agentOutput: "",
+				testsPassed: 0,
+				testsTotal: 0,
+				filesModified: [],
+				error: err.message,
+			},
 		};
 	} finally {
 		cleanup();
@@ -188,10 +195,14 @@ async function main() {
 	console.log(`Scenarios: ${scenarios.map((s) => s.id).join(", ")}`);
 
 	const results: EvalRunResult[] = [];
+	const transcripts = new Map<string, TranscriptSummary>();
 
 	for (const scenario of scenarios) {
-		const result = await runEval(scenario, skillEnabled);
+		const { result, transcript } = await runEval(scenario, skillEnabled);
 		results.push(result);
+		if (transcript) {
+			transcripts.set(result.scenario, transcript);
+		}
 	}
 
 	// Use the results dir from the first result (all share the same timestamp)
@@ -200,7 +211,12 @@ async function main() {
 
 	if (process.env.BRAINTRUST_UPLOAD === "true") {
 		console.log("\nUploading to Braintrust...");
-		await uploadToBraintrust(results);
+		await uploadToBraintrust(results, {
+			model,
+			skillEnabled,
+			runTimestamp,
+			transcripts,
+		});
 	}
 }
 
diff --git a/packages/evals/src/runner/braintrust.ts b/packages/evals/src/runner/braintrust.ts
index 72dddb2..c1ab27a 100644
--- a/packages/evals/src/runner/braintrust.ts
+++ b/packages/evals/src/runner/braintrust.ts
@@ -1,50 +1,126 @@
 import assert from "node:assert";
 import { init } from "braintrust";
 import type { EvalRunResult } from "../types.js";
+import type { TranscriptSummary } from "./transcript.js";
+
+/** Convert a test name to a snake_case score key. */
+function toScoreKey(name: string): string {
+	return `test_${name
+		.toLowerCase()
+		.replace(/[^a-z0-9]+/g, "_")
+		.replace(/^_|_$/g, "")}`;
+}
 
 /**
  * Upload eval results to Braintrust as an experiment.
  *
  * Each EvalRunResult becomes a row in the experiment with:
- * - input: scenario name + config
- * - output: agent output summary
- * - scores: pass (0 or 1)
- * - metadata: model, skill toggle, duration, files modified
+ * - input: scenario ID, prompt content, skillEnabled flag
+ * - output: status, agent output, files modified, test output
+ * - expected: total tests, pass threshold
+ * - scores: pass (0|1), test_pass_rate (0-1), per-test scores
+ * - metadata: model, duration, cost, tool call count, files modified
+ * - spans: one child span per agent tool call (when transcript available)
  */
 export async function uploadToBraintrust(
 	results: EvalRunResult[],
+	opts: {
+		model: string;
+		skillEnabled: boolean;
+		runTimestamp: string;
+		transcripts: Map<string, TranscriptSummary>;
+	},
 ): Promise<void> {
 	assert(process.env.BRAINTRUST_API_KEY, "BRAINTRUST_API_KEY is not set");
 	assert(process.env.BRAINTRUST_PROJECT_ID, "BRAINTRUST_PROJECT_ID is not set");
 
+	const variant = opts.skillEnabled ? "skill" : "baseline";
 	const experiment = await init({
 		projectId: process.env.BRAINTRUST_PROJECT_ID,
+		experiment: `${opts.model}-${variant}-${opts.runTimestamp}`,
+		baseExperiment: process.env.BRAINTRUST_BASE_EXPERIMENT ?? undefined,
+		metadata: {
+			model: opts.model,
+			skillEnabled: opts.skillEnabled,
+			runTimestamp: opts.runTimestamp,
+			scenarioCount: results.length,
+		},
 	});
 
 	for (const r of results) {
-		experiment.log({
-			input: {
-				scenario: r.scenario,
-				skillEnabled: r.skillEnabled,
-			},
-			output: {
-				status: r.status,
-				filesModified: r.filesModified,
-				testOutput: r.testOutput,
-			},
-			scores: {
-				pass: r.status === "passed" ? 1 : 0,
-			},
-			metadata: {
-				agent: r.agent,
-				model: r.model,
-				skillEnabled: r.skillEnabled,
-				duration: r.duration,
-				testsPassed: r.testsPassed,
-				testsTotal: r.testsTotal,
-				...(r.error ? { error: r.error } : {}),
-			},
-		});
+		const transcript = opts.transcripts.get(r.scenario);
+
+		// Build per-test scores
+		const perTestScores: Record<string, number> = {};
+		if (r.individualTests) {
+			for (const [testName, didPass] of Object.entries(r.individualTests)) {
+				perTestScores[toScoreKey(testName)] = didPass ? 1 : 0;
+			}
+		}
+
+		const testPassRate = r.testsTotal > 0 ? r.testsPassed / r.testsTotal : 0;
+
+		const scores: Record<string, number> = {
+			pass: r.status === "passed" ? 1 : 0,
+			test_pass_rate: testPassRate,
+			...perTestScores,
+		};
+
+		const input = {
+			scenario: r.scenario,
+			prompt: r.prompt ?? "",
+			skillEnabled: r.skillEnabled,
+		};
+
+		const output = {
+			status: r.status,
+			agentOutput: r.agentOutput,
+			filesModified: r.filesModified,
+			testOutput: r.testOutput,
+		};
+
+		const expected = {
+			testsTotal: r.testsTotal,
+			passThreshold: 1.0,
+		};
+
+		const metadata: Record<string, unknown> = {
+			agent: r.agent,
+			model: r.model,
+			skillEnabled: r.skillEnabled,
+			duration: r.duration,
+			testsPassed: r.testsPassed,
+			testsTotal: r.testsTotal,
+			toolCallCount: r.toolCallCount ?? 0,
+			filesModified: r.filesModified,
+			...(r.costUsd != null ? { costUsd: r.costUsd } : {}),
+			...(r.error ? { error: r.error } : {}),
+		};
+
+		if (transcript && transcript.toolCalls.length > 0) {
+			// Use traced() to create a root span with child spans for tool calls
+			experiment.traced(
+				(span) => {
+					span.log({ input, output, expected, scores, metadata });
+
+					for (const tc of transcript.toolCalls) {
+						span.traced(
+							(childSpan) => {
+								childSpan.log({
+									input: { tool: tc.tool, args: tc.input },
+									output: { preview: tc.outputPreview },
+									metadata: { toolUseId: tc.toolUseId },
+								});
+							},
+							{ name: `tool:${tc.tool}` },
+						);
+					}
+				},
+				{ name: r.scenario },
+			);
+		} else {
+			experiment.log({ input, output, expected, scores, metadata });
+		}
 	}
 
 	const summary = await experiment.summarize();
diff --git a/packages/evals/src/runner/results.ts b/packages/evals/src/runner/results.ts
index 901ed3a..2f6df7b 100644
--- a/packages/evals/src/runner/results.ts
+++ b/packages/evals/src/runner/results.ts
@@ -1,5 +1,5 @@
-import { mkdirSync, readdirSync, statSync, writeFileSync } from "node:fs";
-import { join, resolve } from "node:path";
+import { readdirSync, statSync } from "node:fs";
+import { join } from "node:path";
 import type { EvalRunResult } from "../types.js";
 
 /**
diff --git a/packages/evals/src/runner/test.ts b/packages/evals/src/runner/test.ts
index a4e625c..2a9c4fa 100644
--- a/packages/evals/src/runner/test.ts
+++ b/packages/evals/src/runner/test.ts
@@ -16,6 +16,8 @@ export interface TestResult {
 	passedCount: number;
 	/** Total number of tests */
 	totalCount: number;
+	/** Per-test pass/fail extracted from vitest verbose output */
+	individualTests: Record<string, boolean>;
 }
 
 /**
@@ -91,6 +93,24 @@ export async function runTests(opts: {
 	}
 }
 
+/**
+ * Extract per-test pass/fail from vitest verbose output.
+ *
+ * Vitest verbose format:
+ *   ✓ EVAL.ts > test name here 0ms          → passed
+ *   × EVAL.ts > test name here 2ms          → failed
+ */
+function parseIndividualTests(output: string): Record<string, boolean> {
+	const results: Record<string, boolean> = {};
+	const re = /[✓×]\s+EVAL\.tsx?\s+>\s+(.+?)\s+\d+ms/g;
+	for (const match of output.matchAll(re)) {
+		const testName = match[1].trim();
+		const didPass = output[match.index] === "✓";
+		results[testName] = didPass;
+	}
+	return results;
+}
+
 function parseTestOutput(output: string): TestResult {
 	// Parse vitest output for pass/fail counts
 	// Vitest formats:
@@ -114,6 +134,7 @@ function parseTestOutput(output: string): TestResult {
 	}
 
 	const passed = totalCount > 0 && passedCount === totalCount;
+	const individualTests = parseIndividualTests(output);
 
-	return { passed, output, passedCount, totalCount };
+	return { passed, output, passedCount, totalCount, individualTests };
 }
diff --git a/packages/evals/src/types.ts b/packages/evals/src/types.ts
index 4b97534..7d85e76 100644
--- a/packages/evals/src/types.ts
+++ b/packages/evals/src/types.ts
@@ -38,4 +38,8 @@ export interface EvalRunResult {
 	toolCallCount?: number;
 	/** Total cost in USD (from stream-json result event) */
 	costUsd?: number;
+	/** The PROMPT.md content sent to the agent */
+	prompt?: string;
+	/** Per-test pass/fail results from vitest */
+	individualTests?: Record<string, boolean>;
 }