fix format

This commit is contained in:
Pedro Rodrigues
2026-01-23 17:33:44 +00:00
parent 1d9f4ea441
commit b5289ff6ee
19 changed files with 616 additions and 595 deletions

View File

@@ -7,7 +7,11 @@ import {
validateSkillExists, validateSkillExists,
} from "./config.js"; } from "./config.js";
import { parseRuleFile } from "./parser.js"; import { parseRuleFile } from "./parser.js";
import { filterRulesForProfile, listProfiles, loadProfile } from "./profiles.js"; import {
filterRulesForProfile,
listProfiles,
loadProfile,
} from "./profiles.js";
import type { Metadata, Profile, Rule, Section } from "./types.js"; import type { Metadata, Profile, Rule, Section } from "./types.js";
import { validateRuleFile } from "./validate.js"; import { validateRuleFile } from "./validate.js";
@@ -118,10 +122,7 @@ function buildSkill(paths: SkillPaths, profile?: Profile): void {
// Check if rules directory exists // Check if rules directory exists
if (!existsSync(paths.rulesDir)) { if (!existsSync(paths.rulesDir)) {
console.log(` No rules directory found. Generating empty AGENTS.md.`); console.log(` No rules directory found. Generating empty AGENTS.md.`);
writeFileSync( writeFileSync(outputFile, `# ${skillTitle}\n\nNo rules defined yet.\n`);
outputFile,
`# ${skillTitle}\n\nNo rules defined yet.\n`,
);
return; return;
} }
@@ -157,7 +158,9 @@ function buildSkill(paths: SkillPaths, profile?: Profile): void {
let filteredRules = rules; let filteredRules = rules;
if (profile) { if (profile) {
filteredRules = filterRulesForProfile(rules, profile); filteredRules = filterRulesForProfile(rules, profile);
console.log(` Filtered to ${filteredRules.length} rules for profile "${profile.name}"`); console.log(
` Filtered to ${filteredRules.length} rules for profile "${profile.name}"`,
);
} }
// Group rules by section and assign IDs // Group rules by section and assign IDs
@@ -244,7 +247,9 @@ function buildSkill(paths: SkillPaths, profile?: Profile): void {
prerequisites.push(`PostgreSQL ${rule.minVersion}+`); prerequisites.push(`PostgreSQL ${rule.minVersion}+`);
} }
if (rule.extensions && rule.extensions.length > 0) { if (rule.extensions && rule.extensions.length > 0) {
prerequisites.push(`Extension${rule.extensions.length > 1 ? "s" : ""}: ${rule.extensions.join(", ")}`); prerequisites.push(
`Extension${rule.extensions.length > 1 ? "s" : ""}: ${rule.extensions.join(", ")}`,
);
} }
if (prerequisites.length > 0) { if (prerequisites.length > 0) {
output.push(`**Prerequisites:** ${prerequisites.join(" | ")}\n`); output.push(`**Prerequisites:** ${prerequisites.join(" | ")}\n`);
@@ -302,7 +307,11 @@ function buildSkill(paths: SkillPaths, profile?: Profile): void {
/** /**
* Parse CLI arguments * Parse CLI arguments
*/ */
function parseArgs(): { skill?: string; profile?: string; allProfiles: boolean } { function parseArgs(): {
skill?: string;
profile?: string;
allProfiles: boolean;
} {
const args = process.argv.slice(2); const args = process.argv.slice(2);
let skill: string | undefined; let skill: string | undefined;
let profile: string | undefined; let profile: string | undefined;

View File

@@ -251,7 +251,8 @@ export function parseRuleFile(
const examples = extractExamples(body); const examples = extractExamples(body);
const tags = frontmatter.tags?.split(",").map((t) => t.trim()) || []; const tags = frontmatter.tags?.split(",").map((t) => t.trim()) || [];
const extensions = frontmatter.extensions?.split(",").map((e) => e.trim()) || []; const extensions =
frontmatter.extensions?.split(",").map((e) => e.trim()) || [];
// Validation warnings // Validation warnings
if (!explanation || explanation.length < 20) { if (!explanation || explanation.length < 20) {

View File

@@ -5,7 +5,10 @@ import type { Profile, Rule } from "./types.js";
/** /**
* Load a profile from the profiles directory * Load a profile from the profiles directory
*/ */
export function loadProfile(profilesDir: string, profileName: string): Profile | null { export function loadProfile(
profilesDir: string,
profileName: string,
): Profile | null {
const profileFile = join(profilesDir, `${profileName}.json`); const profileFile = join(profilesDir, `${profileName}.json`);
if (!existsSync(profileFile)) { if (!existsSync(profileFile)) {
return null; return null;
@@ -54,14 +57,20 @@ function compareVersions(a: string, b: string): number {
/** /**
* Check if a rule is compatible with a profile * Check if a rule is compatible with a profile
*/ */
export function isRuleCompatibleWithProfile(rule: Rule, profile: Profile): boolean { export function isRuleCompatibleWithProfile(
rule: Rule,
profile: Profile,
): boolean {
// Check version requirement // Check version requirement
if (rule.minVersion) { if (rule.minVersion) {
if (compareVersions(rule.minVersion, profile.minVersion) > 0) { if (compareVersions(rule.minVersion, profile.minVersion) > 0) {
// Rule requires a higher version than profile supports // Rule requires a higher version than profile supports
return false; return false;
} }
if (profile.maxVersion && compareVersions(rule.minVersion, profile.maxVersion) > 0) { if (
profile.maxVersion &&
compareVersions(rule.minVersion, profile.maxVersion) > 0
) {
// Rule requires a version higher than profile's max // Rule requires a version higher than profile's max
return false; return false;
} }

View File

@@ -26,8 +26,8 @@ export interface Rule {
references?: string[]; references?: string[];
tags?: string[]; tags?: string[];
supabaseNotes?: string; supabaseNotes?: string;
minVersion?: string; // Minimum PostgreSQL version required (e.g., "11", "14") minVersion?: string; // Minimum PostgreSQL version required (e.g., "11", "14")
extensions?: string[]; // Required PostgreSQL extensions (e.g., ["pg_stat_statements"]) extensions?: string[]; // Required PostgreSQL extensions (e.g., ["pg_stat_statements"])
} }
export interface Section { export interface Section {

View File

@@ -1,18 +1,18 @@
{ {
"name": "postgres-best-practices-evals", "name": "postgres-best-practices-evals",
"version": "1.0.0", "version": "1.0.0",
"description": "Evaluation scenarios for Postgres Best Practices skill", "description": "Evaluation scenarios for Postgres Best Practices skill",
"type": "module", "type": "module",
"scripts": { "scripts": {
"eval": "vitest run", "eval": "vitest run",
"eval:watch": "vitest", "eval:watch": "vitest",
"eval:ui": "vitest --ui" "eval:ui": "vitest --ui"
}, },
"devDependencies": { "devDependencies": {
"@ai-sdk/anthropic": "^0.0.30", "@ai-sdk/anthropic": "^0.0.30",
"@types/node": "^20.0.0", "@types/node": "^20.0.0",
"ai": "^3.0.0", "ai": "^3.0.0",
"typescript": "^5.0.0", "typescript": "^5.0.0",
"vitest": "^1.0.0" "vitest": "^1.0.0"
} }
} }

View File

@@ -1,129 +1,143 @@
import { generateText } from "ai";
import { anthropic } from "@ai-sdk/anthropic";
import { readFileSync } from "node:fs"; import { readFileSync } from "node:fs";
import { join } from "node:path"; import { join } from "node:path";
import type { CriterionResult, EvalConfig, EvalResult, EvalScenario } from "./types.js"; import { anthropic } from "@ai-sdk/anthropic";
import { generateText } from "ai";
import type {
CriterionResult,
EvalConfig,
EvalResult,
EvalScenario,
} from "./types.js";
const DEFAULT_CONFIG: EvalConfig = { const DEFAULT_CONFIG: EvalConfig = {
agentsPath: join(import.meta.dirname, "..", "AGENTS.md"), agentsPath: join(import.meta.dirname, "..", "AGENTS.md"),
model: "claude-sonnet-4-20250514", model: "claude-sonnet-4-20250514",
maxTokens: 2048, maxTokens: 2048,
temperature: 0, temperature: 0,
}; };
/** /**
* Build the user prompt from a scenario * Build the user prompt from a scenario
*/ */
function buildUserPrompt(scenario: EvalScenario): string { function buildUserPrompt(scenario: EvalScenario): string {
const parts: string[] = []; const parts: string[] = [];
// Add version context if specified // Add version context if specified
if (scenario.input.postgresVersion) { if (scenario.input.postgresVersion) {
parts.push(`PostgreSQL Version: ${scenario.input.postgresVersion}`); parts.push(`PostgreSQL Version: ${scenario.input.postgresVersion}`);
} }
// Add extensions context if specified // Add extensions context if specified
if (scenario.input.availableExtensions) { if (scenario.input.availableExtensions) {
if (scenario.input.availableExtensions.length === 0) { if (scenario.input.availableExtensions.length === 0) {
parts.push("Available Extensions: None installed"); parts.push("Available Extensions: None installed");
} else { } else {
parts.push(`Available Extensions: ${scenario.input.availableExtensions.join(", ")}`); parts.push(
} `Available Extensions: ${scenario.input.availableExtensions.join(", ")}`,
} );
}
}
// Add additional context if provided // Add additional context if provided
if (scenario.input.context) { if (scenario.input.context) {
parts.push(`Context: ${scenario.input.context}`); parts.push(`Context: ${scenario.input.context}`);
} }
// Add schema // Add schema
parts.push(`\nSchema:\n\`\`\`sql\n${scenario.input.schema}\n\`\`\``); parts.push(`\nSchema:\n\`\`\`sql\n${scenario.input.schema}\n\`\`\``);
// Add user query // Add user query
parts.push(`\nQuestion: ${scenario.input.userQuery}`); parts.push(`\nQuestion: ${scenario.input.userQuery}`);
return parts.join("\n"); return parts.join("\n");
} }
/** /**
* Extract rule IDs mentioned in a response * Extract rule IDs mentioned in a response
*/ */
function extractRuleIds(response: string): string[] { function extractRuleIds(response: string): string[] {
// Match patterns like "1.1", "2.3", etc. // Match patterns like "1.1", "2.3", etc.
const rulePattern = /\b(\d+\.\d+)\b/g; const rulePattern = /\b(\d+\.\d+)\b/g;
const matches = response.match(rulePattern) || []; const matches = response.match(rulePattern) || [];
return [...new Set(matches)]; return [...new Set(matches)];
} }
/** /**
* Evaluate the response against expected criteria * Evaluate the response against expected criteria
*/ */
function evaluateCriteria(scenario: EvalScenario, response: string): CriterionResult[] { function evaluateCriteria(
const results: CriterionResult[] = []; scenario: EvalScenario,
const responseLower = response.toLowerCase(); response: string,
): CriterionResult[] {
const results: CriterionResult[] = [];
const responseLower = response.toLowerCase();
// Check mustContain criteria // Check mustContain criteria
for (const term of scenario.expectedOutput.mustContain) { for (const term of scenario.expectedOutput.mustContain) {
const found = responseLower.includes(term.toLowerCase()); const found = responseLower.includes(term.toLowerCase());
results.push({ results.push({
criterion: `Response should contain "${term}"`, criterion: `Response should contain "${term}"`,
passed: found, passed: found,
evidence: found ? "Found in response" : "Not found in response", evidence: found ? "Found in response" : "Not found in response",
}); });
} }
// Check mustNotContain criteria // Check mustNotContain criteria
if (scenario.expectedOutput.mustNotContain) { if (scenario.expectedOutput.mustNotContain) {
for (const term of scenario.expectedOutput.mustNotContain) { for (const term of scenario.expectedOutput.mustNotContain) {
const found = responseLower.includes(term.toLowerCase()); const found = responseLower.includes(term.toLowerCase());
results.push({ results.push({
criterion: `Response should NOT contain "${term}"`, criterion: `Response should NOT contain "${term}"`,
passed: !found, passed: !found,
evidence: found ? "Found in response (should not be present)" : "Not found (correct)", evidence: found
}); ? "Found in response (should not be present)"
} : "Not found (correct)",
} });
}
}
// Check shouldRecommendRules // Check shouldRecommendRules
const referencedRules = extractRuleIds(response); const referencedRules = extractRuleIds(response);
for (const ruleId of scenario.expectedOutput.shouldRecommendRules) { for (const ruleId of scenario.expectedOutput.shouldRecommendRules) {
const found = referencedRules.includes(ruleId); const found = referencedRules.includes(ruleId);
results.push({ results.push({
criterion: `Should recommend rule ${ruleId}`, criterion: `Should recommend rule ${ruleId}`,
passed: found, passed: found,
evidence: found ? "Rule referenced" : "Rule not referenced", evidence: found ? "Rule referenced" : "Rule not referenced",
}); });
} }
// Check shouldNotRecommendRules // Check shouldNotRecommendRules
if (scenario.expectedOutput.shouldNotRecommendRules) { if (scenario.expectedOutput.shouldNotRecommendRules) {
for (const ruleId of scenario.expectedOutput.shouldNotRecommendRules) { for (const ruleId of scenario.expectedOutput.shouldNotRecommendRules) {
const found = referencedRules.includes(ruleId); const found = referencedRules.includes(ruleId);
results.push({ results.push({
criterion: `Should NOT recommend rule ${ruleId}`, criterion: `Should NOT recommend rule ${ruleId}`,
passed: !found, passed: !found,
evidence: found ? "Rule referenced (should not be)" : "Rule not referenced (correct)", evidence: found
}); ? "Rule referenced (should not be)"
} : "Rule not referenced (correct)",
} });
}
}
return results; return results;
} }
/** /**
* Run a single evaluation scenario * Run a single evaluation scenario
*/ */
export async function runEval( export async function runEval(
scenario: EvalScenario, scenario: EvalScenario,
config: Partial<EvalConfig> = {} config: Partial<EvalConfig> = {},
): Promise<EvalResult> { ): Promise<EvalResult> {
const finalConfig = { ...DEFAULT_CONFIG, ...config }; const finalConfig = { ...DEFAULT_CONFIG, ...config };
try { try {
// Load AGENTS.md // Load AGENTS.md
const agentsMd = readFileSync(finalConfig.agentsPath, "utf-8"); const agentsMd = readFileSync(finalConfig.agentsPath, "utf-8");
const systemPrompt = `You are a PostgreSQL expert assistant. Use the following knowledge base to provide accurate recommendations: const systemPrompt = `You are a PostgreSQL expert assistant. Use the following knowledge base to provide accurate recommendations:
${agentsMd} ${agentsMd}
@@ -134,59 +148,59 @@ IMPORTANT: When the user specifies a PostgreSQL version or available extensions,
When making recommendations, reference specific rule IDs (e.g., "1.1", "2.3") from the knowledge base.`; When making recommendations, reference specific rule IDs (e.g., "1.1", "2.3") from the knowledge base.`;
const userPrompt = buildUserPrompt(scenario); const userPrompt = buildUserPrompt(scenario);
const start = Date.now(); const start = Date.now();
const { text } = await generateText({ const { text } = await generateText({
model: anthropic(finalConfig.model!), model: anthropic(finalConfig.model ?? DEFAULT_CONFIG.model),
system: systemPrompt, system: systemPrompt,
prompt: userPrompt, prompt: userPrompt,
maxTokens: finalConfig.maxTokens, maxTokens: finalConfig.maxTokens,
temperature: finalConfig.temperature, temperature: finalConfig.temperature,
}); });
const latencyMs = Date.now() - start; const latencyMs = Date.now() - start;
// Evaluate the response // Evaluate the response
const criteriaResults = evaluateCriteria(scenario, text); const criteriaResults = evaluateCriteria(scenario, text);
const rulesReferenced = extractRuleIds(text); const rulesReferenced = extractRuleIds(text);
const passed = criteriaResults.every((r) => r.passed); const passed = criteriaResults.every((r) => r.passed);
return { return {
scenarioId: scenario.id, scenarioId: scenario.id,
passed, passed,
rulesReferenced, rulesReferenced,
criteriaResults, criteriaResults,
response: text, response: text,
latencyMs, latencyMs,
}; };
} catch (error) { } catch (error) {
return { return {
scenarioId: scenario.id, scenarioId: scenario.id,
passed: false, passed: false,
rulesReferenced: [], rulesReferenced: [],
criteriaResults: [], criteriaResults: [],
response: "", response: "",
latencyMs: 0, latencyMs: 0,
error: error instanceof Error ? error.message : String(error), error: error instanceof Error ? error.message : String(error),
}; };
} }
} }
/** /**
* Run multiple evaluation scenarios * Run multiple evaluation scenarios
*/ */
export async function runEvals( export async function runEvals(
scenarios: EvalScenario[], scenarios: EvalScenario[],
config: Partial<EvalConfig> = {} config: Partial<EvalConfig> = {},
): Promise<EvalResult[]> { ): Promise<EvalResult[]> {
const results: EvalResult[] = []; const results: EvalResult[] = [];
for (const scenario of scenarios) { for (const scenario of scenarios) {
console.log(`Running eval: ${scenario.name}...`); console.log(`Running eval: ${scenario.name}...`);
const result = await runEval(scenario, config); const result = await runEval(scenario, config);
results.push(result); results.push(result);
console.log(` ${result.passed ? "PASS" : "FAIL"} (${result.latencyMs}ms)`); console.log(` ${result.passed ? "PASS" : "FAIL"} (${result.latencyMs}ms)`);
} }
return results; return results;
} }

View File

@@ -1,16 +1,16 @@
import { describe, it, expect } from "vitest"; import { describe, expect, it } from "vitest";
import { runEval } from "../runner.js"; import { runEval } from "../runner.js";
import type { EvalScenario } from "../types.js"; import type { EvalScenario } from "../types.js";
const scenario: EvalScenario = { const scenario: EvalScenario = {
id: "covering-index-suggestion", id: "covering-index-suggestion",
name: "Covering Index Suggestion", name: "Covering Index Suggestion",
description: description:
"Agent should suggest using INCLUDE clause for columns in SELECT that aren't in WHERE clause", "Agent should suggest using INCLUDE clause for columns in SELECT that aren't in WHERE clause",
category: "query-performance", category: "query-performance",
difficulty: "intermediate", difficulty: "intermediate",
input: { input: {
schema: ` schema: `
CREATE TABLE users ( CREATE TABLE users (
id SERIAL PRIMARY KEY, id SERIAL PRIMARY KEY,
email VARCHAR(255) NOT NULL, email VARCHAR(255) NOT NULL,
@@ -22,41 +22,40 @@ CREATE TABLE users (
CREATE INDEX users_email_idx ON users (email); CREATE INDEX users_email_idx ON users (email);
-- Table has 2 million rows -- Table has 2 million rows
`, `,
userQuery: `This query still does heap fetches even though we have an index on email: userQuery: `This query still does heap fetches even though we have an index on email:
SELECT email, name, department FROM users WHERE email = 'user@example.com' SELECT email, name, department FROM users WHERE email = 'user@example.com'
EXPLAIN shows "Index Scan" but not "Index Only Scan". How can I avoid the table lookup?`, EXPLAIN shows "Index Scan" but not "Index Only Scan". How can I avoid the table lookup?`,
postgresVersion: "15.4", postgresVersion: "15.4",
}, },
expectedOutput: { expectedOutput: {
shouldRecommendRules: ["1.2"], // query-covering-indexes shouldRecommendRules: ["1.2"], // query-covering-indexes
mustContain: ["include", "covering"], mustContain: ["include", "covering"],
}, },
expectedReasoning: [ expectedReasoning: [
"Identify that the query selects columns (name, department) not in the index", "Identify that the query selects columns (name, department) not in the index",
"Recognize this causes additional heap fetches after the index scan", "Recognize this causes additional heap fetches after the index scan",
"Recommend using INCLUDE clause to create a covering index", "Recommend using INCLUDE clause to create a covering index",
"Explain that this enables index-only scans", "Explain that this enables index-only scans",
], ],
}; };
describe("Covering Index Suggestion", () => { describe("Covering Index Suggestion", () => {
it("should recommend INCLUDE clause for covering index", async () => { it("should recommend INCLUDE clause for covering index", async () => {
const result = await runEval(scenario); const result = await runEval(scenario);
console.log("Response:", result.response); console.log("Response:", result.response);
console.log("Criteria results:", result.criteriaResults); console.log("Criteria results:", result.criteriaResults);
// Response should mention INCLUDE keyword // Response should mention INCLUDE keyword
expect(result.response.toLowerCase()).toContain("include"); expect(result.response.toLowerCase()).toContain("include");
// Response should mention covering index concept // Response should mention covering index concept
const responseLower = result.response.toLowerCase(); const responseLower = result.response.toLowerCase();
expect( expect(
responseLower.includes("covering") || responseLower.includes("index-only") responseLower.includes("covering") ||
).toBe(true); responseLower.includes("index-only"),
}); ).toBe(true);
});
}); });
export { scenario };

View File

@@ -1,56 +1,54 @@
import { describe, it, expect } from "vitest"; import { describe, expect, it } from "vitest";
import { runEval } from "../runner.js"; import { runEval } from "../runner.js";
import type { EvalScenario } from "../types.js"; import type { EvalScenario } from "../types.js";
const scenario: EvalScenario = { const scenario: EvalScenario = {
id: "extension-available-pg-stat-statements", id: "extension-available-pg-stat-statements",
name: "Extension Available - pg_stat_statements", name: "Extension Available - pg_stat_statements",
description: description:
"Agent should recommend pg_stat_statements for query monitoring when the extension is available", "Agent should recommend pg_stat_statements for query monitoring when the extension is available",
category: "extension-requirements", category: "extension-requirements",
difficulty: "basic", difficulty: "basic",
input: { input: {
schema: ` schema: `
-- Production database with various tables -- Production database with various tables
CREATE TABLE users (id SERIAL PRIMARY KEY, email VARCHAR(255)); CREATE TABLE users (id SERIAL PRIMARY KEY, email VARCHAR(255));
CREATE TABLE orders (id SERIAL PRIMARY KEY, user_id INT, total DECIMAL); CREATE TABLE orders (id SERIAL PRIMARY KEY, user_id INT, total DECIMAL);
CREATE TABLE products (id SERIAL PRIMARY KEY, name VARCHAR(200), price DECIMAL); CREATE TABLE products (id SERIAL PRIMARY KEY, name VARCHAR(200), price DECIMAL);
`, `,
userQuery: userQuery:
"Our database is slow but we don't know which queries are causing the problem. How can we identify the slowest queries?", "Our database is slow but we don't know which queries are causing the problem. How can we identify the slowest queries?",
postgresVersion: "15.4", postgresVersion: "15.4",
availableExtensions: ["pg_stat_statements", "pgcrypto", "uuid-ossp"], availableExtensions: ["pg_stat_statements", "pgcrypto", "uuid-ossp"],
}, },
expectedOutput: { expectedOutput: {
shouldRecommendRules: ["7.1"], // monitor-pg-stat-statements shouldRecommendRules: ["7.1"], // monitor-pg-stat-statements
mustContain: ["pg_stat_statements"], mustContain: ["pg_stat_statements"],
}, },
expectedReasoning: [ expectedReasoning: [
"Recognize this is a query monitoring/performance diagnosis problem", "Recognize this is a query monitoring/performance diagnosis problem",
"Check that pg_stat_statements is available in the extensions list", "Check that pg_stat_statements is available in the extensions list",
"Recommend enabling pg_stat_statements for query analysis", "Recommend enabling pg_stat_statements for query analysis",
"Explain how to use it to find slow queries", "Explain how to use it to find slow queries",
], ],
}; };
describe("Extension Available - pg_stat_statements", () => { describe("Extension Available - pg_stat_statements", () => {
it("should recommend pg_stat_statements when available", async () => { it("should recommend pg_stat_statements when available", async () => {
const result = await runEval(scenario); const result = await runEval(scenario);
console.log("Response:", result.response); console.log("Response:", result.response);
console.log("Criteria results:", result.criteriaResults); console.log("Criteria results:", result.criteriaResults);
// Response should mention pg_stat_statements // Response should mention pg_stat_statements
expect(result.response.toLowerCase()).toContain("pg_stat_statements"); expect(result.response.toLowerCase()).toContain("pg_stat_statements");
// Should suggest enabling/using the extension // Should suggest enabling/using the extension
const responseLower = result.response.toLowerCase(); const responseLower = result.response.toLowerCase();
expect( expect(
responseLower.includes("create extension") || responseLower.includes("create extension") ||
responseLower.includes("enable") || responseLower.includes("enable") ||
responseLower.includes("query") responseLower.includes("query"),
).toBe(true); ).toBe(true);
}); });
}); });
export { scenario };

View File

@@ -1,56 +1,56 @@
import { describe, it, expect } from "vitest"; import { describe, expect, it } from "vitest";
import { runEval } from "../runner.js"; import { runEval } from "../runner.js";
import type { EvalScenario } from "../types.js"; import type { EvalScenario } from "../types.js";
const scenario: EvalScenario = { const scenario: EvalScenario = {
id: "extension-unavailable-no-pg-stat-statements", id: "extension-unavailable-no-pg-stat-statements",
name: "Extension Unavailable - No pg_stat_statements", name: "Extension Unavailable - No pg_stat_statements",
description: description:
"Agent should provide alternatives when pg_stat_statements is not available for query monitoring", "Agent should provide alternatives when pg_stat_statements is not available for query monitoring",
category: "extension-requirements", category: "extension-requirements",
difficulty: "intermediate", difficulty: "intermediate",
input: { input: {
schema: ` schema: `
-- Production database with various tables -- Production database with various tables
CREATE TABLE users (id SERIAL PRIMARY KEY, email VARCHAR(255)); CREATE TABLE users (id SERIAL PRIMARY KEY, email VARCHAR(255));
CREATE TABLE orders (id SERIAL PRIMARY KEY, user_id INT, total DECIMAL); CREATE TABLE orders (id SERIAL PRIMARY KEY, user_id INT, total DECIMAL);
CREATE TABLE products (id SERIAL PRIMARY KEY, name VARCHAR(200), price DECIMAL); CREATE TABLE products (id SERIAL PRIMARY KEY, name VARCHAR(200), price DECIMAL);
`, `,
userQuery: userQuery:
"Our database is slow but we don't know which queries are causing the problem. How can we identify the slowest queries?", "Our database is slow but we don't know which queries are causing the problem. How can we identify the slowest queries?",
postgresVersion: "15.4", postgresVersion: "15.4",
availableExtensions: [], // No extensions available availableExtensions: [], // No extensions available
context: context:
"This is a managed database environment where we cannot install additional extensions.", "This is a managed database environment where we cannot install additional extensions.",
}, },
expectedOutput: { expectedOutput: {
shouldRecommendRules: [], // Should not recommend pg_stat_statements rule shouldRecommendRules: [], // Should not recommend pg_stat_statements rule
shouldNotRecommendRules: ["7.1"], // monitor-pg-stat-statements shouldNotRecommendRules: ["7.1"], // monitor-pg-stat-statements
mustContain: ["explain", "analyze"], mustContain: ["explain", "analyze"],
mustNotContain: ["pg_stat_statements"], mustNotContain: ["pg_stat_statements"],
}, },
expectedReasoning: [ expectedReasoning: [
"Recognize that no extensions are available", "Recognize that no extensions are available",
"Check that pg_stat_statements cannot be used", "Check that pg_stat_statements cannot be used",
"Avoid recommending pg_stat_statements", "Avoid recommending pg_stat_statements",
"Suggest alternative approaches like EXPLAIN ANALYZE, log_min_duration_statement, or pg_stat_activity", "Suggest alternative approaches like EXPLAIN ANALYZE, log_min_duration_statement, or pg_stat_activity",
], ],
}; };
describe("Extension Unavailable - No pg_stat_statements", () => { describe("Extension Unavailable - No pg_stat_statements", () => {
it("should suggest alternatives when pg_stat_statements is unavailable", async () => { it("should suggest alternatives when pg_stat_statements is unavailable", async () => {
const result = await runEval(scenario); const result = await runEval(scenario);
console.log("Response:", result.response); console.log("Response:", result.response);
console.log("Criteria results:", result.criteriaResults); console.log("Criteria results:", result.criteriaResults);
// Response should NOT primarily recommend pg_stat_statements // Response should NOT primarily recommend pg_stat_statements
// (it might mention it as unavailable, but shouldn't suggest installing it) // (it might mention it as unavailable, but shouldn't suggest installing it)
const responseLower = result.response.toLowerCase(); const responseLower = result.response.toLowerCase();
// Should suggest EXPLAIN ANALYZE as an alternative // Should suggest EXPLAIN ANALYZE as an alternative
expect(responseLower.includes("explain") && responseLower.includes("analyze")).toBe(true); expect(
}); responseLower.includes("explain") && responseLower.includes("analyze"),
).toBe(true);
});
}); });
export { scenario };

View File

@@ -1,16 +1,16 @@
import { describe, it, expect } from "vitest"; import { describe, expect, it } from "vitest";
import { runEval } from "../runner.js"; import { runEval } from "../runner.js";
import type { EvalScenario } from "../types.js"; import type { EvalScenario } from "../types.js";
const scenario: EvalScenario = { const scenario: EvalScenario = {
id: "missing-index-detection", id: "missing-index-detection",
name: "Missing Index Detection", name: "Missing Index Detection",
description: description:
"Agent should identify missing index on WHERE clause columns and recommend creating an appropriate index", "Agent should identify missing index on WHERE clause columns and recommend creating an appropriate index",
category: "query-performance", category: "query-performance",
difficulty: "basic", difficulty: "basic",
input: { input: {
schema: ` schema: `
CREATE TABLE orders ( CREATE TABLE orders (
id SERIAL PRIMARY KEY, id SERIAL PRIMARY KEY,
customer_id INT NOT NULL, customer_id INT NOT NULL,
@@ -21,36 +21,36 @@ CREATE TABLE orders (
-- No indexes besides primary key -- No indexes besides primary key
-- Table has 5 million rows -- Table has 5 million rows
`, `,
userQuery: userQuery:
"This query is slow and takes 3 seconds: SELECT * FROM orders WHERE customer_id = 12345 AND status = 'pending'", "This query is slow and takes 3 seconds: SELECT * FROM orders WHERE customer_id = 12345 AND status = 'pending'",
}, },
expectedOutput: { expectedOutput: {
shouldRecommendRules: ["1.1"], // query-missing-indexes shouldRecommendRules: ["1.1"], // query-missing-indexes
mustContain: ["index", "customer_id"], mustContain: ["index", "customer_id"],
}, },
expectedReasoning: [ expectedReasoning: [
"Identify that the query filters on customer_id and status", "Identify that the query filters on customer_id and status",
"Recognize that without an index, this causes a sequential scan", "Recognize that without an index, this causes a sequential scan",
"Recommend creating an index on the filtered columns", "Recommend creating an index on the filtered columns",
], ],
}; };
describe("Missing Index Detection", () => { describe("Missing Index Detection", () => {
it("should recommend creating an index on filtered columns", async () => { it("should recommend creating an index on filtered columns", async () => {
const result = await runEval(scenario); const result = await runEval(scenario);
console.log("Response:", result.response); console.log("Response:", result.response);
console.log("Criteria results:", result.criteriaResults); console.log("Criteria results:", result.criteriaResults);
// Check that key criteria passed // Check that key criteria passed
expect(result.criteriaResults.some((c) => c.criterion.includes("index") && c.passed)).toBe( expect(
true result.criteriaResults.some(
); (c) => c.criterion.includes("index") && c.passed,
),
).toBe(true);
// Response should mention creating an index // Response should mention creating an index
expect(result.response.toLowerCase()).toContain("index"); expect(result.response.toLowerCase()).toContain("index");
expect(result.response.toLowerCase()).toContain("customer_id"); expect(result.response.toLowerCase()).toContain("customer_id");
}); });
}); });
export { scenario };

View File

@@ -1,16 +1,16 @@
import { describe, it, expect } from "vitest"; import { describe, expect, it } from "vitest";
import { runEval } from "../runner.js"; import { runEval } from "../runner.js";
import type { EvalScenario } from "../types.js"; import type { EvalScenario } from "../types.js";
const scenario: EvalScenario = { const scenario: EvalScenario = {
id: "n-plus-one-detection", id: "n-plus-one-detection",
name: "N+1 Query Detection", name: "N+1 Query Detection",
description: description:
"Agent should identify N+1 query pattern in application code and recommend using JOINs or batch queries", "Agent should identify N+1 query pattern in application code and recommend using JOINs or batch queries",
category: "query-performance", category: "query-performance",
difficulty: "intermediate", difficulty: "intermediate",
input: { input: {
schema: ` schema: `
CREATE TABLE users ( CREATE TABLE users (
id SERIAL PRIMARY KEY, id SERIAL PRIMARY KEY,
name VARCHAR(100), name VARCHAR(100),
@@ -25,7 +25,7 @@ CREATE TABLE posts (
created_at TIMESTAMPTZ DEFAULT NOW() created_at TIMESTAMPTZ DEFAULT NOW()
); );
`, `,
userQuery: `My API endpoint is slow. Here's the code: userQuery: `My API endpoint is slow. Here's the code:
\`\`\`typescript \`\`\`typescript
// Get all posts // Get all posts
@@ -39,33 +39,33 @@ for (const post of posts) {
\`\`\` \`\`\`
This makes 101 database queries. How can I optimize it?`, This makes 101 database queries. How can I optimize it?`,
}, },
expectedOutput: { expectedOutput: {
shouldRecommendRules: ["6.1"], // data-n-plus-one shouldRecommendRules: ["6.1"], // data-n-plus-one
mustContain: ["join", "n+1"], mustContain: ["join", "n+1"],
}, },
expectedReasoning: [ expectedReasoning: [
"Identify the N+1 query pattern (1 query for posts + N queries for users)", "Identify the N+1 query pattern (1 query for posts + N queries for users)",
"Recognize this as a common performance anti-pattern", "Recognize this as a common performance anti-pattern",
"Recommend using a JOIN to fetch all data in a single query", "Recommend using a JOIN to fetch all data in a single query",
"Optionally suggest using IN clause for batch fetching", "Optionally suggest using IN clause for batch fetching",
], ],
}; };
describe("N+1 Query Detection", () => { describe("N+1 Query Detection", () => {
it("should identify N+1 pattern and recommend JOIN", async () => { it("should identify N+1 pattern and recommend JOIN", async () => {
const result = await runEval(scenario); const result = await runEval(scenario);
console.log("Response:", result.response); console.log("Response:", result.response);
console.log("Criteria results:", result.criteriaResults); console.log("Criteria results:", result.criteriaResults);
// Response should mention JOIN // Response should mention JOIN
expect(result.response.toLowerCase()).toContain("join"); expect(result.response.toLowerCase()).toContain("join");
// Response should explain the N+1 problem // Response should explain the N+1 problem
const responseLower = result.response.toLowerCase(); const responseLower = result.response.toLowerCase();
expect(responseLower.includes("n+1") || responseLower.includes("n + 1")).toBe(true); expect(
}); responseLower.includes("n+1") || responseLower.includes("n + 1"),
).toBe(true);
});
}); });
export { scenario };

View File

@@ -1,4 +1,4 @@
import { describe, it, expect } from "vitest"; import { describe, expect, it } from "vitest";
import { runEval } from "../runner.js"; import { runEval } from "../runner.js";
import type { EvalScenario } from "../types.js"; import type { EvalScenario } from "../types.js";
@@ -6,14 +6,14 @@ import type { EvalScenario } from "../types.js";
* Scenario 1: PG10 - Should NOT recommend covering indexes (requires PG11+) * Scenario 1: PG10 - Should NOT recommend covering indexes (requires PG11+)
*/ */
const scenarioPg10NoCoveringIndex: EvalScenario = { const scenarioPg10NoCoveringIndex: EvalScenario = {
id: "version-constraint-pg10-no-covering", id: "version-constraint-pg10-no-covering",
name: "Version Constraint - PG10 No Covering Index", name: "Version Constraint - PG10 No Covering Index",
description: description:
"Agent should NOT recommend INCLUDE clause on PostgreSQL 10 since it requires PG11+", "Agent should NOT recommend INCLUDE clause on PostgreSQL 10 since it requires PG11+",
category: "version-constraints", category: "version-constraints",
difficulty: "intermediate", difficulty: "intermediate",
input: { input: {
schema: ` schema: `
CREATE TABLE users ( CREATE TABLE users (
id SERIAL PRIMARY KEY, id SERIAL PRIMARY KEY,
email VARCHAR(255) NOT NULL, email VARCHAR(255) NOT NULL,
@@ -23,36 +23,36 @@ CREATE TABLE users (
CREATE INDEX users_email_idx ON users (email); CREATE INDEX users_email_idx ON users (email);
`, `,
userQuery: userQuery:
"How can I optimize this query to avoid heap fetches? SELECT email, name FROM users WHERE email = 'test@example.com'", "How can I optimize this query to avoid heap fetches? SELECT email, name FROM users WHERE email = 'test@example.com'",
postgresVersion: "10.0", postgresVersion: "10.0",
}, },
expectedOutput: { expectedOutput: {
shouldRecommendRules: [], shouldRecommendRules: [],
shouldNotRecommendRules: ["1.2"], // query-covering-indexes requires PG11 shouldNotRecommendRules: ["1.2"], // query-covering-indexes requires PG11
mustContain: ["index"], mustContain: ["index"],
mustNotContain: ["include"], mustNotContain: ["include"],
}, },
expectedReasoning: [ expectedReasoning: [
"Recognize that PostgreSQL 10 is specified", "Recognize that PostgreSQL 10 is specified",
"Check that covering indexes (INCLUDE clause) require PG11+", "Check that covering indexes (INCLUDE clause) require PG11+",
"Avoid recommending INCLUDE clause", "Avoid recommending INCLUDE clause",
"Suggest alternative optimization strategies appropriate for PG10", "Suggest alternative optimization strategies appropriate for PG10",
], ],
}; };
/** /**
* Scenario 2: PG9.3 - Should NOT recommend UPSERT (requires PG9.5+) * Scenario 2: PG9.3 - Should NOT recommend UPSERT (requires PG9.5+)
*/ */
const scenarioPg93NoUpsert: EvalScenario = { const scenarioPg93NoUpsert: EvalScenario = {
id: "version-constraint-pg93-no-upsert", id: "version-constraint-pg93-no-upsert",
name: "Version Constraint - PG9.3 No UPSERT", name: "Version Constraint - PG9.3 No UPSERT",
description: description:
"Agent should NOT recommend ON CONFLICT on PostgreSQL 9.3 since it requires PG9.5+", "Agent should NOT recommend ON CONFLICT on PostgreSQL 9.3 since it requires PG9.5+",
category: "version-constraints", category: "version-constraints",
difficulty: "intermediate", difficulty: "intermediate",
input: { input: {
schema: ` schema: `
CREATE TABLE settings ( CREATE TABLE settings (
user_id INT NOT NULL, user_id INT NOT NULL,
key VARCHAR(50) NOT NULL, key VARCHAR(50) NOT NULL,
@@ -60,49 +60,47 @@ CREATE TABLE settings (
PRIMARY KEY (user_id, key) PRIMARY KEY (user_id, key)
); );
`, `,
userQuery: userQuery:
"I need to insert a setting if it doesn't exist, or update it if it does. How should I do this?", "I need to insert a setting if it doesn't exist, or update it if it does. How should I do this?",
postgresVersion: "9.3", postgresVersion: "9.3",
}, },
expectedOutput: { expectedOutput: {
shouldRecommendRules: [], shouldRecommendRules: [],
shouldNotRecommendRules: ["6.3"], // data-upsert requires PG9.5 shouldNotRecommendRules: ["6.3"], // data-upsert requires PG9.5
mustContain: ["insert", "update"], mustContain: ["insert", "update"],
mustNotContain: ["on conflict"], mustNotContain: ["on conflict"],
}, },
expectedReasoning: [ expectedReasoning: [
"Recognize that PostgreSQL 9.3 is specified", "Recognize that PostgreSQL 9.3 is specified",
"Check that ON CONFLICT (UPSERT) requires PG9.5+", "Check that ON CONFLICT (UPSERT) requires PG9.5+",
"Avoid recommending ON CONFLICT syntax", "Avoid recommending ON CONFLICT syntax",
"Suggest alternative pattern (e.g., CTE with INSERT/UPDATE, or try/catch approach)", "Suggest alternative pattern (e.g., CTE with INSERT/UPDATE, or try/catch approach)",
], ],
}; };
describe("Version Constraint Tests", () => { describe("Version Constraint Tests", () => {
describe("PG10 - No Covering Index", () => { describe("PG10 - No Covering Index", () => {
it("should NOT recommend INCLUDE clause for PG10", async () => { it("should NOT recommend INCLUDE clause for PG10", async () => {
const result = await runEval(scenarioPg10NoCoveringIndex); const result = await runEval(scenarioPg10NoCoveringIndex);
console.log("Response:", result.response); console.log("Response:", result.response);
console.log("Criteria results:", result.criteriaResults); console.log("Criteria results:", result.criteriaResults);
// Response should NOT contain INCLUDE recommendation // Response should NOT contain INCLUDE recommendation
expect(result.response.toLowerCase()).not.toContain("include ("); expect(result.response.toLowerCase()).not.toContain("include (");
expect(result.response.toLowerCase()).not.toContain("include("); expect(result.response.toLowerCase()).not.toContain("include(");
}); });
}); });
describe("PG9.3 - No UPSERT", () => { describe("PG9.3 - No UPSERT", () => {
it("should NOT recommend ON CONFLICT for PG9.3", async () => { it("should NOT recommend ON CONFLICT for PG9.3", async () => {
const result = await runEval(scenarioPg93NoUpsert); const result = await runEval(scenarioPg93NoUpsert);
console.log("Response:", result.response); console.log("Response:", result.response);
console.log("Criteria results:", result.criteriaResults); console.log("Criteria results:", result.criteriaResults);
// Response should NOT recommend ON CONFLICT // Response should NOT recommend ON CONFLICT
expect(result.response.toLowerCase()).not.toContain("on conflict"); expect(result.response.toLowerCase()).not.toContain("on conflict");
}); });
}); });
}); });
export { scenarioPg10NoCoveringIndex, scenarioPg93NoUpsert };

View File

@@ -1,13 +1,13 @@
{ {
"compilerOptions": { "compilerOptions": {
"target": "ES2022", "target": "ES2022",
"module": "ESNext", "module": "ESNext",
"moduleResolution": "bundler", "moduleResolution": "bundler",
"esModuleInterop": true, "esModuleInterop": true,
"strict": true, "strict": true,
"skipLibCheck": true, "skipLibCheck": true,
"outDir": "dist", "outDir": "dist",
"declaration": true "declaration": true
}, },
"include": ["*.ts", "scenarios/**/*.ts"] "include": ["*.ts", "scenarios/**/*.ts"]
} }

View File

@@ -2,111 +2,114 @@
* Evaluation scenario definition * Evaluation scenario definition
*/ */
export interface EvalScenario { export interface EvalScenario {
/** Unique identifier for the scenario */ /** Unique identifier for the scenario */
id: string; id: string;
/** Human-readable name */ /** Human-readable name */
name: string; name: string;
/** Description of what this scenario tests */ /** Description of what this scenario tests */
description: string; description: string;
/** Category of the scenario */ /** Category of the scenario */
category: "query-performance" | "version-constraints" | "extension-requirements"; category:
| "query-performance"
| "version-constraints"
| "extension-requirements";
/** Difficulty level */ /** Difficulty level */
difficulty: "basic" | "intermediate" | "advanced"; difficulty: "basic" | "intermediate" | "advanced";
/** Input for the scenario */ /** Input for the scenario */
input: { input: {
/** SQL schema context */ /** SQL schema context */
schema: string; schema: string;
/** User's question or request */ /** User's question or request */
userQuery: string; userQuery: string;
/** Optional PostgreSQL version (e.g., "10.0", "15.4") */ /** Optional PostgreSQL version (e.g., "10.0", "15.4") */
postgresVersion?: string; postgresVersion?: string;
/** Optional list of available extensions */ /** Optional list of available extensions */
availableExtensions?: string[]; availableExtensions?: string[];
/** Additional context */ /** Additional context */
context?: string; context?: string;
}; };
/** Expected output criteria */ /** Expected output criteria */
expectedOutput: { expectedOutput: {
/** Rule IDs that should be recommended */ /** Rule IDs that should be recommended */
shouldRecommendRules: string[]; shouldRecommendRules: string[];
/** Rule IDs that should NOT be recommended (version/extension constraints) */ /** Rule IDs that should NOT be recommended (version/extension constraints) */
shouldNotRecommendRules?: string[]; shouldNotRecommendRules?: string[];
/** Strings that must appear in the response */ /** Strings that must appear in the response */
mustContain: string[]; mustContain: string[];
/** Strings that must NOT appear in the response */ /** Strings that must NOT appear in the response */
mustNotContain?: string[]; mustNotContain?: string[];
}; };
/** Expected reasoning steps the agent should follow */ /** Expected reasoning steps the agent should follow */
expectedReasoning: string[]; expectedReasoning: string[];
} }
/** /**
* Result of evaluating a single criterion * Result of evaluating a single criterion
*/ */
export interface CriterionResult { export interface CriterionResult {
/** Description of the criterion */ /** Description of the criterion */
criterion: string; criterion: string;
/** Whether the criterion passed */ /** Whether the criterion passed */
passed: boolean; passed: boolean;
/** Evidence or explanation */ /** Evidence or explanation */
evidence?: string; evidence?: string;
} }
/** /**
* Result of running an evaluation scenario * Result of running an evaluation scenario
*/ */
export interface EvalResult { export interface EvalResult {
/** Scenario ID */ /** Scenario ID */
scenarioId: string; scenarioId: string;
/** Whether all criteria passed */ /** Whether all criteria passed */
passed: boolean; passed: boolean;
/** Rule IDs that were referenced in the response */ /** Rule IDs that were referenced in the response */
rulesReferenced: string[]; rulesReferenced: string[];
/** Results for each evaluation criterion */ /** Results for each evaluation criterion */
criteriaResults: CriterionResult[]; criteriaResults: CriterionResult[];
/** The agent's full response */ /** The agent's full response */
response: string; response: string;
/** Time taken in milliseconds */ /** Time taken in milliseconds */
latencyMs: number; latencyMs: number;
/** Error message if evaluation failed */ /** Error message if evaluation failed */
error?: string; error?: string;
} }
/** /**
* Configuration for the eval runner * Configuration for the eval runner
*/ */
export interface EvalConfig { export interface EvalConfig {
/** Path to AGENTS.md file */ /** Path to AGENTS.md file */
agentsPath: string; agentsPath: string;
/** Model to use for evaluation */ /** Model to use for evaluation */
model?: string; model?: string;
/** Maximum tokens for response */ /** Maximum tokens for response */
maxTokens?: number; maxTokens?: number;
/** Temperature for generation */ /** Temperature for generation */
temperature?: number; temperature?: number;
} }

View File

@@ -4,69 +4,71 @@ import type { EvalResult, EvalScenario } from "./types.js";
* Format eval results as a summary table * Format eval results as a summary table
*/ */
export function formatResultsSummary(results: EvalResult[]): string { export function formatResultsSummary(results: EvalResult[]): string {
const lines: string[] = []; const lines: string[] = [];
lines.push("## Eval Results Summary\n"); lines.push("## Eval Results Summary\n");
const passed = results.filter((r) => r.passed).length; const passed = results.filter((r) => r.passed).length;
const total = results.length; const total = results.length;
const passRate = ((passed / total) * 100).toFixed(1); const passRate = ((passed / total) * 100).toFixed(1);
lines.push(`**Pass Rate:** ${passed}/${total} (${passRate}%)\n`); lines.push(`**Pass Rate:** ${passed}/${total} (${passRate}%)\n`);
lines.push("| Scenario | Status | Latency | Rules Referenced |"); lines.push("| Scenario | Status | Latency | Rules Referenced |");
lines.push("|----------|--------|---------|------------------|"); lines.push("|----------|--------|---------|------------------|");
for (const result of results) { for (const result of results) {
const status = result.passed ? "PASS" : "FAIL"; const status = result.passed ? "PASS" : "FAIL";
const latency = `${result.latencyMs}ms`; const latency = `${result.latencyMs}ms`;
const rules = result.rulesReferenced.join(", ") || "none"; const rules = result.rulesReferenced.join(", ") || "none";
lines.push(`| ${result.scenarioId} | ${status} | ${latency} | ${rules} |`); lines.push(`| ${result.scenarioId} | ${status} | ${latency} | ${rules} |`);
} }
return lines.join("\n"); return lines.join("\n");
} }
/** /**
* Format detailed results for a single scenario * Format detailed results for a single scenario
*/ */
export function formatDetailedResult(result: EvalResult): string { export function formatDetailedResult(result: EvalResult): string {
const lines: string[] = []; const lines: string[] = [];
lines.push(`## ${result.scenarioId}\n`); lines.push(`## ${result.scenarioId}\n`);
lines.push(`**Status:** ${result.passed ? "PASS" : "FAIL"}`); lines.push(`**Status:** ${result.passed ? "PASS" : "FAIL"}`);
lines.push(`**Latency:** ${result.latencyMs}ms`); lines.push(`**Latency:** ${result.latencyMs}ms`);
lines.push(`**Rules Referenced:** ${result.rulesReferenced.join(", ") || "none"}\n`); lines.push(
`**Rules Referenced:** ${result.rulesReferenced.join(", ") || "none"}\n`,
);
if (result.error) { if (result.error) {
lines.push(`**Error:** ${result.error}\n`); lines.push(`**Error:** ${result.error}\n`);
} }
lines.push("### Criteria Results\n"); lines.push("### Criteria Results\n");
for (const criterion of result.criteriaResults) { for (const criterion of result.criteriaResults) {
const icon = criterion.passed ? "+" : "-"; const icon = criterion.passed ? "+" : "-";
lines.push(`${icon} ${criterion.criterion}`); lines.push(`${icon} ${criterion.criterion}`);
if (criterion.evidence) { if (criterion.evidence) {
lines.push(` Evidence: ${criterion.evidence}`); lines.push(` Evidence: ${criterion.evidence}`);
} }
} }
lines.push("\n### Response\n"); lines.push("\n### Response\n");
lines.push("```"); lines.push("```");
lines.push(result.response); lines.push(result.response);
lines.push("```"); lines.push("```");
return lines.join("\n"); return lines.join("\n");
} }
/** /**
* Create a scenario builder for cleaner test definitions * Create a scenario builder for cleaner test definitions
*/ */
export function createScenario( export function createScenario(
partial: Omit<EvalScenario, "id"> & { id?: string } partial: Omit<EvalScenario, "id"> & { id?: string },
): EvalScenario { ): EvalScenario {
return { return {
id: partial.id || partial.name.toLowerCase().replace(/\s+/g, "-"), id: partial.id || partial.name.toLowerCase().replace(/\s+/g, "-"),
...partial, ...partial,
} as EvalScenario; } as EvalScenario;
} }

View File

@@ -1,9 +1,9 @@
import { defineConfig } from "vitest/config"; import { defineConfig } from "vitest/config";
export default defineConfig({ export default defineConfig({
test: { test: {
include: ["scenarios/**/*.eval.ts"], include: ["scenarios/**/*.eval.ts"],
testTimeout: 60000, // 60 seconds for LLM calls testTimeout: 60000, // 60 seconds for LLM calls
reporters: ["verbose"], reporters: ["verbose"],
}, },
}); });

View File

@@ -1,23 +1,11 @@
{ {
"name": "aurora", "name": "aurora",
"minVersion": "13", "minVersion": "13",
"maxVersion": "16", "maxVersion": "16",
"extensions": { "extensions": {
"available": [ "available": ["pg_stat_statements", "pgcrypto", "uuid-ossp"],
"pg_stat_statements", "installable": ["postgis", "pg_hint_plan", "pg_similarity"],
"pgcrypto", "unavailable": ["pg_cron", "pg_partman", "timescaledb"]
"uuid-ossp" },
], "notes": "AWS Aurora PostgreSQL. Some extensions are not available due to managed service restrictions. Aurora has its own connection pooling (RDS Proxy) and automatic failover."
"installable": [
"postgis",
"pg_hint_plan",
"pg_similarity"
],
"unavailable": [
"pg_cron",
"pg_partman",
"timescaledb"
]
},
"notes": "AWS Aurora PostgreSQL. Some extensions are not available due to managed service restrictions. Aurora has its own connection pooling (RDS Proxy) and automatic failover."
} }

View File

@@ -1,18 +1,18 @@
{ {
"name": "self-hosted", "name": "self-hosted",
"minVersion": "12", "minVersion": "12",
"extensions": { "extensions": {
"available": [], "available": [],
"installable": [ "installable": [
"pg_stat_statements", "pg_stat_statements",
"pgcrypto", "pgcrypto",
"uuid-ossp", "uuid-ossp",
"postgis", "postgis",
"pg_trgm", "pg_trgm",
"btree_gin", "btree_gin",
"btree_gist" "btree_gist"
], ],
"unavailable": [] "unavailable": []
}, },
"notes": "Generic self-hosted PostgreSQL. Extension availability depends on server configuration. Check pg_available_extensions for what can be installed." "notes": "Generic self-hosted PostgreSQL. Extension availability depends on server configuration. Check pg_available_extensions for what can be installed."
} }

View File

@@ -1,27 +1,27 @@
{ {
"name": "supabase", "name": "supabase",
"minVersion": "15", "minVersion": "15",
"extensions": { "extensions": {
"available": [ "available": [
"pg_stat_statements", "pg_stat_statements",
"pgcrypto", "pgcrypto",
"uuid-ossp", "uuid-ossp",
"pgjwt", "pgjwt",
"pg_graphql", "pg_graphql",
"pg_net", "pg_net",
"pgsodium", "pgsodium",
"supabase_vault", "supabase_vault",
"pg_jsonschema" "pg_jsonschema"
], ],
"installable": [ "installable": [
"postgis", "postgis",
"pg_cron", "pg_cron",
"pgtap", "pgtap",
"plv8", "plv8",
"http", "http",
"pg_hashids" "pg_hashids"
], ],
"unavailable": [] "unavailable": []
}, },
"notes": "Supabase manages connection pooling via Supavisor. Direct connection limits differ from pooled connections. All standard Postgres extensions are available." "notes": "Supabase manages connection pooling via Supavisor. Direct connection limits differ from pooled connections. All standard Postgres extensions are available."
} }