mirror of
https://github.com/supabase/agent-skills.git
synced 2026-01-26 19:09:51 +08:00
fix format
This commit is contained in:
@@ -7,7 +7,11 @@ import {
|
||||
validateSkillExists,
|
||||
} from "./config.js";
|
||||
import { parseRuleFile } from "./parser.js";
|
||||
import { filterRulesForProfile, listProfiles, loadProfile } from "./profiles.js";
|
||||
import {
|
||||
filterRulesForProfile,
|
||||
listProfiles,
|
||||
loadProfile,
|
||||
} from "./profiles.js";
|
||||
import type { Metadata, Profile, Rule, Section } from "./types.js";
|
||||
import { validateRuleFile } from "./validate.js";
|
||||
|
||||
@@ -118,10 +122,7 @@ function buildSkill(paths: SkillPaths, profile?: Profile): void {
|
||||
// Check if rules directory exists
|
||||
if (!existsSync(paths.rulesDir)) {
|
||||
console.log(` No rules directory found. Generating empty AGENTS.md.`);
|
||||
writeFileSync(
|
||||
outputFile,
|
||||
`# ${skillTitle}\n\nNo rules defined yet.\n`,
|
||||
);
|
||||
writeFileSync(outputFile, `# ${skillTitle}\n\nNo rules defined yet.\n`);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -157,7 +158,9 @@ function buildSkill(paths: SkillPaths, profile?: Profile): void {
|
||||
let filteredRules = rules;
|
||||
if (profile) {
|
||||
filteredRules = filterRulesForProfile(rules, profile);
|
||||
console.log(` Filtered to ${filteredRules.length} rules for profile "${profile.name}"`);
|
||||
console.log(
|
||||
` Filtered to ${filteredRules.length} rules for profile "${profile.name}"`,
|
||||
);
|
||||
}
|
||||
|
||||
// Group rules by section and assign IDs
|
||||
@@ -244,7 +247,9 @@ function buildSkill(paths: SkillPaths, profile?: Profile): void {
|
||||
prerequisites.push(`PostgreSQL ${rule.minVersion}+`);
|
||||
}
|
||||
if (rule.extensions && rule.extensions.length > 0) {
|
||||
prerequisites.push(`Extension${rule.extensions.length > 1 ? "s" : ""}: ${rule.extensions.join(", ")}`);
|
||||
prerequisites.push(
|
||||
`Extension${rule.extensions.length > 1 ? "s" : ""}: ${rule.extensions.join(", ")}`,
|
||||
);
|
||||
}
|
||||
if (prerequisites.length > 0) {
|
||||
output.push(`**Prerequisites:** ${prerequisites.join(" | ")}\n`);
|
||||
@@ -302,7 +307,11 @@ function buildSkill(paths: SkillPaths, profile?: Profile): void {
|
||||
/**
|
||||
* Parse CLI arguments
|
||||
*/
|
||||
function parseArgs(): { skill?: string; profile?: string; allProfiles: boolean } {
|
||||
function parseArgs(): {
|
||||
skill?: string;
|
||||
profile?: string;
|
||||
allProfiles: boolean;
|
||||
} {
|
||||
const args = process.argv.slice(2);
|
||||
let skill: string | undefined;
|
||||
let profile: string | undefined;
|
||||
|
||||
@@ -251,7 +251,8 @@ export function parseRuleFile(
|
||||
const examples = extractExamples(body);
|
||||
|
||||
const tags = frontmatter.tags?.split(",").map((t) => t.trim()) || [];
|
||||
const extensions = frontmatter.extensions?.split(",").map((e) => e.trim()) || [];
|
||||
const extensions =
|
||||
frontmatter.extensions?.split(",").map((e) => e.trim()) || [];
|
||||
|
||||
// Validation warnings
|
||||
if (!explanation || explanation.length < 20) {
|
||||
|
||||
@@ -5,7 +5,10 @@ import type { Profile, Rule } from "./types.js";
|
||||
/**
|
||||
* Load a profile from the profiles directory
|
||||
*/
|
||||
export function loadProfile(profilesDir: string, profileName: string): Profile | null {
|
||||
export function loadProfile(
|
||||
profilesDir: string,
|
||||
profileName: string,
|
||||
): Profile | null {
|
||||
const profileFile = join(profilesDir, `${profileName}.json`);
|
||||
if (!existsSync(profileFile)) {
|
||||
return null;
|
||||
@@ -54,14 +57,20 @@ function compareVersions(a: string, b: string): number {
|
||||
/**
|
||||
* Check if a rule is compatible with a profile
|
||||
*/
|
||||
export function isRuleCompatibleWithProfile(rule: Rule, profile: Profile): boolean {
|
||||
export function isRuleCompatibleWithProfile(
|
||||
rule: Rule,
|
||||
profile: Profile,
|
||||
): boolean {
|
||||
// Check version requirement
|
||||
if (rule.minVersion) {
|
||||
if (compareVersions(rule.minVersion, profile.minVersion) > 0) {
|
||||
// Rule requires a higher version than profile supports
|
||||
return false;
|
||||
}
|
||||
if (profile.maxVersion && compareVersions(rule.minVersion, profile.maxVersion) > 0) {
|
||||
if (
|
||||
profile.maxVersion &&
|
||||
compareVersions(rule.minVersion, profile.maxVersion) > 0
|
||||
) {
|
||||
// Rule requires a version higher than profile's max
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -26,8 +26,8 @@ export interface Rule {
|
||||
references?: string[];
|
||||
tags?: string[];
|
||||
supabaseNotes?: string;
|
||||
minVersion?: string; // Minimum PostgreSQL version required (e.g., "11", "14")
|
||||
extensions?: string[]; // Required PostgreSQL extensions (e.g., ["pg_stat_statements"])
|
||||
minVersion?: string; // Minimum PostgreSQL version required (e.g., "11", "14")
|
||||
extensions?: string[]; // Required PostgreSQL extensions (e.g., ["pg_stat_statements"])
|
||||
}
|
||||
|
||||
export interface Section {
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
{
|
||||
"name": "postgres-best-practices-evals",
|
||||
"version": "1.0.0",
|
||||
"description": "Evaluation scenarios for Postgres Best Practices skill",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"eval": "vitest run",
|
||||
"eval:watch": "vitest",
|
||||
"eval:ui": "vitest --ui"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@ai-sdk/anthropic": "^0.0.30",
|
||||
"@types/node": "^20.0.0",
|
||||
"ai": "^3.0.0",
|
||||
"typescript": "^5.0.0",
|
||||
"vitest": "^1.0.0"
|
||||
}
|
||||
"name": "postgres-best-practices-evals",
|
||||
"version": "1.0.0",
|
||||
"description": "Evaluation scenarios for Postgres Best Practices skill",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"eval": "vitest run",
|
||||
"eval:watch": "vitest",
|
||||
"eval:ui": "vitest --ui"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@ai-sdk/anthropic": "^0.0.30",
|
||||
"@types/node": "^20.0.0",
|
||||
"ai": "^3.0.0",
|
||||
"typescript": "^5.0.0",
|
||||
"vitest": "^1.0.0"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,129 +1,143 @@
|
||||
import { generateText } from "ai";
|
||||
import { anthropic } from "@ai-sdk/anthropic";
|
||||
import { readFileSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import type { CriterionResult, EvalConfig, EvalResult, EvalScenario } from "./types.js";
|
||||
import { anthropic } from "@ai-sdk/anthropic";
|
||||
import { generateText } from "ai";
|
||||
import type {
|
||||
CriterionResult,
|
||||
EvalConfig,
|
||||
EvalResult,
|
||||
EvalScenario,
|
||||
} from "./types.js";
|
||||
|
||||
const DEFAULT_CONFIG: EvalConfig = {
|
||||
agentsPath: join(import.meta.dirname, "..", "AGENTS.md"),
|
||||
model: "claude-sonnet-4-20250514",
|
||||
maxTokens: 2048,
|
||||
temperature: 0,
|
||||
agentsPath: join(import.meta.dirname, "..", "AGENTS.md"),
|
||||
model: "claude-sonnet-4-20250514",
|
||||
maxTokens: 2048,
|
||||
temperature: 0,
|
||||
};
|
||||
|
||||
/**
|
||||
* Build the user prompt from a scenario
|
||||
*/
|
||||
function buildUserPrompt(scenario: EvalScenario): string {
|
||||
const parts: string[] = [];
|
||||
const parts: string[] = [];
|
||||
|
||||
// Add version context if specified
|
||||
if (scenario.input.postgresVersion) {
|
||||
parts.push(`PostgreSQL Version: ${scenario.input.postgresVersion}`);
|
||||
}
|
||||
// Add version context if specified
|
||||
if (scenario.input.postgresVersion) {
|
||||
parts.push(`PostgreSQL Version: ${scenario.input.postgresVersion}`);
|
||||
}
|
||||
|
||||
// Add extensions context if specified
|
||||
if (scenario.input.availableExtensions) {
|
||||
if (scenario.input.availableExtensions.length === 0) {
|
||||
parts.push("Available Extensions: None installed");
|
||||
} else {
|
||||
parts.push(`Available Extensions: ${scenario.input.availableExtensions.join(", ")}`);
|
||||
}
|
||||
}
|
||||
// Add extensions context if specified
|
||||
if (scenario.input.availableExtensions) {
|
||||
if (scenario.input.availableExtensions.length === 0) {
|
||||
parts.push("Available Extensions: None installed");
|
||||
} else {
|
||||
parts.push(
|
||||
`Available Extensions: ${scenario.input.availableExtensions.join(", ")}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Add additional context if provided
|
||||
if (scenario.input.context) {
|
||||
parts.push(`Context: ${scenario.input.context}`);
|
||||
}
|
||||
// Add additional context if provided
|
||||
if (scenario.input.context) {
|
||||
parts.push(`Context: ${scenario.input.context}`);
|
||||
}
|
||||
|
||||
// Add schema
|
||||
parts.push(`\nSchema:\n\`\`\`sql\n${scenario.input.schema}\n\`\`\``);
|
||||
// Add schema
|
||||
parts.push(`\nSchema:\n\`\`\`sql\n${scenario.input.schema}\n\`\`\``);
|
||||
|
||||
// Add user query
|
||||
parts.push(`\nQuestion: ${scenario.input.userQuery}`);
|
||||
// Add user query
|
||||
parts.push(`\nQuestion: ${scenario.input.userQuery}`);
|
||||
|
||||
return parts.join("\n");
|
||||
return parts.join("\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract rule IDs mentioned in a response
|
||||
*/
|
||||
function extractRuleIds(response: string): string[] {
|
||||
// Match patterns like "1.1", "2.3", etc.
|
||||
const rulePattern = /\b(\d+\.\d+)\b/g;
|
||||
const matches = response.match(rulePattern) || [];
|
||||
return [...new Set(matches)];
|
||||
// Match patterns like "1.1", "2.3", etc.
|
||||
const rulePattern = /\b(\d+\.\d+)\b/g;
|
||||
const matches = response.match(rulePattern) || [];
|
||||
return [...new Set(matches)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate the response against expected criteria
|
||||
*/
|
||||
function evaluateCriteria(scenario: EvalScenario, response: string): CriterionResult[] {
|
||||
const results: CriterionResult[] = [];
|
||||
const responseLower = response.toLowerCase();
|
||||
function evaluateCriteria(
|
||||
scenario: EvalScenario,
|
||||
response: string,
|
||||
): CriterionResult[] {
|
||||
const results: CriterionResult[] = [];
|
||||
const responseLower = response.toLowerCase();
|
||||
|
||||
// Check mustContain criteria
|
||||
for (const term of scenario.expectedOutput.mustContain) {
|
||||
const found = responseLower.includes(term.toLowerCase());
|
||||
results.push({
|
||||
criterion: `Response should contain "${term}"`,
|
||||
passed: found,
|
||||
evidence: found ? "Found in response" : "Not found in response",
|
||||
});
|
||||
}
|
||||
// Check mustContain criteria
|
||||
for (const term of scenario.expectedOutput.mustContain) {
|
||||
const found = responseLower.includes(term.toLowerCase());
|
||||
results.push({
|
||||
criterion: `Response should contain "${term}"`,
|
||||
passed: found,
|
||||
evidence: found ? "Found in response" : "Not found in response",
|
||||
});
|
||||
}
|
||||
|
||||
// Check mustNotContain criteria
|
||||
if (scenario.expectedOutput.mustNotContain) {
|
||||
for (const term of scenario.expectedOutput.mustNotContain) {
|
||||
const found = responseLower.includes(term.toLowerCase());
|
||||
results.push({
|
||||
criterion: `Response should NOT contain "${term}"`,
|
||||
passed: !found,
|
||||
evidence: found ? "Found in response (should not be present)" : "Not found (correct)",
|
||||
});
|
||||
}
|
||||
}
|
||||
// Check mustNotContain criteria
|
||||
if (scenario.expectedOutput.mustNotContain) {
|
||||
for (const term of scenario.expectedOutput.mustNotContain) {
|
||||
const found = responseLower.includes(term.toLowerCase());
|
||||
results.push({
|
||||
criterion: `Response should NOT contain "${term}"`,
|
||||
passed: !found,
|
||||
evidence: found
|
||||
? "Found in response (should not be present)"
|
||||
: "Not found (correct)",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Check shouldRecommendRules
|
||||
const referencedRules = extractRuleIds(response);
|
||||
for (const ruleId of scenario.expectedOutput.shouldRecommendRules) {
|
||||
const found = referencedRules.includes(ruleId);
|
||||
results.push({
|
||||
criterion: `Should recommend rule ${ruleId}`,
|
||||
passed: found,
|
||||
evidence: found ? "Rule referenced" : "Rule not referenced",
|
||||
});
|
||||
}
|
||||
// Check shouldRecommendRules
|
||||
const referencedRules = extractRuleIds(response);
|
||||
for (const ruleId of scenario.expectedOutput.shouldRecommendRules) {
|
||||
const found = referencedRules.includes(ruleId);
|
||||
results.push({
|
||||
criterion: `Should recommend rule ${ruleId}`,
|
||||
passed: found,
|
||||
evidence: found ? "Rule referenced" : "Rule not referenced",
|
||||
});
|
||||
}
|
||||
|
||||
// Check shouldNotRecommendRules
|
||||
if (scenario.expectedOutput.shouldNotRecommendRules) {
|
||||
for (const ruleId of scenario.expectedOutput.shouldNotRecommendRules) {
|
||||
const found = referencedRules.includes(ruleId);
|
||||
results.push({
|
||||
criterion: `Should NOT recommend rule ${ruleId}`,
|
||||
passed: !found,
|
||||
evidence: found ? "Rule referenced (should not be)" : "Rule not referenced (correct)",
|
||||
});
|
||||
}
|
||||
}
|
||||
// Check shouldNotRecommendRules
|
||||
if (scenario.expectedOutput.shouldNotRecommendRules) {
|
||||
for (const ruleId of scenario.expectedOutput.shouldNotRecommendRules) {
|
||||
const found = referencedRules.includes(ruleId);
|
||||
results.push({
|
||||
criterion: `Should NOT recommend rule ${ruleId}`,
|
||||
passed: !found,
|
||||
evidence: found
|
||||
? "Rule referenced (should not be)"
|
||||
: "Rule not referenced (correct)",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a single evaluation scenario
|
||||
*/
|
||||
export async function runEval(
|
||||
scenario: EvalScenario,
|
||||
config: Partial<EvalConfig> = {}
|
||||
scenario: EvalScenario,
|
||||
config: Partial<EvalConfig> = {},
|
||||
): Promise<EvalResult> {
|
||||
const finalConfig = { ...DEFAULT_CONFIG, ...config };
|
||||
const finalConfig = { ...DEFAULT_CONFIG, ...config };
|
||||
|
||||
try {
|
||||
// Load AGENTS.md
|
||||
const agentsMd = readFileSync(finalConfig.agentsPath, "utf-8");
|
||||
try {
|
||||
// Load AGENTS.md
|
||||
const agentsMd = readFileSync(finalConfig.agentsPath, "utf-8");
|
||||
|
||||
const systemPrompt = `You are a PostgreSQL expert assistant. Use the following knowledge base to provide accurate recommendations:
|
||||
const systemPrompt = `You are a PostgreSQL expert assistant. Use the following knowledge base to provide accurate recommendations:
|
||||
|
||||
${agentsMd}
|
||||
|
||||
@@ -134,59 +148,59 @@ IMPORTANT: When the user specifies a PostgreSQL version or available extensions,
|
||||
|
||||
When making recommendations, reference specific rule IDs (e.g., "1.1", "2.3") from the knowledge base.`;
|
||||
|
||||
const userPrompt = buildUserPrompt(scenario);
|
||||
const userPrompt = buildUserPrompt(scenario);
|
||||
|
||||
const start = Date.now();
|
||||
const { text } = await generateText({
|
||||
model: anthropic(finalConfig.model!),
|
||||
system: systemPrompt,
|
||||
prompt: userPrompt,
|
||||
maxTokens: finalConfig.maxTokens,
|
||||
temperature: finalConfig.temperature,
|
||||
});
|
||||
const latencyMs = Date.now() - start;
|
||||
const start = Date.now();
|
||||
const { text } = await generateText({
|
||||
model: anthropic(finalConfig.model ?? DEFAULT_CONFIG.model),
|
||||
system: systemPrompt,
|
||||
prompt: userPrompt,
|
||||
maxTokens: finalConfig.maxTokens,
|
||||
temperature: finalConfig.temperature,
|
||||
});
|
||||
const latencyMs = Date.now() - start;
|
||||
|
||||
// Evaluate the response
|
||||
const criteriaResults = evaluateCriteria(scenario, text);
|
||||
const rulesReferenced = extractRuleIds(text);
|
||||
const passed = criteriaResults.every((r) => r.passed);
|
||||
// Evaluate the response
|
||||
const criteriaResults = evaluateCriteria(scenario, text);
|
||||
const rulesReferenced = extractRuleIds(text);
|
||||
const passed = criteriaResults.every((r) => r.passed);
|
||||
|
||||
return {
|
||||
scenarioId: scenario.id,
|
||||
passed,
|
||||
rulesReferenced,
|
||||
criteriaResults,
|
||||
response: text,
|
||||
latencyMs,
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
scenarioId: scenario.id,
|
||||
passed: false,
|
||||
rulesReferenced: [],
|
||||
criteriaResults: [],
|
||||
response: "",
|
||||
latencyMs: 0,
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
};
|
||||
}
|
||||
return {
|
||||
scenarioId: scenario.id,
|
||||
passed,
|
||||
rulesReferenced,
|
||||
criteriaResults,
|
||||
response: text,
|
||||
latencyMs,
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
scenarioId: scenario.id,
|
||||
passed: false,
|
||||
rulesReferenced: [],
|
||||
criteriaResults: [],
|
||||
response: "",
|
||||
latencyMs: 0,
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run multiple evaluation scenarios
|
||||
*/
|
||||
export async function runEvals(
|
||||
scenarios: EvalScenario[],
|
||||
config: Partial<EvalConfig> = {}
|
||||
scenarios: EvalScenario[],
|
||||
config: Partial<EvalConfig> = {},
|
||||
): Promise<EvalResult[]> {
|
||||
const results: EvalResult[] = [];
|
||||
const results: EvalResult[] = [];
|
||||
|
||||
for (const scenario of scenarios) {
|
||||
console.log(`Running eval: ${scenario.name}...`);
|
||||
const result = await runEval(scenario, config);
|
||||
results.push(result);
|
||||
console.log(` ${result.passed ? "PASS" : "FAIL"} (${result.latencyMs}ms)`);
|
||||
}
|
||||
for (const scenario of scenarios) {
|
||||
console.log(`Running eval: ${scenario.name}...`);
|
||||
const result = await runEval(scenario, config);
|
||||
results.push(result);
|
||||
console.log(` ${result.passed ? "PASS" : "FAIL"} (${result.latencyMs}ms)`);
|
||||
}
|
||||
|
||||
return results;
|
||||
return results;
|
||||
}
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { runEval } from "../runner.js";
|
||||
import type { EvalScenario } from "../types.js";
|
||||
|
||||
const scenario: EvalScenario = {
|
||||
id: "covering-index-suggestion",
|
||||
name: "Covering Index Suggestion",
|
||||
description:
|
||||
"Agent should suggest using INCLUDE clause for columns in SELECT that aren't in WHERE clause",
|
||||
category: "query-performance",
|
||||
difficulty: "intermediate",
|
||||
input: {
|
||||
schema: `
|
||||
id: "covering-index-suggestion",
|
||||
name: "Covering Index Suggestion",
|
||||
description:
|
||||
"Agent should suggest using INCLUDE clause for columns in SELECT that aren't in WHERE clause",
|
||||
category: "query-performance",
|
||||
difficulty: "intermediate",
|
||||
input: {
|
||||
schema: `
|
||||
CREATE TABLE users (
|
||||
id SERIAL PRIMARY KEY,
|
||||
email VARCHAR(255) NOT NULL,
|
||||
@@ -22,41 +22,40 @@ CREATE TABLE users (
|
||||
CREATE INDEX users_email_idx ON users (email);
|
||||
-- Table has 2 million rows
|
||||
`,
|
||||
userQuery: `This query still does heap fetches even though we have an index on email:
|
||||
userQuery: `This query still does heap fetches even though we have an index on email:
|
||||
|
||||
SELECT email, name, department FROM users WHERE email = 'user@example.com'
|
||||
|
||||
EXPLAIN shows "Index Scan" but not "Index Only Scan". How can I avoid the table lookup?`,
|
||||
postgresVersion: "15.4",
|
||||
},
|
||||
expectedOutput: {
|
||||
shouldRecommendRules: ["1.2"], // query-covering-indexes
|
||||
mustContain: ["include", "covering"],
|
||||
},
|
||||
expectedReasoning: [
|
||||
"Identify that the query selects columns (name, department) not in the index",
|
||||
"Recognize this causes additional heap fetches after the index scan",
|
||||
"Recommend using INCLUDE clause to create a covering index",
|
||||
"Explain that this enables index-only scans",
|
||||
],
|
||||
postgresVersion: "15.4",
|
||||
},
|
||||
expectedOutput: {
|
||||
shouldRecommendRules: ["1.2"], // query-covering-indexes
|
||||
mustContain: ["include", "covering"],
|
||||
},
|
||||
expectedReasoning: [
|
||||
"Identify that the query selects columns (name, department) not in the index",
|
||||
"Recognize this causes additional heap fetches after the index scan",
|
||||
"Recommend using INCLUDE clause to create a covering index",
|
||||
"Explain that this enables index-only scans",
|
||||
],
|
||||
};
|
||||
|
||||
describe("Covering Index Suggestion", () => {
|
||||
it("should recommend INCLUDE clause for covering index", async () => {
|
||||
const result = await runEval(scenario);
|
||||
it("should recommend INCLUDE clause for covering index", async () => {
|
||||
const result = await runEval(scenario);
|
||||
|
||||
console.log("Response:", result.response);
|
||||
console.log("Criteria results:", result.criteriaResults);
|
||||
console.log("Response:", result.response);
|
||||
console.log("Criteria results:", result.criteriaResults);
|
||||
|
||||
// Response should mention INCLUDE keyword
|
||||
expect(result.response.toLowerCase()).toContain("include");
|
||||
// Response should mention INCLUDE keyword
|
||||
expect(result.response.toLowerCase()).toContain("include");
|
||||
|
||||
// Response should mention covering index concept
|
||||
const responseLower = result.response.toLowerCase();
|
||||
expect(
|
||||
responseLower.includes("covering") || responseLower.includes("index-only")
|
||||
).toBe(true);
|
||||
});
|
||||
// Response should mention covering index concept
|
||||
const responseLower = result.response.toLowerCase();
|
||||
expect(
|
||||
responseLower.includes("covering") ||
|
||||
responseLower.includes("index-only"),
|
||||
).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
export { scenario };
|
||||
|
||||
@@ -1,56 +1,54 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { runEval } from "../runner.js";
|
||||
import type { EvalScenario } from "../types.js";
|
||||
|
||||
const scenario: EvalScenario = {
|
||||
id: "extension-available-pg-stat-statements",
|
||||
name: "Extension Available - pg_stat_statements",
|
||||
description:
|
||||
"Agent should recommend pg_stat_statements for query monitoring when the extension is available",
|
||||
category: "extension-requirements",
|
||||
difficulty: "basic",
|
||||
input: {
|
||||
schema: `
|
||||
id: "extension-available-pg-stat-statements",
|
||||
name: "Extension Available - pg_stat_statements",
|
||||
description:
|
||||
"Agent should recommend pg_stat_statements for query monitoring when the extension is available",
|
||||
category: "extension-requirements",
|
||||
difficulty: "basic",
|
||||
input: {
|
||||
schema: `
|
||||
-- Production database with various tables
|
||||
CREATE TABLE users (id SERIAL PRIMARY KEY, email VARCHAR(255));
|
||||
CREATE TABLE orders (id SERIAL PRIMARY KEY, user_id INT, total DECIMAL);
|
||||
CREATE TABLE products (id SERIAL PRIMARY KEY, name VARCHAR(200), price DECIMAL);
|
||||
`,
|
||||
userQuery:
|
||||
"Our database is slow but we don't know which queries are causing the problem. How can we identify the slowest queries?",
|
||||
postgresVersion: "15.4",
|
||||
availableExtensions: ["pg_stat_statements", "pgcrypto", "uuid-ossp"],
|
||||
},
|
||||
expectedOutput: {
|
||||
shouldRecommendRules: ["7.1"], // monitor-pg-stat-statements
|
||||
mustContain: ["pg_stat_statements"],
|
||||
},
|
||||
expectedReasoning: [
|
||||
"Recognize this is a query monitoring/performance diagnosis problem",
|
||||
"Check that pg_stat_statements is available in the extensions list",
|
||||
"Recommend enabling pg_stat_statements for query analysis",
|
||||
"Explain how to use it to find slow queries",
|
||||
],
|
||||
userQuery:
|
||||
"Our database is slow but we don't know which queries are causing the problem. How can we identify the slowest queries?",
|
||||
postgresVersion: "15.4",
|
||||
availableExtensions: ["pg_stat_statements", "pgcrypto", "uuid-ossp"],
|
||||
},
|
||||
expectedOutput: {
|
||||
shouldRecommendRules: ["7.1"], // monitor-pg-stat-statements
|
||||
mustContain: ["pg_stat_statements"],
|
||||
},
|
||||
expectedReasoning: [
|
||||
"Recognize this is a query monitoring/performance diagnosis problem",
|
||||
"Check that pg_stat_statements is available in the extensions list",
|
||||
"Recommend enabling pg_stat_statements for query analysis",
|
||||
"Explain how to use it to find slow queries",
|
||||
],
|
||||
};
|
||||
|
||||
describe("Extension Available - pg_stat_statements", () => {
|
||||
it("should recommend pg_stat_statements when available", async () => {
|
||||
const result = await runEval(scenario);
|
||||
it("should recommend pg_stat_statements when available", async () => {
|
||||
const result = await runEval(scenario);
|
||||
|
||||
console.log("Response:", result.response);
|
||||
console.log("Criteria results:", result.criteriaResults);
|
||||
console.log("Response:", result.response);
|
||||
console.log("Criteria results:", result.criteriaResults);
|
||||
|
||||
// Response should mention pg_stat_statements
|
||||
expect(result.response.toLowerCase()).toContain("pg_stat_statements");
|
||||
// Response should mention pg_stat_statements
|
||||
expect(result.response.toLowerCase()).toContain("pg_stat_statements");
|
||||
|
||||
// Should suggest enabling/using the extension
|
||||
const responseLower = result.response.toLowerCase();
|
||||
expect(
|
||||
responseLower.includes("create extension") ||
|
||||
responseLower.includes("enable") ||
|
||||
responseLower.includes("query")
|
||||
).toBe(true);
|
||||
});
|
||||
// Should suggest enabling/using the extension
|
||||
const responseLower = result.response.toLowerCase();
|
||||
expect(
|
||||
responseLower.includes("create extension") ||
|
||||
responseLower.includes("enable") ||
|
||||
responseLower.includes("query"),
|
||||
).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
export { scenario };
|
||||
|
||||
@@ -1,56 +1,56 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { runEval } from "../runner.js";
|
||||
import type { EvalScenario } from "../types.js";
|
||||
|
||||
const scenario: EvalScenario = {
|
||||
id: "extension-unavailable-no-pg-stat-statements",
|
||||
name: "Extension Unavailable - No pg_stat_statements",
|
||||
description:
|
||||
"Agent should provide alternatives when pg_stat_statements is not available for query monitoring",
|
||||
category: "extension-requirements",
|
||||
difficulty: "intermediate",
|
||||
input: {
|
||||
schema: `
|
||||
id: "extension-unavailable-no-pg-stat-statements",
|
||||
name: "Extension Unavailable - No pg_stat_statements",
|
||||
description:
|
||||
"Agent should provide alternatives when pg_stat_statements is not available for query monitoring",
|
||||
category: "extension-requirements",
|
||||
difficulty: "intermediate",
|
||||
input: {
|
||||
schema: `
|
||||
-- Production database with various tables
|
||||
CREATE TABLE users (id SERIAL PRIMARY KEY, email VARCHAR(255));
|
||||
CREATE TABLE orders (id SERIAL PRIMARY KEY, user_id INT, total DECIMAL);
|
||||
CREATE TABLE products (id SERIAL PRIMARY KEY, name VARCHAR(200), price DECIMAL);
|
||||
`,
|
||||
userQuery:
|
||||
"Our database is slow but we don't know which queries are causing the problem. How can we identify the slowest queries?",
|
||||
postgresVersion: "15.4",
|
||||
availableExtensions: [], // No extensions available
|
||||
context:
|
||||
"This is a managed database environment where we cannot install additional extensions.",
|
||||
},
|
||||
expectedOutput: {
|
||||
shouldRecommendRules: [], // Should not recommend pg_stat_statements rule
|
||||
shouldNotRecommendRules: ["7.1"], // monitor-pg-stat-statements
|
||||
mustContain: ["explain", "analyze"],
|
||||
mustNotContain: ["pg_stat_statements"],
|
||||
},
|
||||
expectedReasoning: [
|
||||
"Recognize that no extensions are available",
|
||||
"Check that pg_stat_statements cannot be used",
|
||||
"Avoid recommending pg_stat_statements",
|
||||
"Suggest alternative approaches like EXPLAIN ANALYZE, log_min_duration_statement, or pg_stat_activity",
|
||||
],
|
||||
userQuery:
|
||||
"Our database is slow but we don't know which queries are causing the problem. How can we identify the slowest queries?",
|
||||
postgresVersion: "15.4",
|
||||
availableExtensions: [], // No extensions available
|
||||
context:
|
||||
"This is a managed database environment where we cannot install additional extensions.",
|
||||
},
|
||||
expectedOutput: {
|
||||
shouldRecommendRules: [], // Should not recommend pg_stat_statements rule
|
||||
shouldNotRecommendRules: ["7.1"], // monitor-pg-stat-statements
|
||||
mustContain: ["explain", "analyze"],
|
||||
mustNotContain: ["pg_stat_statements"],
|
||||
},
|
||||
expectedReasoning: [
|
||||
"Recognize that no extensions are available",
|
||||
"Check that pg_stat_statements cannot be used",
|
||||
"Avoid recommending pg_stat_statements",
|
||||
"Suggest alternative approaches like EXPLAIN ANALYZE, log_min_duration_statement, or pg_stat_activity",
|
||||
],
|
||||
};
|
||||
|
||||
describe("Extension Unavailable - No pg_stat_statements", () => {
|
||||
it("should suggest alternatives when pg_stat_statements is unavailable", async () => {
|
||||
const result = await runEval(scenario);
|
||||
it("should suggest alternatives when pg_stat_statements is unavailable", async () => {
|
||||
const result = await runEval(scenario);
|
||||
|
||||
console.log("Response:", result.response);
|
||||
console.log("Criteria results:", result.criteriaResults);
|
||||
console.log("Response:", result.response);
|
||||
console.log("Criteria results:", result.criteriaResults);
|
||||
|
||||
// Response should NOT primarily recommend pg_stat_statements
|
||||
// (it might mention it as unavailable, but shouldn't suggest installing it)
|
||||
const responseLower = result.response.toLowerCase();
|
||||
// Response should NOT primarily recommend pg_stat_statements
|
||||
// (it might mention it as unavailable, but shouldn't suggest installing it)
|
||||
const responseLower = result.response.toLowerCase();
|
||||
|
||||
// Should suggest EXPLAIN ANALYZE as an alternative
|
||||
expect(responseLower.includes("explain") && responseLower.includes("analyze")).toBe(true);
|
||||
});
|
||||
// Should suggest EXPLAIN ANALYZE as an alternative
|
||||
expect(
|
||||
responseLower.includes("explain") && responseLower.includes("analyze"),
|
||||
).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
export { scenario };
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { runEval } from "../runner.js";
|
||||
import type { EvalScenario } from "../types.js";
|
||||
|
||||
const scenario: EvalScenario = {
|
||||
id: "missing-index-detection",
|
||||
name: "Missing Index Detection",
|
||||
description:
|
||||
"Agent should identify missing index on WHERE clause columns and recommend creating an appropriate index",
|
||||
category: "query-performance",
|
||||
difficulty: "basic",
|
||||
input: {
|
||||
schema: `
|
||||
id: "missing-index-detection",
|
||||
name: "Missing Index Detection",
|
||||
description:
|
||||
"Agent should identify missing index on WHERE clause columns and recommend creating an appropriate index",
|
||||
category: "query-performance",
|
||||
difficulty: "basic",
|
||||
input: {
|
||||
schema: `
|
||||
CREATE TABLE orders (
|
||||
id SERIAL PRIMARY KEY,
|
||||
customer_id INT NOT NULL,
|
||||
@@ -21,36 +21,36 @@ CREATE TABLE orders (
|
||||
-- No indexes besides primary key
|
||||
-- Table has 5 million rows
|
||||
`,
|
||||
userQuery:
|
||||
"This query is slow and takes 3 seconds: SELECT * FROM orders WHERE customer_id = 12345 AND status = 'pending'",
|
||||
},
|
||||
expectedOutput: {
|
||||
shouldRecommendRules: ["1.1"], // query-missing-indexes
|
||||
mustContain: ["index", "customer_id"],
|
||||
},
|
||||
expectedReasoning: [
|
||||
"Identify that the query filters on customer_id and status",
|
||||
"Recognize that without an index, this causes a sequential scan",
|
||||
"Recommend creating an index on the filtered columns",
|
||||
],
|
||||
userQuery:
|
||||
"This query is slow and takes 3 seconds: SELECT * FROM orders WHERE customer_id = 12345 AND status = 'pending'",
|
||||
},
|
||||
expectedOutput: {
|
||||
shouldRecommendRules: ["1.1"], // query-missing-indexes
|
||||
mustContain: ["index", "customer_id"],
|
||||
},
|
||||
expectedReasoning: [
|
||||
"Identify that the query filters on customer_id and status",
|
||||
"Recognize that without an index, this causes a sequential scan",
|
||||
"Recommend creating an index on the filtered columns",
|
||||
],
|
||||
};
|
||||
|
||||
describe("Missing Index Detection", () => {
|
||||
it("should recommend creating an index on filtered columns", async () => {
|
||||
const result = await runEval(scenario);
|
||||
it("should recommend creating an index on filtered columns", async () => {
|
||||
const result = await runEval(scenario);
|
||||
|
||||
console.log("Response:", result.response);
|
||||
console.log("Criteria results:", result.criteriaResults);
|
||||
console.log("Response:", result.response);
|
||||
console.log("Criteria results:", result.criteriaResults);
|
||||
|
||||
// Check that key criteria passed
|
||||
expect(result.criteriaResults.some((c) => c.criterion.includes("index") && c.passed)).toBe(
|
||||
true
|
||||
);
|
||||
// Check that key criteria passed
|
||||
expect(
|
||||
result.criteriaResults.some(
|
||||
(c) => c.criterion.includes("index") && c.passed,
|
||||
),
|
||||
).toBe(true);
|
||||
|
||||
// Response should mention creating an index
|
||||
expect(result.response.toLowerCase()).toContain("index");
|
||||
expect(result.response.toLowerCase()).toContain("customer_id");
|
||||
});
|
||||
// Response should mention creating an index
|
||||
expect(result.response.toLowerCase()).toContain("index");
|
||||
expect(result.response.toLowerCase()).toContain("customer_id");
|
||||
});
|
||||
});
|
||||
|
||||
export { scenario };
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { runEval } from "../runner.js";
|
||||
import type { EvalScenario } from "../types.js";
|
||||
|
||||
const scenario: EvalScenario = {
|
||||
id: "n-plus-one-detection",
|
||||
name: "N+1 Query Detection",
|
||||
description:
|
||||
"Agent should identify N+1 query pattern in application code and recommend using JOINs or batch queries",
|
||||
category: "query-performance",
|
||||
difficulty: "intermediate",
|
||||
input: {
|
||||
schema: `
|
||||
id: "n-plus-one-detection",
|
||||
name: "N+1 Query Detection",
|
||||
description:
|
||||
"Agent should identify N+1 query pattern in application code and recommend using JOINs or batch queries",
|
||||
category: "query-performance",
|
||||
difficulty: "intermediate",
|
||||
input: {
|
||||
schema: `
|
||||
CREATE TABLE users (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(100),
|
||||
@@ -25,7 +25,7 @@ CREATE TABLE posts (
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
`,
|
||||
userQuery: `My API endpoint is slow. Here's the code:
|
||||
userQuery: `My API endpoint is slow. Here's the code:
|
||||
|
||||
\`\`\`typescript
|
||||
// Get all posts
|
||||
@@ -39,33 +39,33 @@ for (const post of posts) {
|
||||
\`\`\`
|
||||
|
||||
This makes 101 database queries. How can I optimize it?`,
|
||||
},
|
||||
expectedOutput: {
|
||||
shouldRecommendRules: ["6.1"], // data-n-plus-one
|
||||
mustContain: ["join", "n+1"],
|
||||
},
|
||||
expectedReasoning: [
|
||||
"Identify the N+1 query pattern (1 query for posts + N queries for users)",
|
||||
"Recognize this as a common performance anti-pattern",
|
||||
"Recommend using a JOIN to fetch all data in a single query",
|
||||
"Optionally suggest using IN clause for batch fetching",
|
||||
],
|
||||
},
|
||||
expectedOutput: {
|
||||
shouldRecommendRules: ["6.1"], // data-n-plus-one
|
||||
mustContain: ["join", "n+1"],
|
||||
},
|
||||
expectedReasoning: [
|
||||
"Identify the N+1 query pattern (1 query for posts + N queries for users)",
|
||||
"Recognize this as a common performance anti-pattern",
|
||||
"Recommend using a JOIN to fetch all data in a single query",
|
||||
"Optionally suggest using IN clause for batch fetching",
|
||||
],
|
||||
};
|
||||
|
||||
describe("N+1 Query Detection", () => {
|
||||
it("should identify N+1 pattern and recommend JOIN", async () => {
|
||||
const result = await runEval(scenario);
|
||||
it("should identify N+1 pattern and recommend JOIN", async () => {
|
||||
const result = await runEval(scenario);
|
||||
|
||||
console.log("Response:", result.response);
|
||||
console.log("Criteria results:", result.criteriaResults);
|
||||
console.log("Response:", result.response);
|
||||
console.log("Criteria results:", result.criteriaResults);
|
||||
|
||||
// Response should mention JOIN
|
||||
expect(result.response.toLowerCase()).toContain("join");
|
||||
// Response should mention JOIN
|
||||
expect(result.response.toLowerCase()).toContain("join");
|
||||
|
||||
// Response should explain the N+1 problem
|
||||
const responseLower = result.response.toLowerCase();
|
||||
expect(responseLower.includes("n+1") || responseLower.includes("n + 1")).toBe(true);
|
||||
});
|
||||
// Response should explain the N+1 problem
|
||||
const responseLower = result.response.toLowerCase();
|
||||
expect(
|
||||
responseLower.includes("n+1") || responseLower.includes("n + 1"),
|
||||
).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
export { scenario };
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { runEval } from "../runner.js";
|
||||
import type { EvalScenario } from "../types.js";
|
||||
|
||||
@@ -6,14 +6,14 @@ import type { EvalScenario } from "../types.js";
|
||||
* Scenario 1: PG10 - Should NOT recommend covering indexes (requires PG11+)
|
||||
*/
|
||||
const scenarioPg10NoCoveringIndex: EvalScenario = {
|
||||
id: "version-constraint-pg10-no-covering",
|
||||
name: "Version Constraint - PG10 No Covering Index",
|
||||
description:
|
||||
"Agent should NOT recommend INCLUDE clause on PostgreSQL 10 since it requires PG11+",
|
||||
category: "version-constraints",
|
||||
difficulty: "intermediate",
|
||||
input: {
|
||||
schema: `
|
||||
id: "version-constraint-pg10-no-covering",
|
||||
name: "Version Constraint - PG10 No Covering Index",
|
||||
description:
|
||||
"Agent should NOT recommend INCLUDE clause on PostgreSQL 10 since it requires PG11+",
|
||||
category: "version-constraints",
|
||||
difficulty: "intermediate",
|
||||
input: {
|
||||
schema: `
|
||||
CREATE TABLE users (
|
||||
id SERIAL PRIMARY KEY,
|
||||
email VARCHAR(255) NOT NULL,
|
||||
@@ -23,36 +23,36 @@ CREATE TABLE users (
|
||||
|
||||
CREATE INDEX users_email_idx ON users (email);
|
||||
`,
|
||||
userQuery:
|
||||
"How can I optimize this query to avoid heap fetches? SELECT email, name FROM users WHERE email = 'test@example.com'",
|
||||
postgresVersion: "10.0",
|
||||
},
|
||||
expectedOutput: {
|
||||
shouldRecommendRules: [],
|
||||
shouldNotRecommendRules: ["1.2"], // query-covering-indexes requires PG11
|
||||
mustContain: ["index"],
|
||||
mustNotContain: ["include"],
|
||||
},
|
||||
expectedReasoning: [
|
||||
"Recognize that PostgreSQL 10 is specified",
|
||||
"Check that covering indexes (INCLUDE clause) require PG11+",
|
||||
"Avoid recommending INCLUDE clause",
|
||||
"Suggest alternative optimization strategies appropriate for PG10",
|
||||
],
|
||||
userQuery:
|
||||
"How can I optimize this query to avoid heap fetches? SELECT email, name FROM users WHERE email = 'test@example.com'",
|
||||
postgresVersion: "10.0",
|
||||
},
|
||||
expectedOutput: {
|
||||
shouldRecommendRules: [],
|
||||
shouldNotRecommendRules: ["1.2"], // query-covering-indexes requires PG11
|
||||
mustContain: ["index"],
|
||||
mustNotContain: ["include"],
|
||||
},
|
||||
expectedReasoning: [
|
||||
"Recognize that PostgreSQL 10 is specified",
|
||||
"Check that covering indexes (INCLUDE clause) require PG11+",
|
||||
"Avoid recommending INCLUDE clause",
|
||||
"Suggest alternative optimization strategies appropriate for PG10",
|
||||
],
|
||||
};
|
||||
|
||||
/**
|
||||
* Scenario 2: PG9.3 - Should NOT recommend UPSERT (requires PG9.5+)
|
||||
*/
|
||||
const scenarioPg93NoUpsert: EvalScenario = {
|
||||
id: "version-constraint-pg93-no-upsert",
|
||||
name: "Version Constraint - PG9.3 No UPSERT",
|
||||
description:
|
||||
"Agent should NOT recommend ON CONFLICT on PostgreSQL 9.3 since it requires PG9.5+",
|
||||
category: "version-constraints",
|
||||
difficulty: "intermediate",
|
||||
input: {
|
||||
schema: `
|
||||
id: "version-constraint-pg93-no-upsert",
|
||||
name: "Version Constraint - PG9.3 No UPSERT",
|
||||
description:
|
||||
"Agent should NOT recommend ON CONFLICT on PostgreSQL 9.3 since it requires PG9.5+",
|
||||
category: "version-constraints",
|
||||
difficulty: "intermediate",
|
||||
input: {
|
||||
schema: `
|
||||
CREATE TABLE settings (
|
||||
user_id INT NOT NULL,
|
||||
key VARCHAR(50) NOT NULL,
|
||||
@@ -60,49 +60,47 @@ CREATE TABLE settings (
|
||||
PRIMARY KEY (user_id, key)
|
||||
);
|
||||
`,
|
||||
userQuery:
|
||||
"I need to insert a setting if it doesn't exist, or update it if it does. How should I do this?",
|
||||
postgresVersion: "9.3",
|
||||
},
|
||||
expectedOutput: {
|
||||
shouldRecommendRules: [],
|
||||
shouldNotRecommendRules: ["6.3"], // data-upsert requires PG9.5
|
||||
mustContain: ["insert", "update"],
|
||||
mustNotContain: ["on conflict"],
|
||||
},
|
||||
expectedReasoning: [
|
||||
"Recognize that PostgreSQL 9.3 is specified",
|
||||
"Check that ON CONFLICT (UPSERT) requires PG9.5+",
|
||||
"Avoid recommending ON CONFLICT syntax",
|
||||
"Suggest alternative pattern (e.g., CTE with INSERT/UPDATE, or try/catch approach)",
|
||||
],
|
||||
userQuery:
|
||||
"I need to insert a setting if it doesn't exist, or update it if it does. How should I do this?",
|
||||
postgresVersion: "9.3",
|
||||
},
|
||||
expectedOutput: {
|
||||
shouldRecommendRules: [],
|
||||
shouldNotRecommendRules: ["6.3"], // data-upsert requires PG9.5
|
||||
mustContain: ["insert", "update"],
|
||||
mustNotContain: ["on conflict"],
|
||||
},
|
||||
expectedReasoning: [
|
||||
"Recognize that PostgreSQL 9.3 is specified",
|
||||
"Check that ON CONFLICT (UPSERT) requires PG9.5+",
|
||||
"Avoid recommending ON CONFLICT syntax",
|
||||
"Suggest alternative pattern (e.g., CTE with INSERT/UPDATE, or try/catch approach)",
|
||||
],
|
||||
};
|
||||
|
||||
describe("Version Constraint Tests", () => {
|
||||
describe("PG10 - No Covering Index", () => {
|
||||
it("should NOT recommend INCLUDE clause for PG10", async () => {
|
||||
const result = await runEval(scenarioPg10NoCoveringIndex);
|
||||
describe("PG10 - No Covering Index", () => {
|
||||
it("should NOT recommend INCLUDE clause for PG10", async () => {
|
||||
const result = await runEval(scenarioPg10NoCoveringIndex);
|
||||
|
||||
console.log("Response:", result.response);
|
||||
console.log("Criteria results:", result.criteriaResults);
|
||||
console.log("Response:", result.response);
|
||||
console.log("Criteria results:", result.criteriaResults);
|
||||
|
||||
// Response should NOT contain INCLUDE recommendation
|
||||
expect(result.response.toLowerCase()).not.toContain("include (");
|
||||
expect(result.response.toLowerCase()).not.toContain("include(");
|
||||
});
|
||||
});
|
||||
// Response should NOT contain INCLUDE recommendation
|
||||
expect(result.response.toLowerCase()).not.toContain("include (");
|
||||
expect(result.response.toLowerCase()).not.toContain("include(");
|
||||
});
|
||||
});
|
||||
|
||||
describe("PG9.3 - No UPSERT", () => {
|
||||
it("should NOT recommend ON CONFLICT for PG9.3", async () => {
|
||||
const result = await runEval(scenarioPg93NoUpsert);
|
||||
describe("PG9.3 - No UPSERT", () => {
|
||||
it("should NOT recommend ON CONFLICT for PG9.3", async () => {
|
||||
const result = await runEval(scenarioPg93NoUpsert);
|
||||
|
||||
console.log("Response:", result.response);
|
||||
console.log("Criteria results:", result.criteriaResults);
|
||||
console.log("Response:", result.response);
|
||||
console.log("Criteria results:", result.criteriaResults);
|
||||
|
||||
// Response should NOT recommend ON CONFLICT
|
||||
expect(result.response.toLowerCase()).not.toContain("on conflict");
|
||||
});
|
||||
});
|
||||
// Response should NOT recommend ON CONFLICT
|
||||
expect(result.response.toLowerCase()).not.toContain("on conflict");
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
export { scenarioPg10NoCoveringIndex, scenarioPg93NoUpsert };
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2022",
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "bundler",
|
||||
"esModuleInterop": true,
|
||||
"strict": true,
|
||||
"skipLibCheck": true,
|
||||
"outDir": "dist",
|
||||
"declaration": true
|
||||
},
|
||||
"include": ["*.ts", "scenarios/**/*.ts"]
|
||||
"compilerOptions": {
|
||||
"target": "ES2022",
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "bundler",
|
||||
"esModuleInterop": true,
|
||||
"strict": true,
|
||||
"skipLibCheck": true,
|
||||
"outDir": "dist",
|
||||
"declaration": true
|
||||
},
|
||||
"include": ["*.ts", "scenarios/**/*.ts"]
|
||||
}
|
||||
|
||||
@@ -2,111 +2,114 @@
|
||||
* Evaluation scenario definition
|
||||
*/
|
||||
export interface EvalScenario {
|
||||
/** Unique identifier for the scenario */
|
||||
id: string;
|
||||
/** Unique identifier for the scenario */
|
||||
id: string;
|
||||
|
||||
/** Human-readable name */
|
||||
name: string;
|
||||
/** Human-readable name */
|
||||
name: string;
|
||||
|
||||
/** Description of what this scenario tests */
|
||||
description: string;
|
||||
/** Description of what this scenario tests */
|
||||
description: string;
|
||||
|
||||
/** Category of the scenario */
|
||||
category: "query-performance" | "version-constraints" | "extension-requirements";
|
||||
/** Category of the scenario */
|
||||
category:
|
||||
| "query-performance"
|
||||
| "version-constraints"
|
||||
| "extension-requirements";
|
||||
|
||||
/** Difficulty level */
|
||||
difficulty: "basic" | "intermediate" | "advanced";
|
||||
/** Difficulty level */
|
||||
difficulty: "basic" | "intermediate" | "advanced";
|
||||
|
||||
/** Input for the scenario */
|
||||
input: {
|
||||
/** SQL schema context */
|
||||
schema: string;
|
||||
/** Input for the scenario */
|
||||
input: {
|
||||
/** SQL schema context */
|
||||
schema: string;
|
||||
|
||||
/** User's question or request */
|
||||
userQuery: string;
|
||||
/** User's question or request */
|
||||
userQuery: string;
|
||||
|
||||
/** Optional PostgreSQL version (e.g., "10.0", "15.4") */
|
||||
postgresVersion?: string;
|
||||
/** Optional PostgreSQL version (e.g., "10.0", "15.4") */
|
||||
postgresVersion?: string;
|
||||
|
||||
/** Optional list of available extensions */
|
||||
availableExtensions?: string[];
|
||||
/** Optional list of available extensions */
|
||||
availableExtensions?: string[];
|
||||
|
||||
/** Additional context */
|
||||
context?: string;
|
||||
};
|
||||
/** Additional context */
|
||||
context?: string;
|
||||
};
|
||||
|
||||
/** Expected output criteria */
|
||||
expectedOutput: {
|
||||
/** Rule IDs that should be recommended */
|
||||
shouldRecommendRules: string[];
|
||||
/** Expected output criteria */
|
||||
expectedOutput: {
|
||||
/** Rule IDs that should be recommended */
|
||||
shouldRecommendRules: string[];
|
||||
|
||||
/** Rule IDs that should NOT be recommended (version/extension constraints) */
|
||||
shouldNotRecommendRules?: string[];
|
||||
/** Rule IDs that should NOT be recommended (version/extension constraints) */
|
||||
shouldNotRecommendRules?: string[];
|
||||
|
||||
/** Strings that must appear in the response */
|
||||
mustContain: string[];
|
||||
/** Strings that must appear in the response */
|
||||
mustContain: string[];
|
||||
|
||||
/** Strings that must NOT appear in the response */
|
||||
mustNotContain?: string[];
|
||||
};
|
||||
/** Strings that must NOT appear in the response */
|
||||
mustNotContain?: string[];
|
||||
};
|
||||
|
||||
/** Expected reasoning steps the agent should follow */
|
||||
expectedReasoning: string[];
|
||||
/** Expected reasoning steps the agent should follow */
|
||||
expectedReasoning: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of evaluating a single criterion
|
||||
*/
|
||||
export interface CriterionResult {
|
||||
/** Description of the criterion */
|
||||
criterion: string;
|
||||
/** Description of the criterion */
|
||||
criterion: string;
|
||||
|
||||
/** Whether the criterion passed */
|
||||
passed: boolean;
|
||||
/** Whether the criterion passed */
|
||||
passed: boolean;
|
||||
|
||||
/** Evidence or explanation */
|
||||
evidence?: string;
|
||||
/** Evidence or explanation */
|
||||
evidence?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of running an evaluation scenario
|
||||
*/
|
||||
export interface EvalResult {
|
||||
/** Scenario ID */
|
||||
scenarioId: string;
|
||||
/** Scenario ID */
|
||||
scenarioId: string;
|
||||
|
||||
/** Whether all criteria passed */
|
||||
passed: boolean;
|
||||
/** Whether all criteria passed */
|
||||
passed: boolean;
|
||||
|
||||
/** Rule IDs that were referenced in the response */
|
||||
rulesReferenced: string[];
|
||||
/** Rule IDs that were referenced in the response */
|
||||
rulesReferenced: string[];
|
||||
|
||||
/** Results for each evaluation criterion */
|
||||
criteriaResults: CriterionResult[];
|
||||
/** Results for each evaluation criterion */
|
||||
criteriaResults: CriterionResult[];
|
||||
|
||||
/** The agent's full response */
|
||||
response: string;
|
||||
/** The agent's full response */
|
||||
response: string;
|
||||
|
||||
/** Time taken in milliseconds */
|
||||
latencyMs: number;
|
||||
/** Time taken in milliseconds */
|
||||
latencyMs: number;
|
||||
|
||||
/** Error message if evaluation failed */
|
||||
error?: string;
|
||||
/** Error message if evaluation failed */
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configuration for the eval runner
|
||||
*/
|
||||
export interface EvalConfig {
|
||||
/** Path to AGENTS.md file */
|
||||
agentsPath: string;
|
||||
/** Path to AGENTS.md file */
|
||||
agentsPath: string;
|
||||
|
||||
/** Model to use for evaluation */
|
||||
model?: string;
|
||||
/** Model to use for evaluation */
|
||||
model?: string;
|
||||
|
||||
/** Maximum tokens for response */
|
||||
maxTokens?: number;
|
||||
/** Maximum tokens for response */
|
||||
maxTokens?: number;
|
||||
|
||||
/** Temperature for generation */
|
||||
temperature?: number;
|
||||
/** Temperature for generation */
|
||||
temperature?: number;
|
||||
}
|
||||
|
||||
@@ -4,69 +4,71 @@ import type { EvalResult, EvalScenario } from "./types.js";
|
||||
* Format eval results as a summary table
|
||||
*/
|
||||
export function formatResultsSummary(results: EvalResult[]): string {
|
||||
const lines: string[] = [];
|
||||
const lines: string[] = [];
|
||||
|
||||
lines.push("## Eval Results Summary\n");
|
||||
lines.push("## Eval Results Summary\n");
|
||||
|
||||
const passed = results.filter((r) => r.passed).length;
|
||||
const total = results.length;
|
||||
const passRate = ((passed / total) * 100).toFixed(1);
|
||||
const passed = results.filter((r) => r.passed).length;
|
||||
const total = results.length;
|
||||
const passRate = ((passed / total) * 100).toFixed(1);
|
||||
|
||||
lines.push(`**Pass Rate:** ${passed}/${total} (${passRate}%)\n`);
|
||||
lines.push(`**Pass Rate:** ${passed}/${total} (${passRate}%)\n`);
|
||||
|
||||
lines.push("| Scenario | Status | Latency | Rules Referenced |");
|
||||
lines.push("|----------|--------|---------|------------------|");
|
||||
lines.push("| Scenario | Status | Latency | Rules Referenced |");
|
||||
lines.push("|----------|--------|---------|------------------|");
|
||||
|
||||
for (const result of results) {
|
||||
const status = result.passed ? "PASS" : "FAIL";
|
||||
const latency = `${result.latencyMs}ms`;
|
||||
const rules = result.rulesReferenced.join(", ") || "none";
|
||||
lines.push(`| ${result.scenarioId} | ${status} | ${latency} | ${rules} |`);
|
||||
}
|
||||
for (const result of results) {
|
||||
const status = result.passed ? "PASS" : "FAIL";
|
||||
const latency = `${result.latencyMs}ms`;
|
||||
const rules = result.rulesReferenced.join(", ") || "none";
|
||||
lines.push(`| ${result.scenarioId} | ${status} | ${latency} | ${rules} |`);
|
||||
}
|
||||
|
||||
return lines.join("\n");
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* Format detailed results for a single scenario
|
||||
*/
|
||||
export function formatDetailedResult(result: EvalResult): string {
|
||||
const lines: string[] = [];
|
||||
const lines: string[] = [];
|
||||
|
||||
lines.push(`## ${result.scenarioId}\n`);
|
||||
lines.push(`**Status:** ${result.passed ? "PASS" : "FAIL"}`);
|
||||
lines.push(`**Latency:** ${result.latencyMs}ms`);
|
||||
lines.push(`**Rules Referenced:** ${result.rulesReferenced.join(", ") || "none"}\n`);
|
||||
lines.push(`## ${result.scenarioId}\n`);
|
||||
lines.push(`**Status:** ${result.passed ? "PASS" : "FAIL"}`);
|
||||
lines.push(`**Latency:** ${result.latencyMs}ms`);
|
||||
lines.push(
|
||||
`**Rules Referenced:** ${result.rulesReferenced.join(", ") || "none"}\n`,
|
||||
);
|
||||
|
||||
if (result.error) {
|
||||
lines.push(`**Error:** ${result.error}\n`);
|
||||
}
|
||||
if (result.error) {
|
||||
lines.push(`**Error:** ${result.error}\n`);
|
||||
}
|
||||
|
||||
lines.push("### Criteria Results\n");
|
||||
for (const criterion of result.criteriaResults) {
|
||||
const icon = criterion.passed ? "+" : "-";
|
||||
lines.push(`${icon} ${criterion.criterion}`);
|
||||
if (criterion.evidence) {
|
||||
lines.push(` Evidence: ${criterion.evidence}`);
|
||||
}
|
||||
}
|
||||
lines.push("### Criteria Results\n");
|
||||
for (const criterion of result.criteriaResults) {
|
||||
const icon = criterion.passed ? "+" : "-";
|
||||
lines.push(`${icon} ${criterion.criterion}`);
|
||||
if (criterion.evidence) {
|
||||
lines.push(` Evidence: ${criterion.evidence}`);
|
||||
}
|
||||
}
|
||||
|
||||
lines.push("\n### Response\n");
|
||||
lines.push("```");
|
||||
lines.push(result.response);
|
||||
lines.push("```");
|
||||
lines.push("\n### Response\n");
|
||||
lines.push("```");
|
||||
lines.push(result.response);
|
||||
lines.push("```");
|
||||
|
||||
return lines.join("\n");
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a scenario builder for cleaner test definitions
|
||||
*/
|
||||
export function createScenario(
|
||||
partial: Omit<EvalScenario, "id"> & { id?: string }
|
||||
partial: Omit<EvalScenario, "id"> & { id?: string },
|
||||
): EvalScenario {
|
||||
return {
|
||||
id: partial.id || partial.name.toLowerCase().replace(/\s+/g, "-"),
|
||||
...partial,
|
||||
} as EvalScenario;
|
||||
return {
|
||||
id: partial.id || partial.name.toLowerCase().replace(/\s+/g, "-"),
|
||||
...partial,
|
||||
} as EvalScenario;
|
||||
}
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
import { defineConfig } from "vitest/config";
|
||||
|
||||
export default defineConfig({
|
||||
test: {
|
||||
include: ["scenarios/**/*.eval.ts"],
|
||||
testTimeout: 60000, // 60 seconds for LLM calls
|
||||
reporters: ["verbose"],
|
||||
},
|
||||
test: {
|
||||
include: ["scenarios/**/*.eval.ts"],
|
||||
testTimeout: 60000, // 60 seconds for LLM calls
|
||||
reporters: ["verbose"],
|
||||
},
|
||||
});
|
||||
|
||||
@@ -1,23 +1,11 @@
|
||||
{
|
||||
"name": "aurora",
|
||||
"minVersion": "13",
|
||||
"maxVersion": "16",
|
||||
"extensions": {
|
||||
"available": [
|
||||
"pg_stat_statements",
|
||||
"pgcrypto",
|
||||
"uuid-ossp"
|
||||
],
|
||||
"installable": [
|
||||
"postgis",
|
||||
"pg_hint_plan",
|
||||
"pg_similarity"
|
||||
],
|
||||
"unavailable": [
|
||||
"pg_cron",
|
||||
"pg_partman",
|
||||
"timescaledb"
|
||||
]
|
||||
},
|
||||
"notes": "AWS Aurora PostgreSQL. Some extensions are not available due to managed service restrictions. Aurora has its own connection pooling (RDS Proxy) and automatic failover."
|
||||
"name": "aurora",
|
||||
"minVersion": "13",
|
||||
"maxVersion": "16",
|
||||
"extensions": {
|
||||
"available": ["pg_stat_statements", "pgcrypto", "uuid-ossp"],
|
||||
"installable": ["postgis", "pg_hint_plan", "pg_similarity"],
|
||||
"unavailable": ["pg_cron", "pg_partman", "timescaledb"]
|
||||
},
|
||||
"notes": "AWS Aurora PostgreSQL. Some extensions are not available due to managed service restrictions. Aurora has its own connection pooling (RDS Proxy) and automatic failover."
|
||||
}
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
{
|
||||
"name": "self-hosted",
|
||||
"minVersion": "12",
|
||||
"extensions": {
|
||||
"available": [],
|
||||
"installable": [
|
||||
"pg_stat_statements",
|
||||
"pgcrypto",
|
||||
"uuid-ossp",
|
||||
"postgis",
|
||||
"pg_trgm",
|
||||
"btree_gin",
|
||||
"btree_gist"
|
||||
],
|
||||
"unavailable": []
|
||||
},
|
||||
"notes": "Generic self-hosted PostgreSQL. Extension availability depends on server configuration. Check pg_available_extensions for what can be installed."
|
||||
"name": "self-hosted",
|
||||
"minVersion": "12",
|
||||
"extensions": {
|
||||
"available": [],
|
||||
"installable": [
|
||||
"pg_stat_statements",
|
||||
"pgcrypto",
|
||||
"uuid-ossp",
|
||||
"postgis",
|
||||
"pg_trgm",
|
||||
"btree_gin",
|
||||
"btree_gist"
|
||||
],
|
||||
"unavailable": []
|
||||
},
|
||||
"notes": "Generic self-hosted PostgreSQL. Extension availability depends on server configuration. Check pg_available_extensions for what can be installed."
|
||||
}
|
||||
|
||||
@@ -1,27 +1,27 @@
|
||||
{
|
||||
"name": "supabase",
|
||||
"minVersion": "15",
|
||||
"extensions": {
|
||||
"available": [
|
||||
"pg_stat_statements",
|
||||
"pgcrypto",
|
||||
"uuid-ossp",
|
||||
"pgjwt",
|
||||
"pg_graphql",
|
||||
"pg_net",
|
||||
"pgsodium",
|
||||
"supabase_vault",
|
||||
"pg_jsonschema"
|
||||
],
|
||||
"installable": [
|
||||
"postgis",
|
||||
"pg_cron",
|
||||
"pgtap",
|
||||
"plv8",
|
||||
"http",
|
||||
"pg_hashids"
|
||||
],
|
||||
"unavailable": []
|
||||
},
|
||||
"notes": "Supabase manages connection pooling via Supavisor. Direct connection limits differ from pooled connections. All standard Postgres extensions are available."
|
||||
"name": "supabase",
|
||||
"minVersion": "15",
|
||||
"extensions": {
|
||||
"available": [
|
||||
"pg_stat_statements",
|
||||
"pgcrypto",
|
||||
"uuid-ossp",
|
||||
"pgjwt",
|
||||
"pg_graphql",
|
||||
"pg_net",
|
||||
"pgsodium",
|
||||
"supabase_vault",
|
||||
"pg_jsonschema"
|
||||
],
|
||||
"installable": [
|
||||
"postgis",
|
||||
"pg_cron",
|
||||
"pgtap",
|
||||
"plv8",
|
||||
"http",
|
||||
"pg_hashids"
|
||||
],
|
||||
"unavailable": []
|
||||
},
|
||||
"notes": "Supabase manages connection pooling via Supavisor. Direct connection limits differ from pooled connections. All standard Postgres extensions are available."
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user