KindLM Assertion Engine
Overview
The assertion engine evaluates model outputs against configured expectations. Each assertion produces a typed result with a score, pass/fail status, and failure reason code. Results are aggregated across repeat runs, then evaluated against gates to determine the final run status.
Assertion Lifecycle
Config (expect) → Assertion Instances → Execute → Results → Aggregate → Gates → Exit Code
- Parse: Config
expectsection is parsed into assertion instances - Execute: Each assertion evaluates the model output independently
- Score: Each assertion produces a score (0-1) and pass/fail
- Aggregate: Multiple runs of the same test case are aggregated
- Gate: Aggregated results are checked against suite-level gates
- Report: Results are formatted for terminal, JSON, JUnit, and compliance
Assertion Interface
// packages/core/src/assertions/interface.ts
import type { ProviderResponse, ProviderToolCall } from "../types/provider";
/** Canonical failure reason codes — used in reports, JUnit, and compliance docs */
export type FailureCode =
| "SCHEMA_INVALID"
| "SCHEMA_PARSE_ERROR"
| "PII_DETECTED"
| "KEYWORD_DENIED"
| "KEYWORD_MISSING"
| "CONTAINS_FAILED"
| "NOT_CONTAINS_FAILED"
| "MAX_LENGTH_EXCEEDED"
| "JUDGE_BELOW_THRESHOLD"
| "TOOL_CALL_MISSING"
| "TOOL_CALL_UNEXPECTED"
| "TOOL_CALL_ARGS_MISMATCH"
| "TOOL_CALL_ORDER_WRONG"
| "TOOL_CALL_ARGS_SCHEMA_INVALID"
| "DRIFT_EXCEEDED"
| "PROVIDER_TIMEOUT"
| "PROVIDER_AUTH_FAILED"
| "PROVIDER_ERROR"
| "INTERNAL_ERROR";
export interface AssertionResult {
/** Name of the assertion type (e.g., "schema", "pii", "judge") */
assertionType: string;
/** Human-readable label for this specific check */
label: string;
/** Whether this assertion passed */
passed: boolean;
/** Score from 0 to 1 (1 = perfect) */
score: number;
/** Failure code if not passed */
failureCode?: FailureCode;
/** Human-readable failure message */
failureMessage?: string;
/** Additional metadata (e.g., matched PII patterns, judge reasoning) */
metadata?: Record<string, unknown>;
}
export interface AssertionContext {
/** The model's text output */
outputText: string;
/** Parsed JSON output (if format=json and parse succeeded) */
outputJson?: unknown;
/** Tool calls made by the model */
toolCalls: ProviderToolCall[];
/** Baseline output for drift comparison (if baseline exists) */
baselineText?: string;
baselineJson?: unknown;
/** The provider adapter (needed for LLM-as-judge) */
judgeAdapter?: import("../types/provider").ProviderAdapter;
/** Judge model name */
judgeModel?: string;
/** Config directory (for resolving relative paths) */
configDir: string;
}
export interface Assertion {
/** Unique type identifier */
readonly type: string;
/** Execute the assertion and return results */
evaluate(context: AssertionContext): Promise<AssertionResult[]>;
}
Built-in Assertions
1. JSON Schema Validation
// packages/core/src/assertions/schema.ts
import Ajv from "ajv";
import addFormats from "ajv-formats";
import { readFileSync } from "fs";
import { resolve } from "path";
import type { Assertion, AssertionContext, AssertionResult } from "./interface";
export class SchemaAssertion implements Assertion {
readonly type = "schema";
constructor(
private format: "text" | "json",
private schemaFile?: string,
private contains?: string[],
private notContains?: string[],
private maxLength?: number,
) {}
async evaluate(ctx: AssertionContext): Promise<AssertionResult[]> {
const results: AssertionResult[] = [];
// --- JSON parse check ---
if (this.format === "json") {
let parsed: unknown;
try {
parsed = JSON.parse(ctx.outputText);
ctx.outputJson = parsed;
results.push({
assertionType: "schema",
label: "JSON parse",
passed: true,
score: 1,
});
} catch (err) {
results.push({
assertionType: "schema",
label: "JSON parse",
passed: false,
score: 0,
failureCode: "SCHEMA_PARSE_ERROR",
failureMessage: `Output is not valid JSON: ${(err as Error).message}`,
metadata: { outputPreview: ctx.outputText.slice(0, 200) },
});
return results; // Can't validate schema if parse failed
}
// --- JSON Schema validation ---
if (this.schemaFile) {
const schemaPath = resolve(ctx.configDir, this.schemaFile);
const schemaContent = JSON.parse(readFileSync(schemaPath, "utf-8"));
const ajv = new Ajv({ allErrors: true, strict: false });
addFormats(ajv);
const validate = ajv.compile(schemaContent);
const valid = validate(parsed);
if (valid) {
results.push({
assertionType: "schema",
label: `Schema: ${this.schemaFile}`,
passed: true,
score: 1,
});
} else {
const errors = validate.errors?.map(
(e) => `${e.instancePath || "/"}: ${e.message}`
) ?? [];
results.push({
assertionType: "schema",
label: `Schema: ${this.schemaFile}`,
passed: false,
score: 0,
failureCode: "SCHEMA_INVALID",
failureMessage: errors.join("; "),
metadata: { schemaErrors: validate.errors },
});
}
}
}
// --- Contains checks ---
if (this.contains) {
for (const substring of this.contains) {
const found = ctx.outputText.includes(substring);
results.push({
assertionType: "schema",
label: `Contains: "${substring}"`,
passed: found,
score: found ? 1 : 0,
...(!found && {
failureCode: "CONTAINS_FAILED" as const,
failureMessage: `Output does not contain "${substring}"`,
}),
});
}
}
// --- NotContains checks ---
if (this.notContains) {
for (const substring of this.notContains) {
const found = ctx.outputText.includes(substring);
results.push({
assertionType: "schema",
label: `Not contains: "${substring}"`,
passed: !found,
score: found ? 0 : 1,
...(found && {
failureCode: "NOT_CONTAINS_FAILED" as const,
failureMessage: `Output contains forbidden substring "${substring}"`,
}),
});
}
}
// --- Max length ---
if (this.maxLength !== undefined) {
const withinLimit = ctx.outputText.length <= this.maxLength;
results.push({
assertionType: "schema",
label: `Max length: ${this.maxLength}`,
passed: withinLimit,
score: withinLimit ? 1 : 0,
...(!withinLimit && {
failureCode: "MAX_LENGTH_EXCEEDED" as const,
failureMessage: `Output length ${ctx.outputText.length} exceeds max ${this.maxLength}`,
}),
});
}
return results;
}
}
2. PII Detection
// packages/core/src/assertions/pii.ts
export class PIIAssertion implements Assertion {
readonly type = "pii";
constructor(
private denyPatterns: string[],
private customPatterns?: Array<{ name: string; pattern: string }>,
) {}
async evaluate(ctx: AssertionContext): Promise<AssertionResult[]> {
const results: AssertionResult[] = [];
const allPatterns = [
...this.denyPatterns.map((p, i) => ({ name: `pii-pattern-${i}`, pattern: p })),
...(this.customPatterns ?? []),
];
for (const { name, pattern } of allPatterns) {
const regex = new RegExp(pattern, "gi");
const matches = ctx.outputText.match(regex);
if (matches && matches.length > 0) {
results.push({
assertionType: "pii",
label: `PII: ${name}`,
passed: false,
score: 0,
failureCode: "PII_DETECTED",
failureMessage: `Found ${matches.length} PII match(es) for pattern "${name}"`,
metadata: {
pattern: name,
matchCount: matches.length,
// Redact actual matches in metadata for safety
redactedMatches: matches.map((m) => m.slice(0, 3) + "***"),
},
});
} else {
results.push({
assertionType: "pii",
label: `PII: ${name}`,
passed: true,
score: 1,
});
}
}
return results;
}
}
3. Keyword Guardrails
// packages/core/src/assertions/keywords.ts
export class KeywordAssertion implements Assertion {
readonly type = "keywords";
constructor(
private deny: string[],
private allow?: string[],
) {}
async evaluate(ctx: AssertionContext): Promise<AssertionResult[]> {
const results: AssertionResult[] = [];
const lowerOutput = ctx.outputText.toLowerCase();
// Deny list
for (const keyword of this.deny) {
const found = lowerOutput.includes(keyword.toLowerCase());
if (found) {
results.push({
assertionType: "keywords",
label: `Keyword deny: "${keyword}"`,
passed: false,
score: 0,
failureCode: "KEYWORD_DENIED",
failureMessage: `Output contains denied keyword "${keyword}"`,
});
}
}
// If no denied keywords found, pass
if (this.deny.length > 0 && results.length === 0) {
results.push({
assertionType: "keywords",
label: "Keyword deny list",
passed: true,
score: 1,
});
}
// Allow list (at least one must be present)
if (this.allow && this.allow.length > 0) {
const found = this.allow.some((kw) =>
lowerOutput.includes(kw.toLowerCase())
);
results.push({
assertionType: "keywords",
label: "Keyword allow list",
passed: found,
score: found ? 1 : 0,
...(!found && {
failureCode: "KEYWORD_MISSING" as const,
failureMessage: `Output must contain at least one of: ${this.allow.join(", ")}`,
}),
});
}
return results;
}
}
4. LLM-as-Judge
// packages/core/src/assertions/judge.ts
export class JudgeAssertion implements Assertion {
readonly type = "judge";
constructor(
private criteria: string,
private minScore: number,
private rubric?: string,
) {}
async evaluate(ctx: AssertionContext): Promise<AssertionResult[]> {
if (!ctx.judgeAdapter || !ctx.judgeModel) {
return [{
assertionType: "judge",
label: `Judge: ${this.criteria}`,
passed: false,
score: 0,
failureCode: "INTERNAL_ERROR",
failureMessage: "No judge model configured for LLM-as-judge assertions",
}];
}
const judgePrompt = this.buildJudgePrompt(ctx.outputText);
const response = await ctx.judgeAdapter.complete({
model: ctx.judgeModel,
messages: [
{ role: "system", content: JUDGE_SYSTEM_PROMPT },
{ role: "user", content: judgePrompt },
],
params: {
temperature: 0,
maxTokens: 512,
},
});
const parsed = this.parseJudgeResponse(response.text);
return [{
assertionType: "judge",
label: `Judge: ${this.criteria}`,
passed: parsed.score >= this.minScore,
score: parsed.score,
...(parsed.score < this.minScore && {
failureCode: "JUDGE_BELOW_THRESHOLD" as const,
failureMessage: `Judge score ${parsed.score.toFixed(2)} below threshold ${this.minScore}`,
}),
metadata: {
reasoning: parsed.reasoning,
judgeModel: ctx.judgeModel,
criteria: this.criteria,
},
}];
}
private buildJudgePrompt(output: string): string {
const rubricSection = this.rubric
? `\n\nRubric:\n${this.rubric}`
: "";
return `Evaluate the following AI output against this criterion:
Criterion: ${this.criteria}${rubricSection}
AI Output:
---
${output}
---
Respond with ONLY a JSON object:
{
"score": <number 0.0 to 1.0>,
"reasoning": "<brief explanation>"
}`;
}
private parseJudgeResponse(text: string): { score: number; reasoning: string } {
try {
// Extract JSON from response (handle markdown code blocks)
const jsonMatch = text.match(/\{[\s\S]*\}/);
if (!jsonMatch) throw new Error("No JSON found");
const parsed = JSON.parse(jsonMatch[0]);
return {
score: Math.max(0, Math.min(1, Number(parsed.score) || 0)),
reasoning: String(parsed.reasoning || ""),
};
} catch {
return { score: 0, reasoning: `Failed to parse judge response: ${text.slice(0, 200)}` };
}
}
}
const JUDGE_SYSTEM_PROMPT = `You are an impartial AI output evaluator. Your job is to score an AI system's output against a specific criterion.
Rules:
- Score from 0.0 (completely fails the criterion) to 1.0 (perfectly meets the criterion)
- Be objective and consistent
- Provide brief, specific reasoning
- Respond ONLY with a JSON object containing "score" and "reasoning"
- Do not be lenient — a score of 1.0 means genuinely excellent`;
5. Tool Call Assertions
// packages/core/src/assertions/tool-calls.ts
export class ToolCallAssertion implements Assertion {
readonly type = "tool-calls";
constructor(
private expectations: Array<{
tool: string;
shouldNotCall?: boolean;
argsMatch?: Record<string, unknown>;
argsSchema?: string;
order?: number;
}>,
) {}
async evaluate(ctx: AssertionContext): Promise<AssertionResult[]> {
const results: AssertionResult[] = [];
for (const expect of this.expectations) {
const matchingCalls = ctx.toolCalls.filter((tc) => tc.name === expect.tool);
// --- shouldNotCall ---
if (expect.shouldNotCall) {
const passed = matchingCalls.length === 0;
results.push({
assertionType: "tool-calls",
label: `Tool NOT called: ${expect.tool}`,
passed,
score: passed ? 1 : 0,
...(!passed && {
failureCode: "TOOL_CALL_UNEXPECTED",
failureMessage: `Tool "${expect.tool}" was called ${matchingCalls.length} time(s) but should not have been`,
}),
});
continue;
}
// --- Tool was called ---
if (matchingCalls.length === 0) {
results.push({
assertionType: "tool-calls",
label: `Tool called: ${expect.tool}`,
passed: false,
score: 0,
failureCode: "TOOL_CALL_MISSING",
failureMessage: `Expected tool "${expect.tool}" was never called. Called: ${
ctx.toolCalls.map((tc) => tc.name).join(", ") || "(none)"
}`,
});
continue;
}
results.push({
assertionType: "tool-calls",
label: `Tool called: ${expect.tool}`,
passed: true,
score: 1,
});
// --- Argument matching ---
if (expect.argsMatch) {
const call = matchingCalls[0]; // Check first matching call
const mismatches: string[] = [];
for (const [key, expectedValue] of Object.entries(expect.argsMatch)) {
const actualValue = call.arguments[key];
if (JSON.stringify(actualValue) !== JSON.stringify(expectedValue)) {
mismatches.push(
`${key}: expected ${JSON.stringify(expectedValue)}, got ${JSON.stringify(actualValue)}`
);
}
}
const passed = mismatches.length === 0;
results.push({
assertionType: "tool-calls",
label: `Tool args: ${expect.tool}`,
passed,
score: passed ? 1 : 0,
...(!passed && {
failureCode: "TOOL_CALL_ARGS_MISMATCH",
failureMessage: `Argument mismatches: ${mismatches.join("; ")}`,
metadata: { mismatches, actualArgs: call.arguments },
}),
});
}
// --- Order check ---
if (expect.order !== undefined) {
const actualIndex = ctx.toolCalls.findIndex((tc) => tc.name === expect.tool);
const passed = actualIndex === expect.order;
results.push({
assertionType: "tool-calls",
label: `Tool order: ${expect.tool} at position ${expect.order}`,
passed,
score: passed ? 1 : 0,
...(!passed && {
failureCode: "TOOL_CALL_ORDER_WRONG",
failureMessage: `Expected "${expect.tool}" at position ${expect.order}, found at ${actualIndex}`,
}),
});
}
}
return results;
}
}
6. Baseline Drift
// packages/core/src/assertions/drift.ts
export class DriftAssertion implements Assertion {
readonly type = "drift";
constructor(
private maxScore: number,
private method: "judge" | "embedding" | "field-diff",
private fields?: string[],
) {}
async evaluate(ctx: AssertionContext): Promise<AssertionResult[]> {
if (!ctx.baselineText) {
return [{
assertionType: "drift",
label: "Baseline drift",
passed: true,
score: 1,
metadata: { reason: "No baseline available — skipping drift check" },
}];
}
switch (this.method) {
case "judge":
return this.evaluateWithJudge(ctx);
case "field-diff":
return this.evaluateFieldDiff(ctx);
case "embedding":
return this.evaluateEmbedding(ctx);
}
}
private async evaluateWithJudge(ctx: AssertionContext): Promise<AssertionResult[]> {
if (!ctx.judgeAdapter || !ctx.judgeModel) {
return [{ assertionType: "drift", label: "Drift (judge)", passed: false, score: 0,
failureCode: "INTERNAL_ERROR", failureMessage: "No judge model for drift comparison" }];
}
const response = await ctx.judgeAdapter.complete({
model: ctx.judgeModel,
messages: [
{ role: "system", content: DRIFT_JUDGE_SYSTEM },
{ role: "user", content: `Baseline output:\n---\n${ctx.baselineText}\n---\n\nCurrent output:\n---\n${ctx.outputText}\n---` },
],
params: { temperature: 0, maxTokens: 512 },
});
const parsed = this.parseDriftResponse(response.text);
const passed = parsed.driftScore <= this.maxScore;
return [{
assertionType: "drift",
label: "Baseline drift (judge)",
passed,
score: 1 - parsed.driftScore, // Invert: higher score = less drift = better
...(!passed && {
failureCode: "DRIFT_EXCEEDED",
failureMessage: `Drift score ${parsed.driftScore.toFixed(3)} exceeds max ${this.maxScore}`,
}),
metadata: { driftScore: parsed.driftScore, reasoning: parsed.reasoning, method: "judge" },
}];
}
private evaluateFieldDiff(ctx: AssertionContext): Promise<AssertionResult[]> {
// Compare specific JSON fields between baseline and current
// Returns per-field drift results
// Implementation uses deep equality + Levenshtein for string fields
// ...
}
private evaluateEmbedding(ctx: AssertionContext): Promise<AssertionResult[]> {
// Cosine similarity between embeddings
// Falls back to judge if embedding provider not available
// ...
}
}
const DRIFT_JUDGE_SYSTEM = `You compare two AI outputs and score their semantic drift.
A drift score of 0.0 means the outputs are semantically identical.
A drift score of 1.0 means the outputs are completely different in meaning.
Focus on:
- Same factual content conveyed?
- Same actions taken (tool calls, decisions)?
- Same tone and professionalism?
- Same structure and format?
Minor wording changes = low drift. Different conclusions or missing information = high drift.
Respond ONLY with JSON: { "driftScore": <0.0-1.0>, "reasoning": "<brief explanation>" }`;
Aggregation
// packages/core/src/engine/aggregator.ts
import type { AssertionResult } from "../assertions/interface";
export interface TestCaseRunResult {
testCaseName: string;
modelId: string;
runIndex: number;
outputText: string;
assertions: AssertionResult[];
latencyMs: number;
tokenUsage: { inputTokens: number; outputTokens: number; totalTokens: number };
costEstimateUsd: number | null;
}
export interface AggregatedTestResult {
testCaseName: string;
modelId: string;
runCount: number;
/** true if pass rate across runs meets threshold */
passed: boolean;
passRate: number;
/** Per-assertion aggregated scores (mean across runs) */
assertionScores: Record<string, { mean: number; min: number; max: number }>;
/** Unique failure codes across all runs */
failureCodes: string[];
/** Average latency */
latencyAvgMs: number;
/** Total cost across all runs */
totalCostUsd: number;
/** Total tokens across all runs */
totalTokens: number;
/** Individual run results (for detailed reports) */
runs: TestCaseRunResult[];
}
/**
* Aggregate multiple runs of the same test case + model into a single result.
*/
export function aggregateRuns(runs: TestCaseRunResult[]): AggregatedTestResult {
if (runs.length === 0) throw new Error("Cannot aggregate zero runs");
const { testCaseName, modelId } = runs[0];
// A single run passes if ALL its assertions pass
const runPassStates = runs.map((run) =>
run.assertions.every((a) => a.passed)
);
const passRate = runPassStates.filter(Boolean).length / runs.length;
// Aggregate assertion scores by label
const scoresByLabel = new Map<string, number[]>();
for (const run of runs) {
for (const assertion of run.assertions) {
const key = `${assertion.assertionType}:${assertion.label}`;
if (!scoresByLabel.has(key)) scoresByLabel.set(key, []);
scoresByLabel.get(key)!.push(assertion.score);
}
}
const assertionScores: Record<string, { mean: number; min: number; max: number }> = {};
for (const [key, scores] of scoresByLabel) {
assertionScores[key] = {
mean: scores.reduce((a, b) => a + b, 0) / scores.length,
min: Math.min(...scores),
max: Math.max(...scores),
};
}
// Collect unique failure codes
const failureCodes = [
...new Set(
runs.flatMap((r) =>
r.assertions
.filter((a) => !a.passed && a.failureCode)
.map((a) => a.failureCode!)
)
),
];
return {
testCaseName,
modelId,
runCount: runs.length,
passed: passRate >= 1.0, // Default: all runs must pass. Gates can override.
passRate,
assertionScores,
failureCodes,
latencyAvgMs: runs.reduce((sum, r) => sum + r.latencyMs, 0) / runs.length,
totalCostUsd: runs.reduce((sum, r) => sum + (r.costEstimateUsd ?? 0), 0),
totalTokens: runs.reduce(
(sum, r) => sum + r.tokenUsage.totalTokens, 0
),
runs,
};
}
Gate Evaluation
// packages/core/src/engine/gate.ts
import type { GatesConfig } from "../types/config";
import type { AggregatedTestResult } from "./aggregator";
export interface GateResult {
gateName: string;
passed: boolean;
actual: number;
threshold: number;
message: string;
}
export interface GateEvaluation {
passed: boolean;
gates: GateResult[];
}
export function evaluateGates(
config: GatesConfig,
results: AggregatedTestResult[],
): GateEvaluation {
const gates: GateResult[] = [];
// --- Pass rate ---
const totalPassed = results.filter((r) => r.passed).length;
const passRate = results.length > 0 ? totalPassed / results.length : 0;
gates.push({
gateName: "passRateMin",
passed: passRate >= config.passRateMin,
actual: passRate,
threshold: config.passRateMin,
message: `Pass rate: ${(passRate * 100).toFixed(1)}% (min: ${(config.passRateMin * 100).toFixed(1)}%)`,
});
// --- Schema failures ---
const schemaFails = results.reduce(
(sum, r) => sum + r.failureCodes.filter((c) => c.startsWith("SCHEMA_")).length, 0
);
gates.push({
gateName: "schemaFailuresMax",
passed: schemaFails <= config.schemaFailuresMax,
actual: schemaFails,
threshold: config.schemaFailuresMax,
message: `Schema failures: ${schemaFails} (max: ${config.schemaFailuresMax})`,
});
// --- PII failures ---
const piiFails = results.reduce(
(sum, r) => sum + (r.failureCodes.includes("PII_DETECTED") ? 1 : 0), 0
);
gates.push({
gateName: "piiFailuresMax",
passed: piiFails <= config.piiFailuresMax,
actual: piiFails,
threshold: config.piiFailuresMax,
message: `PII failures: ${piiFails} (max: ${config.piiFailuresMax})`,
});
// --- Judge average ---
if (config.judgeAvgMin !== undefined) {
const judgeScores = results.flatMap((r) =>
Object.entries(r.assertionScores)
.filter(([key]) => key.startsWith("judge:"))
.map(([, v]) => v.mean)
);
const judgeAvg = judgeScores.length > 0
? judgeScores.reduce((a, b) => a + b, 0) / judgeScores.length
: 1;
gates.push({
gateName: "judgeAvgMin",
passed: judgeAvg >= config.judgeAvgMin,
actual: judgeAvg,
threshold: config.judgeAvgMin,
message: `Judge avg: ${(judgeAvg * 100).toFixed(1)}% (min: ${(config.judgeAvgMin * 100).toFixed(1)}%)`,
});
}
// --- Drift score ---
if (config.driftScoreMax !== undefined) {
const driftScores = results.flatMap((r) =>
Object.entries(r.assertionScores)
.filter(([key]) => key.startsWith("drift:"))
.map(([, v]) => 1 - v.mean) // Invert: assertion score is 1-drift
);
const maxDrift = driftScores.length > 0 ? Math.max(...driftScores) : 0;
gates.push({
gateName: "driftScoreMax",
passed: maxDrift <= config.driftScoreMax,
actual: maxDrift,
threshold: config.driftScoreMax,
message: `Drift score: ${maxDrift.toFixed(3)} (max: ${config.driftScoreMax})`,
});
}
return {
passed: gates.every((g) => g.passed),
gates,
};
}
Exit Code Mapping
| Exit Code | Meaning |
|---|---|
| 0 | All gates passed |
| 1 | One or more gates failed |
| 2 | Config invalid (parse or validation error) |
| 3 | Provider error (auth, network, timeout) |
| 4 | Internal error (unexpected exception) |