Custom Evaluator¶
In this how-to guide, we will walk through the process of creating a custom evaluator.
As the first example we will create an evaluator that checks that the number of sentences in the output of an agent is within a specified range. This is a pretty simple example of an evaluator that needs custom parameters (min_sentences and max_sentences) and a custom evaluation logic (counting sentences and checking that the count is within the specified range).
To also show how we recommend creating an Evaluator that has some global state (which should be set via env-vars), we then will create a second version of the same evaluator that also checks that the output does not contain any words from a blocklist. The blocklist will be stored in a file and the path to the file will be specified via an env-var. This way we can have a single blocklist file that is used across multiple evaluators and easily update it without having to change the code of the evaluators.
import shutil, subprocess
uv = shutil.which("uv") or "/Users/janine/.local/bin/uv"
subprocess.check_call([uv, "pip", "install", "nltk"])
[2mUsing Python 3.11.15 environment at: /Users/janine/ragpill/.venv[0m
[2mChecked [1m1 package[0m [2min 6ms[0m[0m
0
import json
import os
from dataclasses import dataclass, field
from typing import Any
import nltk
nltk.download("punkt_tab", quiet=True)
from nltk.tokenize import sent_tokenize
from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import EvaluationReason, EvaluatorContext
from ragpill.base import BaseEvaluator, EvaluatorMetadata, TestCaseMetadata
Part 1: SentenceCountEvaluator¶
SentenceCountEvaluator checks that the number of sentences in the output falls within [min_sentences, max_sentences].
Per-instance configuration is passed as a JSON string in the CSV check column: {"min_sentences": 2, "max_sentences": 5}. See Loading from CSV at the bottom of this section.
@dataclass(kw_only=True, repr=False)
class SentenceCountEvaluator(BaseEvaluator):
"""Checks that the number of sentences in the output is within [min_sentences, max_sentences].
Uses nltk's sent_tokenize for sentence boundary detection (handles abbreviations like
"Dr." or "U.S.A." correctly, unlike a simple split on punctuation).
Set max_sentences=-1 for no upper limit.
"""
min_sentences: int = field(default=1)
max_sentences: int = field(default=-1)
@classmethod
def from_csv_line(
cls, expected: bool, tags: set[str], check: str, **kwargs: Any
) -> "SentenceCountEvaluator":
min_sentences = 1
max_sentences = -1
if check and check.strip():
try:
check_dict = json.loads(check)
min_sentences = check_dict.get("min_sentences", min_sentences)
max_sentences = check_dict.get("max_sentences", max_sentences)
except json.JSONDecodeError:
# Plain integer → treat as min_sentences only
min_sentences = int(check)
return cls(
min_sentences=min_sentences,
max_sentences=max_sentences,
expected=expected,
tags=tags,
attributes=kwargs,
)
async def run(
self, ctx: EvaluatorContext[Any, Any, EvaluatorMetadata]
) -> EvaluationReason:
sentences = sent_tokenize(str(ctx.output))
count = len(sentences)
above_min = count >= self.min_sentences
below_max = self.max_sentences == -1 or count <= self.max_sentences
range_str = f"[{self.min_sentences}, {'∞' if self.max_sentences == -1 else self.max_sentences}]"
if above_min and below_max:
return EvaluationReason(
value=True,
reason=f"Found {count} sentence(s), within range {range_str}.",
)
if not above_min:
return EvaluationReason(
value=False,
reason=f"Found only {count} sentence(s), minimum required: {self.min_sentences}.",
)
return EvaluationReason(
value=False,
reason=f"Found {count} sentence(s), exceeds maximum of {self.max_sentences}.",
)
Try it out¶
We define a minimal mock agent and two test cases — one that should pass and one that should fail — then run them through a Dataset.
async def mock_agent(question: str) -> str:
"""Simulates an LLM agent. Replace with your real agent."""
return {
"long": "Paris is the capital of France. It is home to the Eiffel Tower. Millions of tourists visit each year.",
"short": "4.",
}.get(question, "I don't know.")
dataset = Dataset(
cases=[
Case(
inputs="long",
metadata=TestCaseMetadata(),
evaluators=[SentenceCountEvaluator(min_sentences=2, max_sentences=5)], # 3 sentences → passes
),
Case(
inputs="short",
metadata=TestCaseMetadata(),
evaluators=[SentenceCountEvaluator(min_sentences=2, max_sentences=5)], # 1 sentence → fails
),
]
)
report = await dataset.evaluate(mock_agent)
report.print()
Output()
/var/folders/5y/v5v7q_1n5h7_jkyhjgbw8kj40000gn/T/ipykernel_71085/1990873887.py:9: PydanticEvalsDeprecationWarning: Omitting the `name` parameter is deprecated. Please provide a name for your `Dataset`.
dataset = Dataset(
Evaluation Summary: mock_agent ┏━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ ┃ Case ID ┃ Assertions ┃ Duration ┃ ┡━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ │ Case 1 │ ✔ │ 2µs │ ├──────────┼────────────┼──────────┤ │ Case 2 │ ✗ │ 1µs │ ├──────────┼────────────┼──────────┤ │ Averages │ 50.0% ✔ │ 2µs │ └──────────┴────────────┴──────────┘
Loading from CSV¶
from_csv_line handles the CSV-to-evaluator mapping automatically when using load_testset. Register SentenceCountEvaluator in the evaluator_classes dict:
from ragpill.csv.testset import load_testset, default_evaluator_classes
testset = load_testset(
"testset.csv",
evaluator_classes={**default_evaluator_classes, "sentence_count": SentenceCountEvaluator},
)
The corresponding CSV rows look like:
| Question | test_type | check | expected | tags |
|---|---|---|---|---|
| What is the capital of France? | sentence_count | {"min_sentences": 2, "max_sentences": 5} |
true | geography |
Part 2: SentenceCountWithBlocklistEvaluator¶
We extend SentenceCountEvaluator to also reject outputs that contain blocked terms. The blocklist lives in a file whose path is set via the RAGPILL_BLOCKLIST_FILE_PATH environment variable — this way a single file can be shared across many evaluator instances, and updating the list requires no code change.
The file path is read through a pydantic_settings.BaseSettings subclass (the same pattern as LLMJudgeSettings), and the file is opened once in __post_init__ when the evaluator is instantiated.
Note: We don't need to override
from_csv_line.SentenceCountWithBlocklistEvaluatorinherits it fromSentenceCountEvaluatorunchanged — the blocklist path comes from an environment variable, not from the CSV row.
from pydantic import ConfigDict, Field
from pydantic_settings import BaseSettings
class BlocklistSettings(BaseSettings):
model_config = ConfigDict(env_prefix="RAGPILL_BLOCKLIST_")
file_path: str = Field("", description="Path to a text file with one blocked term per line.")
@dataclass(kw_only=True, repr=False)
class SentenceCountWithBlocklistEvaluator(SentenceCountEvaluator):
"""Like SentenceCountEvaluator, but also checks the output contains no blocked terms.
The blocklist is loaded at instantiation from the file at RAGPILL_BLOCKLIST_FILE_PATH.
Each non-empty line is treated as one blocked term (matched case-insensitively).
"""
def __post_init__(self) -> None:
settings = BlocklistSettings()
if settings.file_path:
with open(settings.file_path) as f:
self._blocklist = [line.strip() for line in f if line.strip()]
else:
self._blocklist = []
async def run(
self, ctx: EvaluatorContext[Any, Any, EvaluatorMetadata]
) -> EvaluationReason:
# First run the parent sentence-count check
result = await super().run(ctx)
if not result.value:
return result
# Then scan for blocked terms
output_lower = str(ctx.output).lower()
for term in self._blocklist:
if term.lower() in output_lower:
return EvaluationReason(
value=False,
reason=f"Output contains blocked term: '{term}'.",
)
return result
Try it out¶
We write a temporary blocklist file, point RAGPILL_BLOCKLIST_FILE_PATH at it, then run two cases — one clean response and one containing a blocked term.
import tempfile
# Write a temporary blocklist file
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
f.write("forbidden\nillegal\n")
blocklist_path = f.name
os.environ["RAGPILL_BLOCKLIST_FILE_PATH"] = blocklist_path
async def mock_agent_blocklist(question: str) -> str:
return {
"clean": "Paris is the capital of France. It is a beautiful city.",
"dirty": "This output contains a forbidden term. That is not allowed.",
}.get(question, "I don't know.")
dataset2 = Dataset(
cases=[
Case(
inputs="clean",
metadata=TestCaseMetadata(),
evaluators=[SentenceCountWithBlocklistEvaluator(min_sentences=1)], # passes
),
Case(
inputs="dirty",
metadata=TestCaseMetadata(),
evaluators=[SentenceCountWithBlocklistEvaluator(min_sentences=1)], # fails (blocked term)
),
]
)
report2 = await dataset2.evaluate(mock_agent_blocklist)
report2.print()
# Cleanup
os.unlink(blocklist_path)
del os.environ["RAGPILL_BLOCKLIST_FILE_PATH"]
Output()
/var/folders/5y/v5v7q_1n5h7_jkyhjgbw8kj40000gn/T/ipykernel_71085/2876377643.py:18: PydanticEvalsDeprecationWarning: Omitting the `name` parameter is deprecated. Please provide a name for your `Dataset`.
dataset2 = Dataset(
Evaluation Summary: mock_agent_blocklist ┏━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ ┃ Case ID ┃ Assertions ┃ Duration ┃ ┡━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ │ Case 1 │ ✔ │ 2µs │ ├──────────┼────────────┼──────────┤ │ Case 2 │ ✗ │ 1µs │ ├──────────┼────────────┼──────────┤ │ Averages │ 50.0% ✔ │ 2µs │ └──────────┴────────────┴──────────┘