Custom Evaluator¶

In this how-to guide, we will walk through the process of creating a custom evaluator.

As the first example we will create an evaluator that checks that the number of sentences in the output of an agent is within a specified range. This is a pretty simple example of an evaluator that needs custom parameters (min_sentences and max_sentences) and a custom evaluation logic (counting sentences and checking that the count is within the specified range).

To also show how we recommend creating an Evaluator that has some global state (which should be set via env-vars), we then will create a second version of the same evaluator that also checks that the output does not contain any words from a blocklist. The blocklist will be stored in a file and the path to the file will be specified via an env-var. This way we can have a single blocklist file that is used across multiple evaluators and easily update it without having to change the code of the evaluators.

import shutil, subprocess
uv = shutil.which("uv") or "/Users/janine/.local/bin/uv"
subprocess.check_call([uv, "pip", "install", "nltk"])

[2mUsing Python 3.11.15 environment at: /Users/janine/ragpill/.venv[0m
[2mChecked [1m1 package[0m [2min 6ms[0m[0m





0

import json
import os
from dataclasses import dataclass, field
from typing import Any

import nltk
nltk.download("punkt_tab", quiet=True)
from nltk.tokenize import sent_tokenize

from ragpill import Case, Dataset
from ragpill.eval_types import EvaluationReason, EvaluatorContext
from ragpill.base import BaseEvaluator, EvaluatorMetadata, TestCaseMetadata

Part 1: SentenceCountEvaluator¶

SentenceCountEvaluator checks that the number of sentences in the output falls within [min_sentences, max_sentences].

Per-instance configuration is passed as a JSON string in the CSV check column: {"min_sentences": 2, "max_sentences": 5}. See Loading from CSV at the bottom of this section.

@dataclass(kw_only=True, repr=False)
class SentenceCountEvaluator(BaseEvaluator):
    """Checks that the number of sentences in the output is within [min_sentences, max_sentences].

    Uses nltk's sent_tokenize for sentence boundary detection (handles abbreviations like
    "Dr." or "U.S.A." correctly, unlike a simple split on punctuation).
    Set max_sentences=-1 for no upper limit.
    """

    min_sentences: int = field(default=1)
    max_sentences: int = field(default=-1)

    @classmethod
    def from_csv_line(
        cls, expected: bool, tags: set[str], check: str, **kwargs: Any
    ) -> "SentenceCountEvaluator":
        min_sentences = 1
        max_sentences = -1
        if check and check.strip():
            try:
                check_dict = json.loads(check)
                min_sentences = check_dict.get("min_sentences", min_sentences)
                max_sentences = check_dict.get("max_sentences", max_sentences)
            except json.JSONDecodeError:
                # Plain integer → treat as min_sentences only
                min_sentences = int(check)
        return cls(
            min_sentences=min_sentences,
            max_sentences=max_sentences,
            expected=expected,
            tags=tags,
            attributes=kwargs,
        )

    async def run(
        self, ctx: EvaluatorContext[Any, Any, EvaluatorMetadata]
    ) -> EvaluationReason:
        sentences = sent_tokenize(str(ctx.output))
        count = len(sentences)

        above_min = count >= self.min_sentences
        below_max = self.max_sentences == -1 or count <= self.max_sentences
        range_str = f"[{self.min_sentences}, {'∞' if self.max_sentences == -1 else self.max_sentences}]"

        if above_min and below_max:
            return EvaluationReason(
                value=True,
                reason=f"Found {count} sentence(s), within range {range_str}.",
            )
        if not above_min:
            return EvaluationReason(
                value=False,
                reason=f"Found only {count} sentence(s), minimum required: {self.min_sentences}.",
            )
        return EvaluationReason(
            value=False,
            reason=f"Found {count} sentence(s), exceeds maximum of {self.max_sentences}.",
        )

Try it out¶

We define a minimal mock agent and two test cases — one that should pass and one that should fail — then run them through a Dataset.

from ragpill import execute_dataset, evaluate_results


async def mock_agent(question: str) -> str:
    """Simulates an LLM agent. Replace with your real agent."""
    return {
        "long": "Paris is the capital of France. It is home to the Eiffel Tower. Millions of tourists visit each year.",
        "short": "4.",
    }.get(question, "I don't know.")


dataset = Dataset(
    cases=[
        Case(
            inputs="long",
            metadata=TestCaseMetadata(),
            evaluators=[SentenceCountEvaluator(min_sentences=2, max_sentences=5)],  # 3 sentences → passes
        ),
        Case(
            inputs="short",
            metadata=TestCaseMetadata(),
            evaluators=[SentenceCountEvaluator(min_sentences=2, max_sentences=5)],  # 1 sentence → fails
        ),
    ]
)

# Three-layer pipeline: execute (no MLflow server needed) → evaluate.
run_output = await execute_dataset(dataset, task=mock_agent, capture_traces=False)
eval_output = await evaluate_results(run_output, dataset)
print(eval_output.summary)

Loading from CSV¶

from_csv_line handles the CSV-to-evaluator mapping automatically when using load_testset. Register SentenceCountEvaluator in the evaluator_classes dict:

from ragpill.csv.testset import load_testset, default_evaluator_classes

testset = load_testset(
    "testset.csv",
    evaluator_classes={**default_evaluator_classes, "sentence_count": SentenceCountEvaluator},
)

The corresponding CSV rows look like:

Question	test_type	check	expected	tags
What is the capital of France?	sentence_count	`{"min_sentences": 2, "max_sentences": 5}`	true	geography

Part 2: SentenceCountWithBlocklistEvaluator¶

We extend SentenceCountEvaluator to also reject outputs that contain blocked terms. The blocklist lives in a file whose path is set via the RAGPILL_BLOCKLIST_FILE_PATH environment variable — this way a single file can be shared across many evaluator instances, and updating the list requires no code change.

The file path is read through a pydantic_settings.BaseSettings subclass (the same pattern as LLMJudgeSettings), and the file is opened once in __post_init__ when the evaluator is instantiated.

Note: We don't need to override from_csv_line. SentenceCountWithBlocklistEvaluator inherits it from SentenceCountEvaluator unchanged — the blocklist path comes from an environment variable, not from the CSV row.

from pydantic import ConfigDict, Field
from pydantic_settings import BaseSettings


class BlocklistSettings(BaseSettings):
    model_config = ConfigDict(env_prefix="RAGPILL_BLOCKLIST_")

    file_path: str = Field("", description="Path to a text file with one blocked term per line.")

@dataclass(kw_only=True, repr=False)
class SentenceCountWithBlocklistEvaluator(SentenceCountEvaluator):
    """Like SentenceCountEvaluator, but also checks the output contains no blocked terms.

    The blocklist is loaded at instantiation from the file at RAGPILL_BLOCKLIST_FILE_PATH.
    Each non-empty line is treated as one blocked term (matched case-insensitively).
    """

    def __post_init__(self) -> None:
        settings = BlocklistSettings()
        if settings.file_path:
            with open(settings.file_path) as f:
                self._blocklist = [line.strip() for line in f if line.strip()]
        else:
            self._blocklist = []

    async def run(
        self, ctx: EvaluatorContext[Any, Any, EvaluatorMetadata]
    ) -> EvaluationReason:
        # First run the parent sentence-count check
        result = await super().run(ctx)
        if not result.value:
            return result

        # Then scan for blocked terms
        output_lower = str(ctx.output).lower()
        for term in self._blocklist:
            if term.lower() in output_lower:
                return EvaluationReason(
                    value=False,
                    reason=f"Output contains blocked term: '{term}'.",
                )
        return result

Try it out¶

We write a temporary blocklist file, point RAGPILL_BLOCKLIST_FILE_PATH at it, then run two cases — one clean response and one containing a blocked term.

import tempfile

# Write a temporary blocklist file
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
    f.write("forbidden\nillegal\n")
    blocklist_path = f.name

os.environ["RAGPILL_BLOCKLIST_FILE_PATH"] = blocklist_path


async def mock_agent_blocklist(question: str) -> str:
    return {
        "clean": "Paris is the capital of France. It is a beautiful city.",
        "dirty": "This output contains a forbidden term. That is not allowed.",
    }.get(question, "I don't know.")


dataset2 = Dataset(
    cases=[
        Case(
            inputs="clean",
            metadata=TestCaseMetadata(),
            evaluators=[SentenceCountWithBlocklistEvaluator(min_sentences=1)],  # passes
        ),
        Case(
            inputs="dirty",
            metadata=TestCaseMetadata(),
            evaluators=[SentenceCountWithBlocklistEvaluator(min_sentences=1)],  # fails (blocked term)
        ),
    ]
)

run2 = await execute_dataset(dataset2, task=mock_agent_blocklist, capture_traces=False)
eval2 = await evaluate_results(run2, dataset2)
print(eval2.summary)

# Cleanup
os.unlink(blocklist_path)
del os.environ["RAGPILL_BLOCKLIST_FILE_PATH"]