Skip to content

Custom Evaluator

In this how-to guide, we will walk through the process of creating a custom evaluator.

As the first example we will create an evaluator that checks that the number of sentences in the output of an agent is within a specified range. This is a pretty simple example of an evaluator that needs custom parameters (min_sentences and max_sentences) and a custom evaluation logic (counting sentences and checking that the count is within the specified range).

To also show how we recommend creating an Evaluator that has some global state (which should be set via env-vars), we then will create a second version of the same evaluator that also checks that the output does not contain any words from a blocklist. The blocklist will be stored in a file and the path to the file will be specified via an env-var. This way we can have a single blocklist file that is used across multiple evaluators and easily update it without having to change the code of the evaluators.

import shutil, subprocess
uv = shutil.which("uv") or "/Users/janine/.local/bin/uv"
subprocess.check_call([uv, "pip", "install", "nltk"])
Using Python 3.11.15 environment at: /Users/janine/ragpill/.venv
Checked 1 package in 6ms





0
import json
import os
from dataclasses import dataclass, field
from typing import Any

import nltk
nltk.download("punkt_tab", quiet=True)
from nltk.tokenize import sent_tokenize

from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import EvaluationReason, EvaluatorContext

from ragpill.base import BaseEvaluator, EvaluatorMetadata, TestCaseMetadata

Part 1: SentenceCountEvaluator

SentenceCountEvaluator checks that the number of sentences in the output falls within [min_sentences, max_sentences].

Per-instance configuration is passed as a JSON string in the CSV check column: {"min_sentences": 2, "max_sentences": 5}. See Loading from CSV at the bottom of this section.

@dataclass(kw_only=True, repr=False)
class SentenceCountEvaluator(BaseEvaluator):
    """Checks that the number of sentences in the output is within [min_sentences, max_sentences].

    Uses nltk's sent_tokenize for sentence boundary detection (handles abbreviations like
    "Dr." or "U.S.A." correctly, unlike a simple split on punctuation).
    Set max_sentences=-1 for no upper limit.
    """

    min_sentences: int = field(default=1)
    max_sentences: int = field(default=-1)

    @classmethod
    def from_csv_line(
        cls, expected: bool, tags: set[str], check: str, **kwargs: Any
    ) -> "SentenceCountEvaluator":
        min_sentences = 1
        max_sentences = -1
        if check and check.strip():
            try:
                check_dict = json.loads(check)
                min_sentences = check_dict.get("min_sentences", min_sentences)
                max_sentences = check_dict.get("max_sentences", max_sentences)
            except json.JSONDecodeError:
                # Plain integer → treat as min_sentences only
                min_sentences = int(check)
        return cls(
            min_sentences=min_sentences,
            max_sentences=max_sentences,
            expected=expected,
            tags=tags,
            attributes=kwargs,
        )

    async def run(
        self, ctx: EvaluatorContext[Any, Any, EvaluatorMetadata]
    ) -> EvaluationReason:
        sentences = sent_tokenize(str(ctx.output))
        count = len(sentences)

        above_min = count >= self.min_sentences
        below_max = self.max_sentences == -1 or count <= self.max_sentences
        range_str = f"[{self.min_sentences}, {'∞' if self.max_sentences == -1 else self.max_sentences}]"

        if above_min and below_max:
            return EvaluationReason(
                value=True,
                reason=f"Found {count} sentence(s), within range {range_str}.",
            )
        if not above_min:
            return EvaluationReason(
                value=False,
                reason=f"Found only {count} sentence(s), minimum required: {self.min_sentences}.",
            )
        return EvaluationReason(
            value=False,
            reason=f"Found {count} sentence(s), exceeds maximum of {self.max_sentences}.",
        )

Try it out

We define a minimal mock agent and two test cases — one that should pass and one that should fail — then run them through a Dataset.

async def mock_agent(question: str) -> str:
    """Simulates an LLM agent. Replace with your real agent."""
    return {
        "long": "Paris is the capital of France. It is home to the Eiffel Tower. Millions of tourists visit each year.",
        "short": "4.",
    }.get(question, "I don't know.")


dataset = Dataset(
    cases=[
        Case(
            inputs="long",
            metadata=TestCaseMetadata(),
            evaluators=[SentenceCountEvaluator(min_sentences=2, max_sentences=5)],  # 3 sentences → passes
        ),
        Case(
            inputs="short",
            metadata=TestCaseMetadata(),
            evaluators=[SentenceCountEvaluator(min_sentences=2, max_sentences=5)],  # 1 sentence → fails
        ),
    ]
)

report = await dataset.evaluate(mock_agent)
report.print()
Output()


/var/folders/5y/v5v7q_1n5h7_jkyhjgbw8kj40000gn/T/ipykernel_71085/1990873887.py:9: PydanticEvalsDeprecationWarning: Omitting the `name` parameter is deprecated. Please provide a name for your `Dataset`.
  dataset = Dataset(


   Evaluation Summary: mock_agent   
┏━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
┃ Case ID   Assertions  Duration ┃
┡━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
│ Case 1             │      2µs │
├──────────┼────────────┼──────────┤
│ Case 2             │      1µs │
├──────────┼────────────┼──────────┤
│ Averages │ 50.0%     │      2µs │
└──────────┴────────────┴──────────┘

Loading from CSV

from_csv_line handles the CSV-to-evaluator mapping automatically when using load_testset. Register SentenceCountEvaluator in the evaluator_classes dict:

from ragpill.csv.testset import load_testset, default_evaluator_classes

testset = load_testset(
    "testset.csv",
    evaluator_classes={**default_evaluator_classes, "sentence_count": SentenceCountEvaluator},
)

The corresponding CSV rows look like:

Question test_type check expected tags
What is the capital of France? sentence_count {"min_sentences": 2, "max_sentences": 5} true geography

Part 2: SentenceCountWithBlocklistEvaluator

We extend SentenceCountEvaluator to also reject outputs that contain blocked terms. The blocklist lives in a file whose path is set via the RAGPILL_BLOCKLIST_FILE_PATH environment variable — this way a single file can be shared across many evaluator instances, and updating the list requires no code change.

The file path is read through a pydantic_settings.BaseSettings subclass (the same pattern as LLMJudgeSettings), and the file is opened once in __post_init__ when the evaluator is instantiated.

Note: We don't need to override from_csv_line. SentenceCountWithBlocklistEvaluator inherits it from SentenceCountEvaluator unchanged — the blocklist path comes from an environment variable, not from the CSV row.

from pydantic import ConfigDict, Field
from pydantic_settings import BaseSettings


class BlocklistSettings(BaseSettings):
    model_config = ConfigDict(env_prefix="RAGPILL_BLOCKLIST_")

    file_path: str = Field("", description="Path to a text file with one blocked term per line.")
@dataclass(kw_only=True, repr=False)
class SentenceCountWithBlocklistEvaluator(SentenceCountEvaluator):
    """Like SentenceCountEvaluator, but also checks the output contains no blocked terms.

    The blocklist is loaded at instantiation from the file at RAGPILL_BLOCKLIST_FILE_PATH.
    Each non-empty line is treated as one blocked term (matched case-insensitively).
    """

    def __post_init__(self) -> None:
        settings = BlocklistSettings()
        if settings.file_path:
            with open(settings.file_path) as f:
                self._blocklist = [line.strip() for line in f if line.strip()]
        else:
            self._blocklist = []

    async def run(
        self, ctx: EvaluatorContext[Any, Any, EvaluatorMetadata]
    ) -> EvaluationReason:
        # First run the parent sentence-count check
        result = await super().run(ctx)
        if not result.value:
            return result

        # Then scan for blocked terms
        output_lower = str(ctx.output).lower()
        for term in self._blocklist:
            if term.lower() in output_lower:
                return EvaluationReason(
                    value=False,
                    reason=f"Output contains blocked term: '{term}'.",
                )
        return result

Try it out

We write a temporary blocklist file, point RAGPILL_BLOCKLIST_FILE_PATH at it, then run two cases — one clean response and one containing a blocked term.

import tempfile

# Write a temporary blocklist file
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
    f.write("forbidden\nillegal\n")
    blocklist_path = f.name

os.environ["RAGPILL_BLOCKLIST_FILE_PATH"] = blocklist_path


async def mock_agent_blocklist(question: str) -> str:
    return {
        "clean": "Paris is the capital of France. It is a beautiful city.",
        "dirty": "This output contains a forbidden term. That is not allowed.",
    }.get(question, "I don't know.")


dataset2 = Dataset(
    cases=[
        Case(
            inputs="clean",
            metadata=TestCaseMetadata(),
            evaluators=[SentenceCountWithBlocklistEvaluator(min_sentences=1)],  # passes
        ),
        Case(
            inputs="dirty",
            metadata=TestCaseMetadata(),
            evaluators=[SentenceCountWithBlocklistEvaluator(min_sentences=1)],  # fails (blocked term)
        ),
    ]
)

report2 = await dataset2.evaluate(mock_agent_blocklist)
report2.print()

# Cleanup
os.unlink(blocklist_path)
del os.environ["RAGPILL_BLOCKLIST_FILE_PATH"]
Output()


/var/folders/5y/v5v7q_1n5h7_jkyhjgbw8kj40000gn/T/ipykernel_71085/2876377643.py:18: PydanticEvalsDeprecationWarning: Omitting the `name` parameter is deprecated. Please provide a name for your `Dataset`.
  dataset2 = Dataset(


        Evaluation Summary:         
        mock_agent_blocklist        
┏━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
┃ Case ID   Assertions  Duration ┃
┡━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
│ Case 1             │      2µs │
├──────────┼────────────┼──────────┤
│ Case 2             │      1µs │
├──────────┼────────────┼──────────┤
│ Averages │ 50.0%     │      2µs │
└──────────┴────────────┴──────────┘