Skip to content

Custom Type Evaluator

In this how-to guide, we will walk through the process of creating a custom evaluator for an agent that has a custom output type that is not str.

As an example we use an AnimalCreationAgent that returns a structured Animal object:

class Animal(BaseModel):
    name: str
    no_legs: int
    warm_blooded: bool
    habitat: str

When the output is a Pydantic model, evaluators can access fields directly on ctx.output instead of parsing strings. We show two evaluators:

  • LegCountEvaluator — checks that output.no_legs matches an expected value (parameterised per test case via the check CSV column)
  • WarmBloodedEvaluator — checks output.warm_blooded, using the expected flag to handle both warm- and cold-blooded cases
import json
from dataclasses import dataclass, field
from typing import Any

from pydantic import BaseModel, Field
from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import EvaluationReason, EvaluatorContext

from ragpill.base import BaseEvaluator, EvaluatorMetadata, TestCaseMetadata
class Animal(BaseModel):
    name: str = Field(..., description="The name of the animal")
    no_legs: int = Field(..., description="Number of legs")
    warm_blooded: bool = Field(..., description="Whether the animal is warm-blooded")
    habitat: str = Field(..., description="Primary habitat (land, water, air, ...)")


async def animal_agent(question: str) -> Animal:
    """Mock animal creation agent. Replace with your real LLM agent."""
    animals: dict[str, Animal] = {
        "dog":   Animal(name="dog",   no_legs=4, warm_blooded=True,  habitat="land"),
        "eagle": Animal(name="eagle", no_legs=2, warm_blooded=True,  habitat="air"),
        "snake": Animal(name="snake", no_legs=0, warm_blooded=False, habitat="land"),
        "fish":  Animal(name="fish",  no_legs=0, warm_blooded=False, habitat="water"),
    }
    return animals[question.lower()]

The Evaluators

The only difference from a plain-string evaluator is the type annotation on ctx:

# string output (generic)
async def run(self, ctx: EvaluatorContext[Any, Any, EvaluatorMetadata]) -> EvaluationReason:
    output_str = str(ctx.output)  # must stringify manually

# structured output (typed)
async def run(self, ctx: EvaluatorContext[str, Animal, EvaluatorMetadata]) -> EvaluationReason:
    count = ctx.output.no_legs  # direct field access, fully type-checked

Everything else — from_csv_line, expected, the evaluate wrapper — is identical.

@dataclass(kw_only=True, repr=False)
class LegCountEvaluator(BaseEvaluator):
    """Checks that the animal has exactly the expected number of legs.

    Typing ctx as EvaluatorContext[str, Animal, ...] lets the type checker (and your IDE)
    know that ctx.output is an Animal, so field access like ctx.output.no_legs is safe
    and auto-completed — no string parsing needed.
    """

    expected_legs: int

    @classmethod
    def from_csv_line(
        cls, expected: bool, tags: set[str], check: str, **kwargs: Any
    ) -> "LegCountEvaluator":
        return cls(
            expected_legs=int(check),
            expected=expected,
            tags=tags,
            attributes=kwargs,
        )

    async def run(
        self, ctx: EvaluatorContext[str, Animal, EvaluatorMetadata]
    ) -> EvaluationReason:
        count = ctx.output.no_legs
        return EvaluationReason(
            value=count == self.expected_legs,
            reason=f"Animal has {count} leg(s) (expected {self.expected_legs}).",
        )
@dataclass(kw_only=True, repr=False)
class WarmBloodedEvaluator(BaseEvaluator):
    """Checks whether the animal is warm-blooded.

    Set expected=False on the Case or evaluator to assert the animal is cold-blooded.
    No per-instance configuration is needed, so from_csv_line is not overridden —
    the base class handles an empty check column automatically.
    """

    async def run(
        self, ctx: EvaluatorContext[str, Animal, EvaluatorMetadata]
    ) -> EvaluationReason:
        is_warm = ctx.output.warm_blooded
        return EvaluationReason(
            value=is_warm,
            reason=f"Animal is {'warm' if is_warm else 'cold'}-blooded.",
        )

Try it out

Four test cases — three correct predictions and one intentional failure (fish with expected_legs=4) to confirm the evaluator catches it.

dataset = Dataset(
    cases=[
        Case(
            inputs="dog",
            metadata=TestCaseMetadata(),
            evaluators=[
                LegCountEvaluator(expected_legs=4),   # passes
                WarmBloodedEvaluator(),               # passes (warm-blooded, expected=True)
            ],
        ),
        Case(
            inputs="eagle",
            metadata=TestCaseMetadata(),
            evaluators=[
                LegCountEvaluator(expected_legs=2),   # passes
                WarmBloodedEvaluator(),               # passes
            ],
        ),
        Case(
            inputs="snake",
            metadata=TestCaseMetadata(),
            evaluators=[
                LegCountEvaluator(expected_legs=0),   # passes
                WarmBloodedEvaluator(expected=False),  # passes (cold-blooded, expected=False)
            ],
        ),
        Case(
            inputs="fish",
            metadata=TestCaseMetadata(),
            evaluators=[
                LegCountEvaluator(expected_legs=4),   # fails (fish has 0 legs)
            ],
        ),
    ]
)

report = await dataset.evaluate(animal_agent)
report.print()
Output()


/var/folders/5y/v5v7q_1n5h7_jkyhjgbw8kj40000gn/T/ipykernel_71405/133258251.py:1: PydanticEvalsDeprecationWarning: Omitting the `name` parameter is deprecated. Please provide a name for your `Dataset`.
  dataset = Dataset(


  Evaluation Summary: animal_agent  
┏━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
┃ Case ID   Assertions  Duration ┃
┡━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
│ Case 1   ✔✔         │      9µs │
├──────────┼────────────┼──────────┤
│ Case 2   ✔✔         │      4µs │
├──────────┼────────────┼──────────┤
│ Case 3   ✔✔         │      3µs │
├──────────┼────────────┼──────────┤
│ Case 4             │     19µs │
├──────────┼────────────┼──────────┤
│ Averages │ 85.7%     │      9µs │
└──────────┴────────────┴──────────┘

Loading from CSV

Register the evaluators in evaluator_classes and use load_testset. The check column carries per-row config; expected controls polarity for WarmBloodedEvaluator.

from ragpill.csv.testset import load_testset, default_evaluator_classes

testset = load_testset(
    "testset.csv",
    evaluator_classes={
        **default_evaluator_classes,
        "leg_count": LegCountEvaluator,
        "warm_blooded": WarmBloodedEvaluator,
    },
)
Question test_type check expected tags
dog leg_count 4 true biology
dog warm_blooded true biology
snake leg_count 0 true biology
snake warm_blooded false biology

WarmBloodedEvaluator has no per-instance config, so check is left empty. The base from_csv_line handles this automatically — no override needed.