Custom Type Evaluator¶
In this how-to guide, we will walk through the process of creating a custom evaluator for an agent that has a custom output type that is not str.
As an example we use an AnimalCreationAgent that returns a structured Animal object:
When the output is a Pydantic model, evaluators can access fields directly on ctx.output instead of parsing strings. We show two evaluators:
LegCountEvaluator— checks thatoutput.no_legsmatches an expected value (parameterised per test case via thecheckCSV column)WarmBloodedEvaluator— checksoutput.warm_blooded, using theexpectedflag to handle both warm- and cold-blooded cases
import json
from dataclasses import dataclass, field
from typing import Any
from pydantic import BaseModel, Field
from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import EvaluationReason, EvaluatorContext
from ragpill.base import BaseEvaluator, EvaluatorMetadata, TestCaseMetadata
class Animal(BaseModel):
name: str = Field(..., description="The name of the animal")
no_legs: int = Field(..., description="Number of legs")
warm_blooded: bool = Field(..., description="Whether the animal is warm-blooded")
habitat: str = Field(..., description="Primary habitat (land, water, air, ...)")
async def animal_agent(question: str) -> Animal:
"""Mock animal creation agent. Replace with your real LLM agent."""
animals: dict[str, Animal] = {
"dog": Animal(name="dog", no_legs=4, warm_blooded=True, habitat="land"),
"eagle": Animal(name="eagle", no_legs=2, warm_blooded=True, habitat="air"),
"snake": Animal(name="snake", no_legs=0, warm_blooded=False, habitat="land"),
"fish": Animal(name="fish", no_legs=0, warm_blooded=False, habitat="water"),
}
return animals[question.lower()]
The Evaluators¶
The only difference from a plain-string evaluator is the type annotation on ctx:
# string output (generic)
async def run(self, ctx: EvaluatorContext[Any, Any, EvaluatorMetadata]) -> EvaluationReason:
output_str = str(ctx.output) # must stringify manually
# structured output (typed)
async def run(self, ctx: EvaluatorContext[str, Animal, EvaluatorMetadata]) -> EvaluationReason:
count = ctx.output.no_legs # direct field access, fully type-checked
Everything else — from_csv_line, expected, the evaluate wrapper — is identical.
@dataclass(kw_only=True, repr=False)
class LegCountEvaluator(BaseEvaluator):
"""Checks that the animal has exactly the expected number of legs.
Typing ctx as EvaluatorContext[str, Animal, ...] lets the type checker (and your IDE)
know that ctx.output is an Animal, so field access like ctx.output.no_legs is safe
and auto-completed — no string parsing needed.
"""
expected_legs: int
@classmethod
def from_csv_line(
cls, expected: bool, tags: set[str], check: str, **kwargs: Any
) -> "LegCountEvaluator":
return cls(
expected_legs=int(check),
expected=expected,
tags=tags,
attributes=kwargs,
)
async def run(
self, ctx: EvaluatorContext[str, Animal, EvaluatorMetadata]
) -> EvaluationReason:
count = ctx.output.no_legs
return EvaluationReason(
value=count == self.expected_legs,
reason=f"Animal has {count} leg(s) (expected {self.expected_legs}).",
)
@dataclass(kw_only=True, repr=False)
class WarmBloodedEvaluator(BaseEvaluator):
"""Checks whether the animal is warm-blooded.
Set expected=False on the Case or evaluator to assert the animal is cold-blooded.
No per-instance configuration is needed, so from_csv_line is not overridden —
the base class handles an empty check column automatically.
"""
async def run(
self, ctx: EvaluatorContext[str, Animal, EvaluatorMetadata]
) -> EvaluationReason:
is_warm = ctx.output.warm_blooded
return EvaluationReason(
value=is_warm,
reason=f"Animal is {'warm' if is_warm else 'cold'}-blooded.",
)
Try it out¶
Four test cases — three correct predictions and one intentional failure (fish with expected_legs=4) to confirm the evaluator catches it.
dataset = Dataset(
cases=[
Case(
inputs="dog",
metadata=TestCaseMetadata(),
evaluators=[
LegCountEvaluator(expected_legs=4), # passes
WarmBloodedEvaluator(), # passes (warm-blooded, expected=True)
],
),
Case(
inputs="eagle",
metadata=TestCaseMetadata(),
evaluators=[
LegCountEvaluator(expected_legs=2), # passes
WarmBloodedEvaluator(), # passes
],
),
Case(
inputs="snake",
metadata=TestCaseMetadata(),
evaluators=[
LegCountEvaluator(expected_legs=0), # passes
WarmBloodedEvaluator(expected=False), # passes (cold-blooded, expected=False)
],
),
Case(
inputs="fish",
metadata=TestCaseMetadata(),
evaluators=[
LegCountEvaluator(expected_legs=4), # fails (fish has 0 legs)
],
),
]
)
report = await dataset.evaluate(animal_agent)
report.print()
Output()
/var/folders/5y/v5v7q_1n5h7_jkyhjgbw8kj40000gn/T/ipykernel_71405/133258251.py:1: PydanticEvalsDeprecationWarning: Omitting the `name` parameter is deprecated. Please provide a name for your `Dataset`.
dataset = Dataset(
Evaluation Summary: animal_agent ┏━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ ┃ Case ID ┃ Assertions ┃ Duration ┃ ┡━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ │ Case 1 │ ✔✔ │ 9µs │ ├──────────┼────────────┼──────────┤ │ Case 2 │ ✔✔ │ 4µs │ ├──────────┼────────────┼──────────┤ │ Case 3 │ ✔✔ │ 3µs │ ├──────────┼────────────┼──────────┤ │ Case 4 │ ✗ │ 19µs │ ├──────────┼────────────┼──────────┤ │ Averages │ 85.7% ✔ │ 9µs │ └──────────┴────────────┴──────────┘
Loading from CSV¶
Register the evaluators in evaluator_classes and use load_testset. The check column carries per-row config; expected controls polarity for WarmBloodedEvaluator.
from ragpill.csv.testset import load_testset, default_evaluator_classes
testset = load_testset(
"testset.csv",
evaluator_classes={
**default_evaluator_classes,
"leg_count": LegCountEvaluator,
"warm_blooded": WarmBloodedEvaluator,
},
)
| Question | test_type | check | expected | tags |
|---|---|---|---|---|
| dog | leg_count | 4 |
true | biology |
| dog | warm_blooded | true | biology | |
| snake | leg_count | 0 |
true | biology |
| snake | warm_blooded | false | biology |
WarmBloodedEvaluatorhas no per-instance config, socheckis left empty. The basefrom_csv_linehandles this automatically — no override needed.