Custom Type Evaluator¶

In this how-to guide, we will walk through the process of creating a custom evaluator for an agent that has a custom output type that is not str.

As an example we use an AnimalCreationAgent that returns a structured Animal object:

class Animal(BaseModel):
    name: str
    no_legs: int
    warm_blooded: bool
    habitat: str

When the output is a Pydantic model, evaluators can access fields directly on ctx.output instead of parsing strings. We show two evaluators:

LegCountEvaluator — checks that output.no_legs matches an expected value (parameterised per test case via the check CSV column)
WarmBloodedEvaluator — checks output.warm_blooded, using the expected flag to handle both warm- and cold-blooded cases

import json
from dataclasses import dataclass, field
from typing import Any

from pydantic import BaseModel, Field

from ragpill import Case, Dataset
from ragpill.eval_types import EvaluationReason, EvaluatorContext
from ragpill.base import BaseEvaluator, EvaluatorMetadata, TestCaseMetadata

class Animal(BaseModel):
    name: str = Field(..., description="The name of the animal")
    no_legs: int = Field(..., description="Number of legs")
    warm_blooded: bool = Field(..., description="Whether the animal is warm-blooded")
    habitat: str = Field(..., description="Primary habitat (land, water, air, ...)")


async def animal_agent(question: str) -> Animal:
    """Mock animal creation agent. Replace with your real LLM agent."""
    animals: dict[str, Animal] = {
        "dog":   Animal(name="dog",   no_legs=4, warm_blooded=True,  habitat="land"),
        "eagle": Animal(name="eagle", no_legs=2, warm_blooded=True,  habitat="air"),
        "snake": Animal(name="snake", no_legs=0, warm_blooded=False, habitat="land"),
        "fish":  Animal(name="fish",  no_legs=0, warm_blooded=False, habitat="water"),
    }
    return animals[question.lower()]

The Evaluators¶

The only difference from a plain-string evaluator is the type annotation on ctx:

# string output (generic)
async def run(self, ctx: EvaluatorContext[Any, Any, EvaluatorMetadata]) -> EvaluationReason:
    output_str = str(ctx.output)  # must stringify manually

# structured output (typed)
async def run(self, ctx: EvaluatorContext[str, Animal, EvaluatorMetadata]) -> EvaluationReason:
    count = ctx.output.no_legs  # direct field access, fully type-checked

Everything else — from_csv_line, expected, the evaluate wrapper — is identical.

@dataclass(kw_only=True, repr=False)
class LegCountEvaluator(BaseEvaluator):
    """Checks that the animal has exactly the expected number of legs.

    Typing ctx as EvaluatorContext[str, Animal, ...] lets the type checker (and your IDE)
    know that ctx.output is an Animal, so field access like ctx.output.no_legs is safe
    and auto-completed — no string parsing needed.
    """

    expected_legs: int

    @classmethod
    def from_csv_line(
        cls, expected: bool, tags: set[str], check: str, **kwargs: Any
    ) -> "LegCountEvaluator":
        return cls(
            expected_legs=int(check),
            expected=expected,
            tags=tags,
            attributes=kwargs,
        )

    async def run(
        self, ctx: EvaluatorContext[str, Animal, EvaluatorMetadata]
    ) -> EvaluationReason:
        count = ctx.output.no_legs
        return EvaluationReason(
            value=count == self.expected_legs,
            reason=f"Animal has {count} leg(s) (expected {self.expected_legs}).",
        )

@dataclass(kw_only=True, repr=False)
class WarmBloodedEvaluator(BaseEvaluator):
    """Checks whether the animal is warm-blooded.

    Set expected=False on the Case or evaluator to assert the animal is cold-blooded.
    No per-instance configuration is needed, so from_csv_line is not overridden —
    the base class handles an empty check column automatically.
    """

    async def run(
        self, ctx: EvaluatorContext[str, Animal, EvaluatorMetadata]
    ) -> EvaluationReason:
        is_warm = ctx.output.warm_blooded
        return EvaluationReason(
            value=is_warm,
            reason=f"Animal is {'warm' if is_warm else 'cold'}-blooded.",
        )

Try it out¶

Four test cases — three correct predictions and one intentional failure (fish with expected_legs=4) to confirm the evaluator catches it.

from ragpill import execute_dataset, evaluate_results


dataset = Dataset(
    cases=[
        Case(
            inputs="dog",
            metadata=TestCaseMetadata(),
            evaluators=[
                LegCountEvaluator(expected_legs=4),   # passes
                WarmBloodedEvaluator(),               # passes (warm-blooded, expected=True)
            ],
        ),
        Case(
            inputs="eagle",
            metadata=TestCaseMetadata(),
            evaluators=[
                LegCountEvaluator(expected_legs=2),   # passes
                WarmBloodedEvaluator(),               # passes
            ],
        ),
        Case(
            inputs="snake",
            metadata=TestCaseMetadata(),
            evaluators=[
                LegCountEvaluator(expected_legs=0),   # passes
                WarmBloodedEvaluator(expected=False),  # passes (cold-blooded, expected=False)
            ],
        ),
        Case(
            inputs="fish",
            metadata=TestCaseMetadata(),
            evaluators=[
                LegCountEvaluator(expected_legs=4),   # fails (fish has 0 legs)
            ],
        ),
    ]
)

run_output = await execute_dataset(dataset, task=animal_agent, capture_traces=False)
eval_output = await evaluate_results(run_output, dataset)
print(eval_output.summary)

Loading from CSV¶

Register the evaluators in evaluator_classes and use load_testset. The check column carries per-row config; expected controls polarity for WarmBloodedEvaluator.

from ragpill.csv.testset import load_testset, default_evaluator_classes

testset = load_testset(
    "testset.csv",
    evaluator_classes={
        **default_evaluator_classes,
        "leg_count": LegCountEvaluator,
        "warm_blooded": WarmBloodedEvaluator,
    },
)

Question	test_type	check	expected	tags
dog	leg_count	`4`	true	biology
dog	warm_blooded		true	biology
snake	leg_count	`0`	true	biology
snake	warm_blooded		false	biology

WarmBloodedEvaluator has no per-instance config, so check is left empty. The base from_csv_line handles this automatically — no override needed.