A/B Testing Configurations¶

Here's a complete example showing how to A/B test two complete agent configurations. In this example, we use local configuration for development — in production, you'd create the versions and labels in the Logfire UI instead.

from pydantic import BaseModel
from pydantic_ai import Agent

import logfire
from logfire.variables.config import (
    LabeledValue,
    LabelRef,
    LatestVersion,
    Rollout,
    VariableConfig,
    VariablesConfig,
)

logfire.configure()


class AgentConfig(BaseModel):
    """Configuration for a customer support agent."""

    instructions: str
    model: str
    temperature: float
    max_tokens: int


# For local development/testing, you can define versions and labels in code.
# In production, you'd configure these in the Logfire UI.
variables_config = VariablesConfig(
    variables={
        'support_agent_config': VariableConfig(
            name='support_agent_config',
            # The latest version (what traffic gets if no label matches)
            latest_version=LatestVersion(
                version=2,
                serialized_value="""{
                    "instructions": "You are an expert support agent. Provide thorough explanations with examples. Always acknowledge the customer's concern before providing assistance.",
                    "model": "openai:gpt-4o",
                    "temperature": 0.3,
                    "max_tokens": 800
                }""",
            ),
            # Labels pointing to specific versions
            labels={
                'control': LabeledValue(
                    version=1,
                    serialized_value="""{
                        "instructions": "You are a helpful support agent. Be brief and direct.",
                        "model": "openai:gpt-4o-mini",
                        "temperature": 0.7,
                        "max_tokens": 300
                    }""",
                ),
                'treatment': LabelRef(
                    version=2,
                    ref='latest',  # Points to the same value as latest_version
                ),
            },
            # 50/50 A/B test between control and treatment
            rollout=Rollout(labels={'control': 0.5, 'treatment': 0.5}),
            overrides=[],
            json_schema={
                'type': 'object',
                'properties': {
                    'instructions': {'type': 'string'},
                    'model': {'type': 'string'},
                    'temperature': {'type': 'number'},
                    'max_tokens': {'type': 'integer'},
                },
            },
        ),
    }
)

logfire.configure(
    variables=logfire.LocalVariablesOptions(config=variables_config),
)

# Define the variable
agent_config = logfire.var(
    name='support_agent_config',
    type=AgentConfig,
    default=AgentConfig(
        instructions='You are a helpful assistant.',
        model='openai:gpt-4o-mini',
        temperature=0.7,
        max_tokens=500,
    ),
)


async def handle_ticket(user_id: str, message: str) -> str:
    """Handle a support ticket with A/B tested configuration."""
    with agent_config.get(targeting_key=user_id) as config:
        # The label (control or treatment) and version are now in baggage.
        # All spans created below will be tagged with this info.

        agent = Agent(config.value.model, system_prompt=config.value.instructions)
        result = await agent.run(
            message,
            model_settings={
                'temperature': config.value.temperature,
                'max_tokens': config.value.max_tokens,
            },
        )
        return result.output

Analyzing the A/B test in Logfire:

After running traffic through both labels, you can:

Filter traces by the label baggage to see only requests that used a specific version
Compare metrics like response latency, token usage, and error rates between labels
Look at actual responses to qualitatively assess which configuration performs better
Make data-driven decisions about which version to promote to all traffic