Skip to content

Tokens usage is not consistent across Inspect-ai and chatlas for solver #216

@karangattu

Description

@karangattu

This might not be a bug, but needs some investigation hence am creating this issue to track that work.
I am seeing inconsistent tokens being used when inspect-ai is used for the solver step as compared to using chatlasin the solver step.
Also, how does one leverage the cache read on the chatlas side?

Image
inspect-ai eval code

script_using_inspect_ai_tool_calling.py

from inspect_ai import Task, task
from inspect_ai.dataset import Sample
from inspect_ai.scorer import model_graded_qa
from inspect_ai.solver import generate, system_message, use_tools
from inspect_ai.tool import ToolError, tool


@tool
def get_weather():
    async def execute() -> dict[str, dict[str, int | str]]:
        """Get current weather data for various cities."""
        weather_data = {
            "New York": {
                "temp": 72,
                "condition": "Sunny",
                "humidity": 65,
            },
            "London": {
                "temp": 58,
                "condition": "Rainy",
                "humidity": 85,
            },
            "Tokyo": {
                "temp": 68,
                "condition": "Cloudy",
                "humidity": 70,
            },
            "Sydney": {
                "temp": 82,
                "condition": "Sunny",
                "humidity": 60,
            },
            "Paris": {
                "temp": 61,
                "condition": "Partly Cloudy",
                "humidity": 72,
            },
        }
        return weather_data

    return execute


@tool
def calculate_average():
    async def execute(temperatures: list[float]) -> float:
        """Calculate the average temperature.

        Args:
            temperatures: Temperature readings in degrees Fahrenheit.
        """
        if not temperatures:
            raise ToolError("No temperatures provided")
        return sum(temperatures) / len(temperatures)

    return execute


@tool
def compare_values():
    async def execute(
        value1: float,
        value2: float,
        label1: str,
        label2: str,
    ) -> str:
        """Compare two numeric values.

        Args:
            value1: First numeric value to compare.
            value2: Second numeric value to compare.
            label1: Description for the first value.
            label2: Description for the second value.
        """
        if value1 > value2:
            return f"{label1} ({value1}) is greater than {label2} ({value2})"
        if value2 > value1:
            return f"{label2} ({value2}) is greater than {label1} ({value1})"
        return f"{label1} and {label2} are equal ({value1})"

    return execute


@task
def weather_tool_task():
    """Task requiring multiple tool calls to analyze weather data."""
    dataset = [
        Sample(
            input=(
                "Using the available tools, get the weather data, "
                "calculate the average temperature across all cities, "
                "and compare the average temperature to London's "
                "temperature. Tell me if the average is higher or "
                "lower than London's temperature."
            ),
            target=(
                "Average across cities is about 68.2°F. "
                "London's temperature is 58°F. "
                "Average (68.2°F) is about 10°F higher than London's "
                "value (58°F)."
            ),
        ),
        Sample(
            input=(
                "Get the weather information and determine which city "
                "has the highest humidity. Then compare that city's "
                "temperature to Tokyo's temperature."
            ),
            target=(
                "London has the highest humidity at 85%. "
                "London's temperature is 58°F, while Tokyo's temperature "
                "is 68°F. Tokyo is warmer than London by 10 degrees."
            ),
        ),
        Sample(
            input=(
                "Using the tools, find the weather data and calculate "
                "the average temperature of only the sunny cities. "
                "How does this compare to the overall average?"
            ),
            target=(
                "The sunny cities are New York (72°F) and Sydney (82°F), "
                "with an average of 77°F. This is higher than the overall "
                "average temperature of approximately 68.2°F across all "
                "cities."
            ),
        ),
    ]

    return Task(
        dataset=dataset,
        solver=[
            system_message(
                "You are a helpful assistant with access to weather tools. "
                "Use the tools systematically to answer questions accurately. "
                "Make multiple tool calls as needed."
            ),
            use_tools([get_weather(), calculate_average(), compare_values()]),
            generate(),
        ],
        scorer=model_graded_qa(),
        name="inspect_ai_weather",
        metadata={"tags": ["tool_calling", "weather_analysis", "inspect-ai"]},
        model="openai/gpt-5-nano-2025-08-07",
    )
chatlas eval code

script_using_chatlas_tool_calling.py

from chatlas import ChatOpenAI
from inspect_ai import Task, task
from inspect_ai.dataset import Sample
from inspect_ai.scorer import model_graded_qa

WEATHER_DATA: dict[str, dict[str, int | str]] = {
    "New York": {
        "temp": 72,
        "condition": "Sunny",
        "humidity": 65,
    },
    "London": {
        "temp": 58,
        "condition": "Rainy",
        "humidity": 85,
    },
    "Tokyo": {
        "temp": 68,
        "condition": "Cloudy",
        "humidity": 70,
    },
    "Sydney": {
        "temp": 82,
        "condition": "Sunny",
        "humidity": 60,
    },
    "Paris": {
        "temp": 61,
        "condition": "Partly Cloudy",
        "humidity": 72,
    },
}


def get_weather() -> dict[str, dict[str, int | str]]:
    """Get current weather data for various cities."""

    return WEATHER_DATA


def calculate_average(temperatures: list[float]) -> float:
    """Calculate the average temperature."""

    if not temperatures:
        raise ValueError("No temperatures provided")
    return sum(temperatures) / len(temperatures)


def compare_values(
    value1: float,
    value2: float,
    label1: str,
    label2: str,
) -> str:
    """Compare two numeric values."""

    if value1 > value2:
        return f"{label1} ({value1}) is greater than {label2} ({value2})"
    if value2 > value1:
        return f"{label2} ({value2}) is greater than {label1} ({value1})"
    return f"{label1} and {label2} are equal ({value1})"


chat = ChatOpenAI(
    system_prompt=(
        "You are a helpful assistant with access to weather tools. "
        "Use the tools systematically to answer questions accurately. "
        "Make multiple tool calls as needed."
    ),
    model="gpt-5-nano-2025-08-07",
)
chat.register_tool(get_weather)
chat.register_tool(calculate_average)
chat.register_tool(compare_values)


@task
def weather_tool_task():
    """Task requiring multiple tool calls to analyze weather data."""
    dataset = [
        Sample(
            input=(
                "Using the available tools, get the weather data, "
                "calculate the average temperature across all cities, "
                "and compare the average temperature to London's "
                "temperature. Tell me if the average is higher or "
                "lower than London's temperature."
            ),
            target=(
                "Average across cities is about 68.2°F. "
                "London's temperature is 58°F. "
                "Average (68.2°F) is about 10°F higher than London's "
                "value (58°F)."
            ),
        ),
        Sample(
            input=(
                "Get the weather information and determine which city "
                "has the highest humidity. Then compare that city's "
                "temperature to Tokyo's temperature."
            ),
            target=(
                "London has the highest humidity at 85%. "
                "London's temperature is 58°F, while Tokyo's temperature "
                "is 68°F. Tokyo is warmer than London by 10 degrees."
            ),
        ),
        Sample(
            input=(
                "Using the tools, find the weather data and calculate "
                "the average temperature of only the sunny cities. "
                "How does this compare to the overall average?"
            ),
            target=(
                "The sunny cities are New York (72°F) and Sydney (82°F), "
                "with an average of 77°F. This is higher than the overall "
                "average temperature of approximately 68.2°F across all "
                "cities."
            ),
        ),
    ]

    return Task(
        dataset=dataset,
        solver=chat.to_solver(include_system_prompt=True),
        scorer=model_graded_qa(),
        name="chatlas_weather",
        metadata={"tags": ["tool_calling", "weather_analysis", "chatlas"]},
        model="openai/gpt-5-nano-2025-08-07",
    )

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions