OpenGradient · kylexqian · Apr 10, 2026 · Apr 3, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/src/opengradient/__init__.py b/src/opengradient/__init__.py
@@ -88,6 +88,7 @@ async def stream_example():
     InferenceResult,
     ModelOutput,
     ModelRepository,
+    ResponseFormat,
     SchedulerParams,
     TextGenerationOutput,
     TextGenerationStream,
@@ -105,6 +106,7 @@ async def stream_example():
     "SchedulerParams",
     "CandleType",
     "CandleOrder",
+    "ResponseFormat",
     "TextGenerationOutput",
     "TextGenerationStream",
     "x402SettlementMode",

diff --git a/src/opengradient/client/llm.py b/src/opengradient/client/llm.py
@@ -14,7 +14,7 @@
 from x402.mechanisms.evm.exact.register import register_exact_evm_client
 from x402.mechanisms.evm.upto.register import register_upto_evm_client
 
-from ..types import TEE_LLM, StreamChoice, StreamChunk, StreamDelta, TextGenerationOutput, x402SettlementMode
+from ..types import TEE_LLM, ResponseFormat, StreamChoice, StreamChunk, StreamDelta, TextGenerationOutput, x402SettlementMode
 from .opg_token import Permit2ApprovalResult, ensure_opg_approval
 from .tee_connection import RegistryTEEConnection, StaticTEEConnection, TEEConnectionInterface
 from .tee_registry import TEERegistry
@@ -44,6 +44,7 @@ class _ChatParams:
     stop_sequence: Optional[List[str]]
     tools: Optional[List[Dict]]
     tool_choice: Optional[str]
+    response_format: Optional[ResponseFormat]
     x402_settlement_mode: x402SettlementMode
 
 
@@ -152,6 +153,8 @@ def _chat_payload(self, params: _ChatParams, messages: List[Dict], stream: bool
         if params.tools:
             payload["tools"] = params.tools
             payload["tool_choice"] = params.tool_choice or "auto"
+        if params.response_format:
+            payload["response_format"] = params.response_format.to_dict()
         return payload
 
     async def _call_with_tee_retry(
@@ -297,6 +300,7 @@ async def chat(
         temperature: float = 0.0,
         tools: Optional[List[Dict]] = None,
         tool_choice: Optional[str] = None,
+        response_format: Optional[ResponseFormat] = None,
         x402_settlement_mode: x402SettlementMode = x402SettlementMode.BATCH_HASHED,
         stream: bool = False,
     ) -> Union[TextGenerationOutput, AsyncGenerator[StreamChunk, None]]:
@@ -311,6 +315,11 @@ async def chat(
             temperature (float): Temperature for LLM inference, between 0 and 1.
             tools (List[dict], optional): Set of tools for function calling.
             tool_choice (str, optional): Sets a specific tool to choose.
+            response_format (ResponseFormat, optional): Enforce a specific output format.
+                Use ``ResponseFormat(type="json_object")`` for any valid JSON (not supported
+                by Anthropic models). Use ``ResponseFormat(type="json_schema", json_schema={...})``
+                to enforce a strict schema (supported by all providers including Anthropic).
+                Defaults to None (plain text).
             x402_settlement_mode (x402SettlementMode, optional): Settlement mode for x402 payments.
                 - PRIVATE: Payment only, no input/output data on-chain (most privacy-preserving).
                 - BATCH_HASHED: Aggregates inferences into a Merkle tree with input/output hashes and signatures (default, most cost-efficient).
@@ -324,15 +333,25 @@ async def chat(
                 - If stream=True: Async generator yielding StreamChunk objects
 
         Raises:
+            ValueError: If ``response_format="json_object"`` is used with an Anthropic model.
             RuntimeError: If the inference fails.
         """
+        if response_format is not None and response_format.type == "json_object":
+            provider = model.split("/")[0]
+            if provider == "anthropic":
+                raise ValueError(
+                    "Anthropic models do not support response_format type 'json_object'. "
+                    "Use ResponseFormat(type='json_schema', json_schema={...}) with an explicit schema instead."
+                )
+
         params = _ChatParams(
             model=model.split("/")[1],
             max_tokens=max_tokens,
             temperature=temperature,
             stop_sequence=stop_sequence,
             tools=tools,
             tool_choice=tool_choice,
+            response_format=response_format,
             x402_settlement_mode=x402_settlement_mode,
         )
 
@@ -379,6 +398,7 @@ async def _request() -> TextGenerationOutput:
                 transaction_hash="external",
                 finish_reason=choices[0].get("finish_reason"),
                 chat_output=message,
+                usage=result.get("usage"),
                 tee_signature=result.get("tee_signature"),
                 tee_timestamp=result.get("tee_timestamp"),
                 **tee.metadata(),

diff --git a/src/opengradient/types.py b/src/opengradient/types.py
@@ -428,6 +428,9 @@ class TextGenerationOutput:
     completion_output: Optional[str] = None
     """Raw text returned by a completion request."""
 
+    usage: Optional[Dict] = None
+    """Token usage for the request. Contains ``prompt_tokens``, ``completion_tokens``, and ``total_tokens`` when reported by the server."""
+
     payment_hash: Optional[str] = None
     """Payment hash for the x402 transaction."""
 
@@ -513,10 +516,12 @@ class TEE_LLM(str, Enum):
     CLAUDE_OPUS_4_6 = "anthropic/claude-opus-4-6"
 
     # Google models via TEE
+    # Note: gemini-2.5-flash, gemini-2.5-pro, and gemini-2.5-flash-lite are scheduled
+    # for deprecation on June 17, 2026 (flash-lite: July 22, 2026). Replacements will be
+    # gemini-3-flash-preview, gemini-3.1-pro-preview, and gemini-3.1-flash-lite-preview.
     GEMINI_2_5_FLASH = "google/gemini-2.5-flash"
     GEMINI_2_5_PRO = "google/gemini-2.5-pro"
     GEMINI_2_5_FLASH_LITE = "google/gemini-2.5-flash-lite"
-    GEMINI_3_PRO = "google/gemini-3-pro-preview"
     GEMINI_3_FLASH = "google/gemini-3-flash-preview"
 
     # xAI Grok models via TEE
@@ -526,6 +531,71 @@ class TEE_LLM(str, Enum):
     GROK_4_1_FAST_NON_REASONING = "x-ai/grok-4-1-fast-non-reasoning"
 
 
+@dataclass
+class ResponseFormat:
+    """Controls the output format enforced by the TEE gateway.
+
+    Use ``type="json_object"`` to receive any valid JSON object (supported by
+    OpenAI, Gemini, and Grok). Use ``type="json_schema"`` with a ``json_schema``
+    definition to enforce a specific schema (supported by all providers,
+    including Anthropic).
+
+    Attributes:
+        type: One of ``"text"``, ``"json_object"``, or ``"json_schema"``.
+        json_schema: Schema definition (required when ``type="json_schema"``).
+            Must contain ``name`` (str) and ``schema`` (dict).
+            ``strict`` (bool) is optional.
+
+    Raises:
+        ValueError: If ``type`` is not a recognised value, or if
+            ``type="json_schema"`` is used without providing ``json_schema``.
+
+    Examples::
+
+        # Any valid JSON object — OpenAI, Gemini, Grok only
+        ResponseFormat(type="json_object")
+
+        # Strict schema — all providers including Anthropic
+        ResponseFormat(
+            type="json_schema",
+            json_schema={
+                "name": "person",
+                "strict": True,
+                "schema": {
+                    "type": "object",
+                    "properties": {
+                        "name": {"type": "string"},
+                        "age": {"type": "integer"},
+                    },
+                    "required": ["name", "age"],
+                    "additionalProperties": False,
+                },
+            },
+        )
+    """
+
+    type: str
+    json_schema: Optional[Dict] = None
+
+    def __post_init__(self) -> None:
+        valid_types = ("text", "json_object", "json_schema")
+        if self.type not in valid_types:
+            raise ValueError(
+                f"ResponseFormat.type must be one of {valid_types}, got '{self.type}'"
+            )
+        if self.type == "json_schema" and not self.json_schema:
+            raise ValueError(
+                "ResponseFormat.json_schema is required when type='json_schema'"
+            )
+
+    def to_dict(self) -> Dict:
+        """Serialise to a JSON-compatible dict for the TEE gateway request payload."""
+        d: Dict = {"type": self.type}
+        if self.json_schema is not None:
+            d["json_schema"] = self.json_schema
+        return d
+
+
 @dataclass
 class SchedulerParams:
     frequency: int

diff --git a/uv.lock b/uv.lock