From e977b26bca4ad2cdcf866d3bd82db05f241f682f Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Tue, 24 Feb 2026 19:31:24 +0530 Subject: [PATCH 01/16] Enhance multimodal support: Add Image and PDF input types, update processing logic --- backend/app/models/llm/__init__.py | 4 ++ backend/app/models/llm/request.py | 50 ++++++++++++++-- backend/app/services/llm/jobs.py | 2 + backend/app/services/llm/providers/base.py | 2 +- backend/app/services/llm/providers/gai.py | 2 +- backend/app/services/llm/providers/oai.py | 9 ++- backend/app/utils.py | 69 +++++++++++++++++++++- 7 files changed, 128 insertions(+), 10 deletions(-) diff --git a/backend/app/models/llm/__init__.py b/backend/app/models/llm/__init__.py index b183543c4..67b288f39 100644 --- a/backend/app/models/llm/__init__.py +++ b/backend/app/models/llm/__init__.py @@ -9,6 +9,10 @@ LlmCall, AudioContent, TextContent, + ImageContent, + PDFContent, + ImageInput, + PDFInput, ) from app.models.llm.response import ( LLMCallResponse, diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py index 0991aeba8..23a7a09af 100644 --- a/backend/app/models/llm/request.py +++ b/backend/app/models/llm/request.py @@ -1,5 +1,5 @@ import sqlalchemy as sa -from typing import Annotated, Any, Literal, Union +from typing import Annotated, Any, List, Literal, Union from uuid import UUID, uuid4 from pydantic import model_validator, HttpUrl from datetime import datetime @@ -55,8 +55,20 @@ class TTSLLMParams(SQLModel): language: str response_format: Literal["mp3", "wav", "ogg"] | None = "wav" +class ImageLLMParams(SQLModel): + model: str + instructions: str + response_format: Literal["text"] | None = Field( + None, + description="Currently supports text type", + ) + temperature: float | None = Field( + default=0.2, + ge=0.0, + le=2.0, + ) -KaapiLLMParams = Union[TextLLMParams, STTLLMParams, TTSLLMParams] +KaapiLLMParams = Union[TextLLMParams, STTLLMParams, TTSLLMParams, ImageLLMParams] # Input type models for discriminated union @@ -74,6 +86,23 @@ class AudioContent(SQLModel): description="MIME type of the audio (e.g., audio/wav, audio/mp3, audio/ogg)", ) +class ImageContent(SQLModel): + format: Literal["base64", "public_url"] = "base64" + value: str = Field(..., description="Base64 encoded image or Public URL to the image") + # keeping the mime_type + mime_type: str | None = Field( + None, + description="MIME type of the image (e.g., image/png, image/jpeg)", + ) + +class PDFContent(SQLModel): + format: Literal["base64", "public_url"] = "base64" + value: str = Field(..., description="Base64 encoded PDF or Public URL to the PDF") + # keeping the mime_type + mime_type: str | None = Field( + None, + description="MIME type of the PDF (e.g., application/pdf)", + ) class TextInput(SQLModel): type: Literal["text"] = "text" @@ -84,10 +113,18 @@ class AudioInput(SQLModel): type: Literal["audio"] = "audio" content: AudioContent +class ImageInput(SQLModel): + type: Literal["image"] = "image" + content: ImageContent | list[ImageContent] + +class PDFInput(SQLModel): + type: Literal["pdf"] = "pdf" + content: PDFContent | list[PDFContent] + # Discriminated union for query input types QueryInput = Annotated[ - Union[TextInput, AudioInput], + Union[TextInput, AudioInput, ImageInput, PDFInput], Field(discriminator="type"), ] @@ -122,7 +159,7 @@ def validate_conversation_logic(self): class QueryParams(SQLModel): """Query-specific parameters for each LLM call.""" - input: str | QueryInput = Field( + input: str | QueryInput | list[QueryInput] = Field( ..., description=( "User input - either a plain string (text) or a structured input object. " @@ -193,6 +230,7 @@ def validate_params(self): "text": TextLLMParams, "stt": STTLLMParams, "tts": TTSLLMParams, + "image": ImageLLMParams, } model_class = param_models[self.type] validated = model_class.model_validate(self.params) @@ -389,12 +427,12 @@ class LlmCall(SQLModel, table=True): }, ) - input_type: Literal["text", "audio", "image"] = Field( + input_type: Literal["text", "audio", "image", "pdf", "multimodal"] = Field( ..., sa_column=sa.Column( sa.String, nullable=False, - comment="Input type: text, audio, image", + comment="Input type: text, audio, image, pdf, multimodal (list of multiple input types)", ), ) diff --git a/backend/app/services/llm/jobs.py b/backend/app/services/llm/jobs.py index 33aff370a..17afef456 100644 --- a/backend/app/services/llm/jobs.py +++ b/backend/app/services/llm/jobs.py @@ -20,6 +20,8 @@ KaapiCompletionConfig, TextInput, AudioInput, + ImageInput, + PDFInput, ) from app.models.llm.response import TextOutput from app.services.llm.guardrails import ( diff --git a/backend/app/services/llm/providers/base.py b/backend/app/services/llm/providers/base.py index d8f7cafe7..4559eac77 100644 --- a/backend/app/services/llm/providers/base.py +++ b/backend/app/services/llm/providers/base.py @@ -44,7 +44,7 @@ def execute( self, completion_config: NativeCompletionConfig, query: QueryParams, - resolved_input: str, + resolved_input: str | list[dict], include_provider_raw_response: bool = False, ) -> tuple[LLMCallResponse | None, str | None]: """Execute LLM API call. diff --git a/backend/app/services/llm/providers/gai.py b/backend/app/services/llm/providers/gai.py index ce9bf6ad4..9f83aadc5 100644 --- a/backend/app/services/llm/providers/gai.py +++ b/backend/app/services/llm/providers/gai.py @@ -333,7 +333,6 @@ def execute( ) -> tuple[LLMCallResponse | None, str | None]: try: completion_type = completion_config.type - if completion_type == "stt": return self._execute_stt( completion_config=completion_config, @@ -346,6 +345,7 @@ def execute( resolved_input=resolved_input, include_provider_raw_response=include_provider_raw_response, ) + else: return ( None, diff --git a/backend/app/services/llm/providers/oai.py b/backend/app/services/llm/providers/oai.py index 83c0aa8d7..a634e8a1d 100644 --- a/backend/app/services/llm/providers/oai.py +++ b/backend/app/services/llm/providers/oai.py @@ -47,10 +47,17 @@ def execute( error_message: str | None = None try: + # if completeiton_type is not text: -> return Nonne , error we don't params = { **completion_config.params, } - params["input"] = resolved_input + if isinstance(resolved_input, list): + params["input"] = [{ + "role": "user", + "content": resolved_input # [{"type": "text", "value": "hello world"}, {"type": "image", "value": "base64encodedstring"}, {"type": "pdf", "value": "base64encodedstring"}] + }] + else: + params["input"] = resolved_input conversation_cfg = query.conversation diff --git a/backend/app/utils.py b/backend/app/utils.py index 37cd97053..72ac9e033 100644 --- a/backend/app/utils.py +++ b/backend/app/utils.py @@ -25,6 +25,7 @@ from app.core import security from app.core.config import settings from app.crud.credentials import get_provider_credential +from app.models.llm.request import TextInput, AudioInput, ImageInput, PDFInput logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -443,6 +444,61 @@ def resolve_audio_base64(data: str, mime_type: str) -> tuple[str, str | None]: except Exception as e: return "", f"Failed to write audio to temp file: {str(e)}" +def resolve_image_input(image_input) -> list[dict]: + contents = image_input.content if isinstance(image_input.content, list) else [image_input.content] + items = [] + for content in contents: + if content.format == "base64": + mime = content.mime_type or "image/png" + val = content.value + image_url = f"data:{mime};base64,{val}" + else: + image_url = content.value + items.append({ + "type": "input_image", + "image_url": image_url + }) + + return items + + +def resolve_pdf_input(pdf_input) -> list[dict]: + contents = pdf_input.content if isinstance(pdf_input.content, list) else [pdf_input.content] + items = [] + for content in contents: + if content.format == "base64": + mime = content.mime_type or "application/pdf" + val = content.value + pdf_url = f"data:{mime};base64,{val}" + else: + pdf_url = content.value + + items.append({ + "type": "input_file", + "file_url": pdf_url + }) + return items + + +def resolve_multimodal_list(inputs: list) -> tuple[list[dict], str | None]: + content_items = [] + + for item in inputs: + if isinstance(item, TextInput): + content_items.append({ + "type": "input_text", + "text": item.content.value, + }) + elif isinstance(item, ImageInput): + image_items = resolve_image_input(item) + content_items.extend(image_items) + elif isinstance(item, PDFInput): + pdf_items = resolve_pdf_input(item) + content_items.extend(pdf_items) + else: + return [], f"Unsupported input type in multimodal list: {type(item)}" + + return content_items, None def resolve_input(query_input) -> tuple[str, str | None]: """Resolve discriminated union input to content string. @@ -454,7 +510,7 @@ def resolve_input(query_input) -> tuple[str, str | None]: (content_string, None) on success - for text returns content value, for audio returns temp file path ("", error_message) on failure """ - from app.models.llm.request import TextInput, AudioInput + from app.models.llm.request import TextInput, AudioInput, ImageInput, PDFInput try: if isinstance(query_input, TextInput): @@ -464,6 +520,17 @@ def resolve_input(query_input) -> tuple[str, str | None]: # AudioInput content is base64-encoded audio mime_type = query_input.content.mime_type or "audio/wav" return resolve_audio_base64(query_input.content.value, mime_type) + + elif isinstance(query_input, ImageInput): + content_items = resolve_image_input(query_input) + return content_items, None + + elif isinstance(query_input, PDFInput): + content_items = resolve_pdf_input(query_input) + return content_items, None + + elif isinstance(query_input, list): + return resolve_multimodal_list(query_input) else: return "", f"Unknown input type: {type(query_input)}" From cbfd85fefe292c32c08f9c325c1203a78a31a421 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Wed, 25 Feb 2026 17:59:16 +0530 Subject: [PATCH 02/16] added integration for multimodal for both providers --- backend/app/models/llm/request.py | 6 +- backend/app/services/llm/jobs.py | 2 + backend/app/services/llm/mappers.py | 3 +- backend/app/services/llm/providers/base.py | 4 +- backend/app/services/llm/providers/gai.py | 393 ++++++++++++++++++++- backend/app/services/llm/providers/oai.py | 40 ++- backend/app/utils.py | 85 ++--- 7 files changed, 458 insertions(+), 75 deletions(-) diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py index 23a7a09af..ca8055218 100644 --- a/backend/app/models/llm/request.py +++ b/backend/app/models/llm/request.py @@ -427,12 +427,12 @@ class LlmCall(SQLModel, table=True): }, ) - input_type: Literal["text", "audio", "image", "pdf", "multimodal"] = Field( + input_type: Literal["text", "audio", "image"] = Field( ..., sa_column=sa.Column( sa.String, nullable=False, - comment="Input type: text, audio, image, pdf, multimodal (list of multiple input types)", + comment="Input type: text, audio, image", ), ) @@ -535,4 +535,4 @@ class LlmCall(SQLModel, table=True): default=None, nullable=True, sa_column_kwargs={"comment": "Timestamp when the record was soft-deleted"}, - ) + ) \ No newline at end of file diff --git a/backend/app/services/llm/jobs.py b/backend/app/services/llm/jobs.py index 17afef456..b6f0df966 100644 --- a/backend/app/services/llm/jobs.py +++ b/backend/app/services/llm/jobs.py @@ -111,6 +111,8 @@ def resolved_input_context(query_input: TextInput | AudioInput): even if errors occur during LLM execution. """ resolved_input, error = resolve_input(query_input) + print(f"Resolved input: {resolved_input}, error: {error}") + if error: raise ValueError(error) diff --git a/backend/app/services/llm/mappers.py b/backend/app/services/llm/mappers.py index 8b0b895e3..8e61b8bf9 100644 --- a/backend/app/services/llm/mappers.py +++ b/backend/app/services/llm/mappers.py @@ -129,6 +129,7 @@ def map_kaapi_to_google_params(kaapi_params: dict) -> tuple[dict, list[str]]: google_params["response_format"] = response_format # Warn about unsupported parameters if kaapi_params.get("knowledge_base_ids"): + #TODO: Will take up later, when we add google filesearch tool support warnings.append( "Parameter 'knowledge_base_ids' is not supported by Google AI and was ignored." ) @@ -141,7 +142,7 @@ def map_kaapi_to_google_params(kaapi_params: dict) -> tuple[dict, list[str]]: return google_params, warnings -def transform_kaapi_config_to_native( +def transform_kaapi_config_to_native( kaapi_config: KaapiCompletionConfig, ) -> tuple[NativeCompletionConfig, list[str]]: """Transform Kaapi completion config to native provider config with mapped parameters. diff --git a/backend/app/services/llm/providers/base.py b/backend/app/services/llm/providers/base.py index 4559eac77..07195dbfd 100644 --- a/backend/app/services/llm/providers/base.py +++ b/backend/app/services/llm/providers/base.py @@ -8,7 +8,7 @@ from typing import Any from app.models.llm import NativeCompletionConfig, LLMCallResponse, QueryParams - +from app.models.llm.request import TextContent, ImageContent, PDFContent class BaseProvider(ABC): """Abstract base class for LLM providers. @@ -44,7 +44,7 @@ def execute( self, completion_config: NativeCompletionConfig, query: QueryParams, - resolved_input: str | list[dict], + resolved_input: str | list[TextContent | ImageContent | PDFContent], include_provider_raw_response: bool = False, ) -> tuple[LLMCallResponse | None, str | None]: """Execute LLM API call. diff --git a/backend/app/services/llm/providers/gai.py b/backend/app/services/llm/providers/gai.py index 9f83aadc5..536b45dad 100644 --- a/backend/app/services/llm/providers/gai.py +++ b/backend/app/services/llm/providers/gai.py @@ -1,6 +1,7 @@ import logging import base64 from typing import Any +from typing import TypeAlias, List from google import genai from google.genai.types import ( @@ -20,13 +21,17 @@ Usage, TextOutput, TextContent, + ImageContent, + PDFContent, ) from app.models.llm.response import AudioOutput, AudioContent from app.services.llm.providers.base import BaseProvider from app.core.audio_utils import convert_pcm_to_mp3, convert_pcm_to_ogg logger = logging.getLogger(__name__) - +ContentItem: TypeAlias = TextContent | ImageContent | PDFContent +MultiModalInput: TypeAlias = List[ContentItem] +UserInput: TypeAlias = str | MultiModalInput class GoogleAIProvider(BaseProvider): def __init__(self, client: genai.Client): @@ -44,6 +49,57 @@ def create_client(credentials: dict[str, Any]) -> Any: raise ValueError("API Key for Google Gemini Not Set") return genai.Client(api_key=credentials["api_key"]) + @staticmethod + def format_parts( + parts: list[TextContent | ImageContent | PDFContent], + ) -> list[dict]: + items = [] + for part in parts: + if isinstance(part, TextContent): + items.append({"text": part.value}) + + elif isinstance(part, ImageContent): + if part.format == "base64": + items.append( + { + "inline_data": { + "data": part.value, + "mime_type": part.mime_type, + } + } + ) + else: + items.append( + { + "file_data": { + "file_uri": part.value, + "mime_type": part.mime_type, + "display_name": None, + } + } + ) + elif isinstance(part, PDFContent): + if part.format == "base64": + items.append( + { + "inline_data": { + "data": part.value, + "mime_type": part.mime_type, + } + } + ) + else: + items.append( + { + "file_data": { + "file_uri": part.value, + "mime_type": part.mime_type, + "display_name": None, + } + } + ) + return items + def _execute_stt( self, completion_config: NativeCompletionConfig, @@ -323,12 +379,313 @@ def _execute_tts( ) return llm_response, None + + def _execute_vision( + self, + completion_config: NativeCompletionConfig, + resolved_content: ImageContent | list[ImageContent], # using content here because we need mime type and format info for processing + include_provider_raw_response: bool = False, + ) -> tuple[LLMCallResponse | None, str | None]: + model = completion_config.params.get("model") + if not model: + return None, "Missing 'model' in native params" + + contents = [] + if isinstance(resolved_content, list): + gemini_parts = self.format_parts(resolved_content) + contents = [{"role": "user", "parts": gemini_parts}] + else: + contents = [{"role": "user", "parts": self.format_parts([resolved_content])}] + + instructions = completion_config.params.get("instructions", "") + temperature = completion_config.params.get("temperature", None) + thinking_level = completion_config.params.get("reasoning", None) + + generation_kwargs = {} + if instructions: + contents.append({"role": "system", "parts": [{"text": instructions}]}) + + if temperature is not None: + generation_kwargs["temperature"] = temperature + + if thinking_level is not None: + generation_kwargs["thinking_config"] = ThinkingConfig(include_thoughts=False,thinking_level=thinking_level) + + response = self.client.models.generate_content( + model=model, + contents=contents, + config=GenerateContentConfig(**generation_kwargs) + ) + + if response.usage_metadata: + input_tokens = response.usage_metadata.prompt_token_count or 0 + output_tokens = response.usage_metadata.candidates_token_count or 0 + total_tokens = response.usage_metadata.total_token_count or 0 + reasoning_tokens = response.usage_metadata.thoughts_token_count or 0 + else: + logger.warning( + f"[GoogleAIProvider._execute_stt] Response missing usage_metadata, using zeros" + ) + input_tokens = 0 + output_tokens = 0 + total_tokens = 0 + reasoning_tokens = 0 + + + llm_response = LLMCallResponse( + response=LLMResponse( + provider_response_id=response.response_id, + model=response.model_version or model, + provider=completion_config.provider, + output=TextOutput(content=TextContent(value=response.text)), + ), + usage=Usage( + input_tokens=input_tokens, + output_tokens=output_tokens, + total_tokens=total_tokens, + reasoning_tokens=reasoning_tokens, + ) + ) + if include_provider_raw_response: + llm_response.provider_raw_response = response.model_dump(mode="json") + + logger.info( + f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}" + ) + return llm_response, None + + def _execute_pdf( + self, + completion_config: NativeCompletionConfig, + resolved_content: PDFContent | list[PDFContent], # using content here because we need mime type and format info for processing + include_provider_raw_response: bool = False, + ) -> tuple[LLMCallResponse | None, str | None]: + model = completion_config.params.get("model") + if not model: + return None, "Missing 'model' in native params" + + contents = [] + if isinstance(resolved_content, list): + gemini_parts = self.format_parts(resolved_content) + contents = [{"role": "user", "parts": gemini_parts}] + else: + contents = [{"role": "user", "parts": self.format_parts([resolved_content])}] + + instructions = completion_config.params.get("instructions", "") + temperature = completion_config.params.get("temperature", None) + thinking_level = completion_config.params.get("reasoning", None) + + generation_kwargs = {} + if instructions: + contents.append({"role": "system", "parts": [{"text": instructions}]}) + + if temperature is not None: + generation_kwargs["temperature"] = temperature + + if thinking_level is not None: + generation_kwargs["thinking_config"] = ThinkingConfig(include_thoughts=False,thinking_level=thinking_level) + + response = self.client.models.generate_content( + model=model, + contents=contents, + config=GenerateContentConfig(**generation_kwargs) + ) + + if response.usage_metadata: + input_tokens = response.usage_metadata.prompt_token_count or 0 + output_tokens = response.usage_metadata.candidates_token_count or 0 + total_tokens = response.usage_metadata.total_token_count or 0 + reasoning_tokens = response.usage_metadata.thoughts_token_count or 0 + else: + logger.warning( + f"[GoogleAIProvider._execute_stt] Response missing usage_metadata, using zeros" + ) + input_tokens = 0 + output_tokens = 0 + total_tokens = 0 + reasoning_tokens = 0 + + + llm_response = LLMCallResponse( + response=LLMResponse( + provider_response_id=response.response_id, + model=response.model_version or model, + provider=completion_config.provider, + output=TextOutput(content=TextContent(value=response.text)), + ), + usage=Usage( + input_tokens=input_tokens, + output_tokens=output_tokens, + total_tokens=total_tokens, + reasoning_tokens=reasoning_tokens, + ) + ) + if include_provider_raw_response: + llm_response.provider_raw_response = response.model_dump(mode="json") + + logger.info( + f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}" + ) + return llm_response, None + + def _execute_text( + self, + completion_config: NativeCompletionConfig, + resolved_input: str | list[TextContent | ImageContent | PDFContent], + include_provider_raw_response: bool = False, + ) -> tuple[LLMCallResponse | None, str | None]: + model = completion_config.params.get("model") + if not model: + return None, "Missing 'model' in native params" + + contents = [] + + if isinstance(resolved_input, list): + gemini_parts = self.format_parts(resolved_input) + contents = [{"role": "user", "parts": gemini_parts}] + else: + contents = [{"role": "user", "parts": [{"text": resolved_input}]}] + + instructions = completion_config.params.get("instructions", "") + temperature = completion_config.params.get("temperature", None) + thinking_level = completion_config.params.get("reasoning", None) + + generation_kwargs = {} + if instructions: + contents.append({"role": "system", "parts": [{"text": instructions}]}) + + if temperature is not None: + generation_kwargs["temperature"] = temperature + + if thinking_level is not None: + generation_kwargs["thinking_config"] = ThinkingConfig(include_thoughts=False,thinking_level=thinking_level) + + response = self.client.models.generate_content( + model=model, + contents=contents, + config=GenerateContentConfig(**generation_kwargs) + ) + + if response.usage_metadata: + input_tokens = response.usage_metadata.prompt_token_count or 0 + output_tokens = response.usage_metadata.candidates_token_count or 0 + total_tokens = response.usage_metadata.total_token_count or 0 + reasoning_tokens = response.usage_metadata.thoughts_token_count or 0 + else: + logger.warning( + f"[GoogleAIProvider._execute_stt] Response missing usage_metadata, using zeros" + ) + input_tokens = 0 + output_tokens = 0 + total_tokens = 0 + reasoning_tokens = 0 + + + llm_response = LLMCallResponse( + response=LLMResponse( + provider_response_id=response.response_id, + model=response.model_version or model, + provider=completion_config.provider, + output=TextOutput(content=TextContent(value=response.text)), + ), + usage=Usage( + input_tokens=input_tokens, + output_tokens=output_tokens, + total_tokens=total_tokens, + reasoning_tokens=reasoning_tokens, + ) + ) + if include_provider_raw_response: + llm_response.provider_raw_response = response.model_dump(mode="json") + + logger.info( + f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}" + ) + return llm_response, None + + def _execute_multimodal( + self, + completion_config: NativeCompletionConfig, + resolved_input: MultiModalInput, + include_provider_raw_response: bool = False, + ) -> tuple[LLMCallResponse | None, str | None]: + """ + Convert multimodal input's list of content parts into text response. + """ + + model = completion_config.params.get("model") + if not model: + return None, "Missing 'model' in native params" + if not isinstance(resolved_input, MultiModalInput): + return None, "Invalid input type for multimodal completion, expected list of content parts" + + gemini_parts = self.format_parts(resolved_input) + contents = [{"role": "user", "parts": gemini_parts}] + + instructions = completion_config.params.get("instructions", "") + temperature = completion_config.params.get("temperature", None) + thinking_level = completion_config.params.get("reasoning", None) + + generation_kwargs = {} + if instructions: + contents.append({"role": "system", "parts": [{"text": instructions}]}) + + if temperature is not None: + generation_kwargs["temperature"] = temperature + + if thinking_level is not None: + generation_kwargs["thinking_config"] = ThinkingConfig(include_thoughts=False,thinking_level=thinking_level) + + response = self.client.models.generate_content( + model=model, + contents=contents, + config=GenerateContentConfig(**generation_kwargs) + ) + + if response.usage_metadata: + input_tokens = response.usage_metadata.prompt_token_count or 0 + output_tokens = response.usage_metadata.candidates_token_count or 0 + total_tokens = response.usage_metadata.total_token_count or 0 + reasoning_tokens = response.usage_metadata.thoughts_token_count or 0 + else: + logger.warning( + f"[GoogleAIProvider._execute_stt] Response missing usage_metadata, using zeros" + ) + input_tokens = 0 + output_tokens = 0 + total_tokens = 0 + reasoning_tokens = 0 + + + llm_response = LLMCallResponse( + response=LLMResponse( + provider_response_id=response.response_id, + model=response.model_version or model, + provider=completion_config.provider, + output=TextOutput(content=TextContent(value=response.text)), + ), + usage=Usage( + input_tokens=input_tokens, + output_tokens=output_tokens, + total_tokens=total_tokens, + reasoning_tokens=reasoning_tokens, + ) + ) + if include_provider_raw_response: + llm_response.provider_raw_response = response.model_dump(mode="json") + + logger.info( + f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}" + ) + return llm_response, None + + def execute( self, completion_config: NativeCompletionConfig, - query: QueryParams, # Not used by Google AI provider (no conversation support yet) - resolved_input: str, + query: QueryParams, + resolved_input: str | MultiModalInput, include_provider_raw_response: bool = False, ) -> tuple[LLMCallResponse | None, str | None]: try: @@ -345,11 +702,33 @@ def execute( resolved_input=resolved_input, include_provider_raw_response=include_provider_raw_response, ) + + elif completion_type == "text": + return self._execute_text( + completion_config=completion_config, + resolved_input=resolved_input, + include_provider_raw_response=include_provider_raw_response, + ) + + elif completion_type == "vision": + return self._execute_vision( + completion_config=completion_config, + resolved_content=resolved_input, + include_provider_raw_response=include_provider_raw_response, + ) + + elif completion_type == "pdf": + return self._execute_pdf( + completion_config=completion_config, + resolved_content=resolved_input, + include_provider_raw_response=include_provider_raw_response, + ) - else: - return ( - None, - f"Unsupported completion type '{completion_type}' for Google AI provider", + elif completion_type == "multimodal": + return self._execute_text( + completion_config=completion_config, + resolved_input=resolved_input, + include_provider_raw_response=include_provider_raw_response, ) except TypeError as e: diff --git a/backend/app/services/llm/providers/oai.py b/backend/app/services/llm/providers/oai.py index a634e8a1d..b02cd0d2c 100644 --- a/backend/app/services/llm/providers/oai.py +++ b/backend/app/services/llm/providers/oai.py @@ -1,5 +1,6 @@ import logging from typing import Any +from typing import TypeAlias, List import openai from openai import OpenAI @@ -15,10 +16,12 @@ TextContent, ) from app.services.llm.providers.base import BaseProvider - +from app.models.llm.request import TextContent, ImageContent, PDFContent logger = logging.getLogger(__name__) - +ContentItem: TypeAlias = TextContent | ImageContent | PDFContent +MultiModalInput: TypeAlias = List[ContentItem] +UserInput: TypeAlias = str | MultiModalInput class OpenAIProvider(BaseProvider): def __init__(self, client: OpenAI): @@ -36,6 +39,35 @@ def create_client(credentials: dict[str, Any]) -> Any: raise ValueError("OpenAI credentials not configured for this project.") return OpenAI(api_key=credentials["api_key"]) + @staticmethod + def format_parts(parts: list[TextContent | ImageContent | PDFContent]) -> list[dict]: + items = [] + for part in parts: + if isinstance(part, TextContent): + items.append({"type": "input_text", "text": part.value}) + + elif isinstance(part, ImageContent): + if part.format == "base64": + url = f"data:{part.mime_type};base64,{part.value}" + else: + url = part.value + items.append({ + "type": "input_image", + "image_url": url + }) + + elif isinstance(part, PDFContent): + if part.format == "base64": + url = f"data:{part.mime_type};base64,{part.value}" + else: + url = part.value + items.append({ + "type": "input_file", + "file_url": url + }) + + return items + def execute( self, completion_config: NativeCompletionConfig, @@ -51,10 +83,10 @@ def execute( params = { **completion_config.params, } - if isinstance(resolved_input, list): + if isinstance(resolved_input, MultiModalInput): params["input"] = [{ "role": "user", - "content": resolved_input # [{"type": "text", "value": "hello world"}, {"type": "image", "value": "base64encodedstring"}, {"type": "pdf", "value": "base64encodedstring"}] + "content": self.format_parts(resolved_input) }] else: params["input"] = resolved_input diff --git a/backend/app/utils.py b/backend/app/utils.py index 72ac9e033..330cbfdbc 100644 --- a/backend/app/utils.py +++ b/backend/app/utils.py @@ -26,12 +26,13 @@ from app.core.config import settings from app.crud.credentials import get_provider_credential from app.models.llm.request import TextInput, AudioInput, ImageInput, PDFInput +from app.models.llm.request import TextContent, AudioContent, ImageContent, PDFContent logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) T = TypeVar("T") - +ContentPart = TextContent | AudioContent | ImageContent | PDFContent class APIResponse(BaseModel, Generic[T]): success: bool @@ -444,61 +445,21 @@ def resolve_audio_base64(data: str, mime_type: str) -> tuple[str, str | None]: except Exception as e: return "", f"Failed to write audio to temp file: {str(e)}" -def resolve_image_input(image_input) -> list[dict]: + +def resolve_image_content(image_input: ImageInput) -> list[ImageContent]: contents = image_input.content if isinstance(image_input.content, list) else [image_input.content] - items = [] - for content in contents: - if content.format == "base64": - mime = content.mime_type or "image/png" - val = content.value - image_url = f"data:{mime};base64,{val}" - else: - image_url = content.value - items.append({ - "type": "input_image", - "image_url": image_url - }) + for c in contents: + if not c.mime_type: + c.mime_type = "image/png" + return contents - return items - -def resolve_pdf_input(pdf_input) -> list[dict]: +def resolve_pdf_content(pdf_input: PDFInput) -> list[PDFContent]: contents = pdf_input.content if isinstance(pdf_input.content, list) else [pdf_input.content] - items = [] - for content in contents: - if content.format == "base64": - mime = content.mime_type or "application/pdf" - val = content.value - pdf_url = f"data:{mime};base64,{val}" - else: - pdf_url = content.value - - items.append({ - "type": "input_file", - "file_url": pdf_url - }) - return items - - -def resolve_multimodal_list(inputs: list) -> tuple[list[dict], str | None]: - content_items = [] - - for item in inputs: - if isinstance(item, TextInput): - content_items.append({ - "type": "input_text", - "text": item.content.value, - }) - elif isinstance(item, ImageInput): - image_items = resolve_image_input(item) - content_items.extend(image_items) - elif isinstance(item, PDFInput): - pdf_items = resolve_pdf_input(item) - content_items.extend(pdf_items) - else: - return [], f"Unsupported input type in multimodal list: {type(item)}" - - return content_items, None + for c in contents: + if not c.mime_type: + c.mime_type = "application/pdf" + return contents def resolve_input(query_input) -> tuple[str, str | None]: """Resolve discriminated union input to content string. @@ -522,15 +483,23 @@ def resolve_input(query_input) -> tuple[str, str | None]: return resolve_audio_base64(query_input.content.value, mime_type) elif isinstance(query_input, ImageInput): - content_items = resolve_image_input(query_input) - return content_items, None + return resolve_image_content(query_input), None elif isinstance(query_input, PDFInput): - content_items = resolve_pdf_input(query_input) - return content_items, None - + return resolve_pdf_content(query_input), None + elif isinstance(query_input, list): - return resolve_multimodal_list(query_input) + parts: list[ContentPart] = [] + for item in query_input: + if isinstance(item, TextInput): + parts.append(item.content) # TextContent instance + elif isinstance(item, ImageInput): + parts.extend(resolve_image_content(item)) + elif isinstance(item, PDFInput): + parts.extend(resolve_pdf_content(item)) + else: + return [], f"Unsupported input type: {type(item)}" + return parts, None else: return "", f"Unknown input type: {type(query_input)}" From 1f3f13e0bd0986519c07893a4a865df4ee1a2d90 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Wed, 25 Feb 2026 18:38:25 +0530 Subject: [PATCH 03/16] pretify the codes and added support for image, pdf and multimodal to input type --- backend/app/models/llm/request.py | 21 +++- backend/app/services/llm/jobs.py | 2 +- backend/app/services/llm/mappers.py | 4 +- backend/app/services/llm/providers/base.py | 5 +- backend/app/services/llm/providers/gai.py | 127 ++++++++++++--------- backend/app/services/llm/providers/oai.py | 36 +++--- backend/app/utils.py | 22 +++- 7 files changed, 122 insertions(+), 95 deletions(-) diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py index ca8055218..245d95af2 100644 --- a/backend/app/models/llm/request.py +++ b/backend/app/models/llm/request.py @@ -55,6 +55,7 @@ class TTSLLMParams(SQLModel): language: str response_format: Literal["mp3", "wav", "ogg"] | None = "wav" + class ImageLLMParams(SQLModel): model: str instructions: str @@ -68,6 +69,7 @@ class ImageLLMParams(SQLModel): le=2.0, ) + KaapiLLMParams = Union[TextLLMParams, STTLLMParams, TTSLLMParams, ImageLLMParams] @@ -86,17 +88,21 @@ class AudioContent(SQLModel): description="MIME type of the audio (e.g., audio/wav, audio/mp3, audio/ogg)", ) + class ImageContent(SQLModel): - format: Literal["base64", "public_url"] = "base64" - value: str = Field(..., description="Base64 encoded image or Public URL to the image") + format: Literal["base64", "url"] = "base64" + value: str = Field( + ..., description="Base64 encoded image or Public URL to the image" + ) # keeping the mime_type mime_type: str | None = Field( None, description="MIME type of the image (e.g., image/png, image/jpeg)", ) + class PDFContent(SQLModel): - format: Literal["base64", "public_url"] = "base64" + format: Literal["base64", "url"] = "base64" value: str = Field(..., description="Base64 encoded PDF or Public URL to the PDF") # keeping the mime_type mime_type: str | None = Field( @@ -104,6 +110,7 @@ class PDFContent(SQLModel): description="MIME type of the PDF (e.g., application/pdf)", ) + class TextInput(SQLModel): type: Literal["text"] = "text" content: TextContent @@ -113,10 +120,12 @@ class AudioInput(SQLModel): type: Literal["audio"] = "audio" content: AudioContent + class ImageInput(SQLModel): type: Literal["image"] = "image" content: ImageContent | list[ImageContent] + class PDFInput(SQLModel): type: Literal["pdf"] = "pdf" content: PDFContent | list[PDFContent] @@ -427,12 +436,12 @@ class LlmCall(SQLModel, table=True): }, ) - input_type: Literal["text", "audio", "image"] = Field( + input_type: Literal["text", "audio", "image", "pdf", "multimodal"] = Field( ..., sa_column=sa.Column( sa.String, nullable=False, - comment="Input type: text, audio, image", + comment="Input type: text, audio, image, pdf, multimodal", ), ) @@ -535,4 +544,4 @@ class LlmCall(SQLModel, table=True): default=None, nullable=True, sa_column_kwargs={"comment": "Timestamp when the record was soft-deleted"}, - ) \ No newline at end of file + ) diff --git a/backend/app/services/llm/jobs.py b/backend/app/services/llm/jobs.py index b6f0df966..9d45366ab 100644 --- a/backend/app/services/llm/jobs.py +++ b/backend/app/services/llm/jobs.py @@ -112,7 +112,7 @@ def resolved_input_context(query_input: TextInput | AudioInput): """ resolved_input, error = resolve_input(query_input) print(f"Resolved input: {resolved_input}, error: {error}") - + if error: raise ValueError(error) diff --git a/backend/app/services/llm/mappers.py b/backend/app/services/llm/mappers.py index 8e61b8bf9..838912cdf 100644 --- a/backend/app/services/llm/mappers.py +++ b/backend/app/services/llm/mappers.py @@ -129,7 +129,7 @@ def map_kaapi_to_google_params(kaapi_params: dict) -> tuple[dict, list[str]]: google_params["response_format"] = response_format # Warn about unsupported parameters if kaapi_params.get("knowledge_base_ids"): - #TODO: Will take up later, when we add google filesearch tool support + # TODO: Will take up later, when we add google filesearch tool support warnings.append( "Parameter 'knowledge_base_ids' is not supported by Google AI and was ignored." ) @@ -142,7 +142,7 @@ def map_kaapi_to_google_params(kaapi_params: dict) -> tuple[dict, list[str]]: return google_params, warnings -def transform_kaapi_config_to_native( +def transform_kaapi_config_to_native( kaapi_config: KaapiCompletionConfig, ) -> tuple[NativeCompletionConfig, list[str]]: """Transform Kaapi completion config to native provider config with mapped parameters. diff --git a/backend/app/services/llm/providers/base.py b/backend/app/services/llm/providers/base.py index 07195dbfd..fcd64dfc7 100644 --- a/backend/app/services/llm/providers/base.py +++ b/backend/app/services/llm/providers/base.py @@ -5,11 +5,14 @@ """ from abc import ABC, abstractmethod -from typing import Any +from typing import Any, List, TypeAlias from app.models.llm import NativeCompletionConfig, LLMCallResponse, QueryParams from app.models.llm.request import TextContent, ImageContent, PDFContent +ContentItem: TypeAlias = TextContent | ImageContent | PDFContent +MultiModalInput: TypeAlias = List[ContentItem] + class BaseProvider(ABC): """Abstract base class for LLM providers. diff --git a/backend/app/services/llm/providers/gai.py b/backend/app/services/llm/providers/gai.py index 536b45dad..a7eaffecf 100644 --- a/backend/app/services/llm/providers/gai.py +++ b/backend/app/services/llm/providers/gai.py @@ -1,7 +1,6 @@ import logging import base64 from typing import Any -from typing import TypeAlias, List from google import genai from google.genai.types import ( @@ -25,13 +24,11 @@ PDFContent, ) from app.models.llm.response import AudioOutput, AudioContent -from app.services.llm.providers.base import BaseProvider +from app.services.llm.providers.base import BaseProvider, MultiModalInput from app.core.audio_utils import convert_pcm_to_mp3, convert_pcm_to_ogg logger = logging.getLogger(__name__) -ContentItem: TypeAlias = TextContent | ImageContent | PDFContent -MultiModalInput: TypeAlias = List[ContentItem] -UserInput: TypeAlias = str | MultiModalInput + class GoogleAIProvider(BaseProvider): def __init__(self, client: genai.Client): @@ -379,24 +376,29 @@ def _execute_tts( ) return llm_response, None - + def _execute_vision( self, completion_config: NativeCompletionConfig, - resolved_content: ImageContent | list[ImageContent], # using content here because we need mime type and format info for processing + resolved_content: ImageContent + | list[ + ImageContent + ], # using content here because we need mime type and format info for processing include_provider_raw_response: bool = False, ) -> tuple[LLMCallResponse | None, str | None]: model = completion_config.params.get("model") if not model: return None, "Missing 'model' in native params" - + contents = [] if isinstance(resolved_content, list): gemini_parts = self.format_parts(resolved_content) contents = [{"role": "user", "parts": gemini_parts}] else: - contents = [{"role": "user", "parts": self.format_parts([resolved_content])}] - + contents = [ + {"role": "user", "parts": self.format_parts([resolved_content])} + ] + instructions = completion_config.params.get("instructions", "") temperature = completion_config.params.get("temperature", None) thinking_level = completion_config.params.get("reasoning", None) @@ -404,17 +406,19 @@ def _execute_vision( generation_kwargs = {} if instructions: contents.append({"role": "system", "parts": [{"text": instructions}]}) - + if temperature is not None: generation_kwargs["temperature"] = temperature - + if thinking_level is not None: - generation_kwargs["thinking_config"] = ThinkingConfig(include_thoughts=False,thinking_level=thinking_level) - + generation_kwargs["thinking_config"] = ThinkingConfig( + include_thoughts=False, thinking_level=thinking_level + ) + response = self.client.models.generate_content( model=model, contents=contents, - config=GenerateContentConfig(**generation_kwargs) + config=GenerateContentConfig(**generation_kwargs), ) if response.usage_metadata: @@ -431,7 +435,6 @@ def _execute_vision( total_tokens = 0 reasoning_tokens = 0 - llm_response = LLMCallResponse( response=LLMResponse( provider_response_id=response.response_id, @@ -444,33 +447,38 @@ def _execute_vision( output_tokens=output_tokens, total_tokens=total_tokens, reasoning_tokens=reasoning_tokens, - ) + ), ) if include_provider_raw_response: llm_response.provider_raw_response = response.model_dump(mode="json") - + logger.info( f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}" ) return llm_response, None - + def _execute_pdf( self, completion_config: NativeCompletionConfig, - resolved_content: PDFContent | list[PDFContent], # using content here because we need mime type and format info for processing + resolved_content: PDFContent + | list[ + PDFContent + ], # using content here because we need mime type and format info for processing include_provider_raw_response: bool = False, ) -> tuple[LLMCallResponse | None, str | None]: model = completion_config.params.get("model") if not model: return None, "Missing 'model' in native params" - + contents = [] if isinstance(resolved_content, list): gemini_parts = self.format_parts(resolved_content) contents = [{"role": "user", "parts": gemini_parts}] else: - contents = [{"role": "user", "parts": self.format_parts([resolved_content])}] - + contents = [ + {"role": "user", "parts": self.format_parts([resolved_content])} + ] + instructions = completion_config.params.get("instructions", "") temperature = completion_config.params.get("temperature", None) thinking_level = completion_config.params.get("reasoning", None) @@ -478,17 +486,19 @@ def _execute_pdf( generation_kwargs = {} if instructions: contents.append({"role": "system", "parts": [{"text": instructions}]}) - + if temperature is not None: generation_kwargs["temperature"] = temperature - + if thinking_level is not None: - generation_kwargs["thinking_config"] = ThinkingConfig(include_thoughts=False,thinking_level=thinking_level) - + generation_kwargs["thinking_config"] = ThinkingConfig( + include_thoughts=False, thinking_level=thinking_level + ) + response = self.client.models.generate_content( model=model, contents=contents, - config=GenerateContentConfig(**generation_kwargs) + config=GenerateContentConfig(**generation_kwargs), ) if response.usage_metadata: @@ -505,7 +515,6 @@ def _execute_pdf( total_tokens = 0 reasoning_tokens = 0 - llm_response = LLMCallResponse( response=LLMResponse( provider_response_id=response.response_id, @@ -518,18 +527,18 @@ def _execute_pdf( output_tokens=output_tokens, total_tokens=total_tokens, reasoning_tokens=reasoning_tokens, - ) + ), ) if include_provider_raw_response: llm_response.provider_raw_response = response.model_dump(mode="json") - + logger.info( f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}" ) return llm_response, None def _execute_text( - self, + self, completion_config: NativeCompletionConfig, resolved_input: str | list[TextContent | ImageContent | PDFContent], include_provider_raw_response: bool = False, @@ -553,17 +562,19 @@ def _execute_text( generation_kwargs = {} if instructions: contents.append({"role": "system", "parts": [{"text": instructions}]}) - + if temperature is not None: generation_kwargs["temperature"] = temperature - + if thinking_level is not None: - generation_kwargs["thinking_config"] = ThinkingConfig(include_thoughts=False,thinking_level=thinking_level) - + generation_kwargs["thinking_config"] = ThinkingConfig( + include_thoughts=False, thinking_level=thinking_level + ) + response = self.client.models.generate_content( model=model, contents=contents, - config=GenerateContentConfig(**generation_kwargs) + config=GenerateContentConfig(**generation_kwargs), ) if response.usage_metadata: @@ -580,7 +591,6 @@ def _execute_text( total_tokens = 0 reasoning_tokens = 0 - llm_response = LLMCallResponse( response=LLMResponse( provider_response_id=response.response_id, @@ -593,18 +603,18 @@ def _execute_text( output_tokens=output_tokens, total_tokens=total_tokens, reasoning_tokens=reasoning_tokens, - ) + ), ) if include_provider_raw_response: llm_response.provider_raw_response = response.model_dump(mode="json") - + logger.info( f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}" ) return llm_response, None - + def _execute_multimodal( - self, + self, completion_config: NativeCompletionConfig, resolved_input: MultiModalInput, include_provider_raw_response: bool = False, @@ -612,13 +622,16 @@ def _execute_multimodal( """ Convert multimodal input's list of content parts into text response. """ - + model = completion_config.params.get("model") if not model: return None, "Missing 'model' in native params" if not isinstance(resolved_input, MultiModalInput): - return None, "Invalid input type for multimodal completion, expected list of content parts" + return ( + None, + "Invalid input type for multimodal completion, expected list of content parts", + ) gemini_parts = self.format_parts(resolved_input) contents = [{"role": "user", "parts": gemini_parts}] @@ -630,17 +643,19 @@ def _execute_multimodal( generation_kwargs = {} if instructions: contents.append({"role": "system", "parts": [{"text": instructions}]}) - + if temperature is not None: generation_kwargs["temperature"] = temperature - + if thinking_level is not None: - generation_kwargs["thinking_config"] = ThinkingConfig(include_thoughts=False,thinking_level=thinking_level) - + generation_kwargs["thinking_config"] = ThinkingConfig( + include_thoughts=False, thinking_level=thinking_level + ) + response = self.client.models.generate_content( model=model, contents=contents, - config=GenerateContentConfig(**generation_kwargs) + config=GenerateContentConfig(**generation_kwargs), ) if response.usage_metadata: @@ -657,7 +672,6 @@ def _execute_multimodal( total_tokens = 0 reasoning_tokens = 0 - llm_response = LLMCallResponse( response=LLMResponse( provider_response_id=response.response_id, @@ -670,17 +684,16 @@ def _execute_multimodal( output_tokens=output_tokens, total_tokens=total_tokens, reasoning_tokens=reasoning_tokens, - ) + ), ) if include_provider_raw_response: llm_response.provider_raw_response = response.model_dump(mode="json") - + logger.info( f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}" ) return llm_response, None - def execute( self, completion_config: NativeCompletionConfig, @@ -710,8 +723,8 @@ def execute( include_provider_raw_response=include_provider_raw_response, ) - elif completion_type == "vision": - return self._execute_vision( + elif completion_type == "image": + return self._execute_image( completion_config=completion_config, resolved_content=resolved_input, include_provider_raw_response=include_provider_raw_response, @@ -723,9 +736,9 @@ def execute( resolved_content=resolved_input, include_provider_raw_response=include_provider_raw_response, ) - + elif completion_type == "multimodal": - return self._execute_text( + return self._execute_multimodal( completion_config=completion_config, resolved_input=resolved_input, include_provider_raw_response=include_provider_raw_response, diff --git a/backend/app/services/llm/providers/oai.py b/backend/app/services/llm/providers/oai.py index b02cd0d2c..6368d0429 100644 --- a/backend/app/services/llm/providers/oai.py +++ b/backend/app/services/llm/providers/oai.py @@ -1,6 +1,5 @@ import logging from typing import Any -from typing import TypeAlias, List import openai from openai import OpenAI @@ -14,14 +13,13 @@ Usage, TextOutput, TextContent, + ImageContent, + PDFContent, ) -from app.services.llm.providers.base import BaseProvider -from app.models.llm.request import TextContent, ImageContent, PDFContent +from app.services.llm.providers.base import BaseProvider, MultiModalInput logger = logging.getLogger(__name__) -ContentItem: TypeAlias = TextContent | ImageContent | PDFContent -MultiModalInput: TypeAlias = List[ContentItem] -UserInput: TypeAlias = str | MultiModalInput + class OpenAIProvider(BaseProvider): def __init__(self, client: OpenAI): @@ -40,32 +38,28 @@ def create_client(credentials: dict[str, Any]) -> Any: return OpenAI(api_key=credentials["api_key"]) @staticmethod - def format_parts(parts: list[TextContent | ImageContent | PDFContent]) -> list[dict]: + def format_parts( + parts: list[TextContent | ImageContent | PDFContent], + ) -> list[dict]: items = [] for part in parts: if isinstance(part, TextContent): items.append({"type": "input_text", "text": part.value}) - + elif isinstance(part, ImageContent): if part.format == "base64": url = f"data:{part.mime_type};base64,{part.value}" else: url = part.value - items.append({ - "type": "input_image", - "image_url": url - }) + items.append({"type": "input_image", "image_url": url}) elif isinstance(part, PDFContent): if part.format == "base64": url = f"data:{part.mime_type};base64,{part.value}" else: url = part.value - items.append({ - "type": "input_file", - "file_url": url - }) - + items.append({"type": "input_file", "file_url": url}) + return items def execute( @@ -79,15 +73,13 @@ def execute( error_message: str | None = None try: - # if completeiton_type is not text: -> return Nonne , error we don't params = { **completion_config.params, } if isinstance(resolved_input, MultiModalInput): - params["input"] = [{ - "role": "user", - "content": self.format_parts(resolved_input) - }] + params["input"] = [ + {"role": "user", "content": self.format_parts(resolved_input)} + ] else: params["input"] = resolved_input diff --git a/backend/app/utils.py b/backend/app/utils.py index 330cbfdbc..7fa0973b9 100644 --- a/backend/app/utils.py +++ b/backend/app/utils.py @@ -34,6 +34,7 @@ T = TypeVar("T") ContentPart = TextContent | AudioContent | ImageContent | PDFContent + class APIResponse(BaseModel, Generic[T]): success: bool data: Optional[T] = None @@ -447,20 +448,29 @@ def resolve_audio_base64(data: str, mime_type: str) -> tuple[str, str | None]: def resolve_image_content(image_input: ImageInput) -> list[ImageContent]: - contents = image_input.content if isinstance(image_input.content, list) else [image_input.content] + contents = ( + image_input.content + if isinstance(image_input.content, list) + else [image_input.content] + ) for c in contents: if not c.mime_type: c.mime_type = "image/png" - return contents + return contents def resolve_pdf_content(pdf_input: PDFInput) -> list[PDFContent]: - contents = pdf_input.content if isinstance(pdf_input.content, list) else [pdf_input.content] + contents = ( + pdf_input.content + if isinstance(pdf_input.content, list) + else [pdf_input.content] + ) for c in contents: if not c.mime_type: c.mime_type = "application/pdf" return contents + def resolve_input(query_input) -> tuple[str, str | None]: """Resolve discriminated union input to content string. @@ -481,13 +491,13 @@ def resolve_input(query_input) -> tuple[str, str | None]: # AudioInput content is base64-encoded audio mime_type = query_input.content.mime_type or "audio/wav" return resolve_audio_base64(query_input.content.value, mime_type) - + elif isinstance(query_input, ImageInput): return resolve_image_content(query_input), None - + elif isinstance(query_input, PDFInput): return resolve_pdf_content(query_input), None - + elif isinstance(query_input, list): parts: list[ContentPart] = [] for item in query_input: From e5a9fa8121420b97d30f564cb8a8f9ef6d050b90 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Wed, 25 Feb 2026 18:57:24 +0530 Subject: [PATCH 04/16] fixes to function nae and added the input_type --- backend/app/models/llm/request.py | 4 ++-- backend/app/services/llm/providers/gai.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py index 245d95af2..82eda2139 100644 --- a/backend/app/models/llm/request.py +++ b/backend/app/models/llm/request.py @@ -208,7 +208,7 @@ class NativeCompletionConfig(SQLModel): ..., description="Provider-specific parameters (schema varies by provider), should exactly match the provider's endpoint params structure", ) - type: Literal["text", "stt", "tts"] = Field( + type: Literal["text", "stt", "tts", "image", "pdf", "multimodal"] = Field( ..., description="Completion config type. Params schema varies by type" ) @@ -224,7 +224,7 @@ class KaapiCompletionConfig(SQLModel): ..., description="LLM provider (openai)" ) - type: Literal["text", "stt", "tts"] = Field( + type: Literal["text", "stt", "tts", "image", "pdf", "multimodal"] = Field( ..., description="Completion config type. Params schema varies by type" ) params: dict[str, Any] = Field( diff --git a/backend/app/services/llm/providers/gai.py b/backend/app/services/llm/providers/gai.py index a7eaffecf..58cd3d43c 100644 --- a/backend/app/services/llm/providers/gai.py +++ b/backend/app/services/llm/providers/gai.py @@ -377,7 +377,7 @@ def _execute_tts( return llm_response, None - def _execute_vision( + def _execute_image( self, completion_config: NativeCompletionConfig, resolved_content: ImageContent From 668e3fa62b509b73e1daf18acecad5532f33ebf8 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Thu, 26 Feb 2026 00:28:58 +0530 Subject: [PATCH 05/16] Add support for multimodal input types: Image, PDF, and enhance validation --- backend/app/models/llm/request.py | 42 +++++++++++++- backend/app/services/llm/jobs.py | 17 +++++- backend/app/services/llm/mappers.py | 10 ++-- backend/app/services/llm/providers/base.py | 43 +++++++++++++- backend/app/services/llm/providers/gai.py | 56 +++++-------------- backend/app/services/llm/providers/oai.py | 6 +- .../app/services/llm/providers/registry.py | 3 +- backend/app/utils.py | 40 ++++++++----- 8 files changed, 148 insertions(+), 69 deletions(-) diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py index 82eda2139..81efcb89a 100644 --- a/backend/app/models/llm/request.py +++ b/backend/app/models/llm/request.py @@ -68,9 +68,47 @@ class ImageLLMParams(SQLModel): ge=0.0, le=2.0, ) + reasoning: Literal["low", "medium", "high"] | None = None -KaapiLLMParams = Union[TextLLMParams, STTLLMParams, TTSLLMParams, ImageLLMParams] +class PDFLLMParams(SQLModel): + model: str + instructions: str + response_format: Literal["text"] | None = Field( + None, + description="Currently supports text type", + ) + temperature: float | None = Field( + default=0.2, + ge=0.0, + le=2.0, + ) + reasoning: Literal["low", "medium", "high"] | None = None + + +class MultimodalLLMParams(SQLModel): + model: str + instructions: str + response_format: Literal["text"] | None = Field( + None, + description="Currently supports text type", + ) + temperature: float | None = Field( + default=0.2, + ge=0.0, + le=2.0, + ) + reasoning: Literal["low", "medium", "high"] | None = None + + +KaapiLLMParams = Union[ + TextLLMParams, + STTLLMParams, + TTSLLMParams, + ImageLLMParams, + PDFLLMParams, + MultimodalLLMParams, +] # Input type models for discriminated union @@ -240,6 +278,8 @@ def validate_params(self): "stt": STTLLMParams, "tts": TTSLLMParams, "image": ImageLLMParams, + "pdf": PDFLLMParams, + "multimodal": MultimodalLLMParams, } model_class = param_models[self.type] validated = model_class.model_validate(self.params) diff --git a/backend/app/services/llm/jobs.py b/backend/app/services/llm/jobs.py index 9d45366ab..511516139 100644 --- a/backend/app/services/llm/jobs.py +++ b/backend/app/services/llm/jobs.py @@ -29,6 +29,7 @@ run_guardrails_validation, ) from app.services.llm.providers.registry import get_llm_provider +from app.services.llm.providers.base import validate_completion_input from app.services.llm.mappers import transform_kaapi_config_to_native from app.utils import APIResponse, send_callback, resolve_input, cleanup_temp_file @@ -104,7 +105,9 @@ def handle_job_error( @contextmanager -def resolved_input_context(query_input: TextInput | AudioInput): +def resolved_input_context( + query_input: TextInput | AudioInput | ImageInput | PDFInput | list, +): """Context manager for resolving and cleaning up input resources. Ensures temporary files (e.g., downloaded audio) are cleaned up @@ -394,6 +397,18 @@ def execute_job( # Resolve input and execute LLM (context manager handles cleanup) try: with resolved_input_context(request.query.input) as resolved_input: + mismatch = validate_completion_input( + completion_config.type, resolved_input + ) + if mismatch: + callback_response = APIResponse.failure_response( + error=mismatch, + metadata=request.request_metadata, + ) + return handle_job_error( + job_uuid, callback_url_str, callback_response + ) + response, error = decorated_execute( completion_config=completion_config, query=request.query, diff --git a/backend/app/services/llm/mappers.py b/backend/app/services/llm/mappers.py index 838912cdf..d4efc2e9f 100644 --- a/backend/app/services/llm/mappers.py +++ b/backend/app/services/llm/mappers.py @@ -127,6 +127,11 @@ def map_kaapi_to_google_params(kaapi_params: dict) -> tuple[dict, list[str]]: response_format = kaapi_params.get("response_format") if response_format: google_params["response_format"] = response_format + + reasoning = kaapi_params.get("reasoning") + if reasoning: + google_params["reasoning"] = reasoning + # Warn about unsupported parameters if kaapi_params.get("knowledge_base_ids"): # TODO: Will take up later, when we add google filesearch tool support @@ -134,11 +139,6 @@ def map_kaapi_to_google_params(kaapi_params: dict) -> tuple[dict, list[str]]: "Parameter 'knowledge_base_ids' is not supported by Google AI and was ignored." ) - if kaapi_params.get("reasoning") is not None: - warnings.append( - "Parameter 'reasoning' is not applicable for Google AI and was ignored." - ) - return google_params, warnings diff --git a/backend/app/services/llm/providers/base.py b/backend/app/services/llm/providers/base.py index fcd64dfc7..5414cfcf9 100644 --- a/backend/app/services/llm/providers/base.py +++ b/backend/app/services/llm/providers/base.py @@ -5,13 +5,50 @@ """ from abc import ABC, abstractmethod -from typing import Any, List, TypeAlias +from typing import Any, Literal + +from pydantic import model_validator +from sqlmodel import SQLModel from app.models.llm import NativeCompletionConfig, LLMCallResponse, QueryParams from app.models.llm.request import TextContent, ImageContent, PDFContent -ContentItem: TypeAlias = TextContent | ImageContent | PDFContent -MultiModalInput: TypeAlias = List[ContentItem] + +class MultiModalInput(SQLModel): + """Resolved multimodal input containing a list of content parts.""" + + parts: list[TextContent | ImageContent | PDFContent] + + @model_validator(mode="after") + def validate_parts(self): + if not self.parts: + raise ValueError("MultiModalInput requires at least one content part") + return self + + +COMPLETION_TYPE_ALLOWED_INPUT: dict[str, set[type]] = { + "text": {str}, + "stt": {str}, + "tts": {str}, + "image": {list}, + "pdf": {list}, + "multimodal": {MultiModalInput}, +} + + +def validate_completion_input(completion_type: str, resolved_input: Any) -> str | None: + """Returns error message if mismatch, else None.""" + allowed = COMPLETION_TYPE_ALLOWED_INPUT.get(completion_type) + if allowed is None: + return f"Unknown completion type: '{completion_type}'" + if type(resolved_input) not in allowed: + expected = " or ".join(t.__name__ for t in allowed) + return ( + f"completion type '{completion_type}' expects {expected} input, " + f"got {type(resolved_input).__name__}" + ) + return None + class BaseProvider(ABC): """Abstract base class for LLM providers. diff --git a/backend/app/services/llm/providers/gai.py b/backend/app/services/llm/providers/gai.py index 58cd3d43c..342e92eb3 100644 --- a/backend/app/services/llm/providers/gai.py +++ b/backend/app/services/llm/providers/gai.py @@ -380,24 +380,15 @@ def _execute_tts( def _execute_image( self, completion_config: NativeCompletionConfig, - resolved_content: ImageContent - | list[ - ImageContent - ], # using content here because we need mime type and format info for processing + resolved_input: list[ImageContent], include_provider_raw_response: bool = False, ) -> tuple[LLMCallResponse | None, str | None]: model = completion_config.params.get("model") if not model: return None, "Missing 'model' in native params" - contents = [] - if isinstance(resolved_content, list): - gemini_parts = self.format_parts(resolved_content) - contents = [{"role": "user", "parts": gemini_parts}] - else: - contents = [ - {"role": "user", "parts": self.format_parts([resolved_content])} - ] + gemini_parts = self.format_parts(resolved_input) + contents = [{"role": "user", "parts": gemini_parts}] instructions = completion_config.params.get("instructions", "") temperature = completion_config.params.get("temperature", None) @@ -460,24 +451,15 @@ def _execute_image( def _execute_pdf( self, completion_config: NativeCompletionConfig, - resolved_content: PDFContent - | list[ - PDFContent - ], # using content here because we need mime type and format info for processing + resolved_input: list[PDFContent], include_provider_raw_response: bool = False, ) -> tuple[LLMCallResponse | None, str | None]: model = completion_config.params.get("model") if not model: return None, "Missing 'model' in native params" - contents = [] - if isinstance(resolved_content, list): - gemini_parts = self.format_parts(resolved_content) - contents = [{"role": "user", "parts": gemini_parts}] - else: - contents = [ - {"role": "user", "parts": self.format_parts([resolved_content])} - ] + gemini_parts = self.format_parts(resolved_input) + contents = [{"role": "user", "parts": gemini_parts}] instructions = completion_config.params.get("instructions", "") temperature = completion_config.params.get("temperature", None) @@ -540,17 +522,15 @@ def _execute_pdf( def _execute_text( self, completion_config: NativeCompletionConfig, - resolved_input: str | list[TextContent | ImageContent | PDFContent], + resolved_input: str | MultiModalInput, include_provider_raw_response: bool = False, ) -> tuple[LLMCallResponse | None, str | None]: model = completion_config.params.get("model") if not model: return None, "Missing 'model' in native params" - contents = [] - - if isinstance(resolved_input, list): - gemini_parts = self.format_parts(resolved_input) + if isinstance(resolved_input, MultiModalInput): + gemini_parts = self.format_parts(resolved_input.parts) contents = [{"role": "user", "parts": gemini_parts}] else: contents = [{"role": "user", "parts": [{"text": resolved_input}]}] @@ -619,21 +599,11 @@ def _execute_multimodal( resolved_input: MultiModalInput, include_provider_raw_response: bool = False, ) -> tuple[LLMCallResponse | None, str | None]: - """ - Convert multimodal input's list of content parts into text response. - """ - model = completion_config.params.get("model") if not model: return None, "Missing 'model' in native params" - if not isinstance(resolved_input, MultiModalInput): - return ( - None, - "Invalid input type for multimodal completion, expected list of content parts", - ) - - gemini_parts = self.format_parts(resolved_input) + gemini_parts = self.format_parts(resolved_input.parts) contents = [{"role": "user", "parts": gemini_parts}] instructions = completion_config.params.get("instructions", "") @@ -698,7 +668,7 @@ def execute( self, completion_config: NativeCompletionConfig, query: QueryParams, - resolved_input: str | MultiModalInput, + resolved_input: str | list[ImageContent] | list[PDFContent] | MultiModalInput, include_provider_raw_response: bool = False, ) -> tuple[LLMCallResponse | None, str | None]: try: @@ -726,14 +696,14 @@ def execute( elif completion_type == "image": return self._execute_image( completion_config=completion_config, - resolved_content=resolved_input, + resolved_input=resolved_input, include_provider_raw_response=include_provider_raw_response, ) elif completion_type == "pdf": return self._execute_pdf( completion_config=completion_config, - resolved_content=resolved_input, + resolved_input=resolved_input, include_provider_raw_response=include_provider_raw_response, ) diff --git a/backend/app/services/llm/providers/oai.py b/backend/app/services/llm/providers/oai.py index 6368d0429..2f05a3aab 100644 --- a/backend/app/services/llm/providers/oai.py +++ b/backend/app/services/llm/providers/oai.py @@ -66,7 +66,7 @@ def execute( self, completion_config: NativeCompletionConfig, query: QueryParams, - resolved_input: str, + resolved_input: str | list[ImageContent] | list[PDFContent] | MultiModalInput, include_provider_raw_response: bool = False, ) -> tuple[LLMCallResponse | None, str | None]: response: Response | None = None @@ -77,6 +77,10 @@ def execute( **completion_config.params, } if isinstance(resolved_input, MultiModalInput): + params["input"] = [ + {"role": "user", "content": self.format_parts(resolved_input.parts)} + ] + elif isinstance(resolved_input, list): params["input"] = [ {"role": "user", "content": self.format_parts(resolved_input)} ] diff --git a/backend/app/services/llm/providers/registry.py b/backend/app/services/llm/providers/registry.py index 15236b8d7..5eff4db19 100644 --- a/backend/app/services/llm/providers/registry.py +++ b/backend/app/services/llm/providers/registry.py @@ -3,7 +3,6 @@ import logging from sqlmodel import Session -from app.crud import get_provider_credential from app.services.llm.providers.base import BaseProvider from app.services.llm.providers.oai import OpenAIProvider from app.services.llm.providers.gai import GoogleAIProvider @@ -46,6 +45,8 @@ def supported_providers(cls) -> list[str]: def get_llm_provider( session: Session, provider_type: str, project_id: int, organization_id: int ) -> BaseProvider: + from app.crud.credentials import get_provider_credential + provider_class = LLMProvider.get_provider_class(provider_type) # e.g., "openai-native" → "openai", "claude-native" → "claude" diff --git a/backend/app/utils.py b/backend/app/utils.py index 7fa0973b9..29100adb6 100644 --- a/backend/app/utils.py +++ b/backend/app/utils.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import base64 import functools as ft import ipaddress @@ -8,6 +10,7 @@ from pathlib import Path import requests import socket + from typing import Any, Dict, Generic, Optional, TypeVar from urllib.parse import urlparse @@ -25,8 +28,17 @@ from app.core import security from app.core.config import settings from app.crud.credentials import get_provider_credential -from app.models.llm.request import TextInput, AudioInput, ImageInput, PDFInput -from app.models.llm.request import TextContent, AudioContent, ImageContent, PDFContent +from app.models.llm.request import ( + TextInput, + AudioInput, + ImageInput, + PDFInput, + TextContent, + AudioContent, + ImageContent, + PDFContent, +) +from app.services.llm.providers.base import MultiModalInput logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -471,24 +483,24 @@ def resolve_pdf_content(pdf_input: PDFInput) -> list[PDFContent]: return contents -def resolve_input(query_input) -> tuple[str, str | None]: - """Resolve discriminated union input to content string. - - Args: - query_input: The input from QueryParams (TextInput or AudioInput) +def resolve_input( + query_input, +) -> tuple[str | list[ImageContent] | list[PDFContent] | "MultiModalInput", str | None]: + """Resolve query input to provider-ready format. Returns: - (content_string, None) on success - for text returns content value, for audio returns temp file path - ("", error_message) on failure + - TextInput/AudioInput: (str, None) + - ImageInput: (list[ImageContent], None) + - PDFInput: (list[PDFContent], None) + - list[QueryInput]: (MultiModalInput, None) + - Error: ("", error_message) """ - from app.models.llm.request import TextInput, AudioInput, ImageInput, PDFInput try: if isinstance(query_input, TextInput): return query_input.content.value, None elif isinstance(query_input, AudioInput): - # AudioInput content is base64-encoded audio mime_type = query_input.content.mime_type or "audio/wav" return resolve_audio_base64(query_input.content.value, mime_type) @@ -502,14 +514,14 @@ def resolve_input(query_input) -> tuple[str, str | None]: parts: list[ContentPart] = [] for item in query_input: if isinstance(item, TextInput): - parts.append(item.content) # TextContent instance + parts.append(item.content) elif isinstance(item, ImageInput): parts.extend(resolve_image_content(item)) elif isinstance(item, PDFInput): parts.extend(resolve_pdf_content(item)) else: - return [], f"Unsupported input type: {type(item)}" - return parts, None + return "", f"Unsupported input type: {type(item)}" + return MultiModalInput(parts=parts), None else: return "", f"Unknown input type: {type(query_input)}" From 74e328e0edfae48b0363181b1cf56cdfbf1182c1 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Thu, 26 Feb 2026 07:51:52 +0530 Subject: [PATCH 06/16] Enhance multimodal support: Allow None for instructions in Image, PDF, and Multimodal parameters; update validation to restrict audio input in multimodal processing. --- backend/app/models/llm/request.py | 6 +- backend/app/services/llm/providers/base.py | 80 ++++++++++++++++++---- backend/app/utils.py | 10 ++- 3 files changed, 77 insertions(+), 19 deletions(-) diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py index 81efcb89a..1c557d397 100644 --- a/backend/app/models/llm/request.py +++ b/backend/app/models/llm/request.py @@ -58,7 +58,7 @@ class TTSLLMParams(SQLModel): class ImageLLMParams(SQLModel): model: str - instructions: str + instructions: str | None = None response_format: Literal["text"] | None = Field( None, description="Currently supports text type", @@ -73,7 +73,7 @@ class ImageLLMParams(SQLModel): class PDFLLMParams(SQLModel): model: str - instructions: str + instructions: str | None = None response_format: Literal["text"] | None = Field( None, description="Currently supports text type", @@ -88,7 +88,7 @@ class PDFLLMParams(SQLModel): class MultimodalLLMParams(SQLModel): model: str - instructions: str + instructions: str | None = None response_format: Literal["text"] | None = Field( None, description="Currently supports text type", diff --git a/backend/app/services/llm/providers/base.py b/backend/app/services/llm/providers/base.py index 5414cfcf9..5fc75e711 100644 --- a/backend/app/services/llm/providers/base.py +++ b/backend/app/services/llm/providers/base.py @@ -11,7 +11,9 @@ from sqlmodel import SQLModel from app.models.llm import NativeCompletionConfig, LLMCallResponse, QueryParams -from app.models.llm.request import TextContent, ImageContent, PDFContent +from app.models.llm.request import TextContent, AudioContent, ImageContent, PDFContent + +MULTIMODAL_ALLOWED_PARTS = (TextContent, ImageContent, PDFContent) class MultiModalInput(SQLModel): @@ -26,27 +28,75 @@ def validate_parts(self): return self -COMPLETION_TYPE_ALLOWED_INPUT: dict[str, set[type]] = { - "text": {str}, - "stt": {str}, - "tts": {str}, - "image": {list}, - "pdf": {list}, - "multimodal": {MultiModalInput}, +CONTENT_TYPE_LABEL: dict[type, str] = { + TextContent: "text", + AudioContent: "audio", + ImageContent: "image", + PDFContent: "pdf", +} + +INPUT_TYPE_LABEL: dict[type, str] = { + str: "text", + list: "list", + MultiModalInput: "multimodal (mixed input types)", } +COMPLETION_TYPE_RULES: dict[str, dict] = { + "text": {"type": str, "label": "text"}, + "stt": {"type": str, "label": "audio"}, + "tts": {"type": str, "label": "text"}, + "image": {"type": list, "element_type": ImageContent, "label": "image"}, + "pdf": {"type": list, "element_type": PDFContent, "label": "pdf"}, + "multimodal": {"type": MultiModalInput, "label": "multimodal"}, +} + + +def _get_content_label(content: Any) -> str: + return CONTENT_TYPE_LABEL.get(type(content), type(content).__name__) + def validate_completion_input(completion_type: str, resolved_input: Any) -> str | None: - """Returns error message if mismatch, else None.""" - allowed = COMPLETION_TYPE_ALLOWED_INPUT.get(completion_type) - if allowed is None: + """Returns error message if input type doesn't match completion type, else None.""" + rule = COMPLETION_TYPE_RULES.get(completion_type) + if rule is None: return f"Unknown completion type: '{completion_type}'" - if type(resolved_input) not in allowed: - expected = " or ".join(t.__name__ for t in allowed) + + expected_type = rule["type"] + label = rule["label"] + + if not isinstance(resolved_input, expected_type): + actual_label = INPUT_TYPE_LABEL.get( + type(resolved_input), type(resolved_input).__name__ + ) + hint = ( + " Please set completion type to 'multimodal' when sending mixed input types." + if isinstance(resolved_input, MultiModalInput) + else f" Please ensure the input type matches the completion type." + ) return ( - f"completion type '{completion_type}' expects {expected} input, " - f"got {type(resolved_input).__name__}" + f"Input type mismatch: completion type '{completion_type}' expects " + f"'{label}' input, but received {actual_label}.{hint}" ) + + if isinstance(resolved_input, list): + element_type = rule.get("element_type") + if element_type: + for item in resolved_input: + if not isinstance(item, element_type): + return ( + f"Input type mismatch: completion type '{completion_type}' expects " + f"'{label}' input, but received '{_get_content_label(item)}' content. " + f"Please ensure the input type matches the completion type." + ) + + if isinstance(resolved_input, MultiModalInput): + for part in resolved_input.parts: + if not isinstance(part, MULTIMODAL_ALLOWED_PARTS): + return ( + f"Unsupported content in multimodal input: '{_get_content_label(part)}'. " + f"Multimodal supports text, image, and pdf only. Audio is not supported." + ) + return None diff --git a/backend/app/utils.py b/backend/app/utils.py index 29100adb6..7fb218077 100644 --- a/backend/app/utils.py +++ b/backend/app/utils.py @@ -519,8 +519,16 @@ def resolve_input( parts.extend(resolve_image_content(item)) elif isinstance(item, PDFInput): parts.extend(resolve_pdf_content(item)) + elif isinstance(item, AudioInput): + return ( + "", + "Audio input is not supported in multimodal. Please use completion type 'stt' for audio processing.", + ) else: - return "", f"Unsupported input type: {type(item)}" + return ( + "", + "Unsupported input type in multimodal list. Multimodal only supports text, image, and pdf inputs.", + ) return MultiModalInput(parts=parts), None else: From 42a001dda34385af0c18c92d7ef0ca8082ca6930 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Thu, 26 Feb 2026 08:34:07 +0530 Subject: [PATCH 07/16] Refactor multimodal input handling: Introduce ContentPart type for better type management and update relevant classes to use it. --- backend/app/models/llm/request.py | 33 ++++++++++++++++------ backend/app/services/llm/jobs.py | 1 - backend/app/services/llm/providers/base.py | 7 +++-- backend/app/services/llm/providers/gai.py | 10 +++---- backend/app/services/llm/providers/oai.py | 4 +-- backend/app/utils.py | 5 +--- 6 files changed, 36 insertions(+), 24 deletions(-) diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py index 1c557d397..87e3ded9c 100644 --- a/backend/app/models/llm/request.py +++ b/backend/app/models/llm/request.py @@ -59,9 +59,9 @@ class TTSLLMParams(SQLModel): class ImageLLMParams(SQLModel): model: str instructions: str | None = None - response_format: Literal["text"] | None = Field( - None, - description="Currently supports text type", + knowledge_base_ids: list[str] | None = Field( + default=None, + description="List of vector store IDs to use for knowledge retrieval", ) temperature: float | None = Field( default=0.2, @@ -69,14 +69,19 @@ class ImageLLMParams(SQLModel): le=2.0, ) reasoning: Literal["low", "medium", "high"] | None = None + max_num_results: int | None = Field( + default=None, + ge=1, + description="Maximum number of candidate results to return", + ) class PDFLLMParams(SQLModel): model: str instructions: str | None = None - response_format: Literal["text"] | None = Field( - None, - description="Currently supports text type", + knowledge_base_ids: list[str] | None = Field( + default=None, + description="List of vector store IDs to use for knowledge retrieval", ) temperature: float | None = Field( default=0.2, @@ -84,14 +89,19 @@ class PDFLLMParams(SQLModel): le=2.0, ) reasoning: Literal["low", "medium", "high"] | None = None + max_num_results: int | None = Field( + default=None, + ge=1, + description="Maximum number of candidate results to return", + ) class MultimodalLLMParams(SQLModel): model: str instructions: str | None = None - response_format: Literal["text"] | None = Field( - None, - description="Currently supports text type", + knowledge_base_ids: list[str] | None = Field( + default=None, + description="List of vector store IDs to use for knowledge retrieval", ) temperature: float | None = Field( default=0.2, @@ -99,6 +109,11 @@ class MultimodalLLMParams(SQLModel): le=2.0, ) reasoning: Literal["low", "medium", "high"] | None = None + max_num_results: int | None = Field( + default=None, + ge=1, + description="Maximum number of candidate results to return", + ) KaapiLLMParams = Union[ diff --git a/backend/app/services/llm/jobs.py b/backend/app/services/llm/jobs.py index 511516139..4ea45ba22 100644 --- a/backend/app/services/llm/jobs.py +++ b/backend/app/services/llm/jobs.py @@ -114,7 +114,6 @@ def resolved_input_context( even if errors occur during LLM execution. """ resolved_input, error = resolve_input(query_input) - print(f"Resolved input: {resolved_input}, error: {error}") if error: raise ValueError(error) diff --git a/backend/app/services/llm/providers/base.py b/backend/app/services/llm/providers/base.py index 5fc75e711..959494f6f 100644 --- a/backend/app/services/llm/providers/base.py +++ b/backend/app/services/llm/providers/base.py @@ -13,13 +13,14 @@ from app.models.llm import NativeCompletionConfig, LLMCallResponse, QueryParams from app.models.llm.request import TextContent, AudioContent, ImageContent, PDFContent +ContentPart = TextContent | ImageContent | PDFContent MULTIMODAL_ALLOWED_PARTS = (TextContent, ImageContent, PDFContent) class MultiModalInput(SQLModel): """Resolved multimodal input containing a list of content parts.""" - parts: list[TextContent | ImageContent | PDFContent] + parts: list[ContentPart] @model_validator(mode="after") def validate_parts(self): @@ -71,7 +72,7 @@ def validate_completion_input(completion_type: str, resolved_input: Any) -> str hint = ( " Please set completion type to 'multimodal' when sending mixed input types." if isinstance(resolved_input, MultiModalInput) - else f" Please ensure the input type matches the completion type." + else " Please ensure the input type matches the completion type." ) return ( f"Input type mismatch: completion type '{completion_type}' expects " @@ -134,7 +135,7 @@ def execute( self, completion_config: NativeCompletionConfig, query: QueryParams, - resolved_input: str | list[TextContent | ImageContent | PDFContent], + resolved_input: str | list[ContentPart], include_provider_raw_response: bool = False, ) -> tuple[LLMCallResponse | None, str | None]: """Execute LLM API call. diff --git a/backend/app/services/llm/providers/gai.py b/backend/app/services/llm/providers/gai.py index 342e92eb3..fe920ba07 100644 --- a/backend/app/services/llm/providers/gai.py +++ b/backend/app/services/llm/providers/gai.py @@ -24,7 +24,7 @@ PDFContent, ) from app.models.llm.response import AudioOutput, AudioContent -from app.services.llm.providers.base import BaseProvider, MultiModalInput +from app.services.llm.providers.base import BaseProvider, ContentPart, MultiModalInput from app.core.audio_utils import convert_pcm_to_mp3, convert_pcm_to_ogg logger = logging.getLogger(__name__) @@ -48,7 +48,7 @@ def create_client(credentials: dict[str, Any]) -> Any: @staticmethod def format_parts( - parts: list[TextContent | ImageContent | PDFContent], + parts: list[ContentPart], ) -> list[dict]: items = [] for part in parts: @@ -419,7 +419,7 @@ def _execute_image( reasoning_tokens = response.usage_metadata.thoughts_token_count or 0 else: logger.warning( - f"[GoogleAIProvider._execute_stt] Response missing usage_metadata, using zeros" + f"[GoogleAIProvider._execute_image] Response missing usage_metadata, using zeros" ) input_tokens = 0 output_tokens = 0 @@ -564,7 +564,7 @@ def _execute_text( reasoning_tokens = response.usage_metadata.thoughts_token_count or 0 else: logger.warning( - f"[GoogleAIProvider._execute_stt] Response missing usage_metadata, using zeros" + f"[GoogleAIProvider._execute_text] Response missing usage_metadata, using zeros" ) input_tokens = 0 output_tokens = 0 @@ -635,7 +635,7 @@ def _execute_multimodal( reasoning_tokens = response.usage_metadata.thoughts_token_count or 0 else: logger.warning( - f"[GoogleAIProvider._execute_stt] Response missing usage_metadata, using zeros" + f"[GoogleAIProvider._execute_multimodal] Response missing usage_metadata, using zeros" ) input_tokens = 0 output_tokens = 0 diff --git a/backend/app/services/llm/providers/oai.py b/backend/app/services/llm/providers/oai.py index 2f05a3aab..392487eea 100644 --- a/backend/app/services/llm/providers/oai.py +++ b/backend/app/services/llm/providers/oai.py @@ -16,7 +16,7 @@ ImageContent, PDFContent, ) -from app.services.llm.providers.base import BaseProvider, MultiModalInput +from app.services.llm.providers.base import BaseProvider, ContentPart, MultiModalInput logger = logging.getLogger(__name__) @@ -39,7 +39,7 @@ def create_client(credentials: dict[str, Any]) -> Any: @staticmethod def format_parts( - parts: list[TextContent | ImageContent | PDFContent], + parts: list[ContentPart], ) -> list[dict]: items = [] for part in parts: diff --git a/backend/app/utils.py b/backend/app/utils.py index 7fb218077..9c1be2a11 100644 --- a/backend/app/utils.py +++ b/backend/app/utils.py @@ -33,18 +33,15 @@ AudioInput, ImageInput, PDFInput, - TextContent, - AudioContent, ImageContent, PDFContent, ) -from app.services.llm.providers.base import MultiModalInput +from app.services.llm.providers.base import ContentPart, MultiModalInput logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) T = TypeVar("T") -ContentPart = TextContent | AudioContent | ImageContent | PDFContent class APIResponse(BaseModel, Generic[T]): From 335e59b3e5e59f466ee921ad3670f2a645402189 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Thu, 26 Feb 2026 09:49:11 +0530 Subject: [PATCH 08/16] Add comprehensive tests for multimodal input validation and processing --- .../app/tests/services/llm/test_multimodal.py | 450 ++++++++++++++++++ 1 file changed, 450 insertions(+) create mode 100644 backend/app/tests/services/llm/test_multimodal.py diff --git a/backend/app/tests/services/llm/test_multimodal.py b/backend/app/tests/services/llm/test_multimodal.py new file mode 100644 index 000000000..9744c82ce --- /dev/null +++ b/backend/app/tests/services/llm/test_multimodal.py @@ -0,0 +1,450 @@ +import pytest + +from app.models.llm.request import ( + TextInput, + AudioInput, + ImageInput, + PDFInput, + TextContent, + AudioContent, + ImageContent, + PDFContent, +) +from app.services.llm.providers.base import ( + ContentPart, + MultiModalInput, + validate_completion_input, + _get_content_label, +) +from app.services.llm.providers.oai import OpenAIProvider +from app.services.llm.providers.gai import GoogleAIProvider +from app.utils import ( + resolve_input, + resolve_image_content, + resolve_pdf_content, +) + + +class TestValidateCompletionInput: + def test_text_with_str_passes(self): + assert validate_completion_input("text", "hello") is None + + def test_stt_with_str_passes(self): + assert validate_completion_input("stt", "/tmp/audio.wav") is None + + def test_tts_with_str_passes(self): + assert validate_completion_input("tts", "say this") is None + + def test_image_with_image_content_list_passes(self): + parts = [ImageContent(format="base64", value="abc", mime_type="image/png")] + assert validate_completion_input("image", parts) is None + + def test_pdf_with_pdf_content_list_passes(self): + parts = [PDFContent(format="base64", value="abc", mime_type="application/pdf")] + assert validate_completion_input("pdf", parts) is None + + def test_multimodal_with_multimodal_input_passes(self): + mm = MultiModalInput( + parts=[ + TextContent(value="hello"), + ImageContent(format="base64", value="abc", mime_type="image/png"), + ] + ) + assert validate_completion_input("multimodal", mm) is None + + def test_text_input_with_pdf_completion_fails(self): + error = validate_completion_input("pdf", "some text") + assert error is not None + assert "input type mismatch" in error.lower() + assert "'pdf'" in error + assert "text" in error + + def test_multimodal_input_with_image_completion_fails(self): + mm = MultiModalInput( + parts=[ + TextContent(value="hello"), + ImageContent(format="base64", value="abc", mime_type="image/png"), + ] + ) + error = validate_completion_input("image", mm) + assert error is not None + assert "multimodal" in error.lower() + assert "set completion type to 'multimodal'" in error + + def test_text_input_with_image_completion_no_multimodal_hint(self): + error = validate_completion_input("image", "some text") + assert error is not None + assert "set completion type to 'multimodal'" not in error + assert "Please ensure the input type matches" in error + + def test_pdf_content_in_image_completion_fails(self): + parts = [PDFContent(format="base64", value="abc", mime_type="application/pdf")] + error = validate_completion_input("image", parts) + assert error is not None + assert "'pdf'" in error + + def test_image_content_in_pdf_completion_fails(self): + parts = [ImageContent(format="base64", value="abc", mime_type="image/png")] + error = validate_completion_input("pdf", parts) + assert error is not None + assert "'image'" in error + + def test_unknown_completion_type(self): + error = validate_completion_input("unknown_type", "hello") + assert error is not None + assert "Unknown completion type" in error + + def test_list_input_with_text_completion_fails(self): + parts = [ImageContent(format="base64", value="abc", mime_type="image/png")] + error = validate_completion_input("text", parts) + assert error is not None + assert "text" in error + + +class TestMultiModalInput: + def test_valid_parts(self): + mm = MultiModalInput( + parts=[ + TextContent(value="hello"), + ImageContent(format="base64", value="abc", mime_type="image/png"), + PDFContent(format="base64", value="abc", mime_type="application/pdf"), + ] + ) + assert len(mm.parts) == 3 + + def test_empty_parts_raises(self): + with pytest.raises(Exception): + MultiModalInput(parts=[]) + + def test_single_text_part(self): + mm = MultiModalInput(parts=[TextContent(value="only text")]) + assert len(mm.parts) == 1 + + +class TestGetContentLabel: + def test_text_content(self): + assert _get_content_label(TextContent(value="hi")) == "text" + + def test_image_content(self): + assert ( + _get_content_label( + ImageContent(format="base64", value="abc", mime_type="image/png") + ) + == "image" + ) + + def test_pdf_content(self): + assert ( + _get_content_label( + PDFContent(format="base64", value="abc", mime_type="application/pdf") + ) + == "pdf" + ) + + def test_audio_content(self): + assert ( + _get_content_label(AudioContent(value="abc", mime_type="audio/wav")) + == "audio" + ) + + +class TestResolveInputMultimodal: + def test_image_input_returns_image_content_list(self): + img = ImageInput( + content=ImageContent(format="base64", value="abc", mime_type="image/png") + ) + result, error = resolve_input(img) + assert error is None + assert isinstance(result, list) + assert len(result) == 1 + assert isinstance(result[0], ImageContent) + + def test_pdf_input_returns_pdf_content_list(self): + pdf = PDFInput( + content=PDFContent( + format="base64", value="abc", mime_type="application/pdf" + ) + ) + result, error = resolve_input(pdf) + assert error is None + assert isinstance(result, list) + assert len(result) == 1 + assert isinstance(result[0], PDFContent) + + def test_multimodal_list_returns_multimodal_input(self): + inputs = [ + TextInput(content=TextContent(value="describe")), + ImageInput( + content=ImageContent( + format="base64", value="abc", mime_type="image/png" + ) + ), + ] + result, error = resolve_input(inputs) + assert error is None + assert isinstance(result, MultiModalInput) + assert len(result.parts) == 2 + + def test_multimodal_list_with_pdf(self): + inputs = [ + TextInput(content=TextContent(value="analyze")), + PDFInput( + content=PDFContent( + format="base64", value="abc", mime_type="application/pdf" + ) + ), + ] + result, error = resolve_input(inputs) + assert error is None + assert isinstance(result, MultiModalInput) + assert len(result.parts) == 2 + + def test_multimodal_list_with_audio_rejected(self): + inputs = [ + TextInput(content=TextContent(value="hello")), + AudioInput(content=AudioContent(value="abc", mime_type="audio/wav")), + ] + result, error = resolve_input(inputs) + assert error is not None + assert "audio" in error.lower() + assert "stt" in error.lower() + + def test_image_input_default_mime_type(self): + img = ImageInput(content=ImageContent(format="base64", value="abc")) + result, error = resolve_input(img) + assert error is None + assert result[0].mime_type == "image/png" + + def test_pdf_input_default_mime_type(self): + pdf = PDFInput(content=PDFContent(format="base64", value="abc")) + result, error = resolve_input(pdf) + assert error is None + assert result[0].mime_type == "application/pdf" + + def test_image_input_multiple_contents(self): + img = ImageInput( + content=[ + ImageContent(format="base64", value="abc1", mime_type="image/png"), + ImageContent( + format="url", + value="https://example.com/img.jpg", + mime_type="image/jpeg", + ), + ] + ) + result, error = resolve_input(img) + assert error is None + assert len(result) == 2 + + def test_multimodal_mixed_types_in_parts(self): + inputs = [ + TextInput(content=TextContent(value="look at these")), + ImageInput( + content=ImageContent( + format="base64", value="img", mime_type="image/png" + ) + ), + PDFInput( + content=PDFContent( + format="base64", value="pdf", mime_type="application/pdf" + ) + ), + ] + result, error = resolve_input(inputs) + assert error is None + assert isinstance(result, MultiModalInput) + assert len(result.parts) == 3 + assert isinstance(result.parts[0], TextContent) + assert isinstance(result.parts[1], ImageContent) + assert isinstance(result.parts[2], PDFContent) + + +class TestOpenAIFormatParts: + def test_text_part(self): + parts = [TextContent(value="hello")] + result = OpenAIProvider.format_parts(parts) + assert result == [{"type": "input_text", "text": "hello"}] + + def test_image_base64_part(self): + parts = [ImageContent(format="base64", value="abc123", mime_type="image/png")] + result = OpenAIProvider.format_parts(parts) + assert len(result) == 1 + assert result[0]["type"] == "input_image" + assert result[0]["image_url"] == "data:image/png;base64,abc123" + + def test_image_url_part(self): + parts = [ + ImageContent( + format="url", + value="https://example.com/img.jpg", + mime_type="image/jpeg", + ) + ] + result = OpenAIProvider.format_parts(parts) + assert result[0]["type"] == "input_image" + assert result[0]["image_url"] == "https://example.com/img.jpg" + + def test_pdf_base64_part(self): + parts = [ + PDFContent(format="base64", value="pdf123", mime_type="application/pdf") + ] + result = OpenAIProvider.format_parts(parts) + assert len(result) == 1 + assert result[0]["type"] == "input_file" + assert result[0]["file_url"] == "data:application/pdf;base64,pdf123" + + def test_pdf_url_part(self): + parts = [ + PDFContent( + format="url", + value="https://example.com/doc.pdf", + mime_type="application/pdf", + ) + ] + result = OpenAIProvider.format_parts(parts) + assert result[0]["type"] == "input_file" + assert result[0]["file_url"] == "https://example.com/doc.pdf" + + def test_mixed_parts(self): + parts = [ + TextContent(value="describe"), + ImageContent(format="base64", value="img", mime_type="image/png"), + PDFContent( + format="url", + value="https://example.com/doc.pdf", + mime_type="application/pdf", + ), + ] + result = OpenAIProvider.format_parts(parts) + assert len(result) == 3 + assert result[0]["type"] == "input_text" + assert result[1]["type"] == "input_image" + assert result[2]["type"] == "input_file" + + +class TestGoogleAIFormatParts: + def test_text_part(self): + parts = [TextContent(value="hello")] + result = GoogleAIProvider.format_parts(parts) + assert result == [{"text": "hello"}] + + def test_image_base64_part(self): + parts = [ImageContent(format="base64", value="abc123", mime_type="image/png")] + result = GoogleAIProvider.format_parts(parts) + assert len(result) == 1 + assert result[0] == { + "inline_data": {"data": "abc123", "mime_type": "image/png"} + } + + def test_image_url_part(self): + parts = [ + ImageContent( + format="url", + value="https://example.com/img.jpg", + mime_type="image/jpeg", + ) + ] + result = GoogleAIProvider.format_parts(parts) + assert result[0] == { + "file_data": { + "file_uri": "https://example.com/img.jpg", + "mime_type": "image/jpeg", + "display_name": None, + } + } + + def test_pdf_base64_part(self): + parts = [ + PDFContent(format="base64", value="pdf123", mime_type="application/pdf") + ] + result = GoogleAIProvider.format_parts(parts) + assert result[0] == { + "inline_data": {"data": "pdf123", "mime_type": "application/pdf"} + } + + def test_pdf_url_part(self): + parts = [ + PDFContent( + format="url", + value="https://example.com/doc.pdf", + mime_type="application/pdf", + ) + ] + result = GoogleAIProvider.format_parts(parts) + assert result[0] == { + "file_data": { + "file_uri": "https://example.com/doc.pdf", + "mime_type": "application/pdf", + "display_name": None, + } + } + + def test_mixed_parts(self): + parts = [ + TextContent(value="analyze"), + ImageContent( + format="url", value="https://img.com/a.jpg", mime_type="image/jpeg" + ), + PDFContent(format="base64", value="pdf", mime_type="application/pdf"), + ] + result = GoogleAIProvider.format_parts(parts) + assert len(result) == 3 + assert "text" in result[0] + assert "file_data" in result[1] + assert "inline_data" in result[2] + + +class TestResolveImageContent: + def test_single_content(self): + img = ImageInput( + content=ImageContent(format="base64", value="abc", mime_type="image/png") + ) + result = resolve_image_content(img) + assert len(result) == 1 + assert result[0].mime_type == "image/png" + + def test_default_mime_type(self): + img = ImageInput(content=ImageContent(format="base64", value="abc")) + result = resolve_image_content(img) + assert result[0].mime_type == "image/png" + + def test_list_content(self): + img = ImageInput( + content=[ + ImageContent(format="base64", value="a", mime_type="image/png"), + ImageContent(format="base64", value="b", mime_type="image/jpeg"), + ] + ) + result = resolve_image_content(img) + assert len(result) == 2 + + +class TestResolvePdfContent: + def test_single_content(self): + pdf = PDFInput( + content=PDFContent( + format="base64", value="abc", mime_type="application/pdf" + ) + ) + result = resolve_pdf_content(pdf) + assert len(result) == 1 + assert result[0].mime_type == "application/pdf" + + def test_default_mime_type(self): + pdf = PDFInput(content=PDFContent(format="base64", value="abc")) + result = resolve_pdf_content(pdf) + assert result[0].mime_type == "application/pdf" + + def test_list_content(self): + pdf = PDFInput( + content=[ + PDFContent(format="base64", value="a", mime_type="application/pdf"), + PDFContent( + format="url", + value="https://example.com/doc.pdf", + mime_type="application/pdf", + ), + ] + ) + result = resolve_pdf_content(pdf) + assert len(result) == 2 From 56c7a44164e69098d37389461eed4bbe9042cd9f Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Thu, 26 Feb 2026 17:16:21 +0530 Subject: [PATCH 09/16] removed multimodal, image and pdf llmparams and from both input type and completion type --- backend/app/models/llm/request.py | 72 +----------- backend/app/services/llm/jobs.py | 13 --- backend/app/services/llm/providers/base.py | 77 +------------ .../app/tests/services/llm/test_multimodal.py | 105 ------------------ 4 files changed, 5 insertions(+), 262 deletions(-) diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py index 87e3ded9c..778039b62 100644 --- a/backend/app/models/llm/request.py +++ b/backend/app/models/llm/request.py @@ -56,73 +56,10 @@ class TTSLLMParams(SQLModel): response_format: Literal["mp3", "wav", "ogg"] | None = "wav" -class ImageLLMParams(SQLModel): - model: str - instructions: str | None = None - knowledge_base_ids: list[str] | None = Field( - default=None, - description="List of vector store IDs to use for knowledge retrieval", - ) - temperature: float | None = Field( - default=0.2, - ge=0.0, - le=2.0, - ) - reasoning: Literal["low", "medium", "high"] | None = None - max_num_results: int | None = Field( - default=None, - ge=1, - description="Maximum number of candidate results to return", - ) - - -class PDFLLMParams(SQLModel): - model: str - instructions: str | None = None - knowledge_base_ids: list[str] | None = Field( - default=None, - description="List of vector store IDs to use for knowledge retrieval", - ) - temperature: float | None = Field( - default=0.2, - ge=0.0, - le=2.0, - ) - reasoning: Literal["low", "medium", "high"] | None = None - max_num_results: int | None = Field( - default=None, - ge=1, - description="Maximum number of candidate results to return", - ) - - -class MultimodalLLMParams(SQLModel): - model: str - instructions: str | None = None - knowledge_base_ids: list[str] | None = Field( - default=None, - description="List of vector store IDs to use for knowledge retrieval", - ) - temperature: float | None = Field( - default=0.2, - ge=0.0, - le=2.0, - ) - reasoning: Literal["low", "medium", "high"] | None = None - max_num_results: int | None = Field( - default=None, - ge=1, - description="Maximum number of candidate results to return", - ) - - KaapiLLMParams = Union[ TextLLMParams, STTLLMParams, TTSLLMParams, - ImageLLMParams, - PDFLLMParams, - MultimodalLLMParams, ] @@ -277,7 +214,7 @@ class KaapiCompletionConfig(SQLModel): ..., description="LLM provider (openai)" ) - type: Literal["text", "stt", "tts", "image", "pdf", "multimodal"] = Field( + type: Literal["text", "stt", "tts"] = Field( ..., description="Completion config type. Params schema varies by type" ) params: dict[str, Any] = Field( @@ -292,9 +229,6 @@ def validate_params(self): "text": TextLLMParams, "stt": STTLLMParams, "tts": TTSLLMParams, - "image": ImageLLMParams, - "pdf": PDFLLMParams, - "multimodal": MultimodalLLMParams, } model_class = param_models[self.type] validated = model_class.model_validate(self.params) @@ -491,12 +425,12 @@ class LlmCall(SQLModel, table=True): }, ) - input_type: Literal["text", "audio", "image", "pdf", "multimodal"] = Field( + input_type: Literal["text", "audio", "image"] = Field( ..., sa_column=sa.Column( sa.String, nullable=False, - comment="Input type: text, audio, image, pdf, multimodal", + comment="Input type: text, audio, image", ), ) diff --git a/backend/app/services/llm/jobs.py b/backend/app/services/llm/jobs.py index 4ea45ba22..5cdc0d32b 100644 --- a/backend/app/services/llm/jobs.py +++ b/backend/app/services/llm/jobs.py @@ -29,7 +29,6 @@ run_guardrails_validation, ) from app.services.llm.providers.registry import get_llm_provider -from app.services.llm.providers.base import validate_completion_input from app.services.llm.mappers import transform_kaapi_config_to_native from app.utils import APIResponse, send_callback, resolve_input, cleanup_temp_file @@ -396,18 +395,6 @@ def execute_job( # Resolve input and execute LLM (context manager handles cleanup) try: with resolved_input_context(request.query.input) as resolved_input: - mismatch = validate_completion_input( - completion_config.type, resolved_input - ) - if mismatch: - callback_response = APIResponse.failure_response( - error=mismatch, - metadata=request.request_metadata, - ) - return handle_job_error( - job_uuid, callback_url_str, callback_response - ) - response, error = decorated_execute( completion_config=completion_config, query=request.query, diff --git a/backend/app/services/llm/providers/base.py b/backend/app/services/llm/providers/base.py index 959494f6f..f159f0f1c 100644 --- a/backend/app/services/llm/providers/base.py +++ b/backend/app/services/llm/providers/base.py @@ -5,16 +5,15 @@ """ from abc import ABC, abstractmethod -from typing import Any, Literal +from typing import Any from pydantic import model_validator from sqlmodel import SQLModel from app.models.llm import NativeCompletionConfig, LLMCallResponse, QueryParams -from app.models.llm.request import TextContent, AudioContent, ImageContent, PDFContent +from app.models.llm.request import TextContent, ImageContent, PDFContent ContentPart = TextContent | ImageContent | PDFContent -MULTIMODAL_ALLOWED_PARTS = (TextContent, ImageContent, PDFContent) class MultiModalInput(SQLModel): @@ -29,78 +28,6 @@ def validate_parts(self): return self -CONTENT_TYPE_LABEL: dict[type, str] = { - TextContent: "text", - AudioContent: "audio", - ImageContent: "image", - PDFContent: "pdf", -} - -INPUT_TYPE_LABEL: dict[type, str] = { - str: "text", - list: "list", - MultiModalInput: "multimodal (mixed input types)", -} - -COMPLETION_TYPE_RULES: dict[str, dict] = { - "text": {"type": str, "label": "text"}, - "stt": {"type": str, "label": "audio"}, - "tts": {"type": str, "label": "text"}, - "image": {"type": list, "element_type": ImageContent, "label": "image"}, - "pdf": {"type": list, "element_type": PDFContent, "label": "pdf"}, - "multimodal": {"type": MultiModalInput, "label": "multimodal"}, -} - - -def _get_content_label(content: Any) -> str: - return CONTENT_TYPE_LABEL.get(type(content), type(content).__name__) - - -def validate_completion_input(completion_type: str, resolved_input: Any) -> str | None: - """Returns error message if input type doesn't match completion type, else None.""" - rule = COMPLETION_TYPE_RULES.get(completion_type) - if rule is None: - return f"Unknown completion type: '{completion_type}'" - - expected_type = rule["type"] - label = rule["label"] - - if not isinstance(resolved_input, expected_type): - actual_label = INPUT_TYPE_LABEL.get( - type(resolved_input), type(resolved_input).__name__ - ) - hint = ( - " Please set completion type to 'multimodal' when sending mixed input types." - if isinstance(resolved_input, MultiModalInput) - else " Please ensure the input type matches the completion type." - ) - return ( - f"Input type mismatch: completion type '{completion_type}' expects " - f"'{label}' input, but received {actual_label}.{hint}" - ) - - if isinstance(resolved_input, list): - element_type = rule.get("element_type") - if element_type: - for item in resolved_input: - if not isinstance(item, element_type): - return ( - f"Input type mismatch: completion type '{completion_type}' expects " - f"'{label}' input, but received '{_get_content_label(item)}' content. " - f"Please ensure the input type matches the completion type." - ) - - if isinstance(resolved_input, MultiModalInput): - for part in resolved_input.parts: - if not isinstance(part, MULTIMODAL_ALLOWED_PARTS): - return ( - f"Unsupported content in multimodal input: '{_get_content_label(part)}'. " - f"Multimodal supports text, image, and pdf only. Audio is not supported." - ) - - return None - - class BaseProvider(ABC): """Abstract base class for LLM providers. diff --git a/backend/app/tests/services/llm/test_multimodal.py b/backend/app/tests/services/llm/test_multimodal.py index 9744c82ce..5d019ce95 100644 --- a/backend/app/tests/services/llm/test_multimodal.py +++ b/backend/app/tests/services/llm/test_multimodal.py @@ -13,8 +13,6 @@ from app.services.llm.providers.base import ( ContentPart, MultiModalInput, - validate_completion_input, - _get_content_label, ) from app.services.llm.providers.oai import OpenAIProvider from app.services.llm.providers.gai import GoogleAIProvider @@ -25,82 +23,6 @@ ) -class TestValidateCompletionInput: - def test_text_with_str_passes(self): - assert validate_completion_input("text", "hello") is None - - def test_stt_with_str_passes(self): - assert validate_completion_input("stt", "/tmp/audio.wav") is None - - def test_tts_with_str_passes(self): - assert validate_completion_input("tts", "say this") is None - - def test_image_with_image_content_list_passes(self): - parts = [ImageContent(format="base64", value="abc", mime_type="image/png")] - assert validate_completion_input("image", parts) is None - - def test_pdf_with_pdf_content_list_passes(self): - parts = [PDFContent(format="base64", value="abc", mime_type="application/pdf")] - assert validate_completion_input("pdf", parts) is None - - def test_multimodal_with_multimodal_input_passes(self): - mm = MultiModalInput( - parts=[ - TextContent(value="hello"), - ImageContent(format="base64", value="abc", mime_type="image/png"), - ] - ) - assert validate_completion_input("multimodal", mm) is None - - def test_text_input_with_pdf_completion_fails(self): - error = validate_completion_input("pdf", "some text") - assert error is not None - assert "input type mismatch" in error.lower() - assert "'pdf'" in error - assert "text" in error - - def test_multimodal_input_with_image_completion_fails(self): - mm = MultiModalInput( - parts=[ - TextContent(value="hello"), - ImageContent(format="base64", value="abc", mime_type="image/png"), - ] - ) - error = validate_completion_input("image", mm) - assert error is not None - assert "multimodal" in error.lower() - assert "set completion type to 'multimodal'" in error - - def test_text_input_with_image_completion_no_multimodal_hint(self): - error = validate_completion_input("image", "some text") - assert error is not None - assert "set completion type to 'multimodal'" not in error - assert "Please ensure the input type matches" in error - - def test_pdf_content_in_image_completion_fails(self): - parts = [PDFContent(format="base64", value="abc", mime_type="application/pdf")] - error = validate_completion_input("image", parts) - assert error is not None - assert "'pdf'" in error - - def test_image_content_in_pdf_completion_fails(self): - parts = [ImageContent(format="base64", value="abc", mime_type="image/png")] - error = validate_completion_input("pdf", parts) - assert error is not None - assert "'image'" in error - - def test_unknown_completion_type(self): - error = validate_completion_input("unknown_type", "hello") - assert error is not None - assert "Unknown completion type" in error - - def test_list_input_with_text_completion_fails(self): - parts = [ImageContent(format="base64", value="abc", mime_type="image/png")] - error = validate_completion_input("text", parts) - assert error is not None - assert "text" in error - - class TestMultiModalInput: def test_valid_parts(self): mm = MultiModalInput( @@ -121,33 +43,6 @@ def test_single_text_part(self): assert len(mm.parts) == 1 -class TestGetContentLabel: - def test_text_content(self): - assert _get_content_label(TextContent(value="hi")) == "text" - - def test_image_content(self): - assert ( - _get_content_label( - ImageContent(format="base64", value="abc", mime_type="image/png") - ) - == "image" - ) - - def test_pdf_content(self): - assert ( - _get_content_label( - PDFContent(format="base64", value="abc", mime_type="application/pdf") - ) - == "pdf" - ) - - def test_audio_content(self): - assert ( - _get_content_label(AudioContent(value="abc", mime_type="audio/wav")) - == "audio" - ) - - class TestResolveInputMultimodal: def test_image_input_returns_image_content_list(self): img = ImageInput( From 54076f32972da46c1821bf186d294bb7897a9c62 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Thu, 26 Feb 2026 17:28:53 +0530 Subject: [PATCH 10/16] added the table reference for image, pdf and multimodal --- backend/app/crud/llm.py | 15 ++++++++++++++- backend/app/models/llm/request.py | 5 +++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/backend/app/crud/llm.py b/backend/app/crud/llm.py index c1e01e7e7..360bab4f2 100644 --- a/backend/app/crud/llm.py +++ b/backend/app/crud/llm.py @@ -11,6 +11,8 @@ TextInput, AudioInput, QueryInput, + ImageInput, + PDFInput, ) logger = logging.getLogger(__name__) @@ -73,15 +75,26 @@ def create_llm_call( else getattr(completion_config.params, "type", "text") ) - input_type: Literal["text", "audio", "image"] + input_type: Literal["text", "audio", "image", "pdf", "multimodal"] output_type: Literal["text", "audio", "image"] | None + query_input = request.query.input + if completion_type == "stt": input_type = "audio" output_type = "text" elif completion_type == "tts": input_type = "text" output_type = "audio" + elif isinstance(query_input, ImageInput): + input_type = "image" + output_type = "text" + elif isinstance(query_input, PDFInput): + input_type = "pdf" + output_type = "text" + elif isinstance(query_input, list): + input_type = "multimodal" + output_type = "text" else: input_type = "text" output_type = "text" diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py index 778039b62..71e5c1480 100644 --- a/backend/app/models/llm/request.py +++ b/backend/app/models/llm/request.py @@ -425,12 +425,13 @@ class LlmCall(SQLModel, table=True): }, ) - input_type: Literal["text", "audio", "image"] = Field( + # NOTE: image, pdf, multimodal are internal labels stored in the table not user facing. + input_type: Literal["text", "audio", "image", "pdf", "multimodal"] = Field( ..., sa_column=sa.Column( sa.String, nullable=False, - comment="Input type: text, audio, image", + comment="Input type: text, audio, image, pdf, multimodal", ), ) From 2c76a4aa9e86f2f7088b7ac63c9bf69e534aa87c Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Sat, 28 Feb 2026 13:24:43 +0530 Subject: [PATCH 11/16] Remove completion_type for image, pdf, and multimodal types in NativeCompletionConfig and related methods in GoogleAIProvider --- backend/app/models/llm/request.py | 2 +- backend/app/services/llm/providers/gai.py | 241 +--------------------- 2 files changed, 6 insertions(+), 237 deletions(-) diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py index 71e5c1480..57ccf2740 100644 --- a/backend/app/models/llm/request.py +++ b/backend/app/models/llm/request.py @@ -198,7 +198,7 @@ class NativeCompletionConfig(SQLModel): ..., description="Provider-specific parameters (schema varies by provider), should exactly match the provider's endpoint params structure", ) - type: Literal["text", "stt", "tts", "image", "pdf", "multimodal"] = Field( + type: Literal["text", "stt", "tts"] = Field( ..., description="Completion config type. Params schema varies by type" ) diff --git a/backend/app/services/llm/providers/gai.py b/backend/app/services/llm/providers/gai.py index fe920ba07..db9dc6e20 100644 --- a/backend/app/services/llm/providers/gai.py +++ b/backend/app/services/llm/providers/gai.py @@ -377,152 +377,10 @@ def _execute_tts( return llm_response, None - def _execute_image( - self, - completion_config: NativeCompletionConfig, - resolved_input: list[ImageContent], - include_provider_raw_response: bool = False, - ) -> tuple[LLMCallResponse | None, str | None]: - model = completion_config.params.get("model") - if not model: - return None, "Missing 'model' in native params" - - gemini_parts = self.format_parts(resolved_input) - contents = [{"role": "user", "parts": gemini_parts}] - - instructions = completion_config.params.get("instructions", "") - temperature = completion_config.params.get("temperature", None) - thinking_level = completion_config.params.get("reasoning", None) - - generation_kwargs = {} - if instructions: - contents.append({"role": "system", "parts": [{"text": instructions}]}) - - if temperature is not None: - generation_kwargs["temperature"] = temperature - - if thinking_level is not None: - generation_kwargs["thinking_config"] = ThinkingConfig( - include_thoughts=False, thinking_level=thinking_level - ) - - response = self.client.models.generate_content( - model=model, - contents=contents, - config=GenerateContentConfig(**generation_kwargs), - ) - - if response.usage_metadata: - input_tokens = response.usage_metadata.prompt_token_count or 0 - output_tokens = response.usage_metadata.candidates_token_count or 0 - total_tokens = response.usage_metadata.total_token_count or 0 - reasoning_tokens = response.usage_metadata.thoughts_token_count or 0 - else: - logger.warning( - f"[GoogleAIProvider._execute_image] Response missing usage_metadata, using zeros" - ) - input_tokens = 0 - output_tokens = 0 - total_tokens = 0 - reasoning_tokens = 0 - - llm_response = LLMCallResponse( - response=LLMResponse( - provider_response_id=response.response_id, - model=response.model_version or model, - provider=completion_config.provider, - output=TextOutput(content=TextContent(value=response.text)), - ), - usage=Usage( - input_tokens=input_tokens, - output_tokens=output_tokens, - total_tokens=total_tokens, - reasoning_tokens=reasoning_tokens, - ), - ) - if include_provider_raw_response: - llm_response.provider_raw_response = response.model_dump(mode="json") - - logger.info( - f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}" - ) - return llm_response, None - - def _execute_pdf( - self, - completion_config: NativeCompletionConfig, - resolved_input: list[PDFContent], - include_provider_raw_response: bool = False, - ) -> tuple[LLMCallResponse | None, str | None]: - model = completion_config.params.get("model") - if not model: - return None, "Missing 'model' in native params" - - gemini_parts = self.format_parts(resolved_input) - contents = [{"role": "user", "parts": gemini_parts}] - - instructions = completion_config.params.get("instructions", "") - temperature = completion_config.params.get("temperature", None) - thinking_level = completion_config.params.get("reasoning", None) - - generation_kwargs = {} - if instructions: - contents.append({"role": "system", "parts": [{"text": instructions}]}) - - if temperature is not None: - generation_kwargs["temperature"] = temperature - - if thinking_level is not None: - generation_kwargs["thinking_config"] = ThinkingConfig( - include_thoughts=False, thinking_level=thinking_level - ) - - response = self.client.models.generate_content( - model=model, - contents=contents, - config=GenerateContentConfig(**generation_kwargs), - ) - - if response.usage_metadata: - input_tokens = response.usage_metadata.prompt_token_count or 0 - output_tokens = response.usage_metadata.candidates_token_count or 0 - total_tokens = response.usage_metadata.total_token_count or 0 - reasoning_tokens = response.usage_metadata.thoughts_token_count or 0 - else: - logger.warning( - f"[GoogleAIProvider._execute_stt] Response missing usage_metadata, using zeros" - ) - input_tokens = 0 - output_tokens = 0 - total_tokens = 0 - reasoning_tokens = 0 - - llm_response = LLMCallResponse( - response=LLMResponse( - provider_response_id=response.response_id, - model=response.model_version or model, - provider=completion_config.provider, - output=TextOutput(content=TextContent(value=response.text)), - ), - usage=Usage( - input_tokens=input_tokens, - output_tokens=output_tokens, - total_tokens=total_tokens, - reasoning_tokens=reasoning_tokens, - ), - ) - if include_provider_raw_response: - llm_response.provider_raw_response = response.model_dump(mode="json") - - logger.info( - f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}" - ) - return llm_response, None - def _execute_text( self, completion_config: NativeCompletionConfig, - resolved_input: str | MultiModalInput, + resolved_input: str | list[ContentPart] | MultiModalInput, include_provider_raw_response: bool = False, ) -> tuple[LLMCallResponse | None, str | None]: model = completion_config.params.get("model") @@ -532,6 +390,9 @@ def _execute_text( if isinstance(resolved_input, MultiModalInput): gemini_parts = self.format_parts(resolved_input.parts) contents = [{"role": "user", "parts": gemini_parts}] + elif isinstance(resolved_input, list): + gemini_parts = self.format_parts(resolved_input) + contents = [{"role": "user", "parts": gemini_parts}] else: contents = [{"role": "user", "parts": [{"text": resolved_input}]}] @@ -593,82 +454,11 @@ def _execute_text( ) return llm_response, None - def _execute_multimodal( - self, - completion_config: NativeCompletionConfig, - resolved_input: MultiModalInput, - include_provider_raw_response: bool = False, - ) -> tuple[LLMCallResponse | None, str | None]: - model = completion_config.params.get("model") - if not model: - return None, "Missing 'model' in native params" - - gemini_parts = self.format_parts(resolved_input.parts) - contents = [{"role": "user", "parts": gemini_parts}] - - instructions = completion_config.params.get("instructions", "") - temperature = completion_config.params.get("temperature", None) - thinking_level = completion_config.params.get("reasoning", None) - - generation_kwargs = {} - if instructions: - contents.append({"role": "system", "parts": [{"text": instructions}]}) - - if temperature is not None: - generation_kwargs["temperature"] = temperature - - if thinking_level is not None: - generation_kwargs["thinking_config"] = ThinkingConfig( - include_thoughts=False, thinking_level=thinking_level - ) - - response = self.client.models.generate_content( - model=model, - contents=contents, - config=GenerateContentConfig(**generation_kwargs), - ) - - if response.usage_metadata: - input_tokens = response.usage_metadata.prompt_token_count or 0 - output_tokens = response.usage_metadata.candidates_token_count or 0 - total_tokens = response.usage_metadata.total_token_count or 0 - reasoning_tokens = response.usage_metadata.thoughts_token_count or 0 - else: - logger.warning( - f"[GoogleAIProvider._execute_multimodal] Response missing usage_metadata, using zeros" - ) - input_tokens = 0 - output_tokens = 0 - total_tokens = 0 - reasoning_tokens = 0 - - llm_response = LLMCallResponse( - response=LLMResponse( - provider_response_id=response.response_id, - model=response.model_version or model, - provider=completion_config.provider, - output=TextOutput(content=TextContent(value=response.text)), - ), - usage=Usage( - input_tokens=input_tokens, - output_tokens=output_tokens, - total_tokens=total_tokens, - reasoning_tokens=reasoning_tokens, - ), - ) - if include_provider_raw_response: - llm_response.provider_raw_response = response.model_dump(mode="json") - - logger.info( - f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}" - ) - return llm_response, None - def execute( self, completion_config: NativeCompletionConfig, query: QueryParams, - resolved_input: str | list[ImageContent] | list[PDFContent] | MultiModalInput, + resolved_input: str | list[ContentPart] | MultiModalInput, include_provider_raw_response: bool = False, ) -> tuple[LLMCallResponse | None, str | None]: try: @@ -693,27 +483,6 @@ def execute( include_provider_raw_response=include_provider_raw_response, ) - elif completion_type == "image": - return self._execute_image( - completion_config=completion_config, - resolved_input=resolved_input, - include_provider_raw_response=include_provider_raw_response, - ) - - elif completion_type == "pdf": - return self._execute_pdf( - completion_config=completion_config, - resolved_input=resolved_input, - include_provider_raw_response=include_provider_raw_response, - ) - - elif completion_type == "multimodal": - return self._execute_multimodal( - completion_config=completion_config, - resolved_input=resolved_input, - include_provider_raw_response=include_provider_raw_response, - ) - except TypeError as e: # handle unexpected arguments gracefully error_message = f"Invalid or unexpected parameter in Config: {str(e)}" From 2ed9af916849ee8048f2481dc9bcb0744d37a103 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Sun, 1 Mar 2026 18:14:41 +0530 Subject: [PATCH 12/16] Refactor credential patching in LLM provider tests and update reasoning parameter handling in Google params mapping tests --- .../services/llm/providers/test_registry.py | 8 ++---- .../app/tests/services/llm/test_mappers.py | 26 +++++++------------ 2 files changed, 11 insertions(+), 23 deletions(-) diff --git a/backend/app/tests/services/llm/providers/test_registry.py b/backend/app/tests/services/llm/providers/test_registry.py index b3daa44c4..4349da107 100644 --- a/backend/app/tests/services/llm/providers/test_registry.py +++ b/backend/app/tests/services/llm/providers/test_registry.py @@ -40,9 +40,7 @@ def test_get_llm_provider_with_openai(self, db: Session): """Test getting OpenAI provider successfully.""" project = get_project(db) - with patch( - "app.services.llm.providers.registry.get_provider_credential" - ) as mock_get_creds: + with patch("app.crud.credentials.get_provider_credential") as mock_get_creds: mock_get_creds.return_value = {"api_key": "test-api-key"} provider = get_llm_provider( @@ -94,9 +92,7 @@ def test_get_llm_provider_with_missing_credentials(self, db: Session): """Test handling of errors when credentials are not found.""" project = get_project(db) - with patch( - "app.services.llm.providers.registry.get_provider_credential" - ) as mock_get_creds: + with patch("app.crud.credentials.get_provider_credential") as mock_get_creds: mock_get_creds.return_value = None with pytest.raises(ValueError) as exc_info: diff --git a/backend/app/tests/services/llm/test_mappers.py b/backend/app/tests/services/llm/test_mappers.py index 2ecbcd7b2..7a70cf46c 100644 --- a/backend/app/tests/services/llm/test_mappers.py +++ b/backend/app/tests/services/llm/test_mappers.py @@ -292,8 +292,7 @@ def test_knowledge_base_ids_warning(self): assert "knowledge_base_ids" in warnings[0].lower() assert "not supported" in warnings[0] - def test_reasoning_warning(self): - """Test that reasoning parameter is not supported and generates warning.""" + def test_reasoning_passed_through(self): kaapi_params = TextLLMParams( model="gemini-2.5-pro", reasoning="high", @@ -304,13 +303,10 @@ def test_reasoning_warning(self): ) assert result["model"] == "gemini-2.5-pro" - assert "reasoning" not in result - assert len(warnings) == 1 - assert "reasoning" in warnings[0].lower() - assert "not applicable" in warnings[0] + assert result["reasoning"] == "high" + assert len(warnings) == 0 - def test_multiple_unsupported_params(self): - """Test that multiple unsupported parameters generate multiple warnings.""" + def test_knowledge_base_ids_unsupported(self): kaapi_params = TextLLMParams( model="gemini-2.5-pro", reasoning="medium", @@ -322,13 +318,10 @@ def test_multiple_unsupported_params(self): ) assert result["model"] == "gemini-2.5-pro" - assert "reasoning" not in result + assert result["reasoning"] == "medium" assert "knowledge_base_ids" not in result - assert len(warnings) == 2 - # Check both warnings are present - warning_text = " ".join(warnings).lower() - assert "reasoning" in warning_text - assert "knowledge_base_ids" in warning_text + assert len(warnings) == 1 + assert "knowledge_base_ids" in warnings[0].lower() class TestTransformKaapiConfigToNative: @@ -476,7 +469,6 @@ def test_transform_google_config(self): assert warnings == [] def test_transform_google_with_unsupported_params(self): - """Test that Google transformation warns about unsupported parameters.""" kaapi_config = KaapiCompletionConfig( provider="google", type="text", @@ -491,6 +483,6 @@ def test_transform_google_with_unsupported_params(self): assert result.provider == "google-native" assert result.params["model"] == "gemini-2.5-pro" + assert result.params["reasoning"] == "high" assert "knowledge_base_ids" not in result.params - assert "reasoning" not in result.params - assert len(warnings) == 2 + assert len(warnings) == 1 From 2b265db6333c5508d89a2e78cd3a645b41fdec90 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Sun, 1 Mar 2026 19:50:35 +0530 Subject: [PATCH 13/16] Add tests for edge cases in multimodal input handling and enhance OpenAI/Google AI execution routing --- .../app/tests/services/llm/test_multimodal.py | 190 ++++++++++++++++++ 1 file changed, 190 insertions(+) diff --git a/backend/app/tests/services/llm/test_multimodal.py b/backend/app/tests/services/llm/test_multimodal.py index 5d019ce95..b5185995b 100644 --- a/backend/app/tests/services/llm/test_multimodal.py +++ b/backend/app/tests/services/llm/test_multimodal.py @@ -1,4 +1,5 @@ import pytest +from unittest.mock import MagicMock from app.models.llm.request import ( TextInput, @@ -9,6 +10,8 @@ AudioContent, ImageContent, PDFContent, + NativeCompletionConfig, + QueryParams, ) from app.services.llm.providers.base import ( ContentPart, @@ -343,3 +346,190 @@ def test_list_content(self): ) result = resolve_pdf_content(pdf) assert len(result) == 2 + + +class TestResolveInputEdgeCases: + def test_unknown_input_type(self): + result, error = resolve_input(12345) + assert error is not None + assert "Unknown input type" in error + + def test_unsupported_type_in_multimodal_list(self): + result, error = resolve_input(["not_a_valid_input"]) + assert error is not None + assert "Unsupported input type" in error + + def test_text_input_resolves_string(self): + text = TextInput(content=TextContent(value="hello world")) + result, error = resolve_input(text) + assert error is None + assert result == "hello world" + + +class TestOpenAIExecuteInputRouting: + def _make_provider(self): + mock_client = MagicMock() + mock_resp = MagicMock() + mock_resp.id = "resp_123" + mock_resp.model = "gpt-4o-mini" + mock_resp.output_text = "result" + mock_resp.usage.input_tokens = 10 + mock_resp.usage.output_tokens = 5 + mock_resp.usage.total_tokens = 15 + mock_resp.conversation = None + mock_client.responses.create.return_value = mock_resp + return OpenAIProvider(client=mock_client), mock_client + + def _make_config(self): + return NativeCompletionConfig( + provider="openai-native", type="text", params={"model": "gpt-4o-mini"} + ) + + def _make_query(self): + return QueryParams(input="test") + + def test_multimodal_input(self): + provider, mock_client = self._make_provider() + mm = MultiModalInput( + parts=[ + TextContent(value="describe"), + ImageContent(format="base64", value="img", mime_type="image/png"), + ] + ) + response, error = provider.execute( + completion_config=self._make_config(), + query=self._make_query(), + resolved_input=mm, + ) + assert error is None + call_kwargs = mock_client.responses.create.call_args[1] + assert call_kwargs["input"][0]["role"] == "user" + assert len(call_kwargs["input"][0]["content"]) == 2 + + def test_list_input(self): + provider, mock_client = self._make_provider() + parts = [ImageContent(format="base64", value="img", mime_type="image/png")] + response, error = provider.execute( + completion_config=self._make_config(), + query=self._make_query(), + resolved_input=parts, + ) + assert error is None + call_kwargs = mock_client.responses.create.call_args[1] + assert call_kwargs["input"][0]["role"] == "user" + + def test_string_input(self): + provider, mock_client = self._make_provider() + response, error = provider.execute( + completion_config=self._make_config(), + query=self._make_query(), + resolved_input="hello", + ) + assert error is None + call_kwargs = mock_client.responses.create.call_args[1] + assert call_kwargs["input"] == "hello" + + +class TestGoogleAIExecuteTextRouting: + def _make_provider(self): + mock_client = MagicMock() + mock_resp = MagicMock() + mock_resp.response_id = "resp_gai_123" + mock_resp.model_version = "gemini-2.0-flash" + mock_resp.text = "response text" + mock_resp.usage_metadata.prompt_token_count = 10 + mock_resp.usage_metadata.candidates_token_count = 5 + mock_resp.usage_metadata.total_token_count = 15 + mock_resp.usage_metadata.thoughts_token_count = 0 + mock_client.models.generate_content.return_value = mock_resp + return GoogleAIProvider(client=mock_client), mock_client + + def _make_config(self, **extra_params): + params = {"model": "gemini-2.0-flash"} + params.update(extra_params) + return NativeCompletionConfig( + provider="google-native", type="text", params=params + ) + + def _make_query(self): + return QueryParams(input="test") + + def test_multimodal_input(self): + provider, mock_client = self._make_provider() + mm = MultiModalInput( + parts=[ + TextContent(value="describe"), + ImageContent(format="base64", value="img", mime_type="image/png"), + ] + ) + response, error = provider.execute( + completion_config=self._make_config(), + query=self._make_query(), + resolved_input=mm, + ) + assert error is None + call_kwargs = mock_client.models.generate_content.call_args[1] + assert call_kwargs["contents"][0]["role"] == "user" + assert len(call_kwargs["contents"][0]["parts"]) == 2 + + def test_list_input(self): + provider, mock_client = self._make_provider() + parts = [ImageContent(format="base64", value="img", mime_type="image/png")] + response, error = provider.execute( + completion_config=self._make_config(), + query=self._make_query(), + resolved_input=parts, + ) + assert error is None + call_kwargs = mock_client.models.generate_content.call_args[1] + assert call_kwargs["contents"][0]["role"] == "user" + + def test_string_input(self): + provider, mock_client = self._make_provider() + response, error = provider.execute( + completion_config=self._make_config(), + query=self._make_query(), + resolved_input="hello", + ) + assert error is None + call_kwargs = mock_client.models.generate_content.call_args[1] + assert call_kwargs["contents"][0]["parts"] == [{"text": "hello"}] + + def test_missing_model(self): + provider, _ = self._make_provider() + config = NativeCompletionConfig( + provider="google-native", type="text", params={} + ) + response, error = provider.execute( + completion_config=config, + query=self._make_query(), + resolved_input="hello", + ) + assert response is None + assert "Missing 'model'" in error + + def test_instructions_appended(self): + provider, mock_client = self._make_provider() + response, error = provider.execute( + completion_config=self._make_config(instructions="be helpful"), + query=self._make_query(), + resolved_input="hello", + ) + assert error is None + call_kwargs = mock_client.models.generate_content.call_args[1] + contents = call_kwargs["contents"] + assert len(contents) == 2 + assert contents[1]["role"] == "system" + + def test_no_usage_metadata(self): + provider, mock_client = self._make_provider() + mock_resp = mock_client.models.generate_content.return_value + mock_resp.usage_metadata = None + response, error = provider.execute( + completion_config=self._make_config(), + query=self._make_query(), + resolved_input="hello", + ) + assert error is None + assert response.usage.input_tokens == 0 + assert response.usage.output_tokens == 0 From fdb82b01bd994bd975e4790a639a04da99159891 Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Sun, 1 Mar 2026 20:10:15 +0530 Subject: [PATCH 14/16] replaced role: "system" --- backend/app/services/llm/providers/gai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/services/llm/providers/gai.py b/backend/app/services/llm/providers/gai.py index db9dc6e20..05fa46fc1 100644 --- a/backend/app/services/llm/providers/gai.py +++ b/backend/app/services/llm/providers/gai.py @@ -402,7 +402,7 @@ def _execute_text( generation_kwargs = {} if instructions: - contents.append({"role": "system", "parts": [{"text": instructions}]}) + generation_kwargs["system_instruction"] = instructions if temperature is not None: generation_kwargs["temperature"] = temperature From a00bb6d354b2b25e1f7e047e8d21de19a7a4a0eb Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Sun, 1 Mar 2026 20:20:54 +0530 Subject: [PATCH 15/16] Rename test_instructions_appended to test_instructions_passed_to_config and update assertions to validate system_instruction in Google AI execution routing --- backend/app/tests/services/llm/test_multimodal.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/backend/app/tests/services/llm/test_multimodal.py b/backend/app/tests/services/llm/test_multimodal.py index b5185995b..bae09308a 100644 --- a/backend/app/tests/services/llm/test_multimodal.py +++ b/backend/app/tests/services/llm/test_multimodal.py @@ -508,7 +508,7 @@ def test_missing_model(self): assert response is None assert "Missing 'model'" in error - def test_instructions_appended(self): + def test_instructions_passed_to_config(self): provider, mock_client = self._make_provider() response, error = provider.execute( completion_config=self._make_config(instructions="be helpful"), @@ -517,9 +517,8 @@ def test_instructions_appended(self): ) assert error is None call_kwargs = mock_client.models.generate_content.call_args[1] - contents = call_kwargs["contents"] - assert len(contents) == 2 - assert contents[1]["role"] == "system" + config = call_kwargs["config"] + assert config.system_instruction == "be helpful" def test_no_usage_metadata(self): provider, mock_client = self._make_provider() From ad2e1be278b2c4e97e8c1b28eceaff1f17d1093a Mon Sep 17 00:00:00 2001 From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com> Date: Mon, 2 Mar 2026 13:43:43 +0530 Subject: [PATCH 16/16] Enhance LLM API documentation to support multimodal input types and clarify configuration parameters --- backend/app/api/docs/llm/llm_call.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/backend/app/api/docs/llm/llm_call.md b/backend/app/api/docs/llm/llm_call.md index fec4fbc49..8a594390c 100644 --- a/backend/app/api/docs/llm/llm_call.md +++ b/backend/app/api/docs/llm/llm_call.md @@ -6,7 +6,14 @@ for processing, and results are delivered via the callback URL when complete. ### Key Parameters **`query`** (required) - Query parameters for this LLM call: -- `input` (required, string, min 1 char): User question/prompt/query +- `input` (required): User input — accepts one of: + - A plain **string** e.g. `"input": "Hello"` (automatically normalized to a text input internally) + - A **structured input object** with `type` and `content` fields e.g. `"input": {"type": "text", "content": {"format": "text", "value": "Hello"}}` + - A **list of structured input objects** for multimodal inputs e.g. `"input": [{"type": "text", ...}, {"type": "image", ...}]` + - Supported input types: `text`, `audio`, `image`, `pdf` + - For `image` and `pdf` types, `content` accepts a single object or a list e.g. `"content": [{"format": "base64", "value": "..."}, ...]` + - Content `format` varies by type: `"text"` for text, `"base64"` for encoded data, `"url"` for image/pdf URLs + - Default MIME types when not specified: `image/png` for images, `application/pdf` for PDFs - `conversation` (optional, object): Conversation configuration - `id` (optional, string): Existing conversation ID to continue - `auto_create` (optional, boolean, default false): Create new conversation if no ID provided @@ -23,8 +30,9 @@ for processing, and results are delivered via the callback URL when complete. - **Mode 2: Ad-hoc Configuration** - `blob` (object): Complete configuration object - `completion` (required, object): Completion configuration - - `provider` (required, string): Provider type - either `"openai"` (Kaapi abstraction) or `"openai-native"` (pass-through) - - `params` (required, object): Parameters structure depends on provider type (see schema for detailed structure) + - `provider` (required, string): Provider type — `"openai"` or `"google"` (Kaapi abstraction), or `"openai-native"` or `"google-native"` (pass-through) + - `type` (required, string): Completion type — `"text"`, `"stt"`, `"tts"` for Kaapi providers; additionally `"image"`, `"pdf"`, `"multimodal"` for native providers + - `params` (required, object): Parameters structure depends on provider and type (see schema for detailed structure) - **Note** - When using ad-hoc configuration, do not include `id` and `version` fields - When using the Kaapi abstraction, parameters that are not supported by the selected provider or model are automatically suppressed. If any parameters are ignored, a list of warnings is included in the metadata.warnings. For example, the GPT-5 model does not support the temperature parameter, so Kaapi will neither throw an error nor pass this parameter to the model; instead, it will return a warning in the metadata.warnings response.