From e977b26bca4ad2cdcf866d3bd82db05f241f682f Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Tue, 24 Feb 2026 19:31:24 +0530
Subject: [PATCH 01/16] Enhance multimodal support: Add Image and PDF input
 types, update processing logic

---
 backend/app/models/llm/__init__.py         |  4 ++
 backend/app/models/llm/request.py          | 50 ++++++++++++++--
 backend/app/services/llm/jobs.py           |  2 +
 backend/app/services/llm/providers/base.py |  2 +-
 backend/app/services/llm/providers/gai.py  |  2 +-
 backend/app/services/llm/providers/oai.py  |  9 ++-
 backend/app/utils.py                       | 69 +++++++++++++++++++++-
 7 files changed, 128 insertions(+), 10 deletions(-)

diff --git a/backend/app/models/llm/__init__.py b/backend/app/models/llm/__init__.py
index b183543c4..67b288f39 100644
--- a/backend/app/models/llm/__init__.py
+++ b/backend/app/models/llm/__init__.py
@@ -9,6 +9,10 @@
     LlmCall,
     AudioContent,
     TextContent,
+    ImageContent,
+    PDFContent,
+    ImageInput,
+    PDFInput,
 )
 from app.models.llm.response import (
     LLMCallResponse,
diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py
index 0991aeba8..23a7a09af 100644
--- a/backend/app/models/llm/request.py
+++ b/backend/app/models/llm/request.py
@@ -1,5 +1,5 @@
 import sqlalchemy as sa
-from typing import Annotated, Any, Literal, Union
+from typing import Annotated, Any, List, Literal, Union
 from uuid import UUID, uuid4
 from pydantic import model_validator, HttpUrl
 from datetime import datetime
@@ -55,8 +55,20 @@ class TTSLLMParams(SQLModel):
     language: str
     response_format: Literal["mp3", "wav", "ogg"] | None = "wav"
 
+class ImageLLMParams(SQLModel):
+    model: str
+    instructions: str
+    response_format: Literal["text"] | None = Field(
+        None,
+        description="Currently supports text type",
+    )
+    temperature: float | None = Field(
+        default=0.2,
+        ge=0.0,
+        le=2.0,
+    )
 
-KaapiLLMParams = Union[TextLLMParams, STTLLMParams, TTSLLMParams]
+KaapiLLMParams = Union[TextLLMParams, STTLLMParams, TTSLLMParams, ImageLLMParams]
 
 
 # Input type models for discriminated union
@@ -74,6 +86,23 @@ class AudioContent(SQLModel):
         description="MIME type of the audio (e.g., audio/wav, audio/mp3, audio/ogg)",
     )
 
+class ImageContent(SQLModel):
+    format: Literal["base64", "public_url"] = "base64"
+    value: str = Field(..., description="Base64 encoded image or Public URL to the image")
+    # keeping the mime_type
+    mime_type: str | None = Field(
+        None,
+        description="MIME type of the image (e.g., image/png, image/jpeg)",
+    )
+
+class PDFContent(SQLModel):
+    format: Literal["base64", "public_url"] = "base64"
+    value: str = Field(..., description="Base64 encoded PDF or Public URL to the PDF")
+    # keeping the mime_type
+    mime_type: str | None = Field(
+        None,
+        description="MIME type of the PDF (e.g., application/pdf)",
+    )
 
 class TextInput(SQLModel):
     type: Literal["text"] = "text"
@@ -84,10 +113,18 @@ class AudioInput(SQLModel):
     type: Literal["audio"] = "audio"
     content: AudioContent
 
+class ImageInput(SQLModel):
+    type: Literal["image"] = "image"
+    content: ImageContent | list[ImageContent]
+
+class PDFInput(SQLModel):
+    type: Literal["pdf"] = "pdf"
+    content: PDFContent | list[PDFContent]
+
 
 # Discriminated union for query input types
 QueryInput = Annotated[
-    Union[TextInput, AudioInput],
+    Union[TextInput, AudioInput, ImageInput, PDFInput],
     Field(discriminator="type"),
 ]
 
@@ -122,7 +159,7 @@ def validate_conversation_logic(self):
 class QueryParams(SQLModel):
     """Query-specific parameters for each LLM call."""
 
-    input: str | QueryInput = Field(
+    input: str | QueryInput | list[QueryInput] = Field(
         ...,
         description=(
             "User input - either a plain string (text) or a structured input object. "
@@ -193,6 +230,7 @@ def validate_params(self):
             "text": TextLLMParams,
             "stt": STTLLMParams,
             "tts": TTSLLMParams,
+            "image": ImageLLMParams,
         }
         model_class = param_models[self.type]
         validated = model_class.model_validate(self.params)
@@ -389,12 +427,12 @@ class LlmCall(SQLModel, table=True):
         },
     )
 
-    input_type: Literal["text", "audio", "image"] = Field(
+    input_type: Literal["text", "audio", "image", "pdf", "multimodal"] = Field(
         ...,
         sa_column=sa.Column(
             sa.String,
             nullable=False,
-            comment="Input type: text, audio, image",
+            comment="Input type: text, audio, image, pdf, multimodal (list of multiple input types)",
         ),
     )
 
diff --git a/backend/app/services/llm/jobs.py b/backend/app/services/llm/jobs.py
index 33aff370a..17afef456 100644
--- a/backend/app/services/llm/jobs.py
+++ b/backend/app/services/llm/jobs.py
@@ -20,6 +20,8 @@
     KaapiCompletionConfig,
     TextInput,
     AudioInput,
+    ImageInput,
+    PDFInput,
 )
 from app.models.llm.response import TextOutput
 from app.services.llm.guardrails import (
diff --git a/backend/app/services/llm/providers/base.py b/backend/app/services/llm/providers/base.py
index d8f7cafe7..4559eac77 100644
--- a/backend/app/services/llm/providers/base.py
+++ b/backend/app/services/llm/providers/base.py
@@ -44,7 +44,7 @@ def execute(
         self,
         completion_config: NativeCompletionConfig,
         query: QueryParams,
-        resolved_input: str,
+        resolved_input: str | list[dict],
         include_provider_raw_response: bool = False,
     ) -> tuple[LLMCallResponse | None, str | None]:
         """Execute LLM API call.
diff --git a/backend/app/services/llm/providers/gai.py b/backend/app/services/llm/providers/gai.py
index ce9bf6ad4..9f83aadc5 100644
--- a/backend/app/services/llm/providers/gai.py
+++ b/backend/app/services/llm/providers/gai.py
@@ -333,7 +333,6 @@ def execute(
     ) -> tuple[LLMCallResponse | None, str | None]:
         try:
             completion_type = completion_config.type
-
             if completion_type == "stt":
                 return self._execute_stt(
                     completion_config=completion_config,
@@ -346,6 +345,7 @@ def execute(
                     resolved_input=resolved_input,
                     include_provider_raw_response=include_provider_raw_response,
                 )
+            
             else:
                 return (
                     None,
diff --git a/backend/app/services/llm/providers/oai.py b/backend/app/services/llm/providers/oai.py
index 83c0aa8d7..a634e8a1d 100644
--- a/backend/app/services/llm/providers/oai.py
+++ b/backend/app/services/llm/providers/oai.py
@@ -47,10 +47,17 @@ def execute(
         error_message: str | None = None
 
         try:
+            # if completeiton_type is not text: -> return Nonne , error we don't 
             params = {
                 **completion_config.params,
             }
-            params["input"] = resolved_input
+            if isinstance(resolved_input, list):
+                params["input"] = [{
+                    "role": "user",
+                    "content": resolved_input # [{"type": "text", "value": "hello world"}, {"type": "image", "value": "base64encodedstring"}, {"type": "pdf", "value": "base64encodedstring"}]
+                }]
+            else:
+                params["input"] = resolved_input
 
             conversation_cfg = query.conversation
 
diff --git a/backend/app/utils.py b/backend/app/utils.py
index 37cd97053..72ac9e033 100644
--- a/backend/app/utils.py
+++ b/backend/app/utils.py
@@ -25,6 +25,7 @@
 from app.core import security
 from app.core.config import settings
 from app.crud.credentials import get_provider_credential
+from app.models.llm.request import TextInput, AudioInput, ImageInput, PDFInput
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -443,6 +444,61 @@ def resolve_audio_base64(data: str, mime_type: str) -> tuple[str, str | None]:
     except Exception as e:
         return "", f"Failed to write audio to temp file: {str(e)}"
 
+def resolve_image_input(image_input) -> list[dict]:
+    contents = image_input.content if isinstance(image_input.content, list) else [image_input.content]
+    items = []
+    for content in contents:
+        if content.format == "base64":
+            mime = content.mime_type or "image/png"
+            val = content.value
+            image_url = f"data:{mime};base64,{val}"
+        else:
+            image_url = content.value
+        items.append({
+            "type": "input_image",
+            "image_url": image_url
+        })
+
+    return items
+        
+
+def resolve_pdf_input(pdf_input) -> list[dict]:
+    contents = pdf_input.content if isinstance(pdf_input.content, list) else [pdf_input.content]
+    items = []
+    for content in contents:
+        if content.format == "base64":
+            mime = content.mime_type or "application/pdf"
+            val = content.value
+            pdf_url = f"data:{mime};base64,{val}"
+        else:
+            pdf_url = content.value
+        
+        items.append({
+            "type": "input_file",
+            "file_url": pdf_url 
+        })
+    return items
+    
+
+def resolve_multimodal_list(inputs: list) -> tuple[list[dict], str | None]:
+    content_items = []
+
+    for item in inputs:
+        if isinstance(item, TextInput):
+            content_items.append({
+                "type": "input_text",
+                "text": item.content.value,
+            })
+        elif isinstance(item, ImageInput):
+            image_items = resolve_image_input(item)
+            content_items.extend(image_items)
+        elif isinstance(item, PDFInput):
+            pdf_items = resolve_pdf_input(item)
+            content_items.extend(pdf_items)
+        else:
+            return [], f"Unsupported input type in multimodal list: {type(item)}"
+    
+    return content_items, None
 
 def resolve_input(query_input) -> tuple[str, str | None]:
     """Resolve discriminated union input to content string.
@@ -454,7 +510,7 @@ def resolve_input(query_input) -> tuple[str, str | None]:
         (content_string, None) on success - for text returns content value, for audio returns temp file path
         ("", error_message) on failure
     """
-    from app.models.llm.request import TextInput, AudioInput
+    from app.models.llm.request import TextInput, AudioInput, ImageInput, PDFInput
 
     try:
         if isinstance(query_input, TextInput):
@@ -464,6 +520,17 @@ def resolve_input(query_input) -> tuple[str, str | None]:
             # AudioInput content is base64-encoded audio
             mime_type = query_input.content.mime_type or "audio/wav"
             return resolve_audio_base64(query_input.content.value, mime_type)
+        
+        elif isinstance(query_input, ImageInput):
+            content_items = resolve_image_input(query_input)
+            return content_items, None
+        
+        elif isinstance(query_input, PDFInput):
+            content_items = resolve_pdf_input(query_input)
+            return content_items, None
+
+        elif isinstance(query_input, list):
+            return resolve_multimodal_list(query_input)
 
         else:
             return "", f"Unknown input type: {type(query_input)}"

From cbfd85fefe292c32c08f9c325c1203a78a31a421 Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Wed, 25 Feb 2026 17:59:16 +0530
Subject: [PATCH 02/16] added integration for multimodal for both providers

---
 backend/app/models/llm/request.py          |   6 +-
 backend/app/services/llm/jobs.py           |   2 +
 backend/app/services/llm/mappers.py        |   3 +-
 backend/app/services/llm/providers/base.py |   4 +-
 backend/app/services/llm/providers/gai.py  | 393 ++++++++++++++++++++-
 backend/app/services/llm/providers/oai.py  |  40 ++-
 backend/app/utils.py                       |  85 ++---
 7 files changed, 458 insertions(+), 75 deletions(-)

diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py
index 23a7a09af..ca8055218 100644
--- a/backend/app/models/llm/request.py
+++ b/backend/app/models/llm/request.py
@@ -427,12 +427,12 @@ class LlmCall(SQLModel, table=True):
         },
     )
 
-    input_type: Literal["text", "audio", "image", "pdf", "multimodal"] = Field(
+    input_type: Literal["text", "audio", "image"] = Field(
         ...,
         sa_column=sa.Column(
             sa.String,
             nullable=False,
-            comment="Input type: text, audio, image, pdf, multimodal (list of multiple input types)",
+            comment="Input type: text, audio, image",
         ),
     )
 
@@ -535,4 +535,4 @@ class LlmCall(SQLModel, table=True):
         default=None,
         nullable=True,
         sa_column_kwargs={"comment": "Timestamp when the record was soft-deleted"},
-    )
+    )
\ No newline at end of file
diff --git a/backend/app/services/llm/jobs.py b/backend/app/services/llm/jobs.py
index 17afef456..b6f0df966 100644
--- a/backend/app/services/llm/jobs.py
+++ b/backend/app/services/llm/jobs.py
@@ -111,6 +111,8 @@ def resolved_input_context(query_input: TextInput | AudioInput):
     even if errors occur during LLM execution.
     """
     resolved_input, error = resolve_input(query_input)
+    print(f"Resolved input: {resolved_input}, error: {error}")
+    
     if error:
         raise ValueError(error)
 
diff --git a/backend/app/services/llm/mappers.py b/backend/app/services/llm/mappers.py
index 8b0b895e3..8e61b8bf9 100644
--- a/backend/app/services/llm/mappers.py
+++ b/backend/app/services/llm/mappers.py
@@ -129,6 +129,7 @@ def map_kaapi_to_google_params(kaapi_params: dict) -> tuple[dict, list[str]]:
         google_params["response_format"] = response_format
     # Warn about unsupported parameters
     if kaapi_params.get("knowledge_base_ids"):
+        #TODO: Will take up later, when we add google filesearch tool support 
         warnings.append(
             "Parameter 'knowledge_base_ids' is not supported by Google AI and was ignored."
         )
@@ -141,7 +142,7 @@ def map_kaapi_to_google_params(kaapi_params: dict) -> tuple[dict, list[str]]:
     return google_params, warnings
 
 
-def transform_kaapi_config_to_native(
+def transform_kaapi_config_to_native( 
     kaapi_config: KaapiCompletionConfig,
 ) -> tuple[NativeCompletionConfig, list[str]]:
     """Transform Kaapi completion config to native provider config with mapped parameters.
diff --git a/backend/app/services/llm/providers/base.py b/backend/app/services/llm/providers/base.py
index 4559eac77..07195dbfd 100644
--- a/backend/app/services/llm/providers/base.py
+++ b/backend/app/services/llm/providers/base.py
@@ -8,7 +8,7 @@
 from typing import Any
 
 from app.models.llm import NativeCompletionConfig, LLMCallResponse, QueryParams
-
+from app.models.llm.request import TextContent, ImageContent, PDFContent
 
 class BaseProvider(ABC):
     """Abstract base class for LLM providers.
@@ -44,7 +44,7 @@ def execute(
         self,
         completion_config: NativeCompletionConfig,
         query: QueryParams,
-        resolved_input: str | list[dict],
+        resolved_input: str | list[TextContent | ImageContent | PDFContent],
         include_provider_raw_response: bool = False,
     ) -> tuple[LLMCallResponse | None, str | None]:
         """Execute LLM API call.
diff --git a/backend/app/services/llm/providers/gai.py b/backend/app/services/llm/providers/gai.py
index 9f83aadc5..536b45dad 100644
--- a/backend/app/services/llm/providers/gai.py
+++ b/backend/app/services/llm/providers/gai.py
@@ -1,6 +1,7 @@
 import logging
 import base64
 from typing import Any
+from typing import TypeAlias, List
 
 from google import genai
 from google.genai.types import (
@@ -20,13 +21,17 @@
     Usage,
     TextOutput,
     TextContent,
+    ImageContent,
+    PDFContent,
 )
 from app.models.llm.response import AudioOutput, AudioContent
 from app.services.llm.providers.base import BaseProvider
 from app.core.audio_utils import convert_pcm_to_mp3, convert_pcm_to_ogg
 
 logger = logging.getLogger(__name__)
-
+ContentItem: TypeAlias = TextContent | ImageContent | PDFContent
+MultiModalInput: TypeAlias = List[ContentItem]
+UserInput: TypeAlias = str | MultiModalInput
 
 class GoogleAIProvider(BaseProvider):
     def __init__(self, client: genai.Client):
@@ -44,6 +49,57 @@ def create_client(credentials: dict[str, Any]) -> Any:
             raise ValueError("API Key for Google Gemini Not Set")
         return genai.Client(api_key=credentials["api_key"])
 
+    @staticmethod
+    def format_parts(
+        parts: list[TextContent | ImageContent | PDFContent],
+    ) -> list[dict]:
+        items = []
+        for part in parts:
+            if isinstance(part, TextContent):
+                items.append({"text": part.value})
+
+            elif isinstance(part, ImageContent):
+                if part.format == "base64":
+                    items.append(
+                        {
+                            "inline_data": {
+                                "data": part.value,
+                                "mime_type": part.mime_type,
+                            }
+                        }
+                    )
+                else:
+                    items.append(
+                        {
+                            "file_data": {
+                                "file_uri": part.value,
+                                "mime_type": part.mime_type,
+                                "display_name": None,
+                            }
+                        }
+                    )
+            elif isinstance(part, PDFContent):
+                if part.format == "base64":
+                    items.append(
+                        {
+                            "inline_data": {
+                                "data": part.value,
+                                "mime_type": part.mime_type,
+                            }
+                        }
+                    )
+                else:
+                    items.append(
+                        {
+                            "file_data": {
+                                "file_uri": part.value,
+                                "mime_type": part.mime_type,
+                                "display_name": None,
+                            }
+                        }
+                    )
+        return items
+
     def _execute_stt(
         self,
         completion_config: NativeCompletionConfig,
@@ -323,12 +379,313 @@ def _execute_tts(
         )
 
         return llm_response, None
+    
+    def _execute_vision(
+        self,
+        completion_config: NativeCompletionConfig,
+        resolved_content: ImageContent | list[ImageContent], # using content here because we need mime type and format info for processing
+        include_provider_raw_response: bool = False,
+    ) -> tuple[LLMCallResponse | None, str | None]:
+        model = completion_config.params.get("model")
+        if not model:
+            return None, "Missing 'model' in native params"
+        
+        contents = []
+        if isinstance(resolved_content, list):
+            gemini_parts = self.format_parts(resolved_content)
+            contents = [{"role": "user", "parts": gemini_parts}]
+        else:
+            contents = [{"role": "user", "parts": self.format_parts([resolved_content])}]
+        
+        instructions = completion_config.params.get("instructions", "")
+        temperature = completion_config.params.get("temperature", None)
+        thinking_level = completion_config.params.get("reasoning", None)
+
+        generation_kwargs = {}
+        if instructions:
+            contents.append({"role": "system", "parts": [{"text": instructions}]})
+        
+        if temperature is not None:
+            generation_kwargs["temperature"] = temperature
+        
+        if thinking_level is not None:
+            generation_kwargs["thinking_config"] = ThinkingConfig(include_thoughts=False,thinking_level=thinking_level)
+        
+        response = self.client.models.generate_content(
+            model=model,
+            contents=contents,
+            config=GenerateContentConfig(**generation_kwargs)
+        )
+
+        if response.usage_metadata:
+            input_tokens = response.usage_metadata.prompt_token_count or 0
+            output_tokens = response.usage_metadata.candidates_token_count or 0
+            total_tokens = response.usage_metadata.total_token_count or 0
+            reasoning_tokens = response.usage_metadata.thoughts_token_count or 0
+        else:
+            logger.warning(
+                f"[GoogleAIProvider._execute_stt] Response missing usage_metadata, using zeros"
+            )
+            input_tokens = 0
+            output_tokens = 0
+            total_tokens = 0
+            reasoning_tokens = 0
+
+
+        llm_response = LLMCallResponse(
+            response=LLMResponse(
+                provider_response_id=response.response_id,
+                model=response.model_version or model,
+                provider=completion_config.provider,
+                output=TextOutput(content=TextContent(value=response.text)),
+            ),
+            usage=Usage(
+                input_tokens=input_tokens,
+                output_tokens=output_tokens,
+                total_tokens=total_tokens,
+                reasoning_tokens=reasoning_tokens,
+            )
+        )
+        if include_provider_raw_response:
+            llm_response.provider_raw_response = response.model_dump(mode="json")
+           
+        logger.info(
+            f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}"
+        )
+        return llm_response, None
+    
+    def _execute_pdf(
+        self,
+        completion_config: NativeCompletionConfig,
+        resolved_content: PDFContent | list[PDFContent], # using content here because we need mime type and format info for processing
+        include_provider_raw_response: bool = False,
+    ) -> tuple[LLMCallResponse | None, str | None]:
+        model = completion_config.params.get("model")
+        if not model:
+            return None, "Missing 'model' in native params"
+        
+        contents = []
+        if isinstance(resolved_content, list):
+            gemini_parts = self.format_parts(resolved_content)
+            contents = [{"role": "user", "parts": gemini_parts}]
+        else:
+            contents = [{"role": "user", "parts": self.format_parts([resolved_content])}]
+        
+        instructions = completion_config.params.get("instructions", "")
+        temperature = completion_config.params.get("temperature", None)
+        thinking_level = completion_config.params.get("reasoning", None)
+
+        generation_kwargs = {}
+        if instructions:
+            contents.append({"role": "system", "parts": [{"text": instructions}]})
+        
+        if temperature is not None:
+            generation_kwargs["temperature"] = temperature
+        
+        if thinking_level is not None:
+            generation_kwargs["thinking_config"] = ThinkingConfig(include_thoughts=False,thinking_level=thinking_level)
+        
+        response = self.client.models.generate_content(
+            model=model,
+            contents=contents,
+            config=GenerateContentConfig(**generation_kwargs)
+        )
+
+        if response.usage_metadata:
+            input_tokens = response.usage_metadata.prompt_token_count or 0
+            output_tokens = response.usage_metadata.candidates_token_count or 0
+            total_tokens = response.usage_metadata.total_token_count or 0
+            reasoning_tokens = response.usage_metadata.thoughts_token_count or 0
+        else:
+            logger.warning(
+                f"[GoogleAIProvider._execute_stt] Response missing usage_metadata, using zeros"
+            )
+            input_tokens = 0
+            output_tokens = 0
+            total_tokens = 0
+            reasoning_tokens = 0
+
+
+        llm_response = LLMCallResponse(
+            response=LLMResponse(
+                provider_response_id=response.response_id,
+                model=response.model_version or model,
+                provider=completion_config.provider,
+                output=TextOutput(content=TextContent(value=response.text)),
+            ),
+            usage=Usage(
+                input_tokens=input_tokens,
+                output_tokens=output_tokens,
+                total_tokens=total_tokens,
+                reasoning_tokens=reasoning_tokens,
+            )
+        )
+        if include_provider_raw_response:
+            llm_response.provider_raw_response = response.model_dump(mode="json")
+           
+        logger.info(
+            f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}"
+        )
+        return llm_response, None
+
+    def _execute_text(
+        self, 
+        completion_config: NativeCompletionConfig,
+        resolved_input: str | list[TextContent | ImageContent | PDFContent],
+        include_provider_raw_response: bool = False,
+    ) -> tuple[LLMCallResponse | None, str | None]:
+        model = completion_config.params.get("model")
+        if not model:
+            return None, "Missing 'model' in native params"
+
+        contents = []
+
+        if isinstance(resolved_input, list):
+            gemini_parts = self.format_parts(resolved_input)
+            contents = [{"role": "user", "parts": gemini_parts}]
+        else:
+            contents = [{"role": "user", "parts": [{"text": resolved_input}]}]
+
+        instructions = completion_config.params.get("instructions", "")
+        temperature = completion_config.params.get("temperature", None)
+        thinking_level = completion_config.params.get("reasoning", None)
+
+        generation_kwargs = {}
+        if instructions:
+            contents.append({"role": "system", "parts": [{"text": instructions}]})
+        
+        if temperature is not None:
+            generation_kwargs["temperature"] = temperature
+        
+        if thinking_level is not None:
+            generation_kwargs["thinking_config"] = ThinkingConfig(include_thoughts=False,thinking_level=thinking_level)
+        
+        response = self.client.models.generate_content(
+            model=model,
+            contents=contents,
+            config=GenerateContentConfig(**generation_kwargs)
+        )
+
+        if response.usage_metadata:
+            input_tokens = response.usage_metadata.prompt_token_count or 0
+            output_tokens = response.usage_metadata.candidates_token_count or 0
+            total_tokens = response.usage_metadata.total_token_count or 0
+            reasoning_tokens = response.usage_metadata.thoughts_token_count or 0
+        else:
+            logger.warning(
+                f"[GoogleAIProvider._execute_stt] Response missing usage_metadata, using zeros"
+            )
+            input_tokens = 0
+            output_tokens = 0
+            total_tokens = 0
+            reasoning_tokens = 0
+
+
+        llm_response = LLMCallResponse(
+            response=LLMResponse(
+                provider_response_id=response.response_id,
+                model=response.model_version or model,
+                provider=completion_config.provider,
+                output=TextOutput(content=TextContent(value=response.text)),
+            ),
+            usage=Usage(
+                input_tokens=input_tokens,
+                output_tokens=output_tokens,
+                total_tokens=total_tokens,
+                reasoning_tokens=reasoning_tokens,
+            )
+        )
+        if include_provider_raw_response:
+            llm_response.provider_raw_response = response.model_dump(mode="json")
+           
+        logger.info(
+            f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}"
+        )
+        return llm_response, None
+    
+    def _execute_multimodal(
+        self, 
+        completion_config: NativeCompletionConfig,
+        resolved_input: MultiModalInput,
+        include_provider_raw_response: bool = False,
+    ) -> tuple[LLMCallResponse | None, str | None]:
+        """
+        Convert multimodal input's list of content parts into text response.
+        """
+        
+        model = completion_config.params.get("model")
+        if not model:
+            return None, "Missing 'model' in native params"
 
+        if not isinstance(resolved_input, MultiModalInput):
+            return None, "Invalid input type for multimodal completion, expected list of content parts"
+
+        gemini_parts = self.format_parts(resolved_input)
+        contents = [{"role": "user", "parts": gemini_parts}]
+
+        instructions = completion_config.params.get("instructions", "")
+        temperature = completion_config.params.get("temperature", None)
+        thinking_level = completion_config.params.get("reasoning", None)
+
+        generation_kwargs = {}
+        if instructions:
+            contents.append({"role": "system", "parts": [{"text": instructions}]})
+        
+        if temperature is not None:
+            generation_kwargs["temperature"] = temperature
+        
+        if thinking_level is not None:
+            generation_kwargs["thinking_config"] = ThinkingConfig(include_thoughts=False,thinking_level=thinking_level)
+        
+        response = self.client.models.generate_content(
+            model=model,
+            contents=contents,
+            config=GenerateContentConfig(**generation_kwargs)
+        )
+
+        if response.usage_metadata:
+            input_tokens = response.usage_metadata.prompt_token_count or 0
+            output_tokens = response.usage_metadata.candidates_token_count or 0
+            total_tokens = response.usage_metadata.total_token_count or 0
+            reasoning_tokens = response.usage_metadata.thoughts_token_count or 0
+        else:
+            logger.warning(
+                f"[GoogleAIProvider._execute_stt] Response missing usage_metadata, using zeros"
+            )
+            input_tokens = 0
+            output_tokens = 0
+            total_tokens = 0
+            reasoning_tokens = 0
+
+
+        llm_response = LLMCallResponse(
+            response=LLMResponse(
+                provider_response_id=response.response_id,
+                model=response.model_version or model,
+                provider=completion_config.provider,
+                output=TextOutput(content=TextContent(value=response.text)),
+            ),
+            usage=Usage(
+                input_tokens=input_tokens,
+                output_tokens=output_tokens,
+                total_tokens=total_tokens,
+                reasoning_tokens=reasoning_tokens,
+            )
+        )
+        if include_provider_raw_response:
+            llm_response.provider_raw_response = response.model_dump(mode="json")
+           
+        logger.info(
+            f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}"
+        )
+        return llm_response, None
+
+        
     def execute(
         self,
         completion_config: NativeCompletionConfig,
-        query: QueryParams,  # Not used by Google AI provider (no conversation support yet)
-        resolved_input: str,
+        query: QueryParams,
+        resolved_input: str | MultiModalInput,
         include_provider_raw_response: bool = False,
     ) -> tuple[LLMCallResponse | None, str | None]:
         try:
@@ -345,11 +702,33 @@ def execute(
                     resolved_input=resolved_input,
                     include_provider_raw_response=include_provider_raw_response,
                 )
+
+            elif completion_type == "text":
+                return self._execute_text(
+                    completion_config=completion_config,
+                    resolved_input=resolved_input,
+                    include_provider_raw_response=include_provider_raw_response,
+                )
+
+            elif completion_type == "vision":
+                return self._execute_vision(
+                    completion_config=completion_config,
+                    resolved_content=resolved_input,
+                    include_provider_raw_response=include_provider_raw_response,
+                )
+
+            elif completion_type == "pdf":
+                return self._execute_pdf(
+                    completion_config=completion_config,
+                    resolved_content=resolved_input,
+                    include_provider_raw_response=include_provider_raw_response,
+                )
             
-            else:
-                return (
-                    None,
-                    f"Unsupported completion type '{completion_type}' for Google AI provider",
+            elif completion_type == "multimodal":
+                return self._execute_text(
+                    completion_config=completion_config,
+                    resolved_input=resolved_input,
+                    include_provider_raw_response=include_provider_raw_response,
                 )
 
         except TypeError as e:
diff --git a/backend/app/services/llm/providers/oai.py b/backend/app/services/llm/providers/oai.py
index a634e8a1d..b02cd0d2c 100644
--- a/backend/app/services/llm/providers/oai.py
+++ b/backend/app/services/llm/providers/oai.py
@@ -1,5 +1,6 @@
 import logging
 from typing import Any
+from typing import TypeAlias, List
 
 import openai
 from openai import OpenAI
@@ -15,10 +16,12 @@
     TextContent,
 )
 from app.services.llm.providers.base import BaseProvider
-
+from app.models.llm.request import TextContent, ImageContent, PDFContent
 
 logger = logging.getLogger(__name__)
-
+ContentItem: TypeAlias = TextContent | ImageContent | PDFContent
+MultiModalInput: TypeAlias = List[ContentItem]
+UserInput: TypeAlias = str | MultiModalInput
 
 class OpenAIProvider(BaseProvider):
     def __init__(self, client: OpenAI):
@@ -36,6 +39,35 @@ def create_client(credentials: dict[str, Any]) -> Any:
             raise ValueError("OpenAI credentials not configured for this project.")
         return OpenAI(api_key=credentials["api_key"])
 
+    @staticmethod
+    def format_parts(parts: list[TextContent | ImageContent | PDFContent]) -> list[dict]:
+        items = []
+        for part in parts:
+            if isinstance(part, TextContent):
+                items.append({"type": "input_text", "text": part.value})
+            
+            elif isinstance(part, ImageContent):
+                if part.format == "base64":
+                    url = f"data:{part.mime_type};base64,{part.value}"
+                else:
+                    url = part.value
+                items.append({
+                    "type": "input_image",
+                    "image_url": url
+                })
+
+            elif isinstance(part, PDFContent):
+                if part.format == "base64":
+                    url = f"data:{part.mime_type};base64,{part.value}"
+                else:
+                    url = part.value
+                items.append({
+                    "type": "input_file",
+                    "file_url": url
+                })
+        
+        return items
+
     def execute(
         self,
         completion_config: NativeCompletionConfig,
@@ -51,10 +83,10 @@ def execute(
             params = {
                 **completion_config.params,
             }
-            if isinstance(resolved_input, list):
+            if isinstance(resolved_input, MultiModalInput):
                 params["input"] = [{
                     "role": "user",
-                    "content": resolved_input # [{"type": "text", "value": "hello world"}, {"type": "image", "value": "base64encodedstring"}, {"type": "pdf", "value": "base64encodedstring"}]
+                    "content": self.format_parts(resolved_input)
                 }]
             else:
                 params["input"] = resolved_input
diff --git a/backend/app/utils.py b/backend/app/utils.py
index 72ac9e033..330cbfdbc 100644
--- a/backend/app/utils.py
+++ b/backend/app/utils.py
@@ -26,12 +26,13 @@
 from app.core.config import settings
 from app.crud.credentials import get_provider_credential
 from app.models.llm.request import TextInput, AudioInput, ImageInput, PDFInput
+from app.models.llm.request import TextContent, AudioContent, ImageContent, PDFContent
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 T = TypeVar("T")
-
+ContentPart = TextContent | AudioContent | ImageContent | PDFContent
 
 class APIResponse(BaseModel, Generic[T]):
     success: bool
@@ -444,61 +445,21 @@ def resolve_audio_base64(data: str, mime_type: str) -> tuple[str, str | None]:
     except Exception as e:
         return "", f"Failed to write audio to temp file: {str(e)}"
 
-def resolve_image_input(image_input) -> list[dict]:
+
+def resolve_image_content(image_input: ImageInput) -> list[ImageContent]:
     contents = image_input.content if isinstance(image_input.content, list) else [image_input.content]
-    items = []
-    for content in contents:
-        if content.format == "base64":
-            mime = content.mime_type or "image/png"
-            val = content.value
-            image_url = f"data:{mime};base64,{val}"
-        else:
-            image_url = content.value
-        items.append({
-            "type": "input_image",
-            "image_url": image_url
-        })
+    for c in contents:
+        if not c.mime_type:
+            c.mime_type = "image/png"
+    return contents         
 
-    return items
-        
 
-def resolve_pdf_input(pdf_input) -> list[dict]:
+def resolve_pdf_content(pdf_input: PDFInput) -> list[PDFContent]:
     contents = pdf_input.content if isinstance(pdf_input.content, list) else [pdf_input.content]
-    items = []
-    for content in contents:
-        if content.format == "base64":
-            mime = content.mime_type or "application/pdf"
-            val = content.value
-            pdf_url = f"data:{mime};base64,{val}"
-        else:
-            pdf_url = content.value
-        
-        items.append({
-            "type": "input_file",
-            "file_url": pdf_url 
-        })
-    return items
-    
-
-def resolve_multimodal_list(inputs: list) -> tuple[list[dict], str | None]:
-    content_items = []
-
-    for item in inputs:
-        if isinstance(item, TextInput):
-            content_items.append({
-                "type": "input_text",
-                "text": item.content.value,
-            })
-        elif isinstance(item, ImageInput):
-            image_items = resolve_image_input(item)
-            content_items.extend(image_items)
-        elif isinstance(item, PDFInput):
-            pdf_items = resolve_pdf_input(item)
-            content_items.extend(pdf_items)
-        else:
-            return [], f"Unsupported input type in multimodal list: {type(item)}"
-    
-    return content_items, None
+    for c in contents:
+        if not c.mime_type:
+            c.mime_type = "application/pdf"
+    return contents
 
 def resolve_input(query_input) -> tuple[str, str | None]:
     """Resolve discriminated union input to content string.
@@ -522,15 +483,23 @@ def resolve_input(query_input) -> tuple[str, str | None]:
             return resolve_audio_base64(query_input.content.value, mime_type)
         
         elif isinstance(query_input, ImageInput):
-            content_items = resolve_image_input(query_input)
-            return content_items, None
+            return resolve_image_content(query_input), None
         
         elif isinstance(query_input, PDFInput):
-            content_items = resolve_pdf_input(query_input)
-            return content_items, None
-
+            return resolve_pdf_content(query_input), None
+        
         elif isinstance(query_input, list):
-            return resolve_multimodal_list(query_input)
+            parts: list[ContentPart] = []
+            for item in query_input:
+                if isinstance(item, TextInput):
+                    parts.append(item.content)  # TextContent instance
+                elif isinstance(item, ImageInput):
+                    parts.extend(resolve_image_content(item))
+                elif isinstance(item, PDFInput):
+                    parts.extend(resolve_pdf_content(item))
+                else:
+                    return [], f"Unsupported input type: {type(item)}"
+            return parts, None
 
         else:
             return "", f"Unknown input type: {type(query_input)}"

From 1f3f13e0bd0986519c07893a4a865df4ee1a2d90 Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Wed, 25 Feb 2026 18:38:25 +0530
Subject: [PATCH 03/16] pretify the codes and added support for image, pdf and
 multimodal to input type

---
 backend/app/models/llm/request.py          |  21 +++-
 backend/app/services/llm/jobs.py           |   2 +-
 backend/app/services/llm/mappers.py        |   4 +-
 backend/app/services/llm/providers/base.py |   5 +-
 backend/app/services/llm/providers/gai.py  | 127 ++++++++++++---------
 backend/app/services/llm/providers/oai.py  |  36 +++---
 backend/app/utils.py                       |  22 +++-
 7 files changed, 122 insertions(+), 95 deletions(-)

diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py
index ca8055218..245d95af2 100644
--- a/backend/app/models/llm/request.py
+++ b/backend/app/models/llm/request.py
@@ -55,6 +55,7 @@ class TTSLLMParams(SQLModel):
     language: str
     response_format: Literal["mp3", "wav", "ogg"] | None = "wav"
 
+
 class ImageLLMParams(SQLModel):
     model: str
     instructions: str
@@ -68,6 +69,7 @@ class ImageLLMParams(SQLModel):
         le=2.0,
     )
 
+
 KaapiLLMParams = Union[TextLLMParams, STTLLMParams, TTSLLMParams, ImageLLMParams]
 
 
@@ -86,17 +88,21 @@ class AudioContent(SQLModel):
         description="MIME type of the audio (e.g., audio/wav, audio/mp3, audio/ogg)",
     )
 
+
 class ImageContent(SQLModel):
-    format: Literal["base64", "public_url"] = "base64"
-    value: str = Field(..., description="Base64 encoded image or Public URL to the image")
+    format: Literal["base64", "url"] = "base64"
+    value: str = Field(
+        ..., description="Base64 encoded image or Public URL to the image"
+    )
     # keeping the mime_type
     mime_type: str | None = Field(
         None,
         description="MIME type of the image (e.g., image/png, image/jpeg)",
     )
 
+
 class PDFContent(SQLModel):
-    format: Literal["base64", "public_url"] = "base64"
+    format: Literal["base64", "url"] = "base64"
     value: str = Field(..., description="Base64 encoded PDF or Public URL to the PDF")
     # keeping the mime_type
     mime_type: str | None = Field(
@@ -104,6 +110,7 @@ class PDFContent(SQLModel):
         description="MIME type of the PDF (e.g., application/pdf)",
     )
 
+
 class TextInput(SQLModel):
     type: Literal["text"] = "text"
     content: TextContent
@@ -113,10 +120,12 @@ class AudioInput(SQLModel):
     type: Literal["audio"] = "audio"
     content: AudioContent
 
+
 class ImageInput(SQLModel):
     type: Literal["image"] = "image"
     content: ImageContent | list[ImageContent]
 
+
 class PDFInput(SQLModel):
     type: Literal["pdf"] = "pdf"
     content: PDFContent | list[PDFContent]
@@ -427,12 +436,12 @@ class LlmCall(SQLModel, table=True):
         },
     )
 
-    input_type: Literal["text", "audio", "image"] = Field(
+    input_type: Literal["text", "audio", "image", "pdf", "multimodal"] = Field(
         ...,
         sa_column=sa.Column(
             sa.String,
             nullable=False,
-            comment="Input type: text, audio, image",
+            comment="Input type: text, audio, image, pdf, multimodal",
         ),
     )
 
@@ -535,4 +544,4 @@ class LlmCall(SQLModel, table=True):
         default=None,
         nullable=True,
         sa_column_kwargs={"comment": "Timestamp when the record was soft-deleted"},
-    )
\ No newline at end of file
+    )
diff --git a/backend/app/services/llm/jobs.py b/backend/app/services/llm/jobs.py
index b6f0df966..9d45366ab 100644
--- a/backend/app/services/llm/jobs.py
+++ b/backend/app/services/llm/jobs.py
@@ -112,7 +112,7 @@ def resolved_input_context(query_input: TextInput | AudioInput):
     """
     resolved_input, error = resolve_input(query_input)
     print(f"Resolved input: {resolved_input}, error: {error}")
-    
+
     if error:
         raise ValueError(error)
 
diff --git a/backend/app/services/llm/mappers.py b/backend/app/services/llm/mappers.py
index 8e61b8bf9..838912cdf 100644
--- a/backend/app/services/llm/mappers.py
+++ b/backend/app/services/llm/mappers.py
@@ -129,7 +129,7 @@ def map_kaapi_to_google_params(kaapi_params: dict) -> tuple[dict, list[str]]:
         google_params["response_format"] = response_format
     # Warn about unsupported parameters
     if kaapi_params.get("knowledge_base_ids"):
-        #TODO: Will take up later, when we add google filesearch tool support 
+        # TODO: Will take up later, when we add google filesearch tool support
         warnings.append(
             "Parameter 'knowledge_base_ids' is not supported by Google AI and was ignored."
         )
@@ -142,7 +142,7 @@ def map_kaapi_to_google_params(kaapi_params: dict) -> tuple[dict, list[str]]:
     return google_params, warnings
 
 
-def transform_kaapi_config_to_native( 
+def transform_kaapi_config_to_native(
     kaapi_config: KaapiCompletionConfig,
 ) -> tuple[NativeCompletionConfig, list[str]]:
     """Transform Kaapi completion config to native provider config with mapped parameters.
diff --git a/backend/app/services/llm/providers/base.py b/backend/app/services/llm/providers/base.py
index 07195dbfd..fcd64dfc7 100644
--- a/backend/app/services/llm/providers/base.py
+++ b/backend/app/services/llm/providers/base.py
@@ -5,11 +5,14 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import Any
+from typing import Any, List, TypeAlias
 
 from app.models.llm import NativeCompletionConfig, LLMCallResponse, QueryParams
 from app.models.llm.request import TextContent, ImageContent, PDFContent
 
+ContentItem: TypeAlias = TextContent | ImageContent | PDFContent
+MultiModalInput: TypeAlias = List[ContentItem]
+
 class BaseProvider(ABC):
     """Abstract base class for LLM providers.
 
diff --git a/backend/app/services/llm/providers/gai.py b/backend/app/services/llm/providers/gai.py
index 536b45dad..a7eaffecf 100644
--- a/backend/app/services/llm/providers/gai.py
+++ b/backend/app/services/llm/providers/gai.py
@@ -1,7 +1,6 @@
 import logging
 import base64
 from typing import Any
-from typing import TypeAlias, List
 
 from google import genai
 from google.genai.types import (
@@ -25,13 +24,11 @@
     PDFContent,
 )
 from app.models.llm.response import AudioOutput, AudioContent
-from app.services.llm.providers.base import BaseProvider
+from app.services.llm.providers.base import BaseProvider, MultiModalInput
 from app.core.audio_utils import convert_pcm_to_mp3, convert_pcm_to_ogg
 
 logger = logging.getLogger(__name__)
-ContentItem: TypeAlias = TextContent | ImageContent | PDFContent
-MultiModalInput: TypeAlias = List[ContentItem]
-UserInput: TypeAlias = str | MultiModalInput
+
 
 class GoogleAIProvider(BaseProvider):
     def __init__(self, client: genai.Client):
@@ -379,24 +376,29 @@ def _execute_tts(
         )
 
         return llm_response, None
-    
+
     def _execute_vision(
         self,
         completion_config: NativeCompletionConfig,
-        resolved_content: ImageContent | list[ImageContent], # using content here because we need mime type and format info for processing
+        resolved_content: ImageContent
+        | list[
+            ImageContent
+        ],  # using content here because we need mime type and format info for processing
         include_provider_raw_response: bool = False,
     ) -> tuple[LLMCallResponse | None, str | None]:
         model = completion_config.params.get("model")
         if not model:
             return None, "Missing 'model' in native params"
-        
+
         contents = []
         if isinstance(resolved_content, list):
             gemini_parts = self.format_parts(resolved_content)
             contents = [{"role": "user", "parts": gemini_parts}]
         else:
-            contents = [{"role": "user", "parts": self.format_parts([resolved_content])}]
-        
+            contents = [
+                {"role": "user", "parts": self.format_parts([resolved_content])}
+            ]
+
         instructions = completion_config.params.get("instructions", "")
         temperature = completion_config.params.get("temperature", None)
         thinking_level = completion_config.params.get("reasoning", None)
@@ -404,17 +406,19 @@ def _execute_vision(
         generation_kwargs = {}
         if instructions:
             contents.append({"role": "system", "parts": [{"text": instructions}]})
-        
+
         if temperature is not None:
             generation_kwargs["temperature"] = temperature
-        
+
         if thinking_level is not None:
-            generation_kwargs["thinking_config"] = ThinkingConfig(include_thoughts=False,thinking_level=thinking_level)
-        
+            generation_kwargs["thinking_config"] = ThinkingConfig(
+                include_thoughts=False, thinking_level=thinking_level
+            )
+
         response = self.client.models.generate_content(
             model=model,
             contents=contents,
-            config=GenerateContentConfig(**generation_kwargs)
+            config=GenerateContentConfig(**generation_kwargs),
         )
 
         if response.usage_metadata:
@@ -431,7 +435,6 @@ def _execute_vision(
             total_tokens = 0
             reasoning_tokens = 0
 
-
         llm_response = LLMCallResponse(
             response=LLMResponse(
                 provider_response_id=response.response_id,
@@ -444,33 +447,38 @@ def _execute_vision(
                 output_tokens=output_tokens,
                 total_tokens=total_tokens,
                 reasoning_tokens=reasoning_tokens,
-            )
+            ),
         )
         if include_provider_raw_response:
             llm_response.provider_raw_response = response.model_dump(mode="json")
-           
+
         logger.info(
             f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}"
         )
         return llm_response, None
-    
+
     def _execute_pdf(
         self,
         completion_config: NativeCompletionConfig,
-        resolved_content: PDFContent | list[PDFContent], # using content here because we need mime type and format info for processing
+        resolved_content: PDFContent
+        | list[
+            PDFContent
+        ],  # using content here because we need mime type and format info for processing
         include_provider_raw_response: bool = False,
     ) -> tuple[LLMCallResponse | None, str | None]:
         model = completion_config.params.get("model")
         if not model:
             return None, "Missing 'model' in native params"
-        
+
         contents = []
         if isinstance(resolved_content, list):
             gemini_parts = self.format_parts(resolved_content)
             contents = [{"role": "user", "parts": gemini_parts}]
         else:
-            contents = [{"role": "user", "parts": self.format_parts([resolved_content])}]
-        
+            contents = [
+                {"role": "user", "parts": self.format_parts([resolved_content])}
+            ]
+
         instructions = completion_config.params.get("instructions", "")
         temperature = completion_config.params.get("temperature", None)
         thinking_level = completion_config.params.get("reasoning", None)
@@ -478,17 +486,19 @@ def _execute_pdf(
         generation_kwargs = {}
         if instructions:
             contents.append({"role": "system", "parts": [{"text": instructions}]})
-        
+
         if temperature is not None:
             generation_kwargs["temperature"] = temperature
-        
+
         if thinking_level is not None:
-            generation_kwargs["thinking_config"] = ThinkingConfig(include_thoughts=False,thinking_level=thinking_level)
-        
+            generation_kwargs["thinking_config"] = ThinkingConfig(
+                include_thoughts=False, thinking_level=thinking_level
+            )
+
         response = self.client.models.generate_content(
             model=model,
             contents=contents,
-            config=GenerateContentConfig(**generation_kwargs)
+            config=GenerateContentConfig(**generation_kwargs),
         )
 
         if response.usage_metadata:
@@ -505,7 +515,6 @@ def _execute_pdf(
             total_tokens = 0
             reasoning_tokens = 0
 
-
         llm_response = LLMCallResponse(
             response=LLMResponse(
                 provider_response_id=response.response_id,
@@ -518,18 +527,18 @@ def _execute_pdf(
                 output_tokens=output_tokens,
                 total_tokens=total_tokens,
                 reasoning_tokens=reasoning_tokens,
-            )
+            ),
         )
         if include_provider_raw_response:
             llm_response.provider_raw_response = response.model_dump(mode="json")
-           
+
         logger.info(
             f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}"
         )
         return llm_response, None
 
     def _execute_text(
-        self, 
+        self,
         completion_config: NativeCompletionConfig,
         resolved_input: str | list[TextContent | ImageContent | PDFContent],
         include_provider_raw_response: bool = False,
@@ -553,17 +562,19 @@ def _execute_text(
         generation_kwargs = {}
         if instructions:
             contents.append({"role": "system", "parts": [{"text": instructions}]})
-        
+
         if temperature is not None:
             generation_kwargs["temperature"] = temperature
-        
+
         if thinking_level is not None:
-            generation_kwargs["thinking_config"] = ThinkingConfig(include_thoughts=False,thinking_level=thinking_level)
-        
+            generation_kwargs["thinking_config"] = ThinkingConfig(
+                include_thoughts=False, thinking_level=thinking_level
+            )
+
         response = self.client.models.generate_content(
             model=model,
             contents=contents,
-            config=GenerateContentConfig(**generation_kwargs)
+            config=GenerateContentConfig(**generation_kwargs),
         )
 
         if response.usage_metadata:
@@ -580,7 +591,6 @@ def _execute_text(
             total_tokens = 0
             reasoning_tokens = 0
 
-
         llm_response = LLMCallResponse(
             response=LLMResponse(
                 provider_response_id=response.response_id,
@@ -593,18 +603,18 @@ def _execute_text(
                 output_tokens=output_tokens,
                 total_tokens=total_tokens,
                 reasoning_tokens=reasoning_tokens,
-            )
+            ),
         )
         if include_provider_raw_response:
             llm_response.provider_raw_response = response.model_dump(mode="json")
-           
+
         logger.info(
             f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}"
         )
         return llm_response, None
-    
+
     def _execute_multimodal(
-        self, 
+        self,
         completion_config: NativeCompletionConfig,
         resolved_input: MultiModalInput,
         include_provider_raw_response: bool = False,
@@ -612,13 +622,16 @@ def _execute_multimodal(
         """
         Convert multimodal input's list of content parts into text response.
         """
-        
+
         model = completion_config.params.get("model")
         if not model:
             return None, "Missing 'model' in native params"
 
         if not isinstance(resolved_input, MultiModalInput):
-            return None, "Invalid input type for multimodal completion, expected list of content parts"
+            return (
+                None,
+                "Invalid input type for multimodal completion, expected list of content parts",
+            )
 
         gemini_parts = self.format_parts(resolved_input)
         contents = [{"role": "user", "parts": gemini_parts}]
@@ -630,17 +643,19 @@ def _execute_multimodal(
         generation_kwargs = {}
         if instructions:
             contents.append({"role": "system", "parts": [{"text": instructions}]})
-        
+
         if temperature is not None:
             generation_kwargs["temperature"] = temperature
-        
+
         if thinking_level is not None:
-            generation_kwargs["thinking_config"] = ThinkingConfig(include_thoughts=False,thinking_level=thinking_level)
-        
+            generation_kwargs["thinking_config"] = ThinkingConfig(
+                include_thoughts=False, thinking_level=thinking_level
+            )
+
         response = self.client.models.generate_content(
             model=model,
             contents=contents,
-            config=GenerateContentConfig(**generation_kwargs)
+            config=GenerateContentConfig(**generation_kwargs),
         )
 
         if response.usage_metadata:
@@ -657,7 +672,6 @@ def _execute_multimodal(
             total_tokens = 0
             reasoning_tokens = 0
 
-
         llm_response = LLMCallResponse(
             response=LLMResponse(
                 provider_response_id=response.response_id,
@@ -670,17 +684,16 @@ def _execute_multimodal(
                 output_tokens=output_tokens,
                 total_tokens=total_tokens,
                 reasoning_tokens=reasoning_tokens,
-            )
+            ),
         )
         if include_provider_raw_response:
             llm_response.provider_raw_response = response.model_dump(mode="json")
-           
+
         logger.info(
             f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}"
         )
         return llm_response, None
 
-        
     def execute(
         self,
         completion_config: NativeCompletionConfig,
@@ -710,8 +723,8 @@ def execute(
                     include_provider_raw_response=include_provider_raw_response,
                 )
 
-            elif completion_type == "vision":
-                return self._execute_vision(
+            elif completion_type == "image":
+                return self._execute_image(
                     completion_config=completion_config,
                     resolved_content=resolved_input,
                     include_provider_raw_response=include_provider_raw_response,
@@ -723,9 +736,9 @@ def execute(
                     resolved_content=resolved_input,
                     include_provider_raw_response=include_provider_raw_response,
                 )
-            
+
             elif completion_type == "multimodal":
-                return self._execute_text(
+                return self._execute_multimodal(
                     completion_config=completion_config,
                     resolved_input=resolved_input,
                     include_provider_raw_response=include_provider_raw_response,
diff --git a/backend/app/services/llm/providers/oai.py b/backend/app/services/llm/providers/oai.py
index b02cd0d2c..6368d0429 100644
--- a/backend/app/services/llm/providers/oai.py
+++ b/backend/app/services/llm/providers/oai.py
@@ -1,6 +1,5 @@
 import logging
 from typing import Any
-from typing import TypeAlias, List
 
 import openai
 from openai import OpenAI
@@ -14,14 +13,13 @@
     Usage,
     TextOutput,
     TextContent,
+    ImageContent,
+    PDFContent,
 )
-from app.services.llm.providers.base import BaseProvider
-from app.models.llm.request import TextContent, ImageContent, PDFContent
+from app.services.llm.providers.base import BaseProvider, MultiModalInput
 
 logger = logging.getLogger(__name__)
-ContentItem: TypeAlias = TextContent | ImageContent | PDFContent
-MultiModalInput: TypeAlias = List[ContentItem]
-UserInput: TypeAlias = str | MultiModalInput
+
 
 class OpenAIProvider(BaseProvider):
     def __init__(self, client: OpenAI):
@@ -40,32 +38,28 @@ def create_client(credentials: dict[str, Any]) -> Any:
         return OpenAI(api_key=credentials["api_key"])
 
     @staticmethod
-    def format_parts(parts: list[TextContent | ImageContent | PDFContent]) -> list[dict]:
+    def format_parts(
+        parts: list[TextContent | ImageContent | PDFContent],
+    ) -> list[dict]:
         items = []
         for part in parts:
             if isinstance(part, TextContent):
                 items.append({"type": "input_text", "text": part.value})
-            
+
             elif isinstance(part, ImageContent):
                 if part.format == "base64":
                     url = f"data:{part.mime_type};base64,{part.value}"
                 else:
                     url = part.value
-                items.append({
-                    "type": "input_image",
-                    "image_url": url
-                })
+                items.append({"type": "input_image", "image_url": url})
 
             elif isinstance(part, PDFContent):
                 if part.format == "base64":
                     url = f"data:{part.mime_type};base64,{part.value}"
                 else:
                     url = part.value
-                items.append({
-                    "type": "input_file",
-                    "file_url": url
-                })
-        
+                items.append({"type": "input_file", "file_url": url})
+
         return items
 
     def execute(
@@ -79,15 +73,13 @@ def execute(
         error_message: str | None = None
 
         try:
-            # if completeiton_type is not text: -> return Nonne , error we don't 
             params = {
                 **completion_config.params,
             }
             if isinstance(resolved_input, MultiModalInput):
-                params["input"] = [{
-                    "role": "user",
-                    "content": self.format_parts(resolved_input)
-                }]
+                params["input"] = [
+                    {"role": "user", "content": self.format_parts(resolved_input)}
+                ]
             else:
                 params["input"] = resolved_input
 
diff --git a/backend/app/utils.py b/backend/app/utils.py
index 330cbfdbc..7fa0973b9 100644
--- a/backend/app/utils.py
+++ b/backend/app/utils.py
@@ -34,6 +34,7 @@
 T = TypeVar("T")
 ContentPart = TextContent | AudioContent | ImageContent | PDFContent
 
+
 class APIResponse(BaseModel, Generic[T]):
     success: bool
     data: Optional[T] = None
@@ -447,20 +448,29 @@ def resolve_audio_base64(data: str, mime_type: str) -> tuple[str, str | None]:
 
 
 def resolve_image_content(image_input: ImageInput) -> list[ImageContent]:
-    contents = image_input.content if isinstance(image_input.content, list) else [image_input.content]
+    contents = (
+        image_input.content
+        if isinstance(image_input.content, list)
+        else [image_input.content]
+    )
     for c in contents:
         if not c.mime_type:
             c.mime_type = "image/png"
-    return contents         
+    return contents
 
 
 def resolve_pdf_content(pdf_input: PDFInput) -> list[PDFContent]:
-    contents = pdf_input.content if isinstance(pdf_input.content, list) else [pdf_input.content]
+    contents = (
+        pdf_input.content
+        if isinstance(pdf_input.content, list)
+        else [pdf_input.content]
+    )
     for c in contents:
         if not c.mime_type:
             c.mime_type = "application/pdf"
     return contents
 
+
 def resolve_input(query_input) -> tuple[str, str | None]:
     """Resolve discriminated union input to content string.
 
@@ -481,13 +491,13 @@ def resolve_input(query_input) -> tuple[str, str | None]:
             # AudioInput content is base64-encoded audio
             mime_type = query_input.content.mime_type or "audio/wav"
             return resolve_audio_base64(query_input.content.value, mime_type)
-        
+
         elif isinstance(query_input, ImageInput):
             return resolve_image_content(query_input), None
-        
+
         elif isinstance(query_input, PDFInput):
             return resolve_pdf_content(query_input), None
-        
+
         elif isinstance(query_input, list):
             parts: list[ContentPart] = []
             for item in query_input:

From e5a9fa8121420b97d30f564cb8a8f9ef6d050b90 Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Wed, 25 Feb 2026 18:57:24 +0530
Subject: [PATCH 04/16] fixes to function nae and added the input_type

---
 backend/app/models/llm/request.py         | 4 ++--
 backend/app/services/llm/providers/gai.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py
index 245d95af2..82eda2139 100644
--- a/backend/app/models/llm/request.py
+++ b/backend/app/models/llm/request.py
@@ -208,7 +208,7 @@ class NativeCompletionConfig(SQLModel):
         ...,
         description="Provider-specific parameters (schema varies by provider), should exactly match the provider's endpoint params structure",
     )
-    type: Literal["text", "stt", "tts"] = Field(
+    type: Literal["text", "stt", "tts", "image", "pdf", "multimodal"] = Field(
         ..., description="Completion config type. Params schema varies by type"
     )
 
@@ -224,7 +224,7 @@ class KaapiCompletionConfig(SQLModel):
         ..., description="LLM provider (openai)"
     )
 
-    type: Literal["text", "stt", "tts"] = Field(
+    type: Literal["text", "stt", "tts", "image", "pdf", "multimodal"] = Field(
         ..., description="Completion config type. Params schema varies by type"
     )
     params: dict[str, Any] = Field(
diff --git a/backend/app/services/llm/providers/gai.py b/backend/app/services/llm/providers/gai.py
index a7eaffecf..58cd3d43c 100644
--- a/backend/app/services/llm/providers/gai.py
+++ b/backend/app/services/llm/providers/gai.py
@@ -377,7 +377,7 @@ def _execute_tts(
 
         return llm_response, None
 
-    def _execute_vision(
+    def _execute_image(
         self,
         completion_config: NativeCompletionConfig,
         resolved_content: ImageContent

From 668e3fa62b509b73e1daf18acecad5532f33ebf8 Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Thu, 26 Feb 2026 00:28:58 +0530
Subject: [PATCH 05/16] Add support for multimodal input types: Image, PDF, and
 enhance validation

---
 backend/app/models/llm/request.py             | 42 +++++++++++++-
 backend/app/services/llm/jobs.py              | 17 +++++-
 backend/app/services/llm/mappers.py           | 10 ++--
 backend/app/services/llm/providers/base.py    | 43 +++++++++++++-
 backend/app/services/llm/providers/gai.py     | 56 +++++--------------
 backend/app/services/llm/providers/oai.py     |  6 +-
 .../app/services/llm/providers/registry.py    |  3 +-
 backend/app/utils.py                          | 40 ++++++++-----
 8 files changed, 148 insertions(+), 69 deletions(-)

diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py
index 82eda2139..81efcb89a 100644
--- a/backend/app/models/llm/request.py
+++ b/backend/app/models/llm/request.py
@@ -68,9 +68,47 @@ class ImageLLMParams(SQLModel):
         ge=0.0,
         le=2.0,
     )
+    reasoning: Literal["low", "medium", "high"] | None = None
 
 
-KaapiLLMParams = Union[TextLLMParams, STTLLMParams, TTSLLMParams, ImageLLMParams]
+class PDFLLMParams(SQLModel):
+    model: str
+    instructions: str
+    response_format: Literal["text"] | None = Field(
+        None,
+        description="Currently supports text type",
+    )
+    temperature: float | None = Field(
+        default=0.2,
+        ge=0.0,
+        le=2.0,
+    )
+    reasoning: Literal["low", "medium", "high"] | None = None
+
+
+class MultimodalLLMParams(SQLModel):
+    model: str
+    instructions: str
+    response_format: Literal["text"] | None = Field(
+        None,
+        description="Currently supports text type",
+    )
+    temperature: float | None = Field(
+        default=0.2,
+        ge=0.0,
+        le=2.0,
+    )
+    reasoning: Literal["low", "medium", "high"] | None = None
+
+
+KaapiLLMParams = Union[
+    TextLLMParams,
+    STTLLMParams,
+    TTSLLMParams,
+    ImageLLMParams,
+    PDFLLMParams,
+    MultimodalLLMParams,
+]
 
 
 # Input type models for discriminated union
@@ -240,6 +278,8 @@ def validate_params(self):
             "stt": STTLLMParams,
             "tts": TTSLLMParams,
             "image": ImageLLMParams,
+            "pdf": PDFLLMParams,
+            "multimodal": MultimodalLLMParams,
         }
         model_class = param_models[self.type]
         validated = model_class.model_validate(self.params)
diff --git a/backend/app/services/llm/jobs.py b/backend/app/services/llm/jobs.py
index 9d45366ab..511516139 100644
--- a/backend/app/services/llm/jobs.py
+++ b/backend/app/services/llm/jobs.py
@@ -29,6 +29,7 @@
     run_guardrails_validation,
 )
 from app.services.llm.providers.registry import get_llm_provider
+from app.services.llm.providers.base import validate_completion_input
 from app.services.llm.mappers import transform_kaapi_config_to_native
 from app.utils import APIResponse, send_callback, resolve_input, cleanup_temp_file
 
@@ -104,7 +105,9 @@ def handle_job_error(
 
 
 @contextmanager
-def resolved_input_context(query_input: TextInput | AudioInput):
+def resolved_input_context(
+    query_input: TextInput | AudioInput | ImageInput | PDFInput | list,
+):
     """Context manager for resolving and cleaning up input resources.
 
     Ensures temporary files (e.g., downloaded audio) are cleaned up
@@ -394,6 +397,18 @@ def execute_job(
         # Resolve input and execute LLM (context manager handles cleanup)
         try:
             with resolved_input_context(request.query.input) as resolved_input:
+                mismatch = validate_completion_input(
+                    completion_config.type, resolved_input
+                )
+                if mismatch:
+                    callback_response = APIResponse.failure_response(
+                        error=mismatch,
+                        metadata=request.request_metadata,
+                    )
+                    return handle_job_error(
+                        job_uuid, callback_url_str, callback_response
+                    )
+
                 response, error = decorated_execute(
                     completion_config=completion_config,
                     query=request.query,
diff --git a/backend/app/services/llm/mappers.py b/backend/app/services/llm/mappers.py
index 838912cdf..d4efc2e9f 100644
--- a/backend/app/services/llm/mappers.py
+++ b/backend/app/services/llm/mappers.py
@@ -127,6 +127,11 @@ def map_kaapi_to_google_params(kaapi_params: dict) -> tuple[dict, list[str]]:
     response_format = kaapi_params.get("response_format")
     if response_format:
         google_params["response_format"] = response_format
+
+    reasoning = kaapi_params.get("reasoning")
+    if reasoning:
+        google_params["reasoning"] = reasoning
+
     # Warn about unsupported parameters
     if kaapi_params.get("knowledge_base_ids"):
         # TODO: Will take up later, when we add google filesearch tool support
@@ -134,11 +139,6 @@ def map_kaapi_to_google_params(kaapi_params: dict) -> tuple[dict, list[str]]:
             "Parameter 'knowledge_base_ids' is not supported by Google AI and was ignored."
         )
 
-    if kaapi_params.get("reasoning") is not None:
-        warnings.append(
-            "Parameter 'reasoning' is not applicable for Google AI and was ignored."
-        )
-
     return google_params, warnings
 
 
diff --git a/backend/app/services/llm/providers/base.py b/backend/app/services/llm/providers/base.py
index fcd64dfc7..5414cfcf9 100644
--- a/backend/app/services/llm/providers/base.py
+++ b/backend/app/services/llm/providers/base.py
@@ -5,13 +5,50 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import Any, List, TypeAlias
+from typing import Any, Literal
+
+from pydantic import model_validator
+from sqlmodel import SQLModel
 
 from app.models.llm import NativeCompletionConfig, LLMCallResponse, QueryParams
 from app.models.llm.request import TextContent, ImageContent, PDFContent
 
-ContentItem: TypeAlias = TextContent | ImageContent | PDFContent
-MultiModalInput: TypeAlias = List[ContentItem]
+
+class MultiModalInput(SQLModel):
+    """Resolved multimodal input containing a list of content parts."""
+
+    parts: list[TextContent | ImageContent | PDFContent]
+
+    @model_validator(mode="after")
+    def validate_parts(self):
+        if not self.parts:
+            raise ValueError("MultiModalInput requires at least one content part")
+        return self
+
+
+COMPLETION_TYPE_ALLOWED_INPUT: dict[str, set[type]] = {
+    "text": {str},
+    "stt": {str},
+    "tts": {str},
+    "image": {list},
+    "pdf": {list},
+    "multimodal": {MultiModalInput},
+}
+
+
+def validate_completion_input(completion_type: str, resolved_input: Any) -> str | None:
+    """Returns error message if mismatch, else None."""
+    allowed = COMPLETION_TYPE_ALLOWED_INPUT.get(completion_type)
+    if allowed is None:
+        return f"Unknown completion type: '{completion_type}'"
+    if type(resolved_input) not in allowed:
+        expected = " or ".join(t.__name__ for t in allowed)
+        return (
+            f"completion type '{completion_type}' expects {expected} input, "
+            f"got {type(resolved_input).__name__}"
+        )
+    return None
+
 
 class BaseProvider(ABC):
     """Abstract base class for LLM providers.
diff --git a/backend/app/services/llm/providers/gai.py b/backend/app/services/llm/providers/gai.py
index 58cd3d43c..342e92eb3 100644
--- a/backend/app/services/llm/providers/gai.py
+++ b/backend/app/services/llm/providers/gai.py
@@ -380,24 +380,15 @@ def _execute_tts(
     def _execute_image(
         self,
         completion_config: NativeCompletionConfig,
-        resolved_content: ImageContent
-        | list[
-            ImageContent
-        ],  # using content here because we need mime type and format info for processing
+        resolved_input: list[ImageContent],
         include_provider_raw_response: bool = False,
     ) -> tuple[LLMCallResponse | None, str | None]:
         model = completion_config.params.get("model")
         if not model:
             return None, "Missing 'model' in native params"
 
-        contents = []
-        if isinstance(resolved_content, list):
-            gemini_parts = self.format_parts(resolved_content)
-            contents = [{"role": "user", "parts": gemini_parts}]
-        else:
-            contents = [
-                {"role": "user", "parts": self.format_parts([resolved_content])}
-            ]
+        gemini_parts = self.format_parts(resolved_input)
+        contents = [{"role": "user", "parts": gemini_parts}]
 
         instructions = completion_config.params.get("instructions", "")
         temperature = completion_config.params.get("temperature", None)
@@ -460,24 +451,15 @@ def _execute_image(
     def _execute_pdf(
         self,
         completion_config: NativeCompletionConfig,
-        resolved_content: PDFContent
-        | list[
-            PDFContent
-        ],  # using content here because we need mime type and format info for processing
+        resolved_input: list[PDFContent],
         include_provider_raw_response: bool = False,
     ) -> tuple[LLMCallResponse | None, str | None]:
         model = completion_config.params.get("model")
         if not model:
             return None, "Missing 'model' in native params"
 
-        contents = []
-        if isinstance(resolved_content, list):
-            gemini_parts = self.format_parts(resolved_content)
-            contents = [{"role": "user", "parts": gemini_parts}]
-        else:
-            contents = [
-                {"role": "user", "parts": self.format_parts([resolved_content])}
-            ]
+        gemini_parts = self.format_parts(resolved_input)
+        contents = [{"role": "user", "parts": gemini_parts}]
 
         instructions = completion_config.params.get("instructions", "")
         temperature = completion_config.params.get("temperature", None)
@@ -540,17 +522,15 @@ def _execute_pdf(
     def _execute_text(
         self,
         completion_config: NativeCompletionConfig,
-        resolved_input: str | list[TextContent | ImageContent | PDFContent],
+        resolved_input: str | MultiModalInput,
         include_provider_raw_response: bool = False,
     ) -> tuple[LLMCallResponse | None, str | None]:
         model = completion_config.params.get("model")
         if not model:
             return None, "Missing 'model' in native params"
 
-        contents = []
-
-        if isinstance(resolved_input, list):
-            gemini_parts = self.format_parts(resolved_input)
+        if isinstance(resolved_input, MultiModalInput):
+            gemini_parts = self.format_parts(resolved_input.parts)
             contents = [{"role": "user", "parts": gemini_parts}]
         else:
             contents = [{"role": "user", "parts": [{"text": resolved_input}]}]
@@ -619,21 +599,11 @@ def _execute_multimodal(
         resolved_input: MultiModalInput,
         include_provider_raw_response: bool = False,
     ) -> tuple[LLMCallResponse | None, str | None]:
-        """
-        Convert multimodal input's list of content parts into text response.
-        """
-
         model = completion_config.params.get("model")
         if not model:
             return None, "Missing 'model' in native params"
 
-        if not isinstance(resolved_input, MultiModalInput):
-            return (
-                None,
-                "Invalid input type for multimodal completion, expected list of content parts",
-            )
-
-        gemini_parts = self.format_parts(resolved_input)
+        gemini_parts = self.format_parts(resolved_input.parts)
         contents = [{"role": "user", "parts": gemini_parts}]
 
         instructions = completion_config.params.get("instructions", "")
@@ -698,7 +668,7 @@ def execute(
         self,
         completion_config: NativeCompletionConfig,
         query: QueryParams,
-        resolved_input: str | MultiModalInput,
+        resolved_input: str | list[ImageContent] | list[PDFContent] | MultiModalInput,
         include_provider_raw_response: bool = False,
     ) -> tuple[LLMCallResponse | None, str | None]:
         try:
@@ -726,14 +696,14 @@ def execute(
             elif completion_type == "image":
                 return self._execute_image(
                     completion_config=completion_config,
-                    resolved_content=resolved_input,
+                    resolved_input=resolved_input,
                     include_provider_raw_response=include_provider_raw_response,
                 )
 
             elif completion_type == "pdf":
                 return self._execute_pdf(
                     completion_config=completion_config,
-                    resolved_content=resolved_input,
+                    resolved_input=resolved_input,
                     include_provider_raw_response=include_provider_raw_response,
                 )
 
diff --git a/backend/app/services/llm/providers/oai.py b/backend/app/services/llm/providers/oai.py
index 6368d0429..2f05a3aab 100644
--- a/backend/app/services/llm/providers/oai.py
+++ b/backend/app/services/llm/providers/oai.py
@@ -66,7 +66,7 @@ def execute(
         self,
         completion_config: NativeCompletionConfig,
         query: QueryParams,
-        resolved_input: str,
+        resolved_input: str | list[ImageContent] | list[PDFContent] | MultiModalInput,
         include_provider_raw_response: bool = False,
     ) -> tuple[LLMCallResponse | None, str | None]:
         response: Response | None = None
@@ -77,6 +77,10 @@ def execute(
                 **completion_config.params,
             }
             if isinstance(resolved_input, MultiModalInput):
+                params["input"] = [
+                    {"role": "user", "content": self.format_parts(resolved_input.parts)}
+                ]
+            elif isinstance(resolved_input, list):
                 params["input"] = [
                     {"role": "user", "content": self.format_parts(resolved_input)}
                 ]
diff --git a/backend/app/services/llm/providers/registry.py b/backend/app/services/llm/providers/registry.py
index 15236b8d7..5eff4db19 100644
--- a/backend/app/services/llm/providers/registry.py
+++ b/backend/app/services/llm/providers/registry.py
@@ -3,7 +3,6 @@
 import logging
 from sqlmodel import Session
 
-from app.crud import get_provider_credential
 from app.services.llm.providers.base import BaseProvider
 from app.services.llm.providers.oai import OpenAIProvider
 from app.services.llm.providers.gai import GoogleAIProvider
@@ -46,6 +45,8 @@ def supported_providers(cls) -> list[str]:
 def get_llm_provider(
     session: Session, provider_type: str, project_id: int, organization_id: int
 ) -> BaseProvider:
+    from app.crud.credentials import get_provider_credential
+
     provider_class = LLMProvider.get_provider_class(provider_type)
 
     # e.g., "openai-native" → "openai", "claude-native" → "claude"
diff --git a/backend/app/utils.py b/backend/app/utils.py
index 7fa0973b9..29100adb6 100644
--- a/backend/app/utils.py
+++ b/backend/app/utils.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import base64
 import functools as ft
 import ipaddress
@@ -8,6 +10,7 @@
 from pathlib import Path
 import requests
 import socket
+
 from typing import Any, Dict, Generic, Optional, TypeVar
 from urllib.parse import urlparse
 
@@ -25,8 +28,17 @@
 from app.core import security
 from app.core.config import settings
 from app.crud.credentials import get_provider_credential
-from app.models.llm.request import TextInput, AudioInput, ImageInput, PDFInput
-from app.models.llm.request import TextContent, AudioContent, ImageContent, PDFContent
+from app.models.llm.request import (
+    TextInput,
+    AudioInput,
+    ImageInput,
+    PDFInput,
+    TextContent,
+    AudioContent,
+    ImageContent,
+    PDFContent,
+)
+from app.services.llm.providers.base import MultiModalInput
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -471,24 +483,24 @@ def resolve_pdf_content(pdf_input: PDFInput) -> list[PDFContent]:
     return contents
 
 
-def resolve_input(query_input) -> tuple[str, str | None]:
-    """Resolve discriminated union input to content string.
-
-    Args:
-        query_input: The input from QueryParams (TextInput or AudioInput)
+def resolve_input(
+    query_input,
+) -> tuple[str | list[ImageContent] | list[PDFContent] | "MultiModalInput", str | None]:
+    """Resolve query input to provider-ready format.
 
     Returns:
-        (content_string, None) on success - for text returns content value, for audio returns temp file path
-        ("", error_message) on failure
+        - TextInput/AudioInput: (str, None)
+        - ImageInput: (list[ImageContent], None)
+        - PDFInput: (list[PDFContent], None)
+        - list[QueryInput]: (MultiModalInput, None)
+        - Error: ("", error_message)
     """
-    from app.models.llm.request import TextInput, AudioInput, ImageInput, PDFInput
 
     try:
         if isinstance(query_input, TextInput):
             return query_input.content.value, None
 
         elif isinstance(query_input, AudioInput):
-            # AudioInput content is base64-encoded audio
             mime_type = query_input.content.mime_type or "audio/wav"
             return resolve_audio_base64(query_input.content.value, mime_type)
 
@@ -502,14 +514,14 @@ def resolve_input(query_input) -> tuple[str, str | None]:
             parts: list[ContentPart] = []
             for item in query_input:
                 if isinstance(item, TextInput):
-                    parts.append(item.content)  # TextContent instance
+                    parts.append(item.content)
                 elif isinstance(item, ImageInput):
                     parts.extend(resolve_image_content(item))
                 elif isinstance(item, PDFInput):
                     parts.extend(resolve_pdf_content(item))
                 else:
-                    return [], f"Unsupported input type: {type(item)}"
-            return parts, None
+                    return "", f"Unsupported input type: {type(item)}"
+            return MultiModalInput(parts=parts), None
 
         else:
             return "", f"Unknown input type: {type(query_input)}"

From 74e328e0edfae48b0363181b1cf56cdfbf1182c1 Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Thu, 26 Feb 2026 07:51:52 +0530
Subject: [PATCH 06/16] Enhance multimodal support: Allow None for instructions
 in Image, PDF, and Multimodal parameters; update validation to restrict audio
 input in multimodal processing.

---
 backend/app/models/llm/request.py          |  6 +-
 backend/app/services/llm/providers/base.py | 80 ++++++++++++++++++----
 backend/app/utils.py                       | 10 ++-
 3 files changed, 77 insertions(+), 19 deletions(-)

diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py
index 81efcb89a..1c557d397 100644
--- a/backend/app/models/llm/request.py
+++ b/backend/app/models/llm/request.py
@@ -58,7 +58,7 @@ class TTSLLMParams(SQLModel):
 
 class ImageLLMParams(SQLModel):
     model: str
-    instructions: str
+    instructions: str | None = None
     response_format: Literal["text"] | None = Field(
         None,
         description="Currently supports text type",
@@ -73,7 +73,7 @@ class ImageLLMParams(SQLModel):
 
 class PDFLLMParams(SQLModel):
     model: str
-    instructions: str
+    instructions: str | None = None
     response_format: Literal["text"] | None = Field(
         None,
         description="Currently supports text type",
@@ -88,7 +88,7 @@ class PDFLLMParams(SQLModel):
 
 class MultimodalLLMParams(SQLModel):
     model: str
-    instructions: str
+    instructions: str | None = None
     response_format: Literal["text"] | None = Field(
         None,
         description="Currently supports text type",
diff --git a/backend/app/services/llm/providers/base.py b/backend/app/services/llm/providers/base.py
index 5414cfcf9..5fc75e711 100644
--- a/backend/app/services/llm/providers/base.py
+++ b/backend/app/services/llm/providers/base.py
@@ -11,7 +11,9 @@
 from sqlmodel import SQLModel
 
 from app.models.llm import NativeCompletionConfig, LLMCallResponse, QueryParams
-from app.models.llm.request import TextContent, ImageContent, PDFContent
+from app.models.llm.request import TextContent, AudioContent, ImageContent, PDFContent
+
+MULTIMODAL_ALLOWED_PARTS = (TextContent, ImageContent, PDFContent)
 
 
 class MultiModalInput(SQLModel):
@@ -26,27 +28,75 @@ def validate_parts(self):
         return self
 
 
-COMPLETION_TYPE_ALLOWED_INPUT: dict[str, set[type]] = {
-    "text": {str},
-    "stt": {str},
-    "tts": {str},
-    "image": {list},
-    "pdf": {list},
-    "multimodal": {MultiModalInput},
+CONTENT_TYPE_LABEL: dict[type, str] = {
+    TextContent: "text",
+    AudioContent: "audio",
+    ImageContent: "image",
+    PDFContent: "pdf",
+}
+
+INPUT_TYPE_LABEL: dict[type, str] = {
+    str: "text",
+    list: "list",
+    MultiModalInput: "multimodal (mixed input types)",
 }
 
+COMPLETION_TYPE_RULES: dict[str, dict] = {
+    "text": {"type": str, "label": "text"},
+    "stt": {"type": str, "label": "audio"},
+    "tts": {"type": str, "label": "text"},
+    "image": {"type": list, "element_type": ImageContent, "label": "image"},
+    "pdf": {"type": list, "element_type": PDFContent, "label": "pdf"},
+    "multimodal": {"type": MultiModalInput, "label": "multimodal"},
+}
+
+
+def _get_content_label(content: Any) -> str:
+    return CONTENT_TYPE_LABEL.get(type(content), type(content).__name__)
+
 
 def validate_completion_input(completion_type: str, resolved_input: Any) -> str | None:
-    """Returns error message if mismatch, else None."""
-    allowed = COMPLETION_TYPE_ALLOWED_INPUT.get(completion_type)
-    if allowed is None:
+    """Returns error message if input type doesn't match completion type, else None."""
+    rule = COMPLETION_TYPE_RULES.get(completion_type)
+    if rule is None:
         return f"Unknown completion type: '{completion_type}'"
-    if type(resolved_input) not in allowed:
-        expected = " or ".join(t.__name__ for t in allowed)
+
+    expected_type = rule["type"]
+    label = rule["label"]
+
+    if not isinstance(resolved_input, expected_type):
+        actual_label = INPUT_TYPE_LABEL.get(
+            type(resolved_input), type(resolved_input).__name__
+        )
+        hint = (
+            " Please set completion type to 'multimodal' when sending mixed input types."
+            if isinstance(resolved_input, MultiModalInput)
+            else f" Please ensure the input type matches the completion type."
+        )
         return (
-            f"completion type '{completion_type}' expects {expected} input, "
-            f"got {type(resolved_input).__name__}"
+            f"Input type mismatch: completion type '{completion_type}' expects "
+            f"'{label}' input, but received {actual_label}.{hint}"
         )
+
+    if isinstance(resolved_input, list):
+        element_type = rule.get("element_type")
+        if element_type:
+            for item in resolved_input:
+                if not isinstance(item, element_type):
+                    return (
+                        f"Input type mismatch: completion type '{completion_type}' expects "
+                        f"'{label}' input, but received '{_get_content_label(item)}' content. "
+                        f"Please ensure the input type matches the completion type."
+                    )
+
+    if isinstance(resolved_input, MultiModalInput):
+        for part in resolved_input.parts:
+            if not isinstance(part, MULTIMODAL_ALLOWED_PARTS):
+                return (
+                    f"Unsupported content in multimodal input: '{_get_content_label(part)}'. "
+                    f"Multimodal supports text, image, and pdf only. Audio is not supported."
+                )
+
     return None
 
 
diff --git a/backend/app/utils.py b/backend/app/utils.py
index 29100adb6..7fb218077 100644
--- a/backend/app/utils.py
+++ b/backend/app/utils.py
@@ -519,8 +519,16 @@ def resolve_input(
                     parts.extend(resolve_image_content(item))
                 elif isinstance(item, PDFInput):
                     parts.extend(resolve_pdf_content(item))
+                elif isinstance(item, AudioInput):
+                    return (
+                        "",
+                        "Audio input is not supported in multimodal. Please use completion type 'stt' for audio processing.",
+                    )
                 else:
-                    return "", f"Unsupported input type: {type(item)}"
+                    return (
+                        "",
+                        "Unsupported input type in multimodal list. Multimodal only supports text, image, and pdf inputs.",
+                    )
             return MultiModalInput(parts=parts), None
 
         else:

From 42a001dda34385af0c18c92d7ef0ca8082ca6930 Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Thu, 26 Feb 2026 08:34:07 +0530
Subject: [PATCH 07/16] Refactor multimodal input handling: Introduce
 ContentPart type for better type management and update relevant classes to
 use it.

---
 backend/app/models/llm/request.py          | 33 ++++++++++++++++------
 backend/app/services/llm/jobs.py           |  1 -
 backend/app/services/llm/providers/base.py |  7 +++--
 backend/app/services/llm/providers/gai.py  | 10 +++----
 backend/app/services/llm/providers/oai.py  |  4 +--
 backend/app/utils.py                       |  5 +---
 6 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py
index 1c557d397..87e3ded9c 100644
--- a/backend/app/models/llm/request.py
+++ b/backend/app/models/llm/request.py
@@ -59,9 +59,9 @@ class TTSLLMParams(SQLModel):
 class ImageLLMParams(SQLModel):
     model: str
     instructions: str | None = None
-    response_format: Literal["text"] | None = Field(
-        None,
-        description="Currently supports text type",
+    knowledge_base_ids: list[str] | None = Field(
+        default=None,
+        description="List of vector store IDs to use for knowledge retrieval",
     )
     temperature: float | None = Field(
         default=0.2,
@@ -69,14 +69,19 @@ class ImageLLMParams(SQLModel):
         le=2.0,
     )
     reasoning: Literal["low", "medium", "high"] | None = None
+    max_num_results: int | None = Field(
+        default=None,
+        ge=1,
+        description="Maximum number of candidate results to return",
+    )
 
 
 class PDFLLMParams(SQLModel):
     model: str
     instructions: str | None = None
-    response_format: Literal["text"] | None = Field(
-        None,
-        description="Currently supports text type",
+    knowledge_base_ids: list[str] | None = Field(
+        default=None,
+        description="List of vector store IDs to use for knowledge retrieval",
     )
     temperature: float | None = Field(
         default=0.2,
@@ -84,14 +89,19 @@ class PDFLLMParams(SQLModel):
         le=2.0,
     )
     reasoning: Literal["low", "medium", "high"] | None = None
+    max_num_results: int | None = Field(
+        default=None,
+        ge=1,
+        description="Maximum number of candidate results to return",
+    )
 
 
 class MultimodalLLMParams(SQLModel):
     model: str
     instructions: str | None = None
-    response_format: Literal["text"] | None = Field(
-        None,
-        description="Currently supports text type",
+    knowledge_base_ids: list[str] | None = Field(
+        default=None,
+        description="List of vector store IDs to use for knowledge retrieval",
     )
     temperature: float | None = Field(
         default=0.2,
@@ -99,6 +109,11 @@ class MultimodalLLMParams(SQLModel):
         le=2.0,
     )
     reasoning: Literal["low", "medium", "high"] | None = None
+    max_num_results: int | None = Field(
+        default=None,
+        ge=1,
+        description="Maximum number of candidate results to return",
+    )
 
 
 KaapiLLMParams = Union[
diff --git a/backend/app/services/llm/jobs.py b/backend/app/services/llm/jobs.py
index 511516139..4ea45ba22 100644
--- a/backend/app/services/llm/jobs.py
+++ b/backend/app/services/llm/jobs.py
@@ -114,7 +114,6 @@ def resolved_input_context(
     even if errors occur during LLM execution.
     """
     resolved_input, error = resolve_input(query_input)
-    print(f"Resolved input: {resolved_input}, error: {error}")
 
     if error:
         raise ValueError(error)
diff --git a/backend/app/services/llm/providers/base.py b/backend/app/services/llm/providers/base.py
index 5fc75e711..959494f6f 100644
--- a/backend/app/services/llm/providers/base.py
+++ b/backend/app/services/llm/providers/base.py
@@ -13,13 +13,14 @@
 from app.models.llm import NativeCompletionConfig, LLMCallResponse, QueryParams
 from app.models.llm.request import TextContent, AudioContent, ImageContent, PDFContent
 
+ContentPart = TextContent | ImageContent | PDFContent
 MULTIMODAL_ALLOWED_PARTS = (TextContent, ImageContent, PDFContent)
 
 
 class MultiModalInput(SQLModel):
     """Resolved multimodal input containing a list of content parts."""
 
-    parts: list[TextContent | ImageContent | PDFContent]
+    parts: list[ContentPart]
 
     @model_validator(mode="after")
     def validate_parts(self):
@@ -71,7 +72,7 @@ def validate_completion_input(completion_type: str, resolved_input: Any) -> str
         hint = (
             " Please set completion type to 'multimodal' when sending mixed input types."
             if isinstance(resolved_input, MultiModalInput)
-            else f" Please ensure the input type matches the completion type."
+            else " Please ensure the input type matches the completion type."
         )
         return (
             f"Input type mismatch: completion type '{completion_type}' expects "
@@ -134,7 +135,7 @@ def execute(
         self,
         completion_config: NativeCompletionConfig,
         query: QueryParams,
-        resolved_input: str | list[TextContent | ImageContent | PDFContent],
+        resolved_input: str | list[ContentPart],
         include_provider_raw_response: bool = False,
     ) -> tuple[LLMCallResponse | None, str | None]:
         """Execute LLM API call.
diff --git a/backend/app/services/llm/providers/gai.py b/backend/app/services/llm/providers/gai.py
index 342e92eb3..fe920ba07 100644
--- a/backend/app/services/llm/providers/gai.py
+++ b/backend/app/services/llm/providers/gai.py
@@ -24,7 +24,7 @@
     PDFContent,
 )
 from app.models.llm.response import AudioOutput, AudioContent
-from app.services.llm.providers.base import BaseProvider, MultiModalInput
+from app.services.llm.providers.base import BaseProvider, ContentPart, MultiModalInput
 from app.core.audio_utils import convert_pcm_to_mp3, convert_pcm_to_ogg
 
 logger = logging.getLogger(__name__)
@@ -48,7 +48,7 @@ def create_client(credentials: dict[str, Any]) -> Any:
 
     @staticmethod
     def format_parts(
-        parts: list[TextContent | ImageContent | PDFContent],
+        parts: list[ContentPart],
     ) -> list[dict]:
         items = []
         for part in parts:
@@ -419,7 +419,7 @@ def _execute_image(
             reasoning_tokens = response.usage_metadata.thoughts_token_count or 0
         else:
             logger.warning(
-                f"[GoogleAIProvider._execute_stt] Response missing usage_metadata, using zeros"
+                f"[GoogleAIProvider._execute_image] Response missing usage_metadata, using zeros"
             )
             input_tokens = 0
             output_tokens = 0
@@ -564,7 +564,7 @@ def _execute_text(
             reasoning_tokens = response.usage_metadata.thoughts_token_count or 0
         else:
             logger.warning(
-                f"[GoogleAIProvider._execute_stt] Response missing usage_metadata, using zeros"
+                f"[GoogleAIProvider._execute_text] Response missing usage_metadata, using zeros"
             )
             input_tokens = 0
             output_tokens = 0
@@ -635,7 +635,7 @@ def _execute_multimodal(
             reasoning_tokens = response.usage_metadata.thoughts_token_count or 0
         else:
             logger.warning(
-                f"[GoogleAIProvider._execute_stt] Response missing usage_metadata, using zeros"
+                f"[GoogleAIProvider._execute_multimodal] Response missing usage_metadata, using zeros"
             )
             input_tokens = 0
             output_tokens = 0
diff --git a/backend/app/services/llm/providers/oai.py b/backend/app/services/llm/providers/oai.py
index 2f05a3aab..392487eea 100644
--- a/backend/app/services/llm/providers/oai.py
+++ b/backend/app/services/llm/providers/oai.py
@@ -16,7 +16,7 @@
     ImageContent,
     PDFContent,
 )
-from app.services.llm.providers.base import BaseProvider, MultiModalInput
+from app.services.llm.providers.base import BaseProvider, ContentPart, MultiModalInput
 
 logger = logging.getLogger(__name__)
 
@@ -39,7 +39,7 @@ def create_client(credentials: dict[str, Any]) -> Any:
 
     @staticmethod
     def format_parts(
-        parts: list[TextContent | ImageContent | PDFContent],
+        parts: list[ContentPart],
     ) -> list[dict]:
         items = []
         for part in parts:
diff --git a/backend/app/utils.py b/backend/app/utils.py
index 7fb218077..9c1be2a11 100644
--- a/backend/app/utils.py
+++ b/backend/app/utils.py
@@ -33,18 +33,15 @@
     AudioInput,
     ImageInput,
     PDFInput,
-    TextContent,
-    AudioContent,
     ImageContent,
     PDFContent,
 )
-from app.services.llm.providers.base import MultiModalInput
+from app.services.llm.providers.base import ContentPart, MultiModalInput
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 T = TypeVar("T")
-ContentPart = TextContent | AudioContent | ImageContent | PDFContent
 
 
 class APIResponse(BaseModel, Generic[T]):

From 335e59b3e5e59f466ee921ad3670f2a645402189 Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Thu, 26 Feb 2026 09:49:11 +0530
Subject: [PATCH 08/16] Add comprehensive tests for multimodal input validation
 and processing

---
 .../app/tests/services/llm/test_multimodal.py | 450 ++++++++++++++++++
 1 file changed, 450 insertions(+)
 create mode 100644 backend/app/tests/services/llm/test_multimodal.py

diff --git a/backend/app/tests/services/llm/test_multimodal.py b/backend/app/tests/services/llm/test_multimodal.py
new file mode 100644
index 000000000..9744c82ce
--- /dev/null
+++ b/backend/app/tests/services/llm/test_multimodal.py
@@ -0,0 +1,450 @@
+import pytest
+
+from app.models.llm.request import (
+    TextInput,
+    AudioInput,
+    ImageInput,
+    PDFInput,
+    TextContent,
+    AudioContent,
+    ImageContent,
+    PDFContent,
+)
+from app.services.llm.providers.base import (
+    ContentPart,
+    MultiModalInput,
+    validate_completion_input,
+    _get_content_label,
+)
+from app.services.llm.providers.oai import OpenAIProvider
+from app.services.llm.providers.gai import GoogleAIProvider
+from app.utils import (
+    resolve_input,
+    resolve_image_content,
+    resolve_pdf_content,
+)
+
+
+class TestValidateCompletionInput:
+    def test_text_with_str_passes(self):
+        assert validate_completion_input("text", "hello") is None
+
+    def test_stt_with_str_passes(self):
+        assert validate_completion_input("stt", "/tmp/audio.wav") is None
+
+    def test_tts_with_str_passes(self):
+        assert validate_completion_input("tts", "say this") is None
+
+    def test_image_with_image_content_list_passes(self):
+        parts = [ImageContent(format="base64", value="abc", mime_type="image/png")]
+        assert validate_completion_input("image", parts) is None
+
+    def test_pdf_with_pdf_content_list_passes(self):
+        parts = [PDFContent(format="base64", value="abc", mime_type="application/pdf")]
+        assert validate_completion_input("pdf", parts) is None
+
+    def test_multimodal_with_multimodal_input_passes(self):
+        mm = MultiModalInput(
+            parts=[
+                TextContent(value="hello"),
+                ImageContent(format="base64", value="abc", mime_type="image/png"),
+            ]
+        )
+        assert validate_completion_input("multimodal", mm) is None
+
+    def test_text_input_with_pdf_completion_fails(self):
+        error = validate_completion_input("pdf", "some text")
+        assert error is not None
+        assert "input type mismatch" in error.lower()
+        assert "'pdf'" in error
+        assert "text" in error
+
+    def test_multimodal_input_with_image_completion_fails(self):
+        mm = MultiModalInput(
+            parts=[
+                TextContent(value="hello"),
+                ImageContent(format="base64", value="abc", mime_type="image/png"),
+            ]
+        )
+        error = validate_completion_input("image", mm)
+        assert error is not None
+        assert "multimodal" in error.lower()
+        assert "set completion type to 'multimodal'" in error
+
+    def test_text_input_with_image_completion_no_multimodal_hint(self):
+        error = validate_completion_input("image", "some text")
+        assert error is not None
+        assert "set completion type to 'multimodal'" not in error
+        assert "Please ensure the input type matches" in error
+
+    def test_pdf_content_in_image_completion_fails(self):
+        parts = [PDFContent(format="base64", value="abc", mime_type="application/pdf")]
+        error = validate_completion_input("image", parts)
+        assert error is not None
+        assert "'pdf'" in error
+
+    def test_image_content_in_pdf_completion_fails(self):
+        parts = [ImageContent(format="base64", value="abc", mime_type="image/png")]
+        error = validate_completion_input("pdf", parts)
+        assert error is not None
+        assert "'image'" in error
+
+    def test_unknown_completion_type(self):
+        error = validate_completion_input("unknown_type", "hello")
+        assert error is not None
+        assert "Unknown completion type" in error
+
+    def test_list_input_with_text_completion_fails(self):
+        parts = [ImageContent(format="base64", value="abc", mime_type="image/png")]
+        error = validate_completion_input("text", parts)
+        assert error is not None
+        assert "text" in error
+
+
+class TestMultiModalInput:
+    def test_valid_parts(self):
+        mm = MultiModalInput(
+            parts=[
+                TextContent(value="hello"),
+                ImageContent(format="base64", value="abc", mime_type="image/png"),
+                PDFContent(format="base64", value="abc", mime_type="application/pdf"),
+            ]
+        )
+        assert len(mm.parts) == 3
+
+    def test_empty_parts_raises(self):
+        with pytest.raises(Exception):
+            MultiModalInput(parts=[])
+
+    def test_single_text_part(self):
+        mm = MultiModalInput(parts=[TextContent(value="only text")])
+        assert len(mm.parts) == 1
+
+
+class TestGetContentLabel:
+    def test_text_content(self):
+        assert _get_content_label(TextContent(value="hi")) == "text"
+
+    def test_image_content(self):
+        assert (
+            _get_content_label(
+                ImageContent(format="base64", value="abc", mime_type="image/png")
+            )
+            == "image"
+        )
+
+    def test_pdf_content(self):
+        assert (
+            _get_content_label(
+                PDFContent(format="base64", value="abc", mime_type="application/pdf")
+            )
+            == "pdf"
+        )
+
+    def test_audio_content(self):
+        assert (
+            _get_content_label(AudioContent(value="abc", mime_type="audio/wav"))
+            == "audio"
+        )
+
+
+class TestResolveInputMultimodal:
+    def test_image_input_returns_image_content_list(self):
+        img = ImageInput(
+            content=ImageContent(format="base64", value="abc", mime_type="image/png")
+        )
+        result, error = resolve_input(img)
+        assert error is None
+        assert isinstance(result, list)
+        assert len(result) == 1
+        assert isinstance(result[0], ImageContent)
+
+    def test_pdf_input_returns_pdf_content_list(self):
+        pdf = PDFInput(
+            content=PDFContent(
+                format="base64", value="abc", mime_type="application/pdf"
+            )
+        )
+        result, error = resolve_input(pdf)
+        assert error is None
+        assert isinstance(result, list)
+        assert len(result) == 1
+        assert isinstance(result[0], PDFContent)
+
+    def test_multimodal_list_returns_multimodal_input(self):
+        inputs = [
+            TextInput(content=TextContent(value="describe")),
+            ImageInput(
+                content=ImageContent(
+                    format="base64", value="abc", mime_type="image/png"
+                )
+            ),
+        ]
+        result, error = resolve_input(inputs)
+        assert error is None
+        assert isinstance(result, MultiModalInput)
+        assert len(result.parts) == 2
+
+    def test_multimodal_list_with_pdf(self):
+        inputs = [
+            TextInput(content=TextContent(value="analyze")),
+            PDFInput(
+                content=PDFContent(
+                    format="base64", value="abc", mime_type="application/pdf"
+                )
+            ),
+        ]
+        result, error = resolve_input(inputs)
+        assert error is None
+        assert isinstance(result, MultiModalInput)
+        assert len(result.parts) == 2
+
+    def test_multimodal_list_with_audio_rejected(self):
+        inputs = [
+            TextInput(content=TextContent(value="hello")),
+            AudioInput(content=AudioContent(value="abc", mime_type="audio/wav")),
+        ]
+        result, error = resolve_input(inputs)
+        assert error is not None
+        assert "audio" in error.lower()
+        assert "stt" in error.lower()
+
+    def test_image_input_default_mime_type(self):
+        img = ImageInput(content=ImageContent(format="base64", value="abc"))
+        result, error = resolve_input(img)
+        assert error is None
+        assert result[0].mime_type == "image/png"
+
+    def test_pdf_input_default_mime_type(self):
+        pdf = PDFInput(content=PDFContent(format="base64", value="abc"))
+        result, error = resolve_input(pdf)
+        assert error is None
+        assert result[0].mime_type == "application/pdf"
+
+    def test_image_input_multiple_contents(self):
+        img = ImageInput(
+            content=[
+                ImageContent(format="base64", value="abc1", mime_type="image/png"),
+                ImageContent(
+                    format="url",
+                    value="https://example.com/img.jpg",
+                    mime_type="image/jpeg",
+                ),
+            ]
+        )
+        result, error = resolve_input(img)
+        assert error is None
+        assert len(result) == 2
+
+    def test_multimodal_mixed_types_in_parts(self):
+        inputs = [
+            TextInput(content=TextContent(value="look at these")),
+            ImageInput(
+                content=ImageContent(
+                    format="base64", value="img", mime_type="image/png"
+                )
+            ),
+            PDFInput(
+                content=PDFContent(
+                    format="base64", value="pdf", mime_type="application/pdf"
+                )
+            ),
+        ]
+        result, error = resolve_input(inputs)
+        assert error is None
+        assert isinstance(result, MultiModalInput)
+        assert len(result.parts) == 3
+        assert isinstance(result.parts[0], TextContent)
+        assert isinstance(result.parts[1], ImageContent)
+        assert isinstance(result.parts[2], PDFContent)
+
+
+class TestOpenAIFormatParts:
+    def test_text_part(self):
+        parts = [TextContent(value="hello")]
+        result = OpenAIProvider.format_parts(parts)
+        assert result == [{"type": "input_text", "text": "hello"}]
+
+    def test_image_base64_part(self):
+        parts = [ImageContent(format="base64", value="abc123", mime_type="image/png")]
+        result = OpenAIProvider.format_parts(parts)
+        assert len(result) == 1
+        assert result[0]["type"] == "input_image"
+        assert result[0]["image_url"] == "data:image/png;base64,abc123"
+
+    def test_image_url_part(self):
+        parts = [
+            ImageContent(
+                format="url",
+                value="https://example.com/img.jpg",
+                mime_type="image/jpeg",
+            )
+        ]
+        result = OpenAIProvider.format_parts(parts)
+        assert result[0]["type"] == "input_image"
+        assert result[0]["image_url"] == "https://example.com/img.jpg"
+
+    def test_pdf_base64_part(self):
+        parts = [
+            PDFContent(format="base64", value="pdf123", mime_type="application/pdf")
+        ]
+        result = OpenAIProvider.format_parts(parts)
+        assert len(result) == 1
+        assert result[0]["type"] == "input_file"
+        assert result[0]["file_url"] == "data:application/pdf;base64,pdf123"
+
+    def test_pdf_url_part(self):
+        parts = [
+            PDFContent(
+                format="url",
+                value="https://example.com/doc.pdf",
+                mime_type="application/pdf",
+            )
+        ]
+        result = OpenAIProvider.format_parts(parts)
+        assert result[0]["type"] == "input_file"
+        assert result[0]["file_url"] == "https://example.com/doc.pdf"
+
+    def test_mixed_parts(self):
+        parts = [
+            TextContent(value="describe"),
+            ImageContent(format="base64", value="img", mime_type="image/png"),
+            PDFContent(
+                format="url",
+                value="https://example.com/doc.pdf",
+                mime_type="application/pdf",
+            ),
+        ]
+        result = OpenAIProvider.format_parts(parts)
+        assert len(result) == 3
+        assert result[0]["type"] == "input_text"
+        assert result[1]["type"] == "input_image"
+        assert result[2]["type"] == "input_file"
+
+
+class TestGoogleAIFormatParts:
+    def test_text_part(self):
+        parts = [TextContent(value="hello")]
+        result = GoogleAIProvider.format_parts(parts)
+        assert result == [{"text": "hello"}]
+
+    def test_image_base64_part(self):
+        parts = [ImageContent(format="base64", value="abc123", mime_type="image/png")]
+        result = GoogleAIProvider.format_parts(parts)
+        assert len(result) == 1
+        assert result[0] == {
+            "inline_data": {"data": "abc123", "mime_type": "image/png"}
+        }
+
+    def test_image_url_part(self):
+        parts = [
+            ImageContent(
+                format="url",
+                value="https://example.com/img.jpg",
+                mime_type="image/jpeg",
+            )
+        ]
+        result = GoogleAIProvider.format_parts(parts)
+        assert result[0] == {
+            "file_data": {
+                "file_uri": "https://example.com/img.jpg",
+                "mime_type": "image/jpeg",
+                "display_name": None,
+            }
+        }
+
+    def test_pdf_base64_part(self):
+        parts = [
+            PDFContent(format="base64", value="pdf123", mime_type="application/pdf")
+        ]
+        result = GoogleAIProvider.format_parts(parts)
+        assert result[0] == {
+            "inline_data": {"data": "pdf123", "mime_type": "application/pdf"}
+        }
+
+    def test_pdf_url_part(self):
+        parts = [
+            PDFContent(
+                format="url",
+                value="https://example.com/doc.pdf",
+                mime_type="application/pdf",
+            )
+        ]
+        result = GoogleAIProvider.format_parts(parts)
+        assert result[0] == {
+            "file_data": {
+                "file_uri": "https://example.com/doc.pdf",
+                "mime_type": "application/pdf",
+                "display_name": None,
+            }
+        }
+
+    def test_mixed_parts(self):
+        parts = [
+            TextContent(value="analyze"),
+            ImageContent(
+                format="url", value="https://img.com/a.jpg", mime_type="image/jpeg"
+            ),
+            PDFContent(format="base64", value="pdf", mime_type="application/pdf"),
+        ]
+        result = GoogleAIProvider.format_parts(parts)
+        assert len(result) == 3
+        assert "text" in result[0]
+        assert "file_data" in result[1]
+        assert "inline_data" in result[2]
+
+
+class TestResolveImageContent:
+    def test_single_content(self):
+        img = ImageInput(
+            content=ImageContent(format="base64", value="abc", mime_type="image/png")
+        )
+        result = resolve_image_content(img)
+        assert len(result) == 1
+        assert result[0].mime_type == "image/png"
+
+    def test_default_mime_type(self):
+        img = ImageInput(content=ImageContent(format="base64", value="abc"))
+        result = resolve_image_content(img)
+        assert result[0].mime_type == "image/png"
+
+    def test_list_content(self):
+        img = ImageInput(
+            content=[
+                ImageContent(format="base64", value="a", mime_type="image/png"),
+                ImageContent(format="base64", value="b", mime_type="image/jpeg"),
+            ]
+        )
+        result = resolve_image_content(img)
+        assert len(result) == 2
+
+
+class TestResolvePdfContent:
+    def test_single_content(self):
+        pdf = PDFInput(
+            content=PDFContent(
+                format="base64", value="abc", mime_type="application/pdf"
+            )
+        )
+        result = resolve_pdf_content(pdf)
+        assert len(result) == 1
+        assert result[0].mime_type == "application/pdf"
+
+    def test_default_mime_type(self):
+        pdf = PDFInput(content=PDFContent(format="base64", value="abc"))
+        result = resolve_pdf_content(pdf)
+        assert result[0].mime_type == "application/pdf"
+
+    def test_list_content(self):
+        pdf = PDFInput(
+            content=[
+                PDFContent(format="base64", value="a", mime_type="application/pdf"),
+                PDFContent(
+                    format="url",
+                    value="https://example.com/doc.pdf",
+                    mime_type="application/pdf",
+                ),
+            ]
+        )
+        result = resolve_pdf_content(pdf)
+        assert len(result) == 2

From 56c7a44164e69098d37389461eed4bbe9042cd9f Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Thu, 26 Feb 2026 17:16:21 +0530
Subject: [PATCH 09/16] removed multimodal, image and pdf llmparams and from
 both input type and completion type

---
 backend/app/models/llm/request.py             |  72 +-----------
 backend/app/services/llm/jobs.py              |  13 ---
 backend/app/services/llm/providers/base.py    |  77 +------------
 .../app/tests/services/llm/test_multimodal.py | 105 ------------------
 4 files changed, 5 insertions(+), 262 deletions(-)

diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py
index 87e3ded9c..778039b62 100644
--- a/backend/app/models/llm/request.py
+++ b/backend/app/models/llm/request.py
@@ -56,73 +56,10 @@ class TTSLLMParams(SQLModel):
     response_format: Literal["mp3", "wav", "ogg"] | None = "wav"
 
 
-class ImageLLMParams(SQLModel):
-    model: str
-    instructions: str | None = None
-    knowledge_base_ids: list[str] | None = Field(
-        default=None,
-        description="List of vector store IDs to use for knowledge retrieval",
-    )
-    temperature: float | None = Field(
-        default=0.2,
-        ge=0.0,
-        le=2.0,
-    )
-    reasoning: Literal["low", "medium", "high"] | None = None
-    max_num_results: int | None = Field(
-        default=None,
-        ge=1,
-        description="Maximum number of candidate results to return",
-    )
-
-
-class PDFLLMParams(SQLModel):
-    model: str
-    instructions: str | None = None
-    knowledge_base_ids: list[str] | None = Field(
-        default=None,
-        description="List of vector store IDs to use for knowledge retrieval",
-    )
-    temperature: float | None = Field(
-        default=0.2,
-        ge=0.0,
-        le=2.0,
-    )
-    reasoning: Literal["low", "medium", "high"] | None = None
-    max_num_results: int | None = Field(
-        default=None,
-        ge=1,
-        description="Maximum number of candidate results to return",
-    )
-
-
-class MultimodalLLMParams(SQLModel):
-    model: str
-    instructions: str | None = None
-    knowledge_base_ids: list[str] | None = Field(
-        default=None,
-        description="List of vector store IDs to use for knowledge retrieval",
-    )
-    temperature: float | None = Field(
-        default=0.2,
-        ge=0.0,
-        le=2.0,
-    )
-    reasoning: Literal["low", "medium", "high"] | None = None
-    max_num_results: int | None = Field(
-        default=None,
-        ge=1,
-        description="Maximum number of candidate results to return",
-    )
-
-
 KaapiLLMParams = Union[
     TextLLMParams,
     STTLLMParams,
     TTSLLMParams,
-    ImageLLMParams,
-    PDFLLMParams,
-    MultimodalLLMParams,
 ]
 
 
@@ -277,7 +214,7 @@ class KaapiCompletionConfig(SQLModel):
         ..., description="LLM provider (openai)"
     )
 
-    type: Literal["text", "stt", "tts", "image", "pdf", "multimodal"] = Field(
+    type: Literal["text", "stt", "tts"] = Field(
         ..., description="Completion config type. Params schema varies by type"
     )
     params: dict[str, Any] = Field(
@@ -292,9 +229,6 @@ def validate_params(self):
             "text": TextLLMParams,
             "stt": STTLLMParams,
             "tts": TTSLLMParams,
-            "image": ImageLLMParams,
-            "pdf": PDFLLMParams,
-            "multimodal": MultimodalLLMParams,
         }
         model_class = param_models[self.type]
         validated = model_class.model_validate(self.params)
@@ -491,12 +425,12 @@ class LlmCall(SQLModel, table=True):
         },
     )
 
-    input_type: Literal["text", "audio", "image", "pdf", "multimodal"] = Field(
+    input_type: Literal["text", "audio", "image"] = Field(
         ...,
         sa_column=sa.Column(
             sa.String,
             nullable=False,
-            comment="Input type: text, audio, image, pdf, multimodal",
+            comment="Input type: text, audio, image",
         ),
     )
 
diff --git a/backend/app/services/llm/jobs.py b/backend/app/services/llm/jobs.py
index 4ea45ba22..5cdc0d32b 100644
--- a/backend/app/services/llm/jobs.py
+++ b/backend/app/services/llm/jobs.py
@@ -29,7 +29,6 @@
     run_guardrails_validation,
 )
 from app.services.llm.providers.registry import get_llm_provider
-from app.services.llm.providers.base import validate_completion_input
 from app.services.llm.mappers import transform_kaapi_config_to_native
 from app.utils import APIResponse, send_callback, resolve_input, cleanup_temp_file
 
@@ -396,18 +395,6 @@ def execute_job(
         # Resolve input and execute LLM (context manager handles cleanup)
         try:
             with resolved_input_context(request.query.input) as resolved_input:
-                mismatch = validate_completion_input(
-                    completion_config.type, resolved_input
-                )
-                if mismatch:
-                    callback_response = APIResponse.failure_response(
-                        error=mismatch,
-                        metadata=request.request_metadata,
-                    )
-                    return handle_job_error(
-                        job_uuid, callback_url_str, callback_response
-                    )
-
                 response, error = decorated_execute(
                     completion_config=completion_config,
                     query=request.query,
diff --git a/backend/app/services/llm/providers/base.py b/backend/app/services/llm/providers/base.py
index 959494f6f..f159f0f1c 100644
--- a/backend/app/services/llm/providers/base.py
+++ b/backend/app/services/llm/providers/base.py
@@ -5,16 +5,15 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import Any, Literal
+from typing import Any
 
 from pydantic import model_validator
 from sqlmodel import SQLModel
 
 from app.models.llm import NativeCompletionConfig, LLMCallResponse, QueryParams
-from app.models.llm.request import TextContent, AudioContent, ImageContent, PDFContent
+from app.models.llm.request import TextContent, ImageContent, PDFContent
 
 ContentPart = TextContent | ImageContent | PDFContent
-MULTIMODAL_ALLOWED_PARTS = (TextContent, ImageContent, PDFContent)
 
 
 class MultiModalInput(SQLModel):
@@ -29,78 +28,6 @@ def validate_parts(self):
         return self
 
 
-CONTENT_TYPE_LABEL: dict[type, str] = {
-    TextContent: "text",
-    AudioContent: "audio",
-    ImageContent: "image",
-    PDFContent: "pdf",
-}
-
-INPUT_TYPE_LABEL: dict[type, str] = {
-    str: "text",
-    list: "list",
-    MultiModalInput: "multimodal (mixed input types)",
-}
-
-COMPLETION_TYPE_RULES: dict[str, dict] = {
-    "text": {"type": str, "label": "text"},
-    "stt": {"type": str, "label": "audio"},
-    "tts": {"type": str, "label": "text"},
-    "image": {"type": list, "element_type": ImageContent, "label": "image"},
-    "pdf": {"type": list, "element_type": PDFContent, "label": "pdf"},
-    "multimodal": {"type": MultiModalInput, "label": "multimodal"},
-}
-
-
-def _get_content_label(content: Any) -> str:
-    return CONTENT_TYPE_LABEL.get(type(content), type(content).__name__)
-
-
-def validate_completion_input(completion_type: str, resolved_input: Any) -> str | None:
-    """Returns error message if input type doesn't match completion type, else None."""
-    rule = COMPLETION_TYPE_RULES.get(completion_type)
-    if rule is None:
-        return f"Unknown completion type: '{completion_type}'"
-
-    expected_type = rule["type"]
-    label = rule["label"]
-
-    if not isinstance(resolved_input, expected_type):
-        actual_label = INPUT_TYPE_LABEL.get(
-            type(resolved_input), type(resolved_input).__name__
-        )
-        hint = (
-            " Please set completion type to 'multimodal' when sending mixed input types."
-            if isinstance(resolved_input, MultiModalInput)
-            else " Please ensure the input type matches the completion type."
-        )
-        return (
-            f"Input type mismatch: completion type '{completion_type}' expects "
-            f"'{label}' input, but received {actual_label}.{hint}"
-        )
-
-    if isinstance(resolved_input, list):
-        element_type = rule.get("element_type")
-        if element_type:
-            for item in resolved_input:
-                if not isinstance(item, element_type):
-                    return (
-                        f"Input type mismatch: completion type '{completion_type}' expects "
-                        f"'{label}' input, but received '{_get_content_label(item)}' content. "
-                        f"Please ensure the input type matches the completion type."
-                    )
-
-    if isinstance(resolved_input, MultiModalInput):
-        for part in resolved_input.parts:
-            if not isinstance(part, MULTIMODAL_ALLOWED_PARTS):
-                return (
-                    f"Unsupported content in multimodal input: '{_get_content_label(part)}'. "
-                    f"Multimodal supports text, image, and pdf only. Audio is not supported."
-                )
-
-    return None
-
-
 class BaseProvider(ABC):
     """Abstract base class for LLM providers.
 
diff --git a/backend/app/tests/services/llm/test_multimodal.py b/backend/app/tests/services/llm/test_multimodal.py
index 9744c82ce..5d019ce95 100644
--- a/backend/app/tests/services/llm/test_multimodal.py
+++ b/backend/app/tests/services/llm/test_multimodal.py
@@ -13,8 +13,6 @@
 from app.services.llm.providers.base import (
     ContentPart,
     MultiModalInput,
-    validate_completion_input,
-    _get_content_label,
 )
 from app.services.llm.providers.oai import OpenAIProvider
 from app.services.llm.providers.gai import GoogleAIProvider
@@ -25,82 +23,6 @@
 )
 
 
-class TestValidateCompletionInput:
-    def test_text_with_str_passes(self):
-        assert validate_completion_input("text", "hello") is None
-
-    def test_stt_with_str_passes(self):
-        assert validate_completion_input("stt", "/tmp/audio.wav") is None
-
-    def test_tts_with_str_passes(self):
-        assert validate_completion_input("tts", "say this") is None
-
-    def test_image_with_image_content_list_passes(self):
-        parts = [ImageContent(format="base64", value="abc", mime_type="image/png")]
-        assert validate_completion_input("image", parts) is None
-
-    def test_pdf_with_pdf_content_list_passes(self):
-        parts = [PDFContent(format="base64", value="abc", mime_type="application/pdf")]
-        assert validate_completion_input("pdf", parts) is None
-
-    def test_multimodal_with_multimodal_input_passes(self):
-        mm = MultiModalInput(
-            parts=[
-                TextContent(value="hello"),
-                ImageContent(format="base64", value="abc", mime_type="image/png"),
-            ]
-        )
-        assert validate_completion_input("multimodal", mm) is None
-
-    def test_text_input_with_pdf_completion_fails(self):
-        error = validate_completion_input("pdf", "some text")
-        assert error is not None
-        assert "input type mismatch" in error.lower()
-        assert "'pdf'" in error
-        assert "text" in error
-
-    def test_multimodal_input_with_image_completion_fails(self):
-        mm = MultiModalInput(
-            parts=[
-                TextContent(value="hello"),
-                ImageContent(format="base64", value="abc", mime_type="image/png"),
-            ]
-        )
-        error = validate_completion_input("image", mm)
-        assert error is not None
-        assert "multimodal" in error.lower()
-        assert "set completion type to 'multimodal'" in error
-
-    def test_text_input_with_image_completion_no_multimodal_hint(self):
-        error = validate_completion_input("image", "some text")
-        assert error is not None
-        assert "set completion type to 'multimodal'" not in error
-        assert "Please ensure the input type matches" in error
-
-    def test_pdf_content_in_image_completion_fails(self):
-        parts = [PDFContent(format="base64", value="abc", mime_type="application/pdf")]
-        error = validate_completion_input("image", parts)
-        assert error is not None
-        assert "'pdf'" in error
-
-    def test_image_content_in_pdf_completion_fails(self):
-        parts = [ImageContent(format="base64", value="abc", mime_type="image/png")]
-        error = validate_completion_input("pdf", parts)
-        assert error is not None
-        assert "'image'" in error
-
-    def test_unknown_completion_type(self):
-        error = validate_completion_input("unknown_type", "hello")
-        assert error is not None
-        assert "Unknown completion type" in error
-
-    def test_list_input_with_text_completion_fails(self):
-        parts = [ImageContent(format="base64", value="abc", mime_type="image/png")]
-        error = validate_completion_input("text", parts)
-        assert error is not None
-        assert "text" in error
-
-
 class TestMultiModalInput:
     def test_valid_parts(self):
         mm = MultiModalInput(
@@ -121,33 +43,6 @@ def test_single_text_part(self):
         assert len(mm.parts) == 1
 
 
-class TestGetContentLabel:
-    def test_text_content(self):
-        assert _get_content_label(TextContent(value="hi")) == "text"
-
-    def test_image_content(self):
-        assert (
-            _get_content_label(
-                ImageContent(format="base64", value="abc", mime_type="image/png")
-            )
-            == "image"
-        )
-
-    def test_pdf_content(self):
-        assert (
-            _get_content_label(
-                PDFContent(format="base64", value="abc", mime_type="application/pdf")
-            )
-            == "pdf"
-        )
-
-    def test_audio_content(self):
-        assert (
-            _get_content_label(AudioContent(value="abc", mime_type="audio/wav"))
-            == "audio"
-        )
-
-
 class TestResolveInputMultimodal:
     def test_image_input_returns_image_content_list(self):
         img = ImageInput(

From 54076f32972da46c1821bf186d294bb7897a9c62 Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Thu, 26 Feb 2026 17:28:53 +0530
Subject: [PATCH 10/16] added the table reference for image, pdf and multimodal

---
 backend/app/crud/llm.py           | 15 ++++++++++++++-
 backend/app/models/llm/request.py |  5 +++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/backend/app/crud/llm.py b/backend/app/crud/llm.py
index c1e01e7e7..360bab4f2 100644
--- a/backend/app/crud/llm.py
+++ b/backend/app/crud/llm.py
@@ -11,6 +11,8 @@
     TextInput,
     AudioInput,
     QueryInput,
+    ImageInput,
+    PDFInput,
 )
 
 logger = logging.getLogger(__name__)
@@ -73,15 +75,26 @@ def create_llm_call(
         else getattr(completion_config.params, "type", "text")
     )
 
-    input_type: Literal["text", "audio", "image"]
+    input_type: Literal["text", "audio", "image", "pdf", "multimodal"]
     output_type: Literal["text", "audio", "image"] | None
 
+    query_input = request.query.input
+
     if completion_type == "stt":
         input_type = "audio"
         output_type = "text"
     elif completion_type == "tts":
         input_type = "text"
         output_type = "audio"
+    elif isinstance(query_input, ImageInput):
+        input_type = "image"
+        output_type = "text"
+    elif isinstance(query_input, PDFInput):
+        input_type = "pdf"
+        output_type = "text"
+    elif isinstance(query_input, list):
+        input_type = "multimodal"
+        output_type = "text"
     else:
         input_type = "text"
         output_type = "text"
diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py
index 778039b62..71e5c1480 100644
--- a/backend/app/models/llm/request.py
+++ b/backend/app/models/llm/request.py
@@ -425,12 +425,13 @@ class LlmCall(SQLModel, table=True):
         },
     )
 
-    input_type: Literal["text", "audio", "image"] = Field(
+    # NOTE: image, pdf, multimodal are internal labels stored in the table not user facing.
+    input_type: Literal["text", "audio", "image", "pdf", "multimodal"] = Field(
         ...,
         sa_column=sa.Column(
             sa.String,
             nullable=False,
-            comment="Input type: text, audio, image",
+            comment="Input type: text, audio, image, pdf, multimodal",
         ),
     )
 

From 2c76a4aa9e86f2f7088b7ac63c9bf69e534aa87c Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Sat, 28 Feb 2026 13:24:43 +0530
Subject: [PATCH 11/16] Remove completion_type for image, pdf, and multimodal
 types in NativeCompletionConfig and related methods in GoogleAIProvider

---
 backend/app/models/llm/request.py         |   2 +-
 backend/app/services/llm/providers/gai.py | 241 +---------------------
 2 files changed, 6 insertions(+), 237 deletions(-)

diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py
index 71e5c1480..57ccf2740 100644
--- a/backend/app/models/llm/request.py
+++ b/backend/app/models/llm/request.py
@@ -198,7 +198,7 @@ class NativeCompletionConfig(SQLModel):
         ...,
         description="Provider-specific parameters (schema varies by provider), should exactly match the provider's endpoint params structure",
     )
-    type: Literal["text", "stt", "tts", "image", "pdf", "multimodal"] = Field(
+    type: Literal["text", "stt", "tts"] = Field(
         ..., description="Completion config type. Params schema varies by type"
     )
 
diff --git a/backend/app/services/llm/providers/gai.py b/backend/app/services/llm/providers/gai.py
index fe920ba07..db9dc6e20 100644
--- a/backend/app/services/llm/providers/gai.py
+++ b/backend/app/services/llm/providers/gai.py
@@ -377,152 +377,10 @@ def _execute_tts(
 
         return llm_response, None
 
-    def _execute_image(
-        self,
-        completion_config: NativeCompletionConfig,
-        resolved_input: list[ImageContent],
-        include_provider_raw_response: bool = False,
-    ) -> tuple[LLMCallResponse | None, str | None]:
-        model = completion_config.params.get("model")
-        if not model:
-            return None, "Missing 'model' in native params"
-
-        gemini_parts = self.format_parts(resolved_input)
-        contents = [{"role": "user", "parts": gemini_parts}]
-
-        instructions = completion_config.params.get("instructions", "")
-        temperature = completion_config.params.get("temperature", None)
-        thinking_level = completion_config.params.get("reasoning", None)
-
-        generation_kwargs = {}
-        if instructions:
-            contents.append({"role": "system", "parts": [{"text": instructions}]})
-
-        if temperature is not None:
-            generation_kwargs["temperature"] = temperature
-
-        if thinking_level is not None:
-            generation_kwargs["thinking_config"] = ThinkingConfig(
-                include_thoughts=False, thinking_level=thinking_level
-            )
-
-        response = self.client.models.generate_content(
-            model=model,
-            contents=contents,
-            config=GenerateContentConfig(**generation_kwargs),
-        )
-
-        if response.usage_metadata:
-            input_tokens = response.usage_metadata.prompt_token_count or 0
-            output_tokens = response.usage_metadata.candidates_token_count or 0
-            total_tokens = response.usage_metadata.total_token_count or 0
-            reasoning_tokens = response.usage_metadata.thoughts_token_count or 0
-        else:
-            logger.warning(
-                f"[GoogleAIProvider._execute_image] Response missing usage_metadata, using zeros"
-            )
-            input_tokens = 0
-            output_tokens = 0
-            total_tokens = 0
-            reasoning_tokens = 0
-
-        llm_response = LLMCallResponse(
-            response=LLMResponse(
-                provider_response_id=response.response_id,
-                model=response.model_version or model,
-                provider=completion_config.provider,
-                output=TextOutput(content=TextContent(value=response.text)),
-            ),
-            usage=Usage(
-                input_tokens=input_tokens,
-                output_tokens=output_tokens,
-                total_tokens=total_tokens,
-                reasoning_tokens=reasoning_tokens,
-            ),
-        )
-        if include_provider_raw_response:
-            llm_response.provider_raw_response = response.model_dump(mode="json")
-
-        logger.info(
-            f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}"
-        )
-        return llm_response, None
-
-    def _execute_pdf(
-        self,
-        completion_config: NativeCompletionConfig,
-        resolved_input: list[PDFContent],
-        include_provider_raw_response: bool = False,
-    ) -> tuple[LLMCallResponse | None, str | None]:
-        model = completion_config.params.get("model")
-        if not model:
-            return None, "Missing 'model' in native params"
-
-        gemini_parts = self.format_parts(resolved_input)
-        contents = [{"role": "user", "parts": gemini_parts}]
-
-        instructions = completion_config.params.get("instructions", "")
-        temperature = completion_config.params.get("temperature", None)
-        thinking_level = completion_config.params.get("reasoning", None)
-
-        generation_kwargs = {}
-        if instructions:
-            contents.append({"role": "system", "parts": [{"text": instructions}]})
-
-        if temperature is not None:
-            generation_kwargs["temperature"] = temperature
-
-        if thinking_level is not None:
-            generation_kwargs["thinking_config"] = ThinkingConfig(
-                include_thoughts=False, thinking_level=thinking_level
-            )
-
-        response = self.client.models.generate_content(
-            model=model,
-            contents=contents,
-            config=GenerateContentConfig(**generation_kwargs),
-        )
-
-        if response.usage_metadata:
-            input_tokens = response.usage_metadata.prompt_token_count or 0
-            output_tokens = response.usage_metadata.candidates_token_count or 0
-            total_tokens = response.usage_metadata.total_token_count or 0
-            reasoning_tokens = response.usage_metadata.thoughts_token_count or 0
-        else:
-            logger.warning(
-                f"[GoogleAIProvider._execute_stt] Response missing usage_metadata, using zeros"
-            )
-            input_tokens = 0
-            output_tokens = 0
-            total_tokens = 0
-            reasoning_tokens = 0
-
-        llm_response = LLMCallResponse(
-            response=LLMResponse(
-                provider_response_id=response.response_id,
-                model=response.model_version or model,
-                provider=completion_config.provider,
-                output=TextOutput(content=TextContent(value=response.text)),
-            ),
-            usage=Usage(
-                input_tokens=input_tokens,
-                output_tokens=output_tokens,
-                total_tokens=total_tokens,
-                reasoning_tokens=reasoning_tokens,
-            ),
-        )
-        if include_provider_raw_response:
-            llm_response.provider_raw_response = response.model_dump(mode="json")
-
-        logger.info(
-            f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}"
-        )
-        return llm_response, None
-
     def _execute_text(
         self,
         completion_config: NativeCompletionConfig,
-        resolved_input: str | MultiModalInput,
+        resolved_input: str | list[ContentPart] | MultiModalInput,
         include_provider_raw_response: bool = False,
     ) -> tuple[LLMCallResponse | None, str | None]:
         model = completion_config.params.get("model")
@@ -532,6 +390,9 @@ def _execute_text(
         if isinstance(resolved_input, MultiModalInput):
             gemini_parts = self.format_parts(resolved_input.parts)
             contents = [{"role": "user", "parts": gemini_parts}]
+        elif isinstance(resolved_input, list):
+            gemini_parts = self.format_parts(resolved_input)
+            contents = [{"role": "user", "parts": gemini_parts}]
         else:
             contents = [{"role": "user", "parts": [{"text": resolved_input}]}]
 
@@ -593,82 +454,11 @@ def _execute_text(
         )
         return llm_response, None
 
-    def _execute_multimodal(
-        self,
-        completion_config: NativeCompletionConfig,
-        resolved_input: MultiModalInput,
-        include_provider_raw_response: bool = False,
-    ) -> tuple[LLMCallResponse | None, str | None]:
-        model = completion_config.params.get("model")
-        if not model:
-            return None, "Missing 'model' in native params"
-
-        gemini_parts = self.format_parts(resolved_input.parts)
-        contents = [{"role": "user", "parts": gemini_parts}]
-
-        instructions = completion_config.params.get("instructions", "")
-        temperature = completion_config.params.get("temperature", None)
-        thinking_level = completion_config.params.get("reasoning", None)
-
-        generation_kwargs = {}
-        if instructions:
-            contents.append({"role": "system", "parts": [{"text": instructions}]})
-
-        if temperature is not None:
-            generation_kwargs["temperature"] = temperature
-
-        if thinking_level is not None:
-            generation_kwargs["thinking_config"] = ThinkingConfig(
-                include_thoughts=False, thinking_level=thinking_level
-            )
-
-        response = self.client.models.generate_content(
-            model=model,
-            contents=contents,
-            config=GenerateContentConfig(**generation_kwargs),
-        )
-
-        if response.usage_metadata:
-            input_tokens = response.usage_metadata.prompt_token_count or 0
-            output_tokens = response.usage_metadata.candidates_token_count or 0
-            total_tokens = response.usage_metadata.total_token_count or 0
-            reasoning_tokens = response.usage_metadata.thoughts_token_count or 0
-        else:
-            logger.warning(
-                f"[GoogleAIProvider._execute_multimodal] Response missing usage_metadata, using zeros"
-            )
-            input_tokens = 0
-            output_tokens = 0
-            total_tokens = 0
-            reasoning_tokens = 0
-
-        llm_response = LLMCallResponse(
-            response=LLMResponse(
-                provider_response_id=response.response_id,
-                model=response.model_version or model,
-                provider=completion_config.provider,
-                output=TextOutput(content=TextContent(value=response.text)),
-            ),
-            usage=Usage(
-                input_tokens=input_tokens,
-                output_tokens=output_tokens,
-                total_tokens=total_tokens,
-                reasoning_tokens=reasoning_tokens,
-            ),
-        )
-        if include_provider_raw_response:
-            llm_response.provider_raw_response = response.model_dump(mode="json")
-
-        logger.info(
-            f"[GoogleAIProvider._execute_text] Successfully generated text response: {response.response_id}"
-        )
-        return llm_response, None
-
     def execute(
         self,
         completion_config: NativeCompletionConfig,
         query: QueryParams,
-        resolved_input: str | list[ImageContent] | list[PDFContent] | MultiModalInput,
+        resolved_input: str | list[ContentPart] | MultiModalInput,
         include_provider_raw_response: bool = False,
     ) -> tuple[LLMCallResponse | None, str | None]:
         try:
@@ -693,27 +483,6 @@ def execute(
                     include_provider_raw_response=include_provider_raw_response,
                 )
 
-            elif completion_type == "image":
-                return self._execute_image(
-                    completion_config=completion_config,
-                    resolved_input=resolved_input,
-                    include_provider_raw_response=include_provider_raw_response,
-                )
-
-            elif completion_type == "pdf":
-                return self._execute_pdf(
-                    completion_config=completion_config,
-                    resolved_input=resolved_input,
-                    include_provider_raw_response=include_provider_raw_response,
-                )
-
-            elif completion_type == "multimodal":
-                return self._execute_multimodal(
-                    completion_config=completion_config,
-                    resolved_input=resolved_input,
-                    include_provider_raw_response=include_provider_raw_response,
-                )
-
         except TypeError as e:
             # handle unexpected arguments gracefully
             error_message = f"Invalid or unexpected parameter in Config: {str(e)}"

From 2ed9af916849ee8048f2481dc9bcb0744d37a103 Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Sun, 1 Mar 2026 18:14:41 +0530
Subject: [PATCH 12/16] Refactor credential patching in LLM provider tests and
 update reasoning parameter handling in Google params mapping tests

---
 .../services/llm/providers/test_registry.py   |  8 ++----
 .../app/tests/services/llm/test_mappers.py    | 26 +++++++------------
 2 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/backend/app/tests/services/llm/providers/test_registry.py b/backend/app/tests/services/llm/providers/test_registry.py
index b3daa44c4..4349da107 100644
--- a/backend/app/tests/services/llm/providers/test_registry.py
+++ b/backend/app/tests/services/llm/providers/test_registry.py
@@ -40,9 +40,7 @@ def test_get_llm_provider_with_openai(self, db: Session):
         """Test getting OpenAI provider successfully."""
         project = get_project(db)
 
-        with patch(
-            "app.services.llm.providers.registry.get_provider_credential"
-        ) as mock_get_creds:
+        with patch("app.crud.credentials.get_provider_credential") as mock_get_creds:
             mock_get_creds.return_value = {"api_key": "test-api-key"}
 
             provider = get_llm_provider(
@@ -94,9 +92,7 @@ def test_get_llm_provider_with_missing_credentials(self, db: Session):
         """Test handling of errors when credentials are not found."""
         project = get_project(db)
 
-        with patch(
-            "app.services.llm.providers.registry.get_provider_credential"
-        ) as mock_get_creds:
+        with patch("app.crud.credentials.get_provider_credential") as mock_get_creds:
             mock_get_creds.return_value = None
 
             with pytest.raises(ValueError) as exc_info:
diff --git a/backend/app/tests/services/llm/test_mappers.py b/backend/app/tests/services/llm/test_mappers.py
index 2ecbcd7b2..7a70cf46c 100644
--- a/backend/app/tests/services/llm/test_mappers.py
+++ b/backend/app/tests/services/llm/test_mappers.py
@@ -292,8 +292,7 @@ def test_knowledge_base_ids_warning(self):
         assert "knowledge_base_ids" in warnings[0].lower()
         assert "not supported" in warnings[0]
 
-    def test_reasoning_warning(self):
-        """Test that reasoning parameter is not supported and generates warning."""
+    def test_reasoning_passed_through(self):
         kaapi_params = TextLLMParams(
             model="gemini-2.5-pro",
             reasoning="high",
@@ -304,13 +303,10 @@ def test_reasoning_warning(self):
         )
 
         assert result["model"] == "gemini-2.5-pro"
-        assert "reasoning" not in result
-        assert len(warnings) == 1
-        assert "reasoning" in warnings[0].lower()
-        assert "not applicable" in warnings[0]
+        assert result["reasoning"] == "high"
+        assert len(warnings) == 0
 
-    def test_multiple_unsupported_params(self):
-        """Test that multiple unsupported parameters generate multiple warnings."""
+    def test_knowledge_base_ids_unsupported(self):
         kaapi_params = TextLLMParams(
             model="gemini-2.5-pro",
             reasoning="medium",
@@ -322,13 +318,10 @@ def test_multiple_unsupported_params(self):
         )
 
         assert result["model"] == "gemini-2.5-pro"
-        assert "reasoning" not in result
+        assert result["reasoning"] == "medium"
         assert "knowledge_base_ids" not in result
-        assert len(warnings) == 2
-        # Check both warnings are present
-        warning_text = " ".join(warnings).lower()
-        assert "reasoning" in warning_text
-        assert "knowledge_base_ids" in warning_text
+        assert len(warnings) == 1
+        assert "knowledge_base_ids" in warnings[0].lower()
 
 
 class TestTransformKaapiConfigToNative:
@@ -476,7 +469,6 @@ def test_transform_google_config(self):
         assert warnings == []
 
     def test_transform_google_with_unsupported_params(self):
-        """Test that Google transformation warns about unsupported parameters."""
         kaapi_config = KaapiCompletionConfig(
             provider="google",
             type="text",
@@ -491,6 +483,6 @@ def test_transform_google_with_unsupported_params(self):
 
         assert result.provider == "google-native"
         assert result.params["model"] == "gemini-2.5-pro"
+        assert result.params["reasoning"] == "high"
         assert "knowledge_base_ids" not in result.params
-        assert "reasoning" not in result.params
-        assert len(warnings) == 2
+        assert len(warnings) == 1

From 2b265db6333c5508d89a2e78cd3a645b41fdec90 Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Sun, 1 Mar 2026 19:50:35 +0530
Subject: [PATCH 13/16] Add tests for edge cases in multimodal input handling
 and enhance OpenAI/Google AI execution routing

---
 .../app/tests/services/llm/test_multimodal.py | 190 ++++++++++++++++++
 1 file changed, 190 insertions(+)

diff --git a/backend/app/tests/services/llm/test_multimodal.py b/backend/app/tests/services/llm/test_multimodal.py
index 5d019ce95..b5185995b 100644
--- a/backend/app/tests/services/llm/test_multimodal.py
+++ b/backend/app/tests/services/llm/test_multimodal.py
@@ -1,4 +1,5 @@
 import pytest
+from unittest.mock import MagicMock
 
 from app.models.llm.request import (
     TextInput,
@@ -9,6 +10,8 @@
     AudioContent,
     ImageContent,
     PDFContent,
+    NativeCompletionConfig,
+    QueryParams,
 )
 from app.services.llm.providers.base import (
     ContentPart,
@@ -343,3 +346,190 @@ def test_list_content(self):
         )
         result = resolve_pdf_content(pdf)
         assert len(result) == 2
+
+
+class TestResolveInputEdgeCases:
+    def test_unknown_input_type(self):
+        result, error = resolve_input(12345)
+        assert error is not None
+        assert "Unknown input type" in error
+
+    def test_unsupported_type_in_multimodal_list(self):
+        result, error = resolve_input(["not_a_valid_input"])
+        assert error is not None
+        assert "Unsupported input type" in error
+
+    def test_text_input_resolves_string(self):
+        text = TextInput(content=TextContent(value="hello world"))
+        result, error = resolve_input(text)
+        assert error is None
+        assert result == "hello world"
+
+
+class TestOpenAIExecuteInputRouting:
+    def _make_provider(self):
+        mock_client = MagicMock()
+        mock_resp = MagicMock()
+        mock_resp.id = "resp_123"
+        mock_resp.model = "gpt-4o-mini"
+        mock_resp.output_text = "result"
+        mock_resp.usage.input_tokens = 10
+        mock_resp.usage.output_tokens = 5
+        mock_resp.usage.total_tokens = 15
+        mock_resp.conversation = None
+        mock_client.responses.create.return_value = mock_resp
+        return OpenAIProvider(client=mock_client), mock_client
+
+    def _make_config(self):
+        return NativeCompletionConfig(
+            provider="openai-native", type="text", params={"model": "gpt-4o-mini"}
+        )
+
+    def _make_query(self):
+        return QueryParams(input="test")
+
+    def test_multimodal_input(self):
+        provider, mock_client = self._make_provider()
+        mm = MultiModalInput(
+            parts=[
+                TextContent(value="describe"),
+                ImageContent(format="base64", value="img", mime_type="image/png"),
+            ]
+        )
+        response, error = provider.execute(
+            completion_config=self._make_config(),
+            query=self._make_query(),
+            resolved_input=mm,
+        )
+        assert error is None
+        call_kwargs = mock_client.responses.create.call_args[1]
+        assert call_kwargs["input"][0]["role"] == "user"
+        assert len(call_kwargs["input"][0]["content"]) == 2
+
+    def test_list_input(self):
+        provider, mock_client = self._make_provider()
+        parts = [ImageContent(format="base64", value="img", mime_type="image/png")]
+        response, error = provider.execute(
+            completion_config=self._make_config(),
+            query=self._make_query(),
+            resolved_input=parts,
+        )
+        assert error is None
+        call_kwargs = mock_client.responses.create.call_args[1]
+        assert call_kwargs["input"][0]["role"] == "user"
+
+    def test_string_input(self):
+        provider, mock_client = self._make_provider()
+        response, error = provider.execute(
+            completion_config=self._make_config(),
+            query=self._make_query(),
+            resolved_input="hello",
+        )
+        assert error is None
+        call_kwargs = mock_client.responses.create.call_args[1]
+        assert call_kwargs["input"] == "hello"
+
+
+class TestGoogleAIExecuteTextRouting:
+    def _make_provider(self):
+        mock_client = MagicMock()
+        mock_resp = MagicMock()
+        mock_resp.response_id = "resp_gai_123"
+        mock_resp.model_version = "gemini-2.0-flash"
+        mock_resp.text = "response text"
+        mock_resp.usage_metadata.prompt_token_count = 10
+        mock_resp.usage_metadata.candidates_token_count = 5
+        mock_resp.usage_metadata.total_token_count = 15
+        mock_resp.usage_metadata.thoughts_token_count = 0
+        mock_client.models.generate_content.return_value = mock_resp
+        return GoogleAIProvider(client=mock_client), mock_client
+
+    def _make_config(self, **extra_params):
+        params = {"model": "gemini-2.0-flash"}
+        params.update(extra_params)
+        return NativeCompletionConfig(
+            provider="google-native", type="text", params=params
+        )
+
+    def _make_query(self):
+        return QueryParams(input="test")
+
+    def test_multimodal_input(self):
+        provider, mock_client = self._make_provider()
+        mm = MultiModalInput(
+            parts=[
+                TextContent(value="describe"),
+                ImageContent(format="base64", value="img", mime_type="image/png"),
+            ]
+        )
+        response, error = provider.execute(
+            completion_config=self._make_config(),
+            query=self._make_query(),
+            resolved_input=mm,
+        )
+        assert error is None
+        call_kwargs = mock_client.models.generate_content.call_args[1]
+        assert call_kwargs["contents"][0]["role"] == "user"
+        assert len(call_kwargs["contents"][0]["parts"]) == 2
+
+    def test_list_input(self):
+        provider, mock_client = self._make_provider()
+        parts = [ImageContent(format="base64", value="img", mime_type="image/png")]
+        response, error = provider.execute(
+            completion_config=self._make_config(),
+            query=self._make_query(),
+            resolved_input=parts,
+        )
+        assert error is None
+        call_kwargs = mock_client.models.generate_content.call_args[1]
+        assert call_kwargs["contents"][0]["role"] == "user"
+
+    def test_string_input(self):
+        provider, mock_client = self._make_provider()
+        response, error = provider.execute(
+            completion_config=self._make_config(),
+            query=self._make_query(),
+            resolved_input="hello",
+        )
+        assert error is None
+        call_kwargs = mock_client.models.generate_content.call_args[1]
+        assert call_kwargs["contents"][0]["parts"] == [{"text": "hello"}]
+
+    def test_missing_model(self):
+        provider, _ = self._make_provider()
+        config = NativeCompletionConfig(
+            provider="google-native", type="text", params={}
+        )
+        response, error = provider.execute(
+            completion_config=config,
+            query=self._make_query(),
+            resolved_input="hello",
+        )
+        assert response is None
+        assert "Missing 'model'" in error
+
+    def test_instructions_appended(self):
+        provider, mock_client = self._make_provider()
+        response, error = provider.execute(
+            completion_config=self._make_config(instructions="be helpful"),
+            query=self._make_query(),
+            resolved_input="hello",
+        )
+        assert error is None
+        call_kwargs = mock_client.models.generate_content.call_args[1]
+        contents = call_kwargs["contents"]
+        assert len(contents) == 2
+        assert contents[1]["role"] == "system"
+
+    def test_no_usage_metadata(self):
+        provider, mock_client = self._make_provider()
+        mock_resp = mock_client.models.generate_content.return_value
+        mock_resp.usage_metadata = None
+        response, error = provider.execute(
+            completion_config=self._make_config(),
+            query=self._make_query(),
+            resolved_input="hello",
+        )
+        assert error is None
+        assert response.usage.input_tokens == 0
+        assert response.usage.output_tokens == 0

From fdb82b01bd994bd975e4790a639a04da99159891 Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Sun, 1 Mar 2026 20:10:15 +0530
Subject: [PATCH 14/16] replaced role: "system"

---
 backend/app/services/llm/providers/gai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/app/services/llm/providers/gai.py b/backend/app/services/llm/providers/gai.py
index db9dc6e20..05fa46fc1 100644
--- a/backend/app/services/llm/providers/gai.py
+++ b/backend/app/services/llm/providers/gai.py
@@ -402,7 +402,7 @@ def _execute_text(
 
         generation_kwargs = {}
         if instructions:
-            contents.append({"role": "system", "parts": [{"text": instructions}]})
+            generation_kwargs["system_instruction"] = instructions
 
         if temperature is not None:
             generation_kwargs["temperature"] = temperature

From a00bb6d354b2b25e1f7e047e8d21de19a7a4a0eb Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Sun, 1 Mar 2026 20:20:54 +0530
Subject: [PATCH 15/16] Rename test_instructions_appended to
 test_instructions_passed_to_config and update assertions to validate
 system_instruction in Google AI execution routing

---
 backend/app/tests/services/llm/test_multimodal.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/backend/app/tests/services/llm/test_multimodal.py b/backend/app/tests/services/llm/test_multimodal.py
index b5185995b..bae09308a 100644
--- a/backend/app/tests/services/llm/test_multimodal.py
+++ b/backend/app/tests/services/llm/test_multimodal.py
@@ -508,7 +508,7 @@ def test_missing_model(self):
         assert response is None
         assert "Missing 'model'" in error
 
-    def test_instructions_appended(self):
+    def test_instructions_passed_to_config(self):
         provider, mock_client = self._make_provider()
         response, error = provider.execute(
             completion_config=self._make_config(instructions="be helpful"),
@@ -517,9 +517,8 @@ def test_instructions_appended(self):
         )
         assert error is None
         call_kwargs = mock_client.models.generate_content.call_args[1]
-        contents = call_kwargs["contents"]
-        assert len(contents) == 2
-        assert contents[1]["role"] == "system"
+        config = call_kwargs["config"]
+        assert config.system_instruction == "be helpful"
 
     def test_no_usage_metadata(self):
         provider, mock_client = self._make_provider()

From ad2e1be278b2c4e97e8c1b28eceaff1f17d1093a Mon Sep 17 00:00:00 2001
From: Prashant Vasudevan <71649489+vprashrex@users.noreply.github.com>
Date: Mon, 2 Mar 2026 13:43:43 +0530
Subject: [PATCH 16/16] Enhance LLM API documentation to support multimodal
 input types and clarify configuration parameters

---
 backend/app/api/docs/llm/llm_call.md | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/backend/app/api/docs/llm/llm_call.md b/backend/app/api/docs/llm/llm_call.md
index fec4fbc49..8a594390c 100644
--- a/backend/app/api/docs/llm/llm_call.md
+++ b/backend/app/api/docs/llm/llm_call.md
@@ -6,7 +6,14 @@ for processing, and results are delivered via the callback URL when complete.
 ### Key Parameters
 
 **`query`** (required) - Query parameters for this LLM call:
-- `input` (required, string, min 1 char): User question/prompt/query
+- `input` (required): User input — accepts one of:
+  - A plain **string** e.g. `"input": "Hello"` (automatically normalized to a text input internally)
+  - A **structured input object** with `type` and `content` fields e.g. `"input": {"type": "text", "content": {"format": "text", "value": "Hello"}}`
+  - A **list of structured input objects** for multimodal inputs e.g. `"input": [{"type": "text", ...}, {"type": "image", ...}]`
+  - Supported input types: `text`, `audio`, `image`, `pdf`
+  - For `image` and `pdf` types, `content` accepts a single object or a list e.g. `"content": [{"format": "base64", "value": "..."}, ...]`
+  - Content `format` varies by type: `"text"` for text, `"base64"` for encoded data, `"url"` for image/pdf URLs
+  - Default MIME types when not specified: `image/png` for images, `application/pdf` for PDFs
 - `conversation` (optional, object): Conversation configuration
   - `id` (optional, string): Existing conversation ID to continue
   - `auto_create` (optional, boolean, default false): Create new conversation if no ID provided
@@ -23,8 +30,9 @@ for processing, and results are delivered via the callback URL when complete.
 - **Mode 2: Ad-hoc Configuration**
   - `blob` (object): Complete configuration object
     - `completion` (required, object): Completion configuration
-      - `provider` (required, string): Provider type - either `"openai"` (Kaapi abstraction) or `"openai-native"` (pass-through)
-      - `params` (required, object): Parameters structure depends on provider type (see schema for detailed structure)
+      - `provider` (required, string): Provider type — `"openai"` or `"google"` (Kaapi abstraction), or `"openai-native"` or `"google-native"` (pass-through)
+      - `type` (required, string): Completion type — `"text"`, `"stt"`, `"tts"` for Kaapi providers; additionally `"image"`, `"pdf"`, `"multimodal"` for native providers
+      - `params` (required, object): Parameters structure depends on provider and type (see schema for detailed structure)
   - **Note**
     - When using ad-hoc configuration, do not include `id` and `version` fields
     - When using the Kaapi abstraction, parameters that are not supported by the selected provider or model are automatically suppressed. If any parameters are ignored, a list of warnings is included in the metadata.warnings. For example, the GPT-5 model does not support the temperature parameter, so Kaapi will neither throw an error nor pass this parameter to the model; instead, it will return a warning in the metadata.warnings response.