ProjectTech4DevAI · vprashrex · Mar 2, 2026 · Feb 24, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/backend/app/api/docs/llm/llm_call.md b/backend/app/api/docs/llm/llm_call.md
@@ -6,7 +6,14 @@ for processing, and results are delivered via the callback URL when complete.
 ### Key Parameters
 
 **`query`** (required) - Query parameters for this LLM call:
-- `input` (required, string, min 1 char): User question/prompt/query
+- `input` (required): User input — accepts one of:
+  - A plain **string** e.g. `"input": "Hello"` (automatically normalized to a text input internally)
+  - A **structured input object** with `type` and `content` fields e.g. `"input": {"type": "text", "content": {"format": "text", "value": "Hello"}}`
+  - A **list of structured input objects** for multimodal inputs e.g. `"input": [{"type": "text", ...}, {"type": "image", ...}]`
+  - Supported input types: `text`, `audio`, `image`, `pdf`
+  - For `image` and `pdf` types, `content` accepts a single object or a list e.g. `"content": [{"format": "base64", "value": "..."}, ...]`
+  - Content `format` varies by type: `"text"` for text, `"base64"` for encoded data, `"url"` for image/pdf URLs
+  - Default MIME types when not specified: `image/png` for images, `application/pdf` for PDFs
 - `conversation` (optional, object): Conversation configuration
   - `id` (optional, string): Existing conversation ID to continue
   - `auto_create` (optional, boolean, default false): Create new conversation if no ID provided
@@ -23,8 +30,9 @@ for processing, and results are delivered via the callback URL when complete.
 - **Mode 2: Ad-hoc Configuration**
   - `blob` (object): Complete configuration object
     - `completion` (required, object): Completion configuration
-      - `provider` (required, string): Provider type - either `"openai"` (Kaapi abstraction) or `"openai-native"` (pass-through)
-      - `params` (required, object): Parameters structure depends on provider type (see schema for detailed structure)
+      - `provider` (required, string): Provider type — `"openai"` or `"google"` (Kaapi abstraction), or `"openai-native"` or `"google-native"` (pass-through)
+      - `type` (required, string): Completion type — `"text"`, `"stt"`, `"tts"` for Kaapi providers; additionally `"image"`, `"pdf"`, `"multimodal"` for native providers
+      - `params` (required, object): Parameters structure depends on provider and type (see schema for detailed structure)
   - **Note**
     - When using ad-hoc configuration, do not include `id` and `version` fields
     - When using the Kaapi abstraction, parameters that are not supported by the selected provider or model are automatically suppressed. If any parameters are ignored, a list of warnings is included in the metadata.warnings. For example, the GPT-5 model does not support the temperature parameter, so Kaapi will neither throw an error nor pass this parameter to the model; instead, it will return a warning in the metadata.warnings response.

diff --git a/backend/app/crud/llm.py b/backend/app/crud/llm.py
@@ -11,6 +11,8 @@
     TextInput,
     AudioInput,
     QueryInput,
+    ImageInput,
+    PDFInput,
 )
 
 logger = logging.getLogger(__name__)
@@ -73,15 +75,26 @@ def create_llm_call(
         else getattr(completion_config.params, "type", "text")
     )
 
-    input_type: Literal["text", "audio", "image"]
+    input_type: Literal["text", "audio", "image", "pdf", "multimodal"]
     output_type: Literal["text", "audio", "image"] | None
 
+    query_input = request.query.input
+
     if completion_type == "stt":
         input_type = "audio"
         output_type = "text"
     elif completion_type == "tts":
         input_type = "text"
         output_type = "audio"
+    elif isinstance(query_input, ImageInput):
+        input_type = "image"
+        output_type = "text"
+    elif isinstance(query_input, PDFInput):
+        input_type = "pdf"
+        output_type = "text"
+    elif isinstance(query_input, list):
+        input_type = "multimodal"
+        output_type = "text"
     else:
         input_type = "text"
         output_type = "text"

diff --git a/backend/app/models/llm/__init__.py b/backend/app/models/llm/__init__.py
@@ -9,6 +9,10 @@
     LlmCall,
     AudioContent,
     TextContent,
+    ImageContent,
+    PDFContent,
+    ImageInput,
+    PDFInput,
 )
 from app.models.llm.response import (
     LLMCallResponse,

diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py
@@ -1,5 +1,5 @@
 import sqlalchemy as sa
-from typing import Annotated, Any, Literal, Union
+from typing import Annotated, Any, List, Literal, Union
 from uuid import UUID, uuid4
 from pydantic import model_validator, HttpUrl
 from datetime import datetime
@@ -56,7 +56,11 @@ class TTSLLMParams(SQLModel):
     response_format: Literal["mp3", "wav", "ogg"] | None = "wav"
 
 
-KaapiLLMParams = Union[TextLLMParams, STTLLMParams, TTSLLMParams]
+KaapiLLMParams = Union[
+    TextLLMParams,
+    STTLLMParams,
+    TTSLLMParams,
+]
 
 
 # Input type models for discriminated union
@@ -75,6 +79,28 @@ class AudioContent(SQLModel):
     )
 
 
+class ImageContent(SQLModel):
+    format: Literal["base64", "url"] = "base64"
+    value: str = Field(
+        ..., description="Base64 encoded image or Public URL to the image"
+    )
+    # keeping the mime_type
+    mime_type: str | None = Field(
+        None,
+        description="MIME type of the image (e.g., image/png, image/jpeg)",
+    )
+
+
+class PDFContent(SQLModel):
+    format: Literal["base64", "url"] = "base64"
+    value: str = Field(..., description="Base64 encoded PDF or Public URL to the PDF")
+    # keeping the mime_type
+    mime_type: str | None = Field(
+        None,
+        description="MIME type of the PDF (e.g., application/pdf)",
+    )
+
+
 class TextInput(SQLModel):
     type: Literal["text"] = "text"
     content: TextContent
@@ -85,9 +111,19 @@ class AudioInput(SQLModel):
     content: AudioContent
 
 
+class ImageInput(SQLModel):
+    type: Literal["image"] = "image"
+    content: ImageContent | list[ImageContent]
+
+
+class PDFInput(SQLModel):
+    type: Literal["pdf"] = "pdf"
+    content: PDFContent | list[PDFContent]
+
+
 # Discriminated union for query input types
 QueryInput = Annotated[
-    Union[TextInput, AudioInput],
+    Union[TextInput, AudioInput, ImageInput, PDFInput],
     Field(discriminator="type"),
 ]
 
@@ -122,7 +158,7 @@ def validate_conversation_logic(self):
 class QueryParams(SQLModel):
     """Query-specific parameters for each LLM call."""
 
-    input: str | QueryInput = Field(
+    input: str | QueryInput | list[QueryInput] = Field(
         ...,
         description=(
             "User input - either a plain string (text) or a structured input object. "
@@ -389,12 +425,13 @@ class LlmCall(SQLModel, table=True):
         },
     )
 
-    input_type: Literal["text", "audio", "image"] = Field(
+    # NOTE: image, pdf, multimodal are internal labels stored in the table not user facing.
+    input_type: Literal["text", "audio", "image", "pdf", "multimodal"] = Field(
         ...,
         sa_column=sa.Column(
             sa.String,
             nullable=False,
-            comment="Input type: text, audio, image",
+            comment="Input type: text, audio, image, pdf, multimodal",
         ),
     )
 

diff --git a/backend/app/services/llm/jobs.py b/backend/app/services/llm/jobs.py
@@ -20,6 +20,8 @@
     KaapiCompletionConfig,
     TextInput,
     AudioInput,
+    ImageInput,
+    PDFInput,
 )
 from app.models.llm.response import TextOutput
 from app.services.llm.guardrails import (
@@ -102,13 +104,16 @@ def handle_job_error(
 
 
 @contextmanager
-def resolved_input_context(query_input: TextInput | AudioInput):
+def resolved_input_context(
+    query_input: TextInput | AudioInput | ImageInput | PDFInput | list,
+):
     """Context manager for resolving and cleaning up input resources.
 
     Ensures temporary files (e.g., downloaded audio) are cleaned up
     even if errors occur during LLM execution.
     """
     resolved_input, error = resolve_input(query_input)
+
     if error:
         raise ValueError(error)
 

diff --git a/backend/app/services/llm/mappers.py b/backend/app/services/llm/mappers.py
@@ -127,17 +127,18 @@ def map_kaapi_to_google_params(kaapi_params: dict) -> tuple[dict, list[str]]:
     response_format = kaapi_params.get("response_format")
     if response_format:
         google_params["response_format"] = response_format
+
+    reasoning = kaapi_params.get("reasoning")
+    if reasoning:
+        google_params["reasoning"] = reasoning
+
     # Warn about unsupported parameters
     if kaapi_params.get("knowledge_base_ids"):
+        # TODO: Will take up later, when we add google filesearch tool support
         warnings.append(
             "Parameter 'knowledge_base_ids' is not supported by Google AI and was ignored."
         )
 
-    if kaapi_params.get("reasoning") is not None:
-        warnings.append(
-            "Parameter 'reasoning' is not applicable for Google AI and was ignored."
-        )
-
     return google_params, warnings
 
 

diff --git a/backend/app/services/llm/providers/base.py b/backend/app/services/llm/providers/base.py
@@ -7,7 +7,25 @@
 from abc import ABC, abstractmethod
 from typing import Any
 
+from pydantic import model_validator
+from sqlmodel import SQLModel
+
 from app.models.llm import NativeCompletionConfig, LLMCallResponse, QueryParams
+from app.models.llm.request import TextContent, ImageContent, PDFContent
+
+ContentPart = TextContent | ImageContent | PDFContent
+
+
+class MultiModalInput(SQLModel):
+    """Resolved multimodal input containing a list of content parts."""
+
+    parts: list[ContentPart]
+
+    @model_validator(mode="after")
+    def validate_parts(self):
+        if not self.parts:
+            raise ValueError("MultiModalInput requires at least one content part")
+        return self
 
 
 class BaseProvider(ABC):
@@ -44,7 +62,7 @@ def execute(
         self,
         completion_config: NativeCompletionConfig,
         query: QueryParams,
-        resolved_input: str,
+        resolved_input: str | list[ContentPart],
         include_provider_raw_response: bool = False,
     ) -> tuple[LLMCallResponse | None, str | None]:
         """Execute LLM API call.