Responses api init

michalkulakowski · Michal Kulakowski · commit 7cf20dcbcc7f · 2026-03-05T15:12:29.000+01:00
diff --git a/src/http_rest_api_handler.cpp b/src/http_rest_api_handler.cpp
@@ -531,7 +531,7 @@ static Status createV3HttpPayload(
             return Status(StatusCode::JSON_INVALID, "model field is not a string");
         }
 
-        bool isTextGenerationEndpoint = uri.find("completions") != std::string_view::npos;
+        bool isTextGenerationEndpoint = (uri.find("completions") != std::string_view::npos) || (uri.find("responses") != std::string_view::npos);
         if (isTextGenerationEndpoint) {
             auto streamIt = parsedJson->FindMember("stream");
             if (streamIt != parsedJson->MemberEnd()) {
diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp
diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp
@@ -47,6 +47,7 @@ namespace ovms {
 enum class Endpoint {
     CHAT_COMPLETIONS,
     COMPLETIONS,
+    RESPONSES,
     TOKENIZE,
 };
 
@@ -69,12 +70,16 @@ class OpenAIChatCompletionsHandler {
     std::chrono::time_point<std::chrono::system_clock> created;
     ov::genai::Tokenizer tokenizer;
     size_t processedTokens = 0;  // tracks overall number of tokens processed by the pipeline
+    size_t responsesStreamingSequenceNumber = 0;
+    bool responsesStreamingInitialized = false;
+    std::string responsesStreamingOutputText;
 
     // Output parser is used to parse chat completions response to extract specific fields like tool calls and reasoning.
     std::unique_ptr<OutputParser> outputParser = nullptr;
 
     absl::Status parseCompletionsPart();
     absl::Status parseChatCompletionsPart(std::optional<uint32_t> maxTokensLimit, std::optional<std::string> allowedLocalMediaPath, std::optional<std::vector<std::string>> allowedMediaDomains);
+    absl::Status parseResponsesPart(std::optional<uint32_t> maxTokensLimit, std::optional<std::string> allowedLocalMediaPath, std::optional<std::vector<std::string>> allowedMediaDomains);
     absl::Status parseCommonPart(std::optional<uint32_t> maxTokensLimit, uint32_t bestOfLimit, std::optional<uint32_t> maxModelLength);
 
     ParsedOutput parseOutputIfNeeded(const std::vector<int64_t>& generatedIds);
diff --git a/src/llm/language_model/continuous_batching/servable.cpp b/src/llm/language_model/continuous_batching/servable.cpp
@@ -103,6 +103,15 @@ static ov::genai::GenerationOutput prepareEmptyStopReasonOutput() {
     return out;
 }
 
+static ov::genai::GenerationOutput prepareEmptyNoneReasonOutput() {
+    static ov::genai::GenerationOutput out = {
+        std::vector<int64_t>(),  // generated_ids
+        std::vector<float>(),    // generated_log_probs
+        0.0f,                    // score
+        ov::genai::GenerationFinishReason::NONE};
+    return out;
+}
+
 absl::Status ContinuousBatchingServable::readCompleteExecutionResults(std::shared_ptr<GenAiServableExecutionContext>& executionContext) {
     auto cbExecutionContext = std::static_pointer_cast<ContinuousBatchingServableExecutionContext>(executionContext);
     if (cbExecutionContext->payload.client->isDisconnected()) {
@@ -136,7 +145,11 @@ absl::Status ContinuousBatchingServable::readPartialExecutionResults(std::shared
         ov::genai::GenerationOutputs generationOutputs = cbExecutionContext->generationHandle->read();
         RET_CHECK(generationOutputs.size() <= 1);  // TODO: Support multiple generations
         if (generationOutputs.size() == 0) {
-            cbExecutionContext->generationOutputs = {prepareEmptyStopReasonOutput()};
+            if (cbExecutionContext->generationHandle->get_status() == ov::genai::GenerationStatus::RUNNING) {
+                cbExecutionContext->generationOutputs = {prepareEmptyNoneReasonOutput()};
+            } else {
+                cbExecutionContext->generationOutputs = {prepareEmptyStopReasonOutput()};
+            }
         } else {
             cbExecutionContext->generationOutputs = {generationOutputs.begin()->second};
         }
diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp
@@ -68,10 +68,12 @@ absl::Status GenAiServable::loadRequest(std::shared_ptr<GenAiServableExecutionCo
         executionContext->endpoint = Endpoint::CHAT_COMPLETIONS;
     } else if (payload.uri == "/v3/completions" || payload.uri == "/v3/v1/completions") {
         executionContext->endpoint = Endpoint::COMPLETIONS;
+    } else if (payload.uri == "/v3/responses" || payload.uri == "/v3/v1/responses") {
+        executionContext->endpoint = Endpoint::RESPONSES;
     } else if (TokenizeParser::isTokenizeEndpoint(payload.uri)) {
         executionContext->endpoint = Endpoint::TOKENIZE;
     } else {
-        return absl::InvalidArgumentError("Wrong endpoint. Allowed endpoints: /v3/chat/completions, /v3/completions");
+        return absl::InvalidArgumentError("Wrong endpoint. Allowed endpoints: /v3/chat/completions, /v3/completions, /v3/responses, /v3/tokenize");
     }
     executionContext->payload = payload;
     return absl::OkStatus();
@@ -204,6 +206,50 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptr<GenAiServableExecution
         }
         break;
     }
+    case Endpoint::RESPONSES: {
+        if (executionContext->apiHandler->getChatHistory().size() > 0) {
+#if (PYTHON_DISABLE == 0)
+            bool success;
+            if (executionContext->apiHandler->getProcessedJson().size() > 0) {
+                success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText);
+            } else {
+                success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->payload.body, inputText);
+            }
+            if (!success) {
+                return absl::Status(absl::StatusCode::kInvalidArgument, inputText);
+            }
+#else
+            ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory();
+            constexpr bool add_generation_prompt = true;
+            auto toolsStatus = executionContext->apiHandler->parseToolsToJsonContainer();
+            if (!toolsStatus.ok()) {
+                return toolsStatus.status();
+            }
+            const auto& tools = toolsStatus.value();
+            auto chatTemplateKwargsStatus = executionContext->apiHandler->parseChatTemplateKwargsToJsonContainer();
+            if (!chatTemplateKwargsStatus.ok()) {
+                return chatTemplateKwargsStatus.status();
+            }
+            const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value();
+            try {
+                inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools, chatTemplateKwargs);
+            } catch (const std::exception& e) {
+                SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what());
+                return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one.");
+            }
+#endif
+            if (inputText.size() == 0) {
+                return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty");
+            }
+        } else {
+            auto prompt = executionContext->apiHandler->getPrompt();
+            if (!prompt.has_value()) {
+                return absl::Status(absl::StatusCode::kInvalidArgument, "input is missing");
+            }
+            inputText = prompt.value();
+        }
+        break;
+    }
     case Endpoint::COMPLETIONS: {
         inputText = executionContext->apiHandler->getPrompt().value();
         break;
@@ -277,8 +323,12 @@ absl::Status GenAiServable::preparePartialResponse(std::shared_ptr<GenAiServable
         if (!serializedChunk.empty()) {
             executionContext->response = wrapTextInServerSideEventMessage(serializedChunk);
         }
-        if (executionContext->apiHandler->getStreamOptions().includeUsage)
-            executionContext->response += wrapTextInServerSideEventMessage(executionContext->apiHandler->serializeStreamingUsageChunk());
+        if (executionContext->apiHandler->getStreamOptions().includeUsage) {
+            std::string usageChunk = executionContext->apiHandler->serializeStreamingUsageChunk();
+            if (!usageChunk.empty()) {
+                executionContext->response += wrapTextInServerSideEventMessage(usageChunk);
+            }
+        }
 
         executionContext->response += wrapTextInServerSideEventMessage("[DONE]");
 
diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp
@@ -45,10 +45,12 @@ absl::Status VisualLanguageModelServable::loadRequest(std::shared_ptr<GenAiServa
     }
     if (payload.uri == "/v3/chat/completions" || payload.uri == "/v3/v1/chat/completions") {
         executionContext->endpoint = Endpoint::CHAT_COMPLETIONS;
+    } else if (payload.uri == "/v3/responses" || payload.uri == "/v3/v1/responses") {
+        executionContext->endpoint = Endpoint::RESPONSES;
     } else if (TokenizeParser::isTokenizeEndpoint(payload.uri)) {
         executionContext->endpoint = Endpoint::TOKENIZE;
     } else {
-        return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions endpoint or /v3/tokenize");
+        return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions, /v3/responses endpoint or /v3/tokenize");
     }
     executionContext->payload = payload;
     return absl::OkStatus();
@@ -67,7 +69,7 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr<GenAiSer
     if (vlmExecutionContext->apiHandler == nullptr) {
         return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized");
     }
-    if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS) {
+    if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) {
         ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory();
 
         for (size_t i = 0; i < chatHistory.size(); i++) {
diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp
@@ -53,10 +53,12 @@ absl::Status VisualLanguageModelLegacyServable::loadRequest(std::shared_ptr<GenA
     }
     if (payload.uri == "/v3/chat/completions" || payload.uri == "/v3/v1/chat/completions") {
         executionContext->endpoint = Endpoint::CHAT_COMPLETIONS;
+    } else if (payload.uri == "/v3/responses" || payload.uri == "/v3/v1/responses") {
+        executionContext->endpoint = Endpoint::RESPONSES;
     } else if (TokenizeParser::isTokenizeEndpoint(payload.uri)) {
         executionContext->endpoint = Endpoint::TOKENIZE;
     } else {
-        return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions endpoint or /v3/tokenize");
+        return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions, /v3/responses endpoint or /v3/tokenize");
     }
     executionContext->payload = payload;
     return absl::OkStatus();
@@ -237,7 +239,7 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr<Ge
     if (vlmExecutionContext->apiHandler == nullptr) {
         return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized");
     }
-    if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS) {
+    if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) {
         ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory();
 
         for (size_t i = 0; i < chatHistory.size(); i++) {
diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp
diff --git a/src/test/llm/visual_language_model/complete_flow_test.cpp b/src/test/llm/visual_language_model/complete_flow_test.cpp

Original file line number	Diff line number	Diff line change
`@@ -531,7 +531,7 @@ static Status createV3HttpPayload(`
`531`	`531`	`return Status(StatusCode::JSON_INVALID, "model field is not a string");`
`532`	`532`	`}`
`533`	`533`
`534`		`- bool isTextGenerationEndpoint = uri.find("completions") != std::string_view::npos;`
	`534`	`+ bool isTextGenerationEndpoint = (uri.find("completions") != std::string_view::npos) \|\| (uri.find("responses") != std::string_view::npos);`
`535`	`535`	`if (isTextGenerationEndpoint) {`
`536`	`536`	`auto streamIt = parsedJson->FindMember("stream");`
`537`	`537`	`if (streamIt != parsedJson->MemberEnd()) {`