From c3db12ceaef4c733b2a8f23b93796e254311a2d0 Mon Sep 17 00:00:00 2001 From: mzegla Date: Wed, 11 Mar 2026 13:11:34 +0100 Subject: [PATCH] init --- src/llm/BUILD | 5 + src/llm/io_processing/base_genai_parser.hpp | 133 +++++++ .../io_processing/genai_parser_adapter.cpp | 197 ++++++++++ .../io_processing/genai_parser_adapter.hpp | 140 ++++++++ .../llama3/genai_tool_parser.cpp | 340 ++++++++++++++++++ .../llama3/genai_tool_parser.hpp | 148 ++++++++ src/llm/io_processing/output_parser.cpp | 7 + .../llama3_output_parser_test.cpp | 48 ++- 8 files changed, 1004 insertions(+), 14 deletions(-) create mode 100644 src/llm/io_processing/base_genai_parser.hpp create mode 100644 src/llm/io_processing/genai_parser_adapter.cpp create mode 100644 src/llm/io_processing/genai_parser_adapter.hpp create mode 100644 src/llm/io_processing/llama3/genai_tool_parser.cpp create mode 100644 src/llm/io_processing/llama3/genai_tool_parser.hpp diff --git a/src/llm/BUILD b/src/llm/BUILD index ae37d936ca..bed4a5f684 100644 --- a/src/llm/BUILD +++ b/src/llm/BUILD @@ -136,6 +136,7 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w hdrs = [ "io_processing/hermes3/tool_parser.hpp", "io_processing/llama3/tool_parser.hpp", + "io_processing/llama3/genai_tool_parser.hpp", "io_processing/phi4/tool_parser.hpp", "io_processing/devstral/tool_parser.hpp", "io_processing/mistral/tool_parser.hpp", @@ -143,11 +144,14 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w "io_processing/gptoss/reasoning_parser.hpp", "io_processing/gptoss/tool_parser.hpp", "io_processing/gptoss/harmony.hpp", + "io_processing/base_genai_parser.hpp", + "io_processing/genai_parser_adapter.hpp", "io_processing/output_parser.hpp", ], srcs = [ "io_processing/hermes3/tool_parser.cpp", "io_processing/llama3/tool_parser.cpp", + "io_processing/llama3/genai_tool_parser.cpp", "io_processing/phi4/tool_parser.cpp", "io_processing/devstral/tool_parser.cpp", "io_processing/mistral/tool_parser.cpp", @@ -155,6 +159,7 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w "io_processing/gptoss/reasoning_parser.cpp", "io_processing/gptoss/tool_parser.cpp", "io_processing/gptoss/harmony.cpp", + "io_processing/genai_parser_adapter.cpp", "io_processing/output_parser.cpp", ], deps = [ diff --git a/src/llm/io_processing/base_genai_parser.hpp b/src/llm/io_processing/base_genai_parser.hpp new file mode 100644 index 0000000000..822621baf6 --- /dev/null +++ b/src/llm/io_processing/base_genai_parser.hpp @@ -0,0 +1,133 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include + +#include + +namespace ovms { + +/** + * @brief Extension of ov::genai::Parser that exposes OVMS OutputParser metadata. + * + * Concrete implementations should override the tag getters to tell + * GenAIParserAdapter (and therefore the surrounding OutputParser machinery) + * where parsing segments begin and end in the generated text, and whether + * the text streamer must preserve special tokens. + * + * Defaults: + * - getParsingStartTags() pure virtual — must be overridden + * - getSpecialParsingStartTags() returns empty vector + * - getParsingEndTag() pure virtual — must be overridden + * - requiresStreamingWithSpecialTokens() returns false + */ +class BaseGenAIParser : public ov::genai::Parser { +public: + BaseGenAIParser() = default; + ~BaseGenAIParser() override = default; + + /** + * @brief Tags that trigger the start of a parsed segment. + * + * Used by OutputParser to transition into the phase handled by this parser. + * Return an empty vector if the parser handles all output unconditionally. + */ + virtual const std::vector& getParsingStartTags() const = 0; + + /** + * @brief Tags that trigger parsing only when they appear at the very start of output. + * + * These are considered exclusively during the UNKNOWN phase of OutputParser. + */ + virtual const std::vector& getSpecialParsingStartTags() const { + static const std::vector empty{}; + return empty; + } + + /** + * @brief Tag that marks the end of the segment processed by this parser. + * + * Return an empty string if the parser consumes until end-of-generation. + */ + virtual const std::string& getParsingEndTag() const = 0; + + /** + * @brief Whether the text streamer must include special tokens. + * + * If true, the tokenizer used in the TextStreamer should be configured to + * not skip special tokens so that this parser can use them as boundaries. + */ + virtual bool requiresStreamingWithSpecialTokens() const { + return false; + } +}; + +/** + * @brief Extension of ov::genai::IncrementalParser that exposes OVMS OutputParser metadata. + * + * Mirrors BaseGenAIParser but for the streaming (incremental) variant. + * The same defaults apply. + */ +class BaseGenAIIncrementalParser : public ov::genai::IncrementalParser { +public: + BaseGenAIIncrementalParser() = default; + ~BaseGenAIIncrementalParser() override = default; + + /** + * @brief Tags that trigger the start of a parsed segment. + */ + virtual const std::vector& getParsingStartTags() const = 0; + + /** + * @brief Tags that trigger parsing only when they appear at the very start of output. + */ + virtual const std::vector& getSpecialParsingStartTags() const { + static const std::vector empty{}; + return empty; + } + + /** + * @brief Tag that marks the end of the segment processed by this parser. + */ + virtual const std::string& getParsingEndTag() const = 0; + + /** + * @brief Whether the text streamer must include special tokens. + */ + virtual bool requiresStreamingWithSpecialTokens() const { + return false; + } + + /** + * @brief Flush any internally buffered state after the final chunk. + * + * Called by GenAIParserAdapter::parseChunk when finishReason != NONE, + * immediately after the last parse() call. Implementations that use a + * delay window (or any other look-ahead buffering) must drain the pending + * content here and populate @p delta_message with a final delta if one is + * available. + * + * The default implementation is a no-op. + * + * @param delta_message Populated with a structured delta when pending + * content exists; left untouched otherwise. + */ + virtual void flush(ov::genai::JsonContainer& /*delta_message*/) {} +}; + +} // namespace ovms diff --git a/src/llm/io_processing/genai_parser_adapter.cpp b/src/llm/io_processing/genai_parser_adapter.cpp new file mode 100644 index 0000000000..73553ad889 --- /dev/null +++ b/src/llm/io_processing/genai_parser_adapter.cpp @@ -0,0 +1,197 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include "genai_parser_adapter.hpp" + +#include +#include +#include + +namespace ovms { + +GenAIParserAdapter::GenAIParserAdapter( + ov::genai::Tokenizer& tokenizer, + std::shared_ptr parser, + std::shared_ptr incrementalParser) : + BaseOutputParser(tokenizer), + parser(std::move(parser)), + incrementalParser(std::move(incrementalParser)) {} + +// static +void GenAIParserAdapter::extractToolCalls(const ov::genai::JsonContainer& message, ToolCalls_t& toolCalls) { + if (!message.contains("tool_calls") || !message["tool_calls"].is_array()) { + return; + } + const ov::genai::JsonContainer toolCallsArr = message["tool_calls"]; + const size_t count = toolCallsArr.size(); + for (size_t i = 0; i < count; ++i) { + const ov::genai::JsonContainer tc = toolCallsArr[i]; + ToolCall toolCall; + if (tc.contains("id") && tc["id"].is_string()) { + toolCall.id = tc["id"].get_string(); + } + if (tc.contains("function") && tc["function"].is_object()) { + const ov::genai::JsonContainer fn = tc["function"]; + if (fn.contains("name") && fn["name"].is_string()) { + toolCall.name = fn["name"].get_string(); + } + if (fn.contains("arguments") && fn["arguments"].is_string()) { + toolCall.arguments = fn["arguments"].get_string(); + } + } + toolCalls.push_back(std::move(toolCall)); + } +} + +// static +rapidjson::Document GenAIParserAdapter::jsonContainerToDocument(const ov::genai::JsonContainer& container) { + const std::string jsonStr = container.to_json_string(); + rapidjson::Document doc; + doc.Parse(jsonStr.c_str(), static_cast(jsonStr.size())); + return doc; +} + +void GenAIParserAdapter::parse(ParsedOutput& parsedOutput, const std::vector& /*generatedTokens*/) { + ov::genai::JsonContainer message({{"content", parsedOutput.content}}); + + parser->parse(message); + + if (message.contains("content") && message["content"].is_string()) { + parsedOutput.content = message["content"].get_string(); + } + + extractToolCalls(message, parsedOutput.toolCalls); + + if (message.contains("reasoning_content") && message["reasoning_content"].is_string()) { + parsedOutput.reasoning = message["reasoning_content"].get_string(); + } +} + +std::optional GenAIParserAdapter::parseChunk( + const std::string& chunkResponse, + ov::genai::GenerationFinishReason finishReason) { + + ov::genai::JsonContainer deltaMessage = ov::genai::JsonContainer::object(); + std::string deltaText = chunkResponse; + + const std::string filteredContent = incrementalParser->parse(deltaMessage, deltaText); + + // On the final chunk, drain any content still buffered in the parser's delay window. + if (finishReason != ov::genai::GenerationFinishReason::NONE) { + ov::genai::JsonContainer flushedDelta = ov::genai::JsonContainer::object(); + incrementalParser->flush(flushedDelta); + + if (!flushedDelta.empty()) { + rapidjson::Document flushedDoc = jsonContainerToDocument(flushedDelta); + if (!flushedDoc.HasParseError() && flushedDoc.IsObject() && !flushedDoc.ObjectEmpty()) { + // If parse() also produced a structured delta, combine argument strings + // so a single document is returned. + if (!deltaMessage.empty()) { + rapidjson::Document primaryDoc = jsonContainerToDocument(deltaMessage); + if (!primaryDoc.HasParseError() && primaryDoc.IsObject() && !primaryDoc.ObjectEmpty()) { + return combineArgumentDeltas(std::move(primaryDoc), std::move(flushedDoc)); + } + } + return flushedDoc; + } + } + } + + // Prefer structured deltas (tool calls, reasoning) if the incremental parser emitted any + if (!deltaMessage.empty()) { + rapidjson::Document doc = jsonContainerToDocument(deltaMessage); + if (!doc.HasParseError() && doc.IsObject() && !doc.ObjectEmpty()) { + return doc; + } + } + + // Fall back to plain content delta + if (!filteredContent.empty()) { + rapidjson::Document doc; + doc.SetObject(); + rapidjson::Value contentVal( + filteredContent.c_str(), + static_cast(filteredContent.size()), + doc.GetAllocator()); + doc.AddMember("content", contentVal, doc.GetAllocator()); + return doc; + } + + return std::nullopt; +} + +// static +rapidjson::Document GenAIParserAdapter::combineArgumentDeltas( + rapidjson::Document primary, + rapidjson::Document flushed) { + // Navigate primary: delta.tool_calls[0].function.arguments + if (!primary.HasMember("delta") || !flushed.HasMember("delta")) { + return primary; + } + auto& pDelta = primary["delta"]; + auto& fDelta = flushed["delta"]; + if (!pDelta.IsObject() || !pDelta.HasMember("tool_calls") || + !fDelta.IsObject() || !fDelta.HasMember("tool_calls")) { + return primary; + } + auto& pTC = pDelta["tool_calls"]; + auto& fTC = fDelta["tool_calls"]; + if (!pTC.IsArray() || pTC.Empty() || !fTC.IsArray() || fTC.Empty()) { + return primary; + } + auto& pEntry = pTC[rapidjson::SizeType(0)]; + auto& fEntry = fTC[rapidjson::SizeType(0)]; + if (!pEntry.IsObject() || !pEntry.HasMember("function") || + !fEntry.IsObject() || !fEntry.HasMember("function")) { + return primary; + } + auto& pFunc = pEntry["function"]; + auto& fFunc = fEntry["function"]; + if (!pFunc.IsObject() || !pFunc.HasMember("arguments") || + !fFunc.IsObject() || !fFunc.HasMember("arguments")) { + return primary; + } + if (!pFunc["arguments"].IsString() || !fFunc["arguments"].IsString()) { + return primary; + } + const std::string combined = + std::string(pFunc["arguments"].GetString()) + + std::string(fFunc["arguments"].GetString()); + pFunc["arguments"].SetString( + combined.c_str(), + static_cast(combined.size()), + primary.GetAllocator()); + return primary; +} + +const std::vector& GenAIParserAdapter::getParsingStartTags() const { + return parser->getParsingStartTags(); +} + +const std::vector& GenAIParserAdapter::getSpecialParsingStartTags() const { + return parser->getSpecialParsingStartTags(); +} + +const std::string& GenAIParserAdapter::getParsingEndTag() const { + return parser->getParsingEndTag(); +} + +bool GenAIParserAdapter::requiresStreamingWithSpecialTokens() const { + return parser->requiresStreamingWithSpecialTokens() || + incrementalParser->requiresStreamingWithSpecialTokens(); +} + +} // namespace ovms diff --git a/src/llm/io_processing/genai_parser_adapter.hpp b/src/llm/io_processing/genai_parser_adapter.hpp new file mode 100644 index 0000000000..d39d701562 --- /dev/null +++ b/src/llm/io_processing/genai_parser_adapter.hpp @@ -0,0 +1,140 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include "src/llm/io_processing/base_genai_parser.hpp" +#include "src/llm/io_processing/base_output_parser.hpp" +#include "src/port/rapidjson_document.hpp" + +namespace ovms { + +/** + * @brief Adapter that bridges BaseGenAIParser / BaseGenAIIncrementalParser + * with the OVMS BaseOutputParser interface. + * + * GenAIParserAdapter wraps a pair of OVMS-extended GenAI parser variants and + * translates calls to/from the OVMS OutputParser flow: + * - parse() (unary mode) delegates to the wrapped BaseGenAIParser + * - parseChunk() (streaming) delegates to the wrapped BaseGenAIIncrementalParser + * + * Start/end tags and the streaming-with-special-tokens flag are obtained + * directly from the wrapped parsers via the BaseGenAIParser / + * BaseGenAIIncrementalParser interface, keeping the adapter free of + * parser-specific configuration. + */ +class GenAIParserAdapter : public BaseOutputParser { + std::shared_ptr parser; + std::shared_ptr incrementalParser; + + /** + * @brief Extract tool_calls from a GenAI JsonContainer into OVMS ToolCalls_t. + * + * Expects the standard OpenAI tool_calls format: + * [{"id": "...", "type": "function", "function": {"name": "...", "arguments": "..."}}] + */ + static void extractToolCalls(const ov::genai::JsonContainer& message, ToolCalls_t& toolCalls); + + /** + * @brief Convert a GenAI JsonContainer to a rapidjson::Document via JSON serialization. + */ + static rapidjson::Document jsonContainerToDocument(const ov::genai::JsonContainer& container); + +public: + GenAIParserAdapter() = delete; + + /** + * @brief Construct adapter wrapping a unary and an incremental GenAI parser. + * + * Tag configuration and the streaming-with-special-tokens flag are queried + * from the parsers themselves via the BaseGenAIParser / + * BaseGenAIIncrementalParser interface. + * + * @param tokenizer Tokenizer forwarded to BaseOutputParser. + * @param parser OVMS-extended GenAI parser for unary (non-streaming) mode. + * @param incrementalParser OVMS-extended GenAI parser for streaming mode. + */ + explicit GenAIParserAdapter( + ov::genai::Tokenizer& tokenizer, + std::shared_ptr parser, + std::shared_ptr incrementalParser); + + /** + * @brief Parse full (non-streaming) output via the wrapped ov::genai::Parser. + * + * Wraps parsedOutput.content in a GenAI JsonContainer, invokes the parser, + * then maps results back: + * - content -> parsedOutput.content + * - tool_calls -> parsedOutput.toolCalls + * - reasoning_content -> parsedOutput.reasoning + * + * @note generatedTokens is accepted for interface compatibility but not + * forwarded to the GenAI parser. + */ + void parse(ParsedOutput& parsedOutput, const std::vector& generatedTokens) override; + + /** + * @brief Parse a streaming chunk via the wrapped ov::genai::IncrementalParser. + * + * Builds an empty delta JsonContainer and invokes the incremental parser. + * If the parser emits structured data (tool calls, reasoning), the delta + * JsonContainer is converted to a rapidjson::Document and returned. + * If the parser emits plain filtered content, it is wrapped in + * {"content": } and returned. Returns std::nullopt when the chunk + * does not yet produce a meaningful response. + * + * @note When finishReason != NONE the adapter also calls flush() on the + * incremental parser to drain any content held in its delay window. + * The flush delta is combined with the parse delta (argument strings + * concatenated) when both are non-null. + */ + std::optional parseChunk( + const std::string& chunkResponse, + ov::genai::GenerationFinishReason finishReason) override; + + const std::vector& getParsingStartTags() const override; + const std::vector& getSpecialParsingStartTags() const override; + const std::string& getParsingEndTag() const override; + bool requiresStreamingWithSpecialTokens() const override; + +private: + /** + * @brief Combine two wrapDelta documents by concatenating their + * delta.tool_calls[0].function.arguments strings. + * + * Used when both parse() and flush() produce argument deltas for the + * final chunk so that only a single document needs to be returned. + * If the documents cannot be combined (unexpected structure), @p primary + * is returned unchanged. + * + * @param primary Delta from the normal parse() call (moved). + * @param flushed Delta from flush() (moved, used only for its arguments value). + * @return Combined document, or @p primary if combination is not applicable. + */ + static rapidjson::Document combineArgumentDeltas( + rapidjson::Document primary, + rapidjson::Document flushed); +}; + +} // namespace ovms diff --git a/src/llm/io_processing/llama3/genai_tool_parser.cpp b/src/llm/io_processing/llama3/genai_tool_parser.cpp new file mode 100644 index 0000000000..10865e458a --- /dev/null +++ b/src/llm/io_processing/llama3/genai_tool_parser.cpp @@ -0,0 +1,340 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include "genai_tool_parser.hpp" + +#include +#include +#include +#include +#include +#include + +#include + +#include "src/port/rapidjson_document.hpp" +#include "src/port/rapidjson_stringbuffer.hpp" +#include "src/port/rapidjson_writer.hpp" + +#include "../../../logging.hpp" +#include "../base_output_parser.hpp" +#include "../utils.hpp" +#include "src/stringutils.hpp" + +namespace ovms { + +// ───────────────────────────────────────────────────────────────────────────── +// Shared helpers +// ───────────────────────────────────────────────────────────────────────────── + +static std::string rapidjsonToString(const rapidjson::Document& doc) { + rapidjson::StringBuffer sb; + rapidjson::Writer writer(sb); + doc.Accept(writer); + return sb.GetString(); +} + +static bool jsonHasArgumentsOrParameters(const rapidjson::Document& json) { + return json.HasMember("arguments") || json.HasMember("parameters"); +} + +static void changeParametersToArguments(rapidjson::Document& json) { + if (json.HasMember("parameters")) { + json.AddMember("arguments", json["parameters"], json.GetAllocator()); + json.RemoveMember("parameters"); + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Llama3GenAIToolParser (unary) +// ───────────────────────────────────────────────────────────────────────────── + +void Llama3GenAIToolParser::parse(ov::genai::JsonContainer& message) { + if (!message.contains("content") || !message["content"].is_string()) { + return; + } + + std::string content = message["content"].get_string(); + std::string toolCallsText; + + const std::string botTagStr(botTag); + const size_t botPos = content.find(botTagStr); + if (botPos != std::string::npos) { + toolCallsText = content.substr(botPos + botTagStr.size()); + content = content.substr(0, botPos); + } else if (!content.empty() && content[0] == '{') { + // Model output starts with "{" — treat the whole content as tool calls + toolCallsText = content; + content.clear(); + } else { + return; // No tool calls present + } + + // Split tool call JSON blobs by ";" separator + std::vector toolJsonStrings; + const std::string sep(separator); + size_t start = 0; + size_t end; + while ((end = toolCallsText.find(sep, start)) != std::string::npos) { + std::string part = toolCallsText.substr(start, end - start); + if (!part.empty()) { + toolJsonStrings.push_back(std::move(part)); + } + start = end + sep.size(); + } + std::string lastPart = toolCallsText.substr(start); + if (!lastPart.empty()) { + toolJsonStrings.push_back(std::move(lastPart)); + } + + ov::genai::JsonContainer toolCallsArr = ov::genai::JsonContainer::array(); + + for (const std::string& toolJson : toolJsonStrings) { + rapidjson::Document toolDoc; + toolDoc.Parse(toolJson.c_str()); + if (toolDoc.HasParseError()) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Llama3GenAIToolParser: failed to parse tool call JSON"); + continue; + } + + if (!toolDoc.HasMember("name") || !toolDoc["name"].IsString()) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Llama3GenAIToolParser: tool call missing valid 'name' field"); + continue; + } + const std::string name = toolDoc["name"].GetString(); + + const char* argsKey = nullptr; + if (toolDoc.HasMember("arguments")) { + argsKey = "arguments"; + } else if (toolDoc.HasMember("parameters")) { + argsKey = "parameters"; + } + if (!argsKey) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Llama3GenAIToolParser: tool call missing 'parameters'/'arguments' field"); + continue; + } + + // Serialize the arguments/parameters object to a JSON string + rapidjson::StringBuffer sb; + rapidjson::Writer writer(sb); + toolDoc[argsKey].Accept(writer); + const std::string arguments = sb.GetString(); + + ov::genai::JsonContainer fn = ov::genai::JsonContainer::object(); + fn["name"] = name; + fn["arguments"] = arguments; + + ov::genai::JsonContainer tc = ov::genai::JsonContainer::object(); + tc["id"] = generateRandomId(); + tc["type"] = "function"; + tc["function"] = fn; + + toolCallsArr.push_back(tc); + } + + message["content"] = content; + if (toolCallsArr.size() > 0) { + message["tool_calls"] = toolCallsArr; + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Llama3GenAIIncrementalToolParser (streaming) +// ───────────────────────────────────────────────────────────────────────────── + +void Llama3GenAIIncrementalToolParser::startNextToolCall() { + lastJson.Clear(); + jsonBuilder.clear(); + toolCallIndex++; + argumentsDelayWindow[0].clear(); + argumentsDelayWindow[1].clear(); +} + +void Llama3GenAIIncrementalToolParser::reset() { + lastJson.Clear(); + jsonBuilder.clear(); + toolCallIndex = -1; + argumentsDelayWindow[0].clear(); + argumentsDelayWindow[1].clear(); + escapeLevel = 0; +} + +void Llama3GenAIIncrementalToolParser::flush(ov::genai::JsonContainer& delta_message) { + if (toolCallIndex < 0 || argumentsDelayWindow[1].empty()) { + return; + } + + // Drain the last pending chunk from the delay window with the same + // closing-quote logic that normally fires when ";" is detected. + std::string lastChunk = argumentsDelayWindow[1]; + argumentsDelayWindow[1].clear(); + + const size_t lastBrace = lastChunk.find_last_of('}'); + if (lastBrace != std::string::npos) { + lastChunk.insert(lastBrace, "\""); + } else { + lastChunk += "\""; + } + + rapidjson::Document newJson; + try { + newJson = jsonBuilder.add(lastChunk); + } catch (const std::exception& e) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, + "Llama3GenAIIncrementalToolParser::flush: chunk parse failed: {}", e.what()); + return; + } + + if (!lastJson.HasMember("arguments") && !lastJson.HasMember("parameters")) { + return; + } + + changeParametersToArguments(newJson); + rapidjson::Document delta = PartialJsonBuilder::computeDelta(lastJson, newJson); + lastJson.CopyFrom(newJson, lastJson.GetAllocator()); + + if (delta.ObjectEmpty()) { + return; + } + for (auto it = delta.MemberBegin(); it != delta.MemberEnd(); ++it) { + if (it->value.IsNull() || + (it->value.IsString() && std::string(it->value.GetString()).empty())) { + return; + } + } + + const rapidjson::Document doc = BaseOutputParser::wrapDelta(delta, toolCallIndex); + delta_message = ov::genai::JsonContainer::from_json_string(rapidjsonToString(doc)); +} + +std::string Llama3GenAIIncrementalToolParser::parse( + ov::genai::JsonContainer& delta_message, + std::string& delta_text, + const std::optional>& /*delta_tokens*/) { + + std::string chunk = delta_text; + + if (chunk.empty()) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Llama3GenAIIncrementalToolParser: received empty chunk"); + return ""; + } + + // <|python_tag|> signals start of tool calls — consume it, start first tool call + if (chunk.find(botTag) != std::string::npos) { + startNextToolCall(); + return ""; + } + + // If not yet started, start implicitly (bare "{" path) + if (toolCallIndex < 0) { + startNextToolCall(); + } + + bool isCurrentToolCallParsingFinished = false; + + if (jsonHasArgumentsOrParameters(lastJson)) { + std::string modifiedChunk = chunk; + escapeSpecialCharacters(modifiedChunk); + + // Starting to collect arguments — force string type by injecting opening quote + if (argumentsDelayWindow[0].empty()) { + const size_t firstNonWs = modifiedChunk.find_first_not_of(" \t\n\r\f\v"); + if (firstNonWs != std::string::npos) { + modifiedChunk.insert(firstNonWs, "\""); + } else { + modifiedChunk.append("\""); + } + argumentsDelayWindow[0] = modifiedChunk; + return ""; + } + + if (!argumentsDelayWindow[1].empty()) { + argumentsDelayWindow[0] = argumentsDelayWindow[1]; + } + + // ";" detected: end of this tool call — close the arguments string + if (modifiedChunk.find(separator) != std::string::npos) { + isCurrentToolCallParsingFinished = true; + const size_t lastBrace = argumentsDelayWindow[0].find_last_of('}'); + if (lastBrace != std::string::npos) { + argumentsDelayWindow[0].insert(lastBrace, "\""); + } + } else { + argumentsDelayWindow[1] = modifiedChunk; + } + } + + rapidjson::Document newJson; + try { + if (!argumentsDelayWindow[0].empty()) { + newJson = jsonBuilder.add(argumentsDelayWindow[0]); + } else { + newJson = jsonBuilder.add(chunk); + } + } catch (const std::exception& e) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Llama3GenAIIncrementalToolParser: chunk parse failed: {}", e.what()); + throw std::runtime_error("Generated tool call structure is not valid"); + } + + // Case 1: arguments/parameters just appeared — emit first delta with function name + if (jsonHasArgumentsOrParameters(newJson) && !jsonHasArgumentsOrParameters(lastJson)) { + std::string functionName; + changeParametersToArguments(newJson); + if (lastJson.HasMember("name") && lastJson["name"].IsString()) { + functionName = lastJson["name"].GetString(); + } else if (newJson.HasMember("name") && newJson["name"].IsString()) { + functionName = newJson["name"].GetString(); + } else { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Llama3GenAIIncrementalToolParser: function name missing when arguments appeared"); + throw std::runtime_error("Tool call name is missing in generated output"); + } + const rapidjson::Document doc = BaseOutputParser::wrapFirstDelta(functionName, toolCallIndex); + delta_message = ov::genai::JsonContainer::from_json_string(rapidjsonToString(doc)); + lastJson.CopyFrom(newJson, lastJson.GetAllocator()); + return ""; + + // Case 2: arguments already present — compute and emit incremental delta + } else if (lastJson.HasMember("arguments") || lastJson.HasMember("parameters")) { + changeParametersToArguments(newJson); + rapidjson::Document delta = PartialJsonBuilder::computeDelta(lastJson, newJson); + lastJson.CopyFrom(newJson, lastJson.GetAllocator()); + + if (delta.ObjectEmpty()) { + return ""; + } + for (auto it = delta.MemberBegin(); it != delta.MemberEnd(); ++it) { + if (it->value.IsNull() || (it->value.IsString() && std::string(it->value.GetString()).empty())) { + return ""; + } + } + + const rapidjson::Document doc = BaseOutputParser::wrapDelta(delta, toolCallIndex); + delta_message = ov::genai::JsonContainer::from_json_string(rapidjsonToString(doc)); + + if (isCurrentToolCallParsingFinished) { + startNextToolCall(); + } + return ""; + + // Case 3: still accumulating function name — keep building + } else { + lastJson.CopyFrom(newJson, lastJson.GetAllocator()); + } + + return ""; +} + +} // namespace ovms diff --git a/src/llm/io_processing/llama3/genai_tool_parser.hpp b/src/llm/io_processing/llama3/genai_tool_parser.hpp new file mode 100644 index 0000000000..36b611047b --- /dev/null +++ b/src/llm/io_processing/llama3/genai_tool_parser.hpp @@ -0,0 +1,148 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include +#include +#include + +#include + +#include "src/llm/io_processing/base_genai_parser.hpp" +#include "src/llm/io_processing/partial_json_builder.hpp" +#include "src/port/rapidjson_document.hpp" + +namespace ovms { + +/** + * @brief Unary GenAI tool parser for Llama 3 Pythonic / JSON tool call format. + * + * Implements ov::genai::Parser::parse(JsonContainer&) — invoked once at the + * end of generation. Reads message["content"], extracts tool calls delimited + * by "<|python_tag|>" (or a bare "{" at position 0), splits them by ";", + * parses each as JSON, and writes the result back into message["tool_calls"] + * in OpenAI format. + * + * Because this class implements BaseGenAIParser (not BaseOutputParser), it + * does not receive raw token IDs; all detection is text-based. + */ +class Llama3GenAIToolParser : public BaseGenAIParser { + static constexpr const char* botTag = "<|python_tag|>"; + static constexpr const char* separator = ";"; + +public: + Llama3GenAIToolParser() = default; + ~Llama3GenAIToolParser() override = default; + + void parse(ov::genai::JsonContainer& message) override; + + const std::vector& getParsingStartTags() const override { + static const std::vector tags = {botTag}; + return tags; + } + + const std::vector& getSpecialParsingStartTags() const override { + static const std::vector tags = {"{"}; + return tags; + } + + const std::string& getParsingEndTag() const override { + static const std::string empty; + return empty; + } +}; + +/** + * @brief Incremental (streaming) GenAI tool parser for Llama 3 format. + * + * Implements ov::genai::IncrementalParser — invoked per streamed chunk. + * Replicates the streaming logic of Llama3ToolParser using the same internal + * state (PartialJsonBuilder, delay window, escape tracking) but outputs the + * assembled tool call deltas into delta_message as a JsonContainer instead of + * returning a rapidjson::Document directly. + * + * @note End-of-stream argument string closing is handled via flush(), which + * is called by GenAIParserAdapter::parseChunk when finishReason != NONE. + * This drains the last chunk from the delay window and properly closes + * the arguments JSON string. + */ +class Llama3GenAIIncrementalToolParser : public BaseGenAIIncrementalParser { + static constexpr const char* botTag = "<|python_tag|>"; + static constexpr const char* separator = ";"; + + // Internal streaming state — mirrors Llama3ToolParser + rapidjson::Document lastJson; + PartialJsonBuilder jsonBuilder; + // Index of the tool call currently being assembled; -1 = not yet started + int toolCallIndex = -1; + // Two-slot delay window to defer argument chunks until we can close the string + std::array argumentsDelayWindow{{"", ""}}; + int escapeLevel = 0; + + void startNextToolCall(); + +public: + Llama3GenAIIncrementalToolParser() = default; + ~Llama3GenAIIncrementalToolParser() override = default; + + /** + * @brief Process one streamed chunk. + * + * @param delta_message Populated with the tool-call delta in OpenAI streaming + * format when a delta is ready to emit; untouched otherwise. + * @param delta_text Incoming text chunk (modified in place to empty string + * once tool call processing begins, so no raw tool-call + * syntax leaks into the content stream). + * @param delta_tokens Unused. + * @return Empty string — tool call content is not forwarded to the text stream. + */ + std::string parse( + ov::genai::JsonContainer& delta_message, + std::string& delta_text, + const std::optional>& delta_tokens = std::nullopt) override; + + void reset() override; + + const std::vector& getParsingStartTags() const override { + static const std::vector tags = {botTag}; + return tags; + } + + const std::vector& getSpecialParsingStartTags() const override { + static const std::vector tags = {"{"}; + return tags; + } + + const std::string& getParsingEndTag() const override { + static const std::string empty; + return empty; + } + + /** + * @brief Flush the pending chunk from the delay window after end-of-generation. + * + * The streaming parser defers each incoming chunk by one slot to allow + * look-ahead for the ";" separator. When generation ends without a ";" + * (the normal case for the last tool call), the final chunk is still + * sitting in window[1]. This method drains it, inserts the closing quote + * before the last '}', and populates @p delta_message with the resulting + * wrapDelta document. + */ + void flush(ov::genai::JsonContainer& delta_message) override; +}; + +} // namespace ovms diff --git a/src/llm/io_processing/output_parser.cpp b/src/llm/io_processing/output_parser.cpp index 1c060375df..02183e4b76 100644 --- a/src/llm/io_processing/output_parser.cpp +++ b/src/llm/io_processing/output_parser.cpp @@ -21,6 +21,7 @@ #include "../../stringutils.hpp" #include "output_parser.hpp" #include "llama3/tool_parser.hpp" +#include "llama3/genai_tool_parser.hpp" #include "hermes3/tool_parser.hpp" #include "phi4/tool_parser.hpp" #include "mistral/tool_parser.hpp" @@ -29,6 +30,7 @@ #include "qwen3coder/qwen3coder_tool_parser.hpp" #include "devstral/tool_parser.hpp" #include "gptoss/reasoning_parser.hpp" +#include "genai_parser_adapter.hpp" namespace ovms { OutputParser::TagLookupStatus OutputParser::StreamOutputCache::lookupTag(const std::string& tag) const { @@ -159,6 +161,11 @@ OutputParser::OutputParser(ov::genai::Tokenizer& tokenizer, const std::string to tokenizer(tokenizer) { if (toolParserName == "llama3") { toolParser = std::make_unique(tokenizer); + } else if (toolParserName == "llama3_genai") { + toolParser = std::make_unique( + tokenizer, + std::make_shared(), + std::make_shared()); } else if (toolParserName == "hermes3") { toolParser = std::make_unique(tokenizer); } else if (toolParserName == "phi4") { diff --git a/src/test/llm/output_parsers/llama3_output_parser_test.cpp b/src/test/llm/output_parsers/llama3_output_parser_test.cpp index a26da4703f..bd7e1c9d0f 100644 --- a/src/test/llm/output_parsers/llama3_output_parser_test.cpp +++ b/src/test/llm/output_parsers/llama3_output_parser_test.cpp @@ -37,7 +37,15 @@ static std::unique_ptr llama3Tokenizer; // Id of the <|python_tag|> which is a special token used to indicate the start of a tool calls constexpr int64_t botTokenId = 128010; -class Llama3OutputParserTest : public ::testing::Test { +// Parametrized fixture — runs every test with both "llama3" (native) and "llama3_genai" (adapter) parsers. +// +// Known behavioral differences between "llama3" and "llama3_genai": +// +// 1. Content + tool-call in unary mode (ParseToolCallOutputWithContentAndSingleToolCall): +// "llama3" detects the boundary via the raw botTokenId in the token stream. +// "llama3_genai" works on decoded text only; <|python_tag|> is a special token and is +// stripped by the tokenizer, so the boundary is invisible → test is skipped for llama3_genai. +class Llama3OutputParserTest : public ::testing::TestWithParam { protected: std::unique_ptr outputParserWithRegularToolParsing; @@ -56,11 +64,17 @@ class Llama3OutputParserTest : public ::testing::Test { } void SetUp() override { - outputParserWithRegularToolParsing = std::make_unique(*llama3Tokenizer, "llama3", "", EMPTY_TOOLS_SCHEMA); + outputParserWithRegularToolParsing = std::make_unique(*llama3Tokenizer, GetParam(), "", EMPTY_TOOLS_SCHEMA); } }; -TEST_F(Llama3OutputParserTest, ParseToolCallOutputWithSingleToolCall) { +INSTANTIATE_TEST_SUITE_P( + Llama3Parsers, + Llama3OutputParserTest, + ::testing::Values("llama3", "llama3_genai"), + [](const ::testing::TestParamInfo& info) { return info.param; }); + +TEST_P(Llama3OutputParserTest, ParseToolCallOutputWithSingleToolCall) { std::string input = "{\"name\": \"example_tool\", \"parameters\": {\"arg1\": \"value1\", \"arg2\": 42}}"; auto generatedTensor = llama3Tokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids; std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); @@ -74,7 +88,7 @@ TEST_F(Llama3OutputParserTest, ParseToolCallOutputWithSingleToolCall) { EXPECT_EQ(parsedOutput.toolCalls[0].id.empty(), false); } -TEST_F(Llama3OutputParserTest, ParseToolCallOutputNoToolsInTheRequest) { +TEST_P(Llama3OutputParserTest, ParseToolCallOutputNoToolsInTheRequest) { std::string input = "{\"name\": \"example_tool\", \"parameters\": {\"arg1\": \"value1\", \"arg2\": 42}}"; auto generatedTensor = llama3Tokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids; std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); @@ -83,7 +97,7 @@ TEST_F(Llama3OutputParserTest, ParseToolCallOutputNoToolsInTheRequest) { EXPECT_EQ(parsedOutput.reasoning, ""); ASSERT_EQ(parsedOutput.toolCalls.size(), 0); } -TEST_F(Llama3OutputParserTest, ParseRegularJsonOutputToolsInTheRequest) { +TEST_P(Llama3OutputParserTest, ParseRegularJsonOutputToolsInTheRequest) { std::string input = "{\"name\": \"Jane Doe\", \"location\": \"unknown\"}"; auto generatedTensor = llama3Tokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids; std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); @@ -94,7 +108,7 @@ TEST_F(Llama3OutputParserTest, ParseRegularJsonOutputToolsInTheRequest) { } // Tool parser is available, but there are no tools in the request, so all output should be treated as content -TEST_F(Llama3OutputParserTest, ParseRegularJsonOutputNoToolsInTheRequest) { +TEST_P(Llama3OutputParserTest, ParseRegularJsonOutputNoToolsInTheRequest) { std::string input = "{\"name\": \"Jane Doe\", \"location\": \"unknown\"}"; auto generatedTensor = llama3Tokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids; std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); @@ -103,7 +117,7 @@ TEST_F(Llama3OutputParserTest, ParseRegularJsonOutputNoToolsInTheRequest) { EXPECT_EQ(parsedOutput.reasoning, ""); } -TEST_F(Llama3OutputParserTest, ParseToolCallOutputWithThreeToolCalls) { +TEST_P(Llama3OutputParserTest, ParseToolCallOutputWithThreeToolCalls) { std::string input = "{\"name\": \"example_tool\", \"parameters\": {\"arg1\": \"value1\", \"arg2\": 42}};" "{\"name\": \"another_tool\", \"parameters\": {\"param1\": \"data\", \"param2\": true}};" "{\"name\": \"third_tool\", \"parameters\": {\"key\": \"value\"}}"; @@ -130,7 +144,7 @@ TEST_F(Llama3OutputParserTest, ParseToolCallOutputWithThreeToolCalls) { EXPECT_NE(secondToolCallId, thirdToolCallId); } -TEST_F(Llama3OutputParserTest, ParseToolCallOutputWithContentAndNoToolCalls) { +TEST_P(Llama3OutputParserTest, ParseToolCallOutputWithContentAndNoToolCalls) { std::string input = "This is a regular model response without tool calls."; auto generatedTensor = llama3Tokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids; std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); @@ -140,7 +154,13 @@ TEST_F(Llama3OutputParserTest, ParseToolCallOutputWithContentAndNoToolCalls) { EXPECT_EQ(parsedOutput.reasoning, ""); } -TEST_F(Llama3OutputParserTest, ParseToolCallOutputWithContentAndSingleToolCall) { +TEST_P(Llama3OutputParserTest, ParseToolCallOutputWithContentAndSingleToolCall) { + // llama3_genai works on decoded text only; <|python_tag|> is a special token stripped by + // the tokenizer, so the content/tool-call boundary is invisible to the text-based parser. + if (GetParam() == "llama3_genai") { + GTEST_SKIP() << "llama3_genai cannot detect tool calls preceded by content: " + "<|python_tag|> is a special token and is stripped during decoding."; + } std::string content = "This is a content part and next will be a tool call."; std::string toolCall = "{\"name\": \"example_tool\", \"parameters\": {\"arg1\": \"value1\", \"arg2\": 42}}"; auto generatedContentTensor = llama3Tokenizer->encode(content, ov::genai::add_special_tokens(false)).input_ids; @@ -160,7 +180,7 @@ TEST_F(Llama3OutputParserTest, ParseToolCallOutputWithContentAndSingleToolCall) EXPECT_EQ(parsedOutput.toolCalls[0].id.empty(), false); } -TEST_F(Llama3OutputParserTest, HolisticStreaming) { +TEST_P(Llama3OutputParserTest, HolisticStreaming) { std::vector>> chunkToDeltaVec{ // Tool call phase // Starting first tool. Collecting chunk until full name is received. Don't return until then. @@ -202,7 +222,7 @@ TEST_F(Llama3OutputParserTest, HolisticStreaming) { for (auto lastFinishReason : {ov::genai::GenerationFinishReason::NONE, ov::genai::GenerationFinishReason::STOP, ov::genai::GenerationFinishReason::LENGTH}) { // Need to have new output parser per case to simulate separate request processing - outputParserWithRegularToolParsing = std::make_unique(*llama3Tokenizer, "llama3", "", EMPTY_TOOLS_SCHEMA); + outputParserWithRegularToolParsing = std::make_unique(*llama3Tokenizer, GetParam(), "", EMPTY_TOOLS_SCHEMA); auto chunkToDeltaVecCopy = chunkToDeltaVec; if (lastFinishReason == ov::genai::GenerationFinishReason::NONE) { chunkToDeltaVecCopy.push_back({"Paris\"}}", ov::genai::GenerationFinishReason::NONE, R"({"delta":{"tool_calls":[{"index":1,"function":{"arguments":" \""}}]}})"}); @@ -264,7 +284,7 @@ TEST_F(Llama3OutputParserTest, HolisticStreaming) { } // Positive test for streaming tool calls with complex arguments containing special characters -TEST_F(Llama3OutputParserTest, StreamingToolWithComplexArguments) { +TEST_P(Llama3OutputParserTest, StreamingToolWithComplexArguments) { std::vector>> chunkToDeltaVec{ {"{\"", std::nullopt}, {"name", std::nullopt}, @@ -331,7 +351,7 @@ TEST_F(Llama3OutputParserTest, StreamingToolWithComplexArguments) { "{\"delta\":{\"tool_calls\":[{\"index\":0,\"function\":{\"arguments\":\"nested_value2\"}}]}}"}, }; - auto outputParser = std::make_unique(*llama3Tokenizer, "llama3", "", EMPTY_TOOLS_SCHEMA); + auto outputParser = std::make_unique(*llama3Tokenizer, GetParam(), "", EMPTY_TOOLS_SCHEMA); for (const auto& [chunk, expectedDelta] : chunkToDeltaVec) { std::optional doc = outputParser->parseChunk(chunk, true, ov::genai::GenerationFinishReason::NONE); if (!expectedDelta.has_value() && !doc.has_value()) { @@ -383,7 +403,7 @@ TEST_F(Llama3OutputParserTest, StreamingToolWithComplexArguments) { } } -TEST_F(Llama3OutputParserTest, ToolCallsWithoutToolsInTheRequestStreaming) { +TEST_P(Llama3OutputParserTest, ToolCallsWithoutToolsInTheRequestStreaming) { std::vector>> chunkToDeltaVec{ // Tool parser is available, but tools are not in the request so every chunk is just a regular content {"<|python_tag|>", "{\"delta\":{\"content\":\"<|python_tag|>\"}}"},