diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 6898b51604..b24b545b33 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -700,6 +700,16 @@ absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optionalvalue.IsNull()) { + if (!it->value.IsString()) + return absl::InvalidArgumentError("lora_adapter is not a string"); + SPDLOG_ERROR("Found lora adapter in request => {}", it->value.GetString()); + request.loraAdapter = it->value.GetString(); + } + // ignore_eos: bool; optional - defaults to false // Extension, unsupported by OpenAI API, however supported by vLLM and CB lib it = doc.FindMember("ignore_eos"); diff --git a/src/llm/apis/openai_request.hpp b/src/llm/apis/openai_request.hpp index de355c12a1..d4add423a3 100644 --- a/src/llm/apis/openai_request.hpp +++ b/src/llm/apis/openai_request.hpp @@ -73,6 +73,9 @@ struct OpenAIChatCompletionsRequest { std::optional maxModelLength; + // LoRA adapter selection + std::optional loraAdapter{std::nullopt}; + // Guided generation specific // String representation of response format object std::optional responseFormat{std::nullopt}; diff --git a/src/llm/language_model/continuous_batching/llm_executor.hpp b/src/llm/language_model/continuous_batching/llm_executor.hpp index ec7c7d517b..e4035da01f 100644 --- a/src/llm/language_model/continuous_batching/llm_executor.hpp +++ b/src/llm/language_model/continuous_batching/llm_executor.hpp @@ -102,7 +102,7 @@ struct LLMExecutor { void printMetrics() { ov::genai::PipelineMetrics metrics = pipe->get_metrics(); SPDLOG_LOGGER_INFO(llm_executor_logger, "All requests: {}; Scheduled requests: {}; Cache {};", - metrics.requests, metrics.scheduled_requests, formatCacheInfo(metrics.cache_usage, metrics.kv_cache_size_in_bytes, this->isDynamicKVCache)); + metrics.requests, metrics.scheduled_requests, metrics.cache_usage); } }; #pragma GCC diagnostic pop diff --git a/src/llm/language_model/continuous_batching/servable_initializer.cpp b/src/llm/language_model/continuous_batching/servable_initializer.cpp index 27f4f51aee..6857b1ac56 100644 --- a/src/llm/language_model/continuous_batching/servable_initializer.cpp +++ b/src/llm/language_model/continuous_batching/servable_initializer.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -198,6 +199,37 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr 0) { + SPDLOG_INFO("LoRA adapters will be applied to the model. Number of adapters: {}", nodeOptions.lora_adapter_size()); + for (int i = 0; i < nodeOptions.lora_adapter_size(); ++i) { + SPDLOG_INFO("Processing LoRA adapter number {} with model path: {} alpha: {}", i, nodeOptions.lora_adapter(i).model_path(), nodeOptions.lora_adapter(i).alpha()); + const auto& loraAdapterOption = nodeOptions.lora_adapter(i); + auto fsLoraPath = std::filesystem::path(loraAdapterOption.model_path()); + std::string loraPath; + if (fsLoraPath.is_relative()) { + loraPath = (std::filesystem::path(graphPath) / fsLoraPath).string(); + } else { + loraPath = fsLoraPath.string(); + } + try { + ov::genai::Adapter adapter(loraPath); + properties->adapterConfig.add(adapter, loraAdapterOption.alpha()); + std::string adapterName = loraAdapterOption.has_name() + ? loraAdapterOption.name() + : std::filesystem::path(loraPath).stem().string(); + properties->adaptersByName.emplace(adapterName, adapter); + SPDLOG_INFO("Registered LoRA adapter '{}' from path: {}", adapterName, loraPath); + } catch (const std::exception& e) { + SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {} exception: {}", loraPath, e.what()); + return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; + } catch (...) { + SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {}", loraPath); + return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; + } + } + properties->pluginConfig.insert(ov::genai::adapters(properties->adapterConfig)); + } + status = JsonParser::parsePluginConfig(nodeOptions.plugin_config(), properties->pluginConfig); if (!status.ok()) { SPDLOG_ERROR("Error during llm node plugin_config option parsing to JSON: {}", nodeOptions.plugin_config()); diff --git a/src/llm/language_model/legacy/servable.cpp b/src/llm/language_model/legacy/servable.cpp index b3420553a2..f6f02f95cc 100644 --- a/src/llm/language_model/legacy/servable.cpp +++ b/src/llm/language_model/legacy/servable.cpp @@ -120,6 +120,12 @@ absl::Status LegacyServable::parseRequest(std::shared_ptrgenerationConfigBuilder->unsetStructuredOutputConfig(); } + + auto adapterStatus = applyLoraAdapter(executionContext); + if (!adapterStatus.ok()) { + return adapterStatus; + } + return absl::OkStatus(); } diff --git a/src/llm/language_model/legacy/servable_initializer.cpp b/src/llm/language_model/legacy/servable_initializer.cpp index 4ee7d4820a..e2abc79186 100644 --- a/src/llm/language_model/legacy/servable_initializer.cpp +++ b/src/llm/language_model/legacy/servable_initializer.cpp @@ -19,6 +19,7 @@ #include #include "openvino/genai/llm_pipeline.hpp" +#include #include #include @@ -76,6 +77,35 @@ Status LegacyServableInitializer::initialize(std::shared_ptr& ser return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; } + if (nodeOptions.lora_adapter_size() > 0) { + for (int i = 0; i < nodeOptions.lora_adapter_size(); ++i) { + const auto& loraAdapterOption = nodeOptions.lora_adapter(i); + auto fsLoraPath = std::filesystem::path(loraAdapterOption.model_path()); + std::string loraPath; + if (fsLoraPath.is_relative()) { + loraPath = (std::filesystem::path(graphPath) / fsLoraPath).string(); + } else { + loraPath = fsLoraPath.string(); + } + try { + ov::genai::Adapter adapter(loraPath); + properties->adapterConfig.add(adapter, loraAdapterOption.alpha()); + std::string adapterName = loraAdapterOption.has_name() + ? loraAdapterOption.name() + : std::filesystem::path(loraPath).stem().string(); + properties->adaptersByName.emplace(adapterName, adapter); + SPDLOG_INFO("Registered LoRA adapter '{}' from path: {}", adapterName, loraPath); + } catch (const std::exception& e) { + SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {} exception: {}", loraPath, e.what()); + return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; + } catch (...) { + SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {}", loraPath); + return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; + } + } + properties->pluginConfig.insert(ov::genai::adapters(properties->adapterConfig)); + } + status = JsonParser::parsePluginConfig(nodeOptions.plugin_config(), properties->pluginConfig); if (!status.ok()) { SPDLOG_ERROR("Error during llm node plugin_config option parsing to JSON: {}", nodeOptions.plugin_config()); diff --git a/src/llm/llm_calculator.proto b/src/llm/llm_calculator.proto index c8edacf88e..1fc60b3825 100644 --- a/src/llm/llm_calculator.proto +++ b/src/llm/llm_calculator.proto @@ -26,6 +26,12 @@ message LLMCalculatorOptions { optional LLMCalculatorOptions ext = 113473750; } + message LoraAdapter { + required string model_path = 1; + optional float alpha = 2 [default = 1]; + optional string name = 3; + } + message KVCrushConfig { enum AnchorPointMode { RANDOM = 0; @@ -135,4 +141,6 @@ message LLMCalculatorOptions { optional bool enable_tool_guided_generation = 23 [default = false]; optional SparseAttentionConfig sparse_attention_config = 24; + + repeated LoraAdapter lora_adapter = 25; } diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index 6d9810ae5f..7d8fb0066f 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -153,6 +153,28 @@ absl::Status GenAiServable::parseRequest(std::shared_ptrgenerationConfigBuilder->unsetStructuredOutputConfig(); } + auto adapterStatus = applyLoraAdapter(executionContext); + if (!adapterStatus.ok()) { + return adapterStatus; + } + + return absl::OkStatus(); +} + +absl::Status GenAiServable::applyLoraAdapter(std::shared_ptr& executionContext) { + const auto& request = executionContext->apiHandler->getRequest(); + if (request.loraAdapter.has_value()) { + auto props = getProperties(); + auto it = props->adaptersByName.find(request.loraAdapter.value()); + if (it == props->adaptersByName.end()) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Unknown LoRA adapter requested: {}", request.loraAdapter.value()); + return absl::InvalidArgumentError("Unknown LoRA adapter: " + request.loraAdapter.value()); + } + //float alpha = props->adapterConfig.get_alpha(it->second); + executionContext->generationConfigBuilder->getConfig().adapters = + ov::genai::AdapterConfig(it->second, 0.5);//alpha); + SPDLOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + } return absl::OkStatus(); } diff --git a/src/llm/servable.hpp b/src/llm/servable.hpp index e4a5dd5ee2..314711eed2 100644 --- a/src/llm/servable.hpp +++ b/src/llm/servable.hpp @@ -20,6 +20,8 @@ #include #include +#include + #pragma warning(push) #pragma warning(disable : 4251 4005 4309 6001 6385 6386 6326 6011 4005 4456 6246) #pragma GCC diagnostic push @@ -111,6 +113,9 @@ struct GenAiServableProperties { ov::genai::Tokenizer tokenizer; // Specific pipeline properties bool eagle3Mode = false; + // LoRA adapter support + ov::genai::AdapterConfig adapterConfig; + std::unordered_map adaptersByName; #if (PYTHON_DISABLE == 0) PyJinjaTemplateProcessor templateProcessor; @@ -157,6 +162,12 @@ class GenAiServable { */ virtual absl::Status parseRequest(std::shared_ptr& executionContext); +protected: + // Sets per-request LoRA adapter on generationConfigBuilder if lora_adapter is specified in the request + absl::Status applyLoraAdapter(std::shared_ptr& executionContext); + +public: + /* prepareInputs method implementation MUST fill executionContext inputIds field. Base implementation applies chat template to the payload body and encodes it with tokenizer. diff --git a/src/llm/visual_language_model/legacy/legacy_executor.cpp b/src/llm/visual_language_model/legacy/legacy_executor.cpp index a21c799cec..854bd925c8 100644 --- a/src/llm/visual_language_model/legacy/legacy_executor.cpp +++ b/src/llm/visual_language_model/legacy/legacy_executor.cpp @@ -40,7 +40,30 @@ void VisualLanguageModelLegacyExecutor::processRequest() { } else { SPDLOG_LOGGER_TRACE(llm_executor_logger, "Generation started"); try { - requestExecutionContext->results = pipe->generate(requestExecutionContext->inputText, requestExecutionContext->inputImages, requestExecutionContext->generationConfigBuilder->getConfig(), requestExecutionContext->textStreamer); + /* + absl::Status GenAiServable::applyLoraAdapter(std::shared_ptr& executionContext) { + const auto& request = executionContext->apiHandler->getRequest(); + if (request.loraAdapter.has_value()) { + auto props = getProperties(); + auto it = props->adaptersByName.find(request.loraAdapter.value()); + if (it == props->adaptersByName.end()) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Unknown LoRA adapter requested: {}", request.loraAdapter.value()); + return absl::InvalidArgumentError("Unknown LoRA adapter: " + request.loraAdapter.value()); + } + float alpha = props->adapterConfig.get_alpha(it->second); + executionContext->generationConfigBuilder->getConfig().adapters = + ov::genai::AdapterConfig(it->second, alpha); + } + return absl::OkStatus(); +} + */ + + + requestExecutionContext->results = pipe->generate( + requestExecutionContext->inputText, + requestExecutionContext->inputImages, + requestExecutionContext->generationConfigBuilder->getConfig(), + requestExecutionContext->textStreamer); } catch (std::exception& e) { requestExecutionContext->success = false; SPDLOG_LOGGER_ERROR(llm_executor_logger, "VLM pipeline generation failed: {}.", e.what()); diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 2834072410..4c6a9c3c35 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -130,6 +130,12 @@ absl::Status VisualLanguageModelLegacyServable::parseRequest(std::shared_ptrgenerationConfigBuilder->unsetStructuredOutputConfig(); } + + auto adapterStatus = applyLoraAdapter(executionContext); + if (!adapterStatus.ok()) { + return adapterStatus; + } + return absl::OkStatus(); } diff --git a/src/llm/visual_language_model/legacy/servable_initializer.cpp b/src/llm/visual_language_model/legacy/servable_initializer.cpp index ec8bfd327a..b1849135fa 100644 --- a/src/llm/visual_language_model/legacy/servable_initializer.cpp +++ b/src/llm/visual_language_model/legacy/servable_initializer.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -75,6 +76,35 @@ Status VisualLanguageModelLegacyServableInitializer::initialize(std::shared_ptr< return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; } + if (nodeOptions.lora_adapter_size() > 0) { + for (int i = 0; i < nodeOptions.lora_adapter_size(); ++i) { + const auto& loraAdapterOption = nodeOptions.lora_adapter(i); + auto fsLoraPath = std::filesystem::path(loraAdapterOption.model_path()); + std::string loraPath; + if (fsLoraPath.is_relative()) { + loraPath = (std::filesystem::path(graphPath) / fsLoraPath).string(); + } else { + loraPath = fsLoraPath.string(); + } + try { + ov::genai::Adapter adapter(loraPath); + properties->adapterConfig.add(adapter, loraAdapterOption.alpha()); + std::string adapterName = loraAdapterOption.has_name() + ? loraAdapterOption.name() + : std::filesystem::path(loraPath).stem().string(); + properties->adaptersByName.emplace(adapterName, adapter); + SPDLOG_INFO("Registered LoRA adapter '{}' from path: {}", adapterName, loraPath); + } catch (const std::exception& e) { + SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {} exception: {}", loraPath, e.what()); + return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; + } catch (...) { + SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {}", loraPath); + return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; + } + } + properties->pluginConfig.insert(ov::genai::adapters(properties->adapterConfig)); + } + status = JsonParser::parsePluginConfig(nodeOptions.plugin_config(), properties->pluginConfig); if (!status.ok()) { SPDLOG_ERROR("Error during llm node plugin_config option parsing to JSON: {}", nodeOptions.plugin_config());