fix: support ctx_len for model start cli (#1766)

vansangpfiev · web-flow · commit 2b74824ae2e4 · 2024-12-04T13:54:22.000+07:00
* fix: support ctx_len for model start cli

* chore: docs

* fix: guard max ctx_len
diff --git a/docs/docs/cli/models/index.mdx b/docs/docs/cli/models/index.mdx
@@ -159,9 +159,10 @@ This command uses a `model_id` from the model that you have downloaded or availa
 
 | Option                    | Description                                                               | Required | Default value                                | Example                |
 |---------------------------|---------------------------------------------------------------------------|----------|----------------------------------------------|------------------------|
-| `model_id`                | The identifier of the model you want to start.                            | Yes       | `Prompt to select from the available models` | `mistral`       |
-| `--gpus`                  | List of GPUs to use.                                                      | No       | -                                            | `[0,1]`           |
-| `-h`, `--help`            | Display help information for the command.                                 | No       | -                                            | `-h`               |
+| `model_id`                | The identifier of the model you want to start.                            | Yes      | `Prompt to select from the available models` | `mistral`              |
+| `--gpus`                  | List of GPUs to use.                                                      | No       | -                                            | `[0,1]`                |
+| `--ctx_len`               | Maximum context length for inference.                                     | No       | `min(8192, max_model_context_length)`        | `1024`                 |
+| `-h`, `--help`            | Display help information for the command.                                 | No       | -                                            | `-h`                   |
 
 ## `cortex models stop`
 :::info
diff --git a/docs/docs/cli/models/start.md b/docs/docs/cli/models/start.md
@@ -33,6 +33,7 @@ cortex models start [model_id]:[engine] [options]
 |---------------------------|----------------------------------------------------------|----------|----------------------------------------------|-------------------|
 | `model_id`                | The identifier of the model you want to start.           | No       | `Prompt to select from the available models` | `mistral`         |
 | `--gpus`                  | List of GPUs to use.                                     | No       | -                                            | `[0,1]`           |
+| `--ctx_len`               | Maximum context length for inference.                    | No       | `min(8192, max_model_context_length)`        | `1024`            |
 | `-h`, `--help`            | Display help information for the command.                | No       | -                                            | `-h`              |
 
 
diff --git a/docs/docs/cli/run.mdx b/docs/docs/cli/run.mdx
@@ -36,7 +36,8 @@ You can use the `--verbose` flag to display more detailed output of the internal
 
 | Option                      | Description                                                                 | Required | Default value                                | Example                |
 |-----------------------------|-----------------------------------------------------------------------------|----------|----------------------------------------------|------------------------|
-| `model_id`                  | The identifier of the model you want to chat with.                          | Yes       | - | `mistral`       |
-| `--gpus`                   | List of GPUs to use.                                                         | No       | -                                            | `[0,1]`           |
+| `model_id`                  | The identifier of the model you want to chat with.                          | Yes      | - | `mistral`       |
+| `--gpus`                    | List of GPUs to use.                                                        | No       | -                                            | `[0,1]`           |
+| `--ctx_len`                 | Maximum context length for inference.                                       | No       | `min(8192, max_model_context_length)`        | `1024`                 |
 | `-h`, `--help`              | Display help information for the command.                                   | No       | -                                            | `-h`               |
 <!-- | `-t`, `--thread <thread_id>`  | Specify the Thread ID. Defaults to creating a new thread if none specified. | No       | -                                            | `-t jan_1717650808`       |                                      | `-c`               | -->
diff --git a/engine/cli/command_line_parser.cc b/engine/cli/command_line_parser.cc
@@ -163,16 +163,18 @@ void CommandLineParser::SetupCommonCommands() {
   run_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
                  " run [options] [model_id]");
   run_cmd->add_option("model_id", cml_data_.model_id, "");
-  run_cmd->add_option("--gpus", hw_activate_opts_["gpus"],
+  run_cmd->add_option("--gpus", run_settings_["gpus"],
                       "List of GPU to activate, for example [0, 1]");
+  run_cmd->add_option("--ctx_len", run_settings_["ctx_len"],
+                      "Maximum context length for inference");
   run_cmd->add_flag("-d,--detach", cml_data_.run_detach, "Detached mode");
   run_cmd->callback([this, run_cmd] {
     if (std::exchange(executed_, true))
       return;
     commands::RunCmd rc(cml_data_.config.apiServerHost,
                         std::stoi(cml_data_.config.apiServerPort),
                         cml_data_.model_id, download_service_);
-    rc.Exec(cml_data_.run_detach, hw_activate_opts_);
+    rc.Exec(cml_data_.run_detach, run_settings_);
   });
 }
 
@@ -203,8 +205,10 @@ void CommandLineParser::SetupModelCommands() {
   model_start_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
                          " models start [model_id]");
   model_start_cmd->add_option("model_id", cml_data_.model_id, "");
-  model_start_cmd->add_option("--gpus", hw_activate_opts_["gpus"],
+  model_start_cmd->add_option("--gpus", run_settings_["gpus"],
                               "List of GPU to activate, for example [0, 1]");
+  model_start_cmd->add_option("--ctx_len", run_settings_["ctx_len"],
+                              "Maximum context length for inference");
   model_start_cmd->group(kSubcommands);
   model_start_cmd->callback([this, model_start_cmd]() {
     if (std::exchange(executed_, true))
@@ -216,7 +220,7 @@ void CommandLineParser::SetupModelCommands() {
     };
     commands::ModelStartCmd().Exec(cml_data_.config.apiServerHost,
                                    std::stoi(cml_data_.config.apiServerPort),
-                                   cml_data_.model_id, hw_activate_opts_);
+                                   cml_data_.model_id, run_settings_);
   });
 
   auto stop_model_cmd =
@@ -562,7 +566,7 @@ void CommandLineParser::SetupHardwareCommands() {
   hw_activate_cmd->usage("Usage:\n" + commands::GetCortexBinary() +
                          " hardware activate --gpus [list_gpu]");
   hw_activate_cmd->group(kSubcommands);
-  hw_activate_cmd->add_option("--gpus", hw_activate_opts_["gpus"],
+  hw_activate_cmd->add_option("--gpus", run_settings_["gpus"],
                               "List of GPU to activate, for example [0, 1]");
   hw_activate_cmd->callback([this, hw_activate_cmd]() {
     if (std::exchange(executed_, true))
@@ -572,14 +576,14 @@ void CommandLineParser::SetupHardwareCommands() {
       return;
     }
 
-    if (hw_activate_opts_["gpus"].empty()) {
+    if (run_settings_["gpus"].empty()) {
       CLI_LOG("[list_gpu] is required\n");
       CLI_LOG(hw_activate_cmd->help());
       return;
     }
     commands::HardwareActivateCmd().Exec(
         cml_data_.config.apiServerHost,
-        std::stoi(cml_data_.config.apiServerPort), hw_activate_opts_);
+        std::stoi(cml_data_.config.apiServerPort), run_settings_);
   });
 }
 
diff --git a/engine/cli/command_line_parser.h b/engine/cli/command_line_parser.h
@@ -79,5 +79,5 @@ class CommandLineParser {
   std::unordered_map<std::string, std::string> config_update_opts_;
   bool executed_ = false;
   commands::HarwareOptions hw_opts_;
-  std::unordered_map<std::string, std::string> hw_activate_opts_;
+  std::unordered_map<std::string, std::string> run_settings_;
 };
diff --git a/engine/cli/commands/model_start_cmd.cc b/engine/cli/commands/model_start_cmd.cc
@@ -30,8 +30,8 @@ bool ModelStartCmd::Exec(
 
   //
   bool should_activate_hw = false;
-  for (auto const& [_, v] : options) {
-    if (!v.empty()) {
+  for (auto const& [k, v] : options) {
+    if (k == "gpus" && !v.empty()) {
       should_activate_hw = true;
       break;
     }
@@ -57,6 +57,9 @@ bool ModelStartCmd::Exec(
 
   Json::Value json_data;
   json_data["model"] = model_id.value();
+  for (auto const& [k, v] : options) {
+    UpdateConfig(json_data, k, v);
+  }
   auto data_str = json_data.toStyledString();
   auto res = curl_utils::SimplePostJson(url.ToFullPath(), data_str);
   if (res.has_error()) {
@@ -75,4 +78,16 @@ bool ModelStartCmd::Exec(
   }
   return true;
 }
+
+bool ModelStartCmd::UpdateConfig(Json::Value& data, const std::string& key,
+                                 const std::string& value) {
+  if (key == "ctx_len" && !value.empty()) {
+    try {
+      data["ctx_len"] = std::stoi(value);
+    } catch (const std::exception& e) {
+      CLI_LOG("Failed to parse numeric value for " << key << ": " << e.what());
+    }
+  }
+  return true;
+}
 };  // namespace commands
diff --git a/engine/cli/commands/model_start_cmd.h b/engine/cli/commands/model_start_cmd.h
@@ -2,6 +2,7 @@
 
 #include <string>
 #include <unordered_map>
+#include "json/json.h"
 
 namespace commands {
 
@@ -10,5 +11,8 @@ class ModelStartCmd {
   bool Exec(const std::string& host, int port, const std::string& model_handle,
             const std::unordered_map<std::string, std::string>& options,
             bool print_success_log = true);
+    private:
+  bool UpdateConfig(Json::Value& data, const std::string& key,
+                    const std::string& value);
 };
 }  // namespace commands
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
@@ -702,6 +702,8 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
   config::YamlHandler yaml_handler;
 
   try {
+    constexpr const int kDefautlContextLength = 8192;
+    int max_model_context_length = kDefautlContextLength;
     Json::Value json_data;
     // Currently we don't support download vision models, so we need to bypass check
     if (!params_override.bypass_model_check()) {
@@ -732,6 +734,8 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
       json_data["system_prompt"] = mc.system_template;
       json_data["user_prompt"] = mc.user_template;
       json_data["ai_prompt"] = mc.ai_template;
+      json_data["ctx_len"] = std::min(kDefautlContextLength, mc.ctx_len);
+      max_model_context_length = mc.ctx_len;
     } else {
       bypass_stop_check_set_.insert(model_handle);
     }
@@ -753,12 +757,14 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
     ASSIGN_IF_PRESENT(json_data, params_override, cache_enabled);
     ASSIGN_IF_PRESENT(json_data, params_override, ngl);
     ASSIGN_IF_PRESENT(json_data, params_override, n_parallel);
-    ASSIGN_IF_PRESENT(json_data, params_override, ctx_len);
     ASSIGN_IF_PRESENT(json_data, params_override, cache_type);
     ASSIGN_IF_PRESENT(json_data, params_override, mmproj);
     ASSIGN_IF_PRESENT(json_data, params_override, model_path);
 #undef ASSIGN_IF_PRESENT
-
+    if (params_override.ctx_len) {
+      json_data["ctx_len"] =
+          std::min(params_override.ctx_len.value(), max_model_context_length);
+    }
     CTL_INF(json_data.toStyledString());
     auto may_fallback_res = MayFallbackToCpu(json_data["model_path"].asString(),
                                              json_data["ngl"].asInt(),