add cpu_threads api + docs

hahuyhoang411 · hahuyhoang411 · commit 5135656bb7a5 · 2023-11-27T23:39:10.000+07:00
diff --git a/docs/docs/features/load-unload.md b/docs/docs/features/load-unload.md
@@ -69,7 +69,8 @@ In case you got error while loading models. Please check for the correct model p
 | `ngl`              | Integer | The number of GPU layers to use.                             |
 | `ctx_len`          | Integer | The context length for the model operations.                 |
 | `embedding`        | Boolean | Whether to use embedding in the model.                       |
-| `n_parallel`       | Integer | The number of parallel operations. Uses Drogon thread count if not set. |
+| `n_parallel`       | Integer | The number of parallel operations.|
+|`cpu_threads`|Integer|The number of threads for CPU inference.|
 | `cont_batching`    | Boolean | Whether to use continuous batching.                          |
 | `user_prompt`      | String  | The prompt to use for the user.                              |
 | `ai_prompt`        | String  | The prompt to use for the AI assistant.                      |
diff --git a/docs/openapi/NitroAPI.yaml b/docs/openapi/NitroAPI.yaml
@@ -235,6 +235,11 @@ components:
           example: 4
           nullable: true
           description: The number of parallel operations. Only set when enable continuous batching.
+        cpu_threads:
+          type: integer
+          example: 4
+          nullable: true
+          description: The number of threads for CPU-based inference.
         pre_prompt:
           type: string
           default: A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.
@@ -255,7 +260,6 @@ components:
           default: "ASSISTANT:"
           nullable: true
           description: The prefix for assistant prompt.
-
       required:
         - llama_model_path