Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit 4cfa440

Browse files
committed
feat: add dynamic loading model through api
1 parent 93ee354 commit 4cfa440

File tree

4 files changed

+118
-51
lines changed

4 files changed

+118
-51
lines changed

config.json

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,5 @@
44
"address": "127.0.0.1",
55
"port": 3928
66
}
7-
],
8-
"custom_config": {
9-
"llama_model_path": "/Users/alandao/Documents/codes/nitro.cpp_temp/models/llama2_7b_chat_uncensored.Q4_0.gguf",
10-
"ctx_len": 2048,
11-
"ngl": 100,
12-
"embedding":true
13-
}
7+
]
148
}

controllers/llamaCPP.cc

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <chrono>
55
#include <cstring>
66
#include <drogon/HttpResponse.h>
7+
#include <drogon/HttpTypes.h>
78
#include <regex>
89
#include <thread>
910

@@ -41,6 +42,15 @@ std::string create_return_json(const std::string &id, const std::string &model,
4142
void llamaCPP::chatCompletion(
4243
const HttpRequestPtr &req,
4344
std::function<void(const HttpResponsePtr &)> &&callback) {
45+
if (!model_loaded) {
46+
Json::Value jsonResp;
47+
jsonResp["message"] = "Model is not loaded yet";
48+
auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
49+
resp->setStatusCode(drogon::k500InternalServerError);
50+
callback(resp);
51+
return;
52+
}
53+
4454
const auto &jsonBody = req->getJsonObject();
4555
std::string formatted_output =
4656
"Below is a conversation between an AI system named ASSISTANT and USER\n";
@@ -203,6 +213,15 @@ void llamaCPP::chatCompletion(
203213
void llamaCPP::embedding(
204214
const HttpRequestPtr &req,
205215
std::function<void(const HttpResponsePtr &)> &&callback) {
216+
if (!model_loaded) {
217+
Json::Value jsonResp;
218+
jsonResp["message"] = "Model is not loaded yet";
219+
auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
220+
resp->setStatusCode(drogon::k500InternalServerError);
221+
callback(resp);
222+
return;
223+
}
224+
206225
auto lock = llama.lock();
207226

208227
const auto &jsonBody = req->getJsonObject();
@@ -225,3 +244,51 @@ void llamaCPP::embedding(
225244
resp->setContentTypeString("application/json");
226245
callback(resp);
227246
}
247+
248+
void llamaCPP::loadModel(
249+
const HttpRequestPtr &req,
250+
std::function<void(const HttpResponsePtr &)> &&callback) {
251+
252+
const auto &jsonBody = req->getJsonObject();
253+
254+
gpt_params params;
255+
if (jsonBody) {
256+
params.model = (*jsonBody)["llama_model_path"].asString();
257+
params.n_gpu_layers = (*jsonBody)["ngl"].asInt();
258+
params.n_ctx = (*jsonBody)["ctx_len"].asInt();
259+
params.embedding = (*jsonBody)["embedding"].asBool();
260+
}
261+
#ifdef GGML_USE_CUBLAS
262+
LOG_INFO << "Setting up GGML CUBLAS PARAMS";
263+
params.mul_mat_q = false;
264+
#endif // GGML_USE_CUBLAS
265+
if (params.model_alias == "unknown") {
266+
params.model_alias = params.model;
267+
}
268+
269+
llama_backend_init(params.numa);
270+
271+
LOG_INFO_LLAMA("build info",
272+
{{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
273+
LOG_INFO_LLAMA("system info",
274+
{
275+
{"n_threads", params.n_threads},
276+
{"total_threads", std::thread::hardware_concurrency()},
277+
{"system_info", llama_print_system_info()},
278+
});
279+
280+
// load the model
281+
if (!llama.loadModel(params)) {
282+
LOG_ERROR << "Error loading the model will exit the program";
283+
Json::Value jsonResp;
284+
jsonResp["message"] = "Model loaded failed";
285+
auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
286+
resp->setStatusCode(drogon::k500InternalServerError);
287+
callback(resp);
288+
}
289+
Json::Value jsonResp;
290+
jsonResp["message"] = "Model loaded successfully";
291+
model_loaded = true;
292+
auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
293+
callback(resp);
294+
}

controllers/llamaCPP.h

Lines changed: 38 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include <drogon/HttpTypes.h>
12
#if defined(_WIN32)
23
#define NOMINMAX
34
#endif
@@ -1311,51 +1312,55 @@ namespace inferences {
13111312
class llamaCPP : public drogon::HttpController<llamaCPP> {
13121313
public:
13131314
llamaCPP() {
1314-
gpt_params params;
1315-
auto conf = drogon::app().getCustomConfig();
1316-
params.model = conf["llama_model_path"].asString();
1317-
params.n_gpu_layers = conf["ngl"].asInt();
1318-
params.n_ctx = conf["ctx_len"].asInt();
1319-
params.embedding = conf["embedding"].asBool();
1320-
#ifdef GGML_USE_CUBLAS
1321-
LOG_INFO << "Setting up GGML CUBLAS PARAMS";
1322-
params.mul_mat_q = false;
1323-
#endif // GGML_USE_CUBLAS
1324-
if (params.model_alias == "unknown") {
1325-
params.model_alias = params.model;
1326-
}
1327-
1328-
llama_backend_init(params.numa);
1329-
1330-
LOG_INFO_LLAMA("build info",
1331-
{{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
1332-
LOG_INFO_LLAMA("system info",
1333-
{
1334-
{"n_threads", params.n_threads},
1335-
{"total_threads", std::thread::hardware_concurrency()},
1336-
{"system_info", llama_print_system_info()},
1337-
});
1338-
1339-
// load the model
1340-
if (!llama.loadModel(params)) {
1341-
LOG_ERROR << "Error loading the model will exit the program";
1342-
std::terminate();
1343-
}
1344-
nitro_utils::nitro_logo();
1315+
// gpt_params params;
1316+
// auto conf = drogon::app().getCustomConfig();
1317+
// params.model = conf["llama_model_path"].asString();
1318+
// params.n_gpu_layers = conf["ngl"].asInt();
1319+
// params.n_ctx = conf["ctx_len"].asInt();
1320+
// params.embedding = conf["embedding"].asBool();
1321+
//#ifdef GGML_USE_CUBLAS
1322+
// LOG_INFO << "Setting up GGML CUBLAS PARAMS";
1323+
// params.mul_mat_q = false;
1324+
//#endif // GGML_USE_CUBLAS
1325+
// if (params.model_alias == "unknown") {
1326+
// params.model_alias = params.model;
1327+
// }
1328+
//
1329+
// llama_backend_init(params.numa);
1330+
//
1331+
// LOG_INFO_LLAMA("build info",
1332+
// {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
1333+
// LOG_INFO_LLAMA("system info",
1334+
// {
1335+
// {"n_threads", params.n_threads},
1336+
// {"total_threads", std::thread::hardware_concurrency()},
1337+
// {"system_info", llama_print_system_info()},
1338+
// });
1339+
//
1340+
// // load the model
1341+
// if (!llama.loadModel(params)) {
1342+
// LOG_ERROR << "Error loading the model will exit the program";
1343+
// std::terminate();
1344+
// }
13451345
}
13461346
METHOD_LIST_BEGIN
13471347
// list path definitions here;
1348-
METHOD_ADD(llamaCPP::chatCompletion, "chat_completion");
1349-
METHOD_ADD(llamaCPP::embedding,"embedding");
1348+
METHOD_ADD(llamaCPP::chatCompletion, "chat_completion",Post);
1349+
METHOD_ADD(llamaCPP::embedding,"embedding",Post);
1350+
METHOD_ADD(llamaCPP::loadModel,"loadmodel",Post);
13501351
// PATH_ADD("/llama/chat_completion", Post);
13511352
METHOD_LIST_END
13521353
void chatCompletion(const HttpRequestPtr &req,
13531354
std::function<void(const HttpResponsePtr &)> &&callback);
13541355
void embedding(const HttpRequestPtr &req,
13551356
std::function<void(const HttpResponsePtr &)> &&callback);
1357+
void loadModel(const HttpRequestPtr &req,
1358+
std::function<void(const HttpResponsePtr &)> &&callback);
1359+
13561360

13571361
private:
13581362
llama_server_context llama;
1363+
bool model_loaded = false;
13591364
size_t sent_count = 0;
13601365
size_t sent_token_probs_index = 0;
13611366
};

main.cc

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
#include "controllers/nitro_utils.h"
22
#include <climits> // for PATH_MAX
33
#include <drogon/drogon.h>
44

@@ -43,16 +43,15 @@ int main() {
4343
char path[MAX_PATH];
4444
char dir[MAX_PATH];
4545
// char dir[MAX_PATH];
46-
if(GetModuleFileNameA(NULL, path, sizeof(path))) {
47-
char* lastBackslash = strrchr(path, '\\');
48-
if (lastBackslash == nullptr) {
49-
return 1;
50-
}
51-
lastBackslash[0] = '\0';
52-
strcpy(dir, path);
53-
configPath = std::string(dir) + "/config/config.json";
54-
}
55-
else {
46+
if (GetModuleFileNameA(NULL, path, sizeof(path))) {
47+
char *lastBackslash = strrchr(path, '\\');
48+
if (lastBackslash == nullptr) {
49+
return 1;
50+
}
51+
lastBackslash[0] = '\0';
52+
strcpy(dir, path);
53+
configPath = std::string(dir) + "/config/config.json";
54+
} else {
5655
LOG_ERROR << "Failed to get binary location!";
5756
return 1;
5857
}
@@ -66,6 +65,8 @@ int main() {
6665
auto app_conf = drogon::app().getCustomConfig();
6766

6867
LOG_INFO << app_conf["llama_model_file"].asString();
68+
nitro_utils::nitro_logo();
69+
LOG_INFO << "Server started, please load your model";
6970
// drogon::app().addListener("0.0.0.0", 8080);
7071
drogon::app().run();
7172

0 commit comments

Comments
 (0)