Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit ac5c5be

Browse files
authored
Merge pull request #68 from janhq/66-feat-loadunload-model-with-config-at-runtime
66 feat load unload model with config at runtime
2 parents 93ee354 + a7a3818 commit ac5c5be

File tree

4 files changed

+123
-90
lines changed

4 files changed

+123
-90
lines changed

config.json

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,5 @@
44
"address": "127.0.0.1",
55
"port": 3928
66
}
7-
],
8-
"custom_config": {
9-
"llama_model_path": "/Users/alandao/Documents/codes/nitro.cpp_temp/models/llama2_7b_chat_uncensored.Q4_0.gguf",
10-
"ctx_len": 2048,
11-
"ngl": 100,
12-
"embedding":true
13-
}
7+
]
148
}

controllers/llamaCPP.cc

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <chrono>
55
#include <cstring>
66
#include <drogon/HttpResponse.h>
7+
#include <drogon/HttpTypes.h>
78
#include <regex>
89
#include <thread>
910

@@ -41,6 +42,15 @@ std::string create_return_json(const std::string &id, const std::string &model,
4142
void llamaCPP::chatCompletion(
4243
const HttpRequestPtr &req,
4344
std::function<void(const HttpResponsePtr &)> &&callback) {
45+
if (!model_loaded) {
46+
Json::Value jsonResp;
47+
jsonResp["message"] = "Model is not loaded yet";
48+
auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
49+
resp->setStatusCode(drogon::k500InternalServerError);
50+
callback(resp);
51+
return;
52+
}
53+
4454
const auto &jsonBody = req->getJsonObject();
4555
std::string formatted_output =
4656
"Below is a conversation between an AI system named ASSISTANT and USER\n";
@@ -203,6 +213,15 @@ void llamaCPP::chatCompletion(
203213
void llamaCPP::embedding(
204214
const HttpRequestPtr &req,
205215
std::function<void(const HttpResponsePtr &)> &&callback) {
216+
if (!model_loaded) {
217+
Json::Value jsonResp;
218+
jsonResp["message"] = "Model is not loaded yet";
219+
auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
220+
resp->setStatusCode(drogon::k500InternalServerError);
221+
callback(resp);
222+
return;
223+
}
224+
206225
auto lock = llama.lock();
207226

208227
const auto &jsonBody = req->getJsonObject();
@@ -225,3 +244,51 @@ void llamaCPP::embedding(
225244
resp->setContentTypeString("application/json");
226245
callback(resp);
227246
}
247+
248+
void llamaCPP::loadModel(
249+
const HttpRequestPtr &req,
250+
std::function<void(const HttpResponsePtr &)> &&callback) {
251+
252+
const auto &jsonBody = req->getJsonObject();
253+
254+
gpt_params params;
255+
if (jsonBody) {
256+
params.model = (*jsonBody)["llama_model_path"].asString();
257+
params.n_gpu_layers = (*jsonBody)["ngl"].asInt();
258+
params.n_ctx = (*jsonBody)["ctx_len"].asInt();
259+
params.embedding = (*jsonBody)["embedding"].asBool();
260+
}
261+
#ifdef GGML_USE_CUBLAS
262+
LOG_INFO << "Setting up GGML CUBLAS PARAMS";
263+
params.mul_mat_q = false;
264+
#endif // GGML_USE_CUBLAS
265+
if (params.model_alias == "unknown") {
266+
params.model_alias = params.model;
267+
}
268+
269+
llama_backend_init(params.numa);
270+
271+
LOG_INFO_LLAMA("build info",
272+
{{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
273+
LOG_INFO_LLAMA("system info",
274+
{
275+
{"n_threads", params.n_threads},
276+
{"total_threads", std::thread::hardware_concurrency()},
277+
{"system_info", llama_print_system_info()},
278+
});
279+
280+
// load the model
281+
if (!llama.loadModel(params)) {
282+
LOG_ERROR << "Error loading the model will exit the program";
283+
Json::Value jsonResp;
284+
jsonResp["message"] = "Model loaded failed";
285+
auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
286+
resp->setStatusCode(drogon::k500InternalServerError);
287+
callback(resp);
288+
}
289+
Json::Value jsonResp;
290+
jsonResp["message"] = "Model loaded successfully";
291+
model_loaded = true;
292+
auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
293+
callback(resp);
294+
}

controllers/llamaCPP.h

Lines changed: 39 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include <drogon/HttpTypes.h>
12
#if defined(_WIN32)
23
#define NOMINMAX
34
#endif
@@ -1311,51 +1312,56 @@ namespace inferences {
13111312
class llamaCPP : public drogon::HttpController<llamaCPP> {
13121313
public:
13131314
llamaCPP() {
1314-
gpt_params params;
1315-
auto conf = drogon::app().getCustomConfig();
1316-
params.model = conf["llama_model_path"].asString();
1317-
params.n_gpu_layers = conf["ngl"].asInt();
1318-
params.n_ctx = conf["ctx_len"].asInt();
1319-
params.embedding = conf["embedding"].asBool();
1320-
#ifdef GGML_USE_CUBLAS
1321-
LOG_INFO << "Setting up GGML CUBLAS PARAMS";
1322-
params.mul_mat_q = false;
1323-
#endif // GGML_USE_CUBLAS
1324-
if (params.model_alias == "unknown") {
1325-
params.model_alias = params.model;
1326-
}
1327-
1328-
llama_backend_init(params.numa);
1329-
1330-
LOG_INFO_LLAMA("build info",
1331-
{{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
1332-
LOG_INFO_LLAMA("system info",
1333-
{
1334-
{"n_threads", params.n_threads},
1335-
{"total_threads", std::thread::hardware_concurrency()},
1336-
{"system_info", llama_print_system_info()},
1337-
});
1338-
1339-
// load the model
1340-
if (!llama.loadModel(params)) {
1341-
LOG_ERROR << "Error loading the model will exit the program";
1342-
std::terminate();
1343-
}
1344-
nitro_utils::nitro_logo();
1315+
// gpt_params params;
1316+
// auto conf = drogon::app().getCustomConfig();
1317+
// params.model = conf["llama_model_path"].asString();
1318+
// params.n_gpu_layers = conf["ngl"].asInt();
1319+
// params.n_ctx = conf["ctx_len"].asInt();
1320+
// params.embedding = conf["embedding"].asBool();
1321+
// #ifdef GGML_USE_CUBLAS
1322+
// LOG_INFO << "Setting up GGML CUBLAS PARAMS";
1323+
// params.mul_mat_q = false;
1324+
// #endif // GGML_USE_CUBLAS
1325+
// if (params.model_alias == "unknown") {
1326+
// params.model_alias = params.model;
1327+
// }
1328+
//
1329+
// llama_backend_init(params.numa);
1330+
//
1331+
// LOG_INFO_LLAMA("build info",
1332+
// {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
1333+
// LOG_INFO_LLAMA("system info",
1334+
// {
1335+
// {"n_threads", params.n_threads},
1336+
// {"total_threads",
1337+
// std::thread::hardware_concurrency()},
1338+
// {"system_info", llama_print_system_info()},
1339+
// });
1340+
//
1341+
// // load the model
1342+
// if (!llama.loadModel(params)) {
1343+
// LOG_ERROR << "Error loading the model will exit the program";
1344+
// std::terminate();
1345+
// }
1346+
// deprecate this if find no usecase
13451347
}
13461348
METHOD_LIST_BEGIN
13471349
// list path definitions here;
1348-
METHOD_ADD(llamaCPP::chatCompletion, "chat_completion");
1349-
METHOD_ADD(llamaCPP::embedding,"embedding");
1350+
METHOD_ADD(llamaCPP::chatCompletion, "chat_completion", Post);
1351+
METHOD_ADD(llamaCPP::embedding, "embedding", Post);
1352+
METHOD_ADD(llamaCPP::loadModel, "loadmodel", Post);
13501353
// PATH_ADD("/llama/chat_completion", Post);
13511354
METHOD_LIST_END
13521355
void chatCompletion(const HttpRequestPtr &req,
13531356
std::function<void(const HttpResponsePtr &)> &&callback);
13541357
void embedding(const HttpRequestPtr &req,
13551358
std::function<void(const HttpResponsePtr &)> &&callback);
1359+
void loadModel(const HttpRequestPtr &req,
1360+
std::function<void(const HttpResponsePtr &)> &&callback);
13561361

13571362
private:
13581363
llama_server_context llama;
1364+
bool model_loaded = false;
13591365
size_t sent_count = 0;
13601366
size_t sent_token_probs_index = 0;
13611367
};

main.cc

Lines changed: 16 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
#include "controllers/nitro_utils.h"
22
#include <climits> // for PATH_MAX
33
#include <drogon/drogon.h>
44

@@ -14,59 +14,25 @@
1414
#error "Unsupported platform!"
1515
#endif
1616

17-
int main() {
18-
std::string configPath;
17+
int main(int argc, char *argv[]) {
1918

20-
#if defined(__APPLE__) && defined(__MACH__)
21-
char path[PATH_MAX];
22-
uint32_t size = sizeof(path);
23-
if (_NSGetExecutablePath(path, &size) == 0) {
24-
path[size] = '\0'; // Null-terminate the string
25-
char *dir = dirname(path);
26-
configPath = std::string(dir) + "/config/config.json";
27-
} else {
28-
LOG_ERROR << "Failed to get binary location!";
29-
return 1;
30-
}
31-
#elif defined(__linux__)
32-
char path[PATH_MAX];
33-
ssize_t len = readlink("/proc/self/exe", path, sizeof(path) - 1);
34-
if (len != -1) {
35-
path[len] = '\0';
36-
char *dir = dirname(path);
37-
configPath = std::string(dir) + "/config/config.json";
38-
} else {
39-
LOG_ERROR << "Failed to get binary location!";
40-
return 1;
41-
}
42-
#elif defined(_WIN32)
43-
char path[MAX_PATH];
44-
char dir[MAX_PATH];
45-
// char dir[MAX_PATH];
46-
if(GetModuleFileNameA(NULL, path, sizeof(path))) {
47-
char* lastBackslash = strrchr(path, '\\');
48-
if (lastBackslash == nullptr) {
49-
return 1;
50-
}
51-
lastBackslash[0] = '\0';
52-
strcpy(dir, path);
53-
configPath = std::string(dir) + "/config/config.json";
54-
}
55-
else {
56-
LOG_ERROR << "Failed to get binary location!";
57-
return 1;
19+
std::string host = "127.0.0.1";
20+
int port = 3928;
21+
22+
// Check for host argument
23+
if (argc > 1) {
24+
host = argv[1];
5825
}
59-
#else
60-
LOG_ERROR << "Unsupported platform!";
61-
return 1;
62-
#endif
6326

64-
// Set HTTP listener address and port
65-
drogon::app().loadConfigFile(configPath);
66-
auto app_conf = drogon::app().getCustomConfig();
27+
// Check for port argument
28+
if (argc > 2) {
29+
port = std::atoi(argv[2]); // Convert string argument to int
30+
}
6731

68-
LOG_INFO << app_conf["llama_model_file"].asString();
69-
// drogon::app().addListener("0.0.0.0", 8080);
32+
nitro_utils::nitro_logo();
33+
LOG_INFO << "Server started, listening at: " << host << ":" << port;
34+
LOG_INFO << "Please load your model";
35+
drogon::app().addListener(host, port);
7036
drogon::app().run();
7137

7238
return 0;

0 commit comments

Comments
 (0)