|
| 1 | +#include <drogon/HttpTypes.h> |
1 | 2 | #if defined(_WIN32) |
2 | 3 | #define NOMINMAX |
3 | 4 | #endif |
@@ -1311,51 +1312,55 @@ namespace inferences { |
1311 | 1312 | class llamaCPP : public drogon::HttpController<llamaCPP> { |
1312 | 1313 | public: |
1313 | 1314 | llamaCPP() { |
1314 | | - gpt_params params; |
1315 | | - auto conf = drogon::app().getCustomConfig(); |
1316 | | - params.model = conf["llama_model_path"].asString(); |
1317 | | - params.n_gpu_layers = conf["ngl"].asInt(); |
1318 | | - params.n_ctx = conf["ctx_len"].asInt(); |
1319 | | - params.embedding = conf["embedding"].asBool(); |
1320 | | -#ifdef GGML_USE_CUBLAS |
1321 | | - LOG_INFO << "Setting up GGML CUBLAS PARAMS"; |
1322 | | - params.mul_mat_q = false; |
1323 | | -#endif // GGML_USE_CUBLAS |
1324 | | - if (params.model_alias == "unknown") { |
1325 | | - params.model_alias = params.model; |
1326 | | - } |
1327 | | - |
1328 | | - llama_backend_init(params.numa); |
1329 | | - |
1330 | | - LOG_INFO_LLAMA("build info", |
1331 | | - {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}}); |
1332 | | - LOG_INFO_LLAMA("system info", |
1333 | | - { |
1334 | | - {"n_threads", params.n_threads}, |
1335 | | - {"total_threads", std::thread::hardware_concurrency()}, |
1336 | | - {"system_info", llama_print_system_info()}, |
1337 | | - }); |
1338 | | - |
1339 | | - // load the model |
1340 | | - if (!llama.loadModel(params)) { |
1341 | | - LOG_ERROR << "Error loading the model will exit the program"; |
1342 | | - std::terminate(); |
1343 | | - } |
1344 | | - nitro_utils::nitro_logo(); |
| 1315 | +// gpt_params params; |
| 1316 | +// auto conf = drogon::app().getCustomConfig(); |
| 1317 | +// params.model = conf["llama_model_path"].asString(); |
| 1318 | +// params.n_gpu_layers = conf["ngl"].asInt(); |
| 1319 | +// params.n_ctx = conf["ctx_len"].asInt(); |
| 1320 | +// params.embedding = conf["embedding"].asBool(); |
| 1321 | +//#ifdef GGML_USE_CUBLAS |
| 1322 | +// LOG_INFO << "Setting up GGML CUBLAS PARAMS"; |
| 1323 | +// params.mul_mat_q = false; |
| 1324 | +//#endif // GGML_USE_CUBLAS |
| 1325 | +// if (params.model_alias == "unknown") { |
| 1326 | +// params.model_alias = params.model; |
| 1327 | +// } |
| 1328 | +// |
| 1329 | +// llama_backend_init(params.numa); |
| 1330 | +// |
| 1331 | +// LOG_INFO_LLAMA("build info", |
| 1332 | +// {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}}); |
| 1333 | +// LOG_INFO_LLAMA("system info", |
| 1334 | +// { |
| 1335 | +// {"n_threads", params.n_threads}, |
| 1336 | +// {"total_threads", std::thread::hardware_concurrency()}, |
| 1337 | +// {"system_info", llama_print_system_info()}, |
| 1338 | +// }); |
| 1339 | +// |
| 1340 | +// // load the model |
| 1341 | +// if (!llama.loadModel(params)) { |
| 1342 | +// LOG_ERROR << "Error loading the model will exit the program"; |
| 1343 | +// std::terminate(); |
| 1344 | +// } |
1345 | 1345 | } |
1346 | 1346 | METHOD_LIST_BEGIN |
1347 | 1347 | // list path definitions here; |
1348 | | - METHOD_ADD(llamaCPP::chatCompletion, "chat_completion"); |
1349 | | - METHOD_ADD(llamaCPP::embedding,"embedding"); |
| 1348 | + METHOD_ADD(llamaCPP::chatCompletion, "chat_completion",Post); |
| 1349 | + METHOD_ADD(llamaCPP::embedding,"embedding",Post); |
| 1350 | + METHOD_ADD(llamaCPP::loadModel,"loadmodel",Post); |
1350 | 1351 | // PATH_ADD("/llama/chat_completion", Post); |
1351 | 1352 | METHOD_LIST_END |
1352 | 1353 | void chatCompletion(const HttpRequestPtr &req, |
1353 | 1354 | std::function<void(const HttpResponsePtr &)> &&callback); |
1354 | 1355 | void embedding(const HttpRequestPtr &req, |
1355 | 1356 | std::function<void(const HttpResponsePtr &)> &&callback); |
| 1357 | + void loadModel(const HttpRequestPtr &req, |
| 1358 | + std::function<void(const HttpResponsePtr &)> &&callback); |
| 1359 | + |
1356 | 1360 |
|
1357 | 1361 | private: |
1358 | 1362 | llama_server_context llama; |
| 1363 | + bool model_loaded = false; |
1359 | 1364 | size_t sent_count = 0; |
1360 | 1365 | size_t sent_token_probs_index = 0; |
1361 | 1366 | }; |
|
0 commit comments