Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion engine/cli/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "utils/file_manager_utils.h"
#include "utils/logging_utils.h"
#include "utils/system_info_utils.h"
#include "utils/widechar_conv.h"

#if defined(__APPLE__) && defined(__MACH__)
#include <libgen.h> // for dirname()
Expand Down Expand Up @@ -46,7 +47,7 @@ void SetupLogger(trantor::FileLogger& async_logger, bool verbose) {

std::filesystem::create_directories(
#if defined(_WIN32)
std::filesystem::u8path(config.logFolderPath) /
std::filesystem::path(cortex::wc::Utf8ToWstring(config.logFolderPath)) /
#else
std::filesystem::path(config.logFolderPath) /
#endif
Expand Down
3 changes: 2 additions & 1 deletion engine/common/message.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ struct Message : JsonSerializable {
std::move(root.get("object", "thread.message").asString());
message.created_at = root["created_at"].asUInt();
if (message.created_at == 0 && root["created"].asUInt64() != 0) {
message.created_at = root["created"].asUInt64() / 1000;
message.created_at =
static_cast<uint32_t>(root["created"].asUInt64() / 1000);
}
message.thread_id = std::move(root["thread_id"].asString());
message.status = StatusFromString(std::move(root["status"].asString()));
Expand Down
6 changes: 3 additions & 3 deletions engine/config/model_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@ struct RemoteModelConfig {

// Load basic string fields
model = json.get("model", model).asString();
header_template =
json.get("header_template", header_template).asString();
header_template = json.get("header_template", header_template).asString();
engine = json.get("engine", engine).asString();
version = json.get("version", version).asString();
created =
Expand Down Expand Up @@ -405,7 +404,8 @@ struct ModelConfig {
oss << format_utils::print_comment("END REQUIRED");
oss << format_utils::print_comment("BEGIN OPTIONAL");

oss << format_utils::print_float("size", size);
oss << format_utils::print_kv("size", std::to_string(size),
format_utils::MAGENTA);
oss << format_utils::print_bool("stream", stream);
oss << format_utils::print_float("top_p", top_p);
oss << format_utils::print_float("temperature", temperature);
Expand Down
2 changes: 1 addition & 1 deletion engine/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ void RunServer(std::optional<std::string> host, std::optional<int> port,
// Create logs/ folder and setup log to file
std::filesystem::create_directories(
#if defined(_WIN32)
std::filesystem::u8path(config.logFolderPath) /
std::filesystem::path(cortex::wc::Utf8ToWstring(config.logFolderPath)) /
#else
std::filesystem::path(config.logFolderPath) /
#endif
Expand Down
8 changes: 4 additions & 4 deletions engine/services/model_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ cpp::result<DownloadTask, std::string> ModelService::HandleDownloadUrlAsync(

try {
std::filesystem::create_directories(local_path.parent_path());
} catch (const std::filesystem::filesystem_error& e) {
} catch (const std::filesystem::filesystem_error&) {
// if file exist, remove it
std::filesystem::remove(local_path.parent_path());
std::filesystem::create_directories(local_path.parent_path());
Expand Down Expand Up @@ -380,7 +380,7 @@ ModelService::EstimateModel(const std::string& model_handle,
auto mc = yaml_handler.GetModelConfig();
assert(hw_service_);
auto hw_info = hw_service_->GetHardwareInfo();
auto free_vram_MiB = 0u;
int64_t free_vram_MiB = 0;
for (const auto& gpu : hw_info.gpus) {
free_vram_MiB += gpu.free_vram;
}
Expand Down Expand Up @@ -444,7 +444,7 @@ cpp::result<std::string, std::string> ModelService::HandleUrl(

try {
std::filesystem::create_directories(local_path.parent_path());
} catch (const std::filesystem::filesystem_error& e) {
} catch (const std::filesystem::filesystem_error&) {
// if file exist, remove it
std::filesystem::remove(local_path.parent_path());
std::filesystem::create_directories(local_path.parent_path());
Expand Down Expand Up @@ -1326,7 +1326,7 @@ ModelService::MayFallbackToCpu(const std::string& model_path, int ngl,
}
// If in GPU acceleration mode:
// We use all visible GPUs, so only need to sum all free vram
auto free_vram_MiB = 0u;
int64_t free_vram_MiB = 0;
for (const auto& gpu : hw_info.gpus) {
free_vram_MiB += gpu.free_vram;
}
Expand Down
3 changes: 2 additions & 1 deletion engine/utils/command_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ class CommandExecutor {
std::array<char, 128> buffer;
std::string result;

while (fgets(buffer.data(), buffer.size(), m_pipe.get()) != nullptr) {
while (fgets(buffer.data(), static_cast<int>(buffer.size()),
m_pipe.get()) != nullptr) {
result += buffer.data();
}

Expand Down
2 changes: 2 additions & 0 deletions engine/utils/cortex_utils.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#pragma once


#include <drogon/HttpClient.h>
#include <drogon/HttpResponse.h>
#include <sys/stat.h>
Expand Down
2 changes: 1 addition & 1 deletion engine/utils/format_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ inline std::string WriteKeyValue(const std::string& key,
strValue.pop_back();
}
out_file << strValue;
} catch (const std::exception& e) {
} catch (const std::exception&) {
out_file << value; // If not a float, write as is
}
} else {
Expand Down
2 changes: 1 addition & 1 deletion engine/utils/hardware/cpu_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ struct CpuInfo {
return CPU{};
auto cpu = res[0];
cortex::cpuid::CpuInfo inst;
float usage = GetCPUUsage();
auto usage = static_cast<float>(GetCPUUsage());
return CPU{.cores = cpu.numPhysicalCores(),
.arch = std::string(GetArch()),
.model = cpu.modelName(),
Expand Down
22 changes: 11 additions & 11 deletions engine/utils/hardware/gguf/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,21 @@ inline float GetQuantBit(GGMLType gt) {
switch (gt) {
case GGML_TYPE_I32:
case GGML_TYPE_F32:
return 32.0;
return 32.0f;
case GGML_TYPE_I16:
case GGML_TYPE_BF16:
case GGML_TYPE_F16:
return 16.0;
return 16.0f;
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
return 2.31;
return 2.31f;
case GGML_TYPE_Q2_K:
return 2.5625;
return 2.5625f;
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_Q3_K:
return 3.4375;
return 3.4375f;
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
Expand All @@ -72,25 +72,25 @@ inline float GetQuantBit(GGMLType gt) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_K:
return 4.5;
return 4.5f;
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q5_K:
return 5.5;
return 5.5f;
case GGML_TYPE_Q6_K:
return 6.5625;
return 6.5625f;
case GGML_TYPE_I8:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q8_K:
return 8.0;
return 8.0f;

case GGML_TYPE_I64:
case GGML_TYPE_F64:
return 64.0;
return 64.0f;

default:
return 8.0;
return 8.0f;
}
}

Expand Down
37 changes: 19 additions & 18 deletions engine/utils/hardware/gguf/gguf_file_estimate.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

namespace hardware {
inline uint64_t BytesToMiB(uint64_t b) {
return (double)b / 1024 / 1024;
return static_cast<uint64_t>((double)b / 1024 / 1024);
};
struct RunConfig {
int ngl;
Expand Down Expand Up @@ -91,8 +91,8 @@ inline std::optional<Estimation> EstimateLLaMACppRun(
// std::cout << n_vocab << std::endl;

// token_embeddings_size = n_vocab * embedding_length * 2 * quant_bit_in/16 bytes
int32_t quant_bit_in = 0;
int32_t quant_bit_out = 0;
float quant_bit_in = 0;
float quant_bit_out = 0;

for (auto const& ti : (*gf).tensor_infos) {
if (ti->name == "output.weight") {
Expand All @@ -109,16 +109,17 @@ inline std::optional<Estimation> EstimateLLaMACppRun(
// std::cout << "n_vocab: " << n_vocab << std::endl;
// std::cout << "file_size: " << file_size << std::endl;
// Model weight
int64_t token_embeddings_size =
n_vocab * embedding_length * 2 * quant_bit_in / 16;
int64_t output_layer_size =
n_vocab * embedding_length * 2 * quant_bit_out / 16;
auto token_embeddings_size =
static_cast<int64_t>(n_vocab * embedding_length * 2 * quant_bit_in / 16);
auto output_layer_size =
static_cast<int64_t>(n_vocab * embedding_length * 2 * quant_bit_out / 16);
// RAM = token_embeddings_size + ((total_ngl-ngl) >=1 ? output_layer_size + (total_ngl - ngl - 1 ) / (total_ngl-1) * (total_file_size - token_embeddings_size - output_layer_size) : 0 ) (bytes)
int64_t offload = 0;
if (total_ngl >= rc.ngl + 1) {
offload = output_layer_size +
(double)(total_ngl - rc.ngl - 1) / (total_ngl - 1) *
(file_size - token_embeddings_size - output_layer_size);
offload = static_cast<int64_t>(
output_layer_size +
(double)(total_ngl - rc.ngl - 1) / (total_ngl - 1) *
(file_size - token_embeddings_size - output_layer_size));
}

int64_t ram_usage = token_embeddings_size + offload;
Expand All @@ -133,18 +134,18 @@ inline std::optional<Estimation> EstimateLLaMACppRun(
// KV cache
// kv_cache_size = ctx_len/8192 * hidden_dim/4096 * quant_bit/16 * num_block/33 * 1 (GB)
auto hidden_dim = embedding_length;
int kv_quant_bit =
auto kv_quant_bit =
GetQuantBit(rc.kv_cache_type); // f16, 8 bits for q8_0, 4.5 bits for q4_0
int64_t kv_cache_size = (double)(1024 * 1024 * 1024) * rc.ctx_len / 8192 *
hidden_dim / 4096 * kv_quant_bit / 16 * num_block /
33; //(bytes)
auto kv_cache_size = static_cast<int64_t>(
(double)(1024 * 1024 * 1024) * rc.ctx_len / 8192 * hidden_dim / 4096 *
kv_quant_bit / 16 * num_block / 33); //(bytes)

// std::cout << "kv_cache_size: " << BytesToMiB(kv_cache_size) << std::endl;

// VRAM = (min(n_batch, n_ubatch))/ 512 * 266 (MiB)
int64_t preprocessing_buffer_size =
auto preprocessing_buffer_size = static_cast<int64_t>(
(double)std::min(rc.n_batch, rc.n_ubatch) / 512 * 266 * 1024 * 1024 *
n_vocab / 128256 /*llama3 n_vocab*/; //(bytes)
n_vocab / 128256 /*llama3 n_vocab*/); //(bytes)
if (total_ngl != rc.ngl) {
preprocessing_buffer_size += output_layer_size;
}
Expand Down Expand Up @@ -173,8 +174,8 @@ inline std::optional<Estimation> EstimateLLaMACppRun(
if (rc.free_vram_MiB > res.gpu_mode.vram_MiB) {
res.gpu_mode.recommend_ngl = total_ngl;
} else {
res.gpu_mode.recommend_ngl =
(double)rc.free_vram_MiB / res.gpu_mode.vram_MiB * rc.ngl;
res.gpu_mode.recommend_ngl = static_cast<int>(
(double)rc.free_vram_MiB / res.gpu_mode.vram_MiB * rc.ngl);
}
#if defined(__APPLE__) && defined(__MACH__)
res.cpu_mode.ram_MiB = res.gpu_mode.vram_MiB + res.gpu_mode.ram_MiB;
Expand Down
16 changes: 10 additions & 6 deletions engine/utils/hardware/gpu/vulkan/vulkan_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -433,8 +433,8 @@ class VulkanGpu {
for (uint32_t i = 0; i < memory_properties.memoryHeapCount; ++i) {
if (memory_properties.memoryHeaps[i].flags &
VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
gpu_avail_MiB +=
memory_properties.memoryHeaps[i].size / (1024ull * 1024ull);
gpu_avail_MiB += static_cast<int>(
memory_properties.memoryHeaps[i].size / (1024ull * 1024ull));
}
}

Expand All @@ -449,8 +449,10 @@ class VulkanGpu {
used_vram_MiB = gpus_usages[device_properties.deviceName];

#endif
int free_vram_MiB =
total_vram_MiB > used_vram_MiB ? total_vram_MiB - used_vram_MiB : 0;
auto free_vram_MiB =
total_vram_MiB > used_vram_MiB
? static_cast<int>(total_vram_MiB - used_vram_MiB)
: 0;
if (device_properties.vendorID == kNvidiaVendor ||
device_properties.vendorID == kAmdVendor) {
gpus.emplace_back(cortex::hw::GPU{
Expand Down Expand Up @@ -507,8 +509,10 @@ class VulkanGpu {
total_vram_MiB = gpus_[i].free_vram;
used_vram_MiB = gpus_usages[gpus_[i].name];
#endif
int free_vram_MiB =
total_vram_MiB > used_vram_MiB ? total_vram_MiB - used_vram_MiB : 0;
auto free_vram_MiB =
total_vram_MiB > used_vram_MiB
? static_cast<int>(total_vram_MiB - used_vram_MiB)
: 0;
gpus_[i].free_vram = free_vram_MiB;
}

Expand Down
4 changes: 2 additions & 2 deletions engine/utils/huggingface_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ inline std::optional<std::string> GetDefaultBranch(
return default_branch.as<std::string>();
}
return std::nullopt;
} catch (const std::exception& e) {
} catch (const std::exception&) {
return std::nullopt;
}
}
Expand All @@ -328,7 +328,7 @@ inline std::optional<std::string> GetModelAuthorCortexsoHub(
return author.as<std::string>();
}
return std::nullopt;
} catch (const std::exception& e) {
} catch (const std::exception&) {
return std::nullopt;
}
}
Expand Down
3 changes: 2 additions & 1 deletion engine/utils/url_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,8 @@ inline std::string FromUrl(const Url& url) {
} catch (const std::bad_variant_access& e) {
// Handle the case where the variant does not match any of the expected types
// This should not happen if the map was created correctly
throw std::runtime_error("Invalid variant type in queries map");
throw std::runtime_error(
std::string("Invalid variant type in queries map: ") + e.what());
}
}

Expand Down
Loading