Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit a7e8ca2

Browse files
authored
Merge pull request #394 from janhq/pump-version
chore: version pump and support Kompute
2 parents 07960d2 + cf9af4d commit a7e8ca2

File tree

3 files changed

+80
-59
lines changed

3 files changed

+80
-59
lines changed

controllers/llamaCPP.h

Lines changed: 78 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1919,66 +1919,87 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
19191919
"key+value (default: disabled)\n");
19201920
printf(" not recommended: doubles context memory "
19211921
"required and no measurable increase in quality\n");
1922-
if (llama_mlock_supported()) {
1923-
printf(" --mlock force system to keep model in RAM rather "
1924-
"than swapping or compressing\n");
1922+
if (llama_supports_mlock()) {
1923+
printf(" --mlock force system to keep model in RAM "
1924+
"rather than swapping or compressing\n");
19251925
}
1926-
if (llama_mmap_supported()) {
1927-
printf(" --no-mmap do not memory-map model (slower load but "
1928-
"may reduce pageouts if not using mlock)\n");
1926+
if (llama_supports_mmap()) {
1927+
printf(" --no-mmap do not memory-map model (slower load "
1928+
"but may reduce pageouts if not using mlock)\n");
1929+
}
1930+
printf(" --numa attempt optimizations that help on some "
1931+
"NUMA systems\n");
1932+
if (llama_supports_gpu_offload()) {
1933+
printf(" -ngl N, --n-gpu-layers N\n");
1934+
printf(" number of layers to store in VRAM\n");
1935+
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
1936+
printf(" how to split the model across multiple "
1937+
"GPUs, one of:\n");
1938+
printf(" - none: use one GPU only\n");
1939+
printf(" - layer (default): split layers and "
1940+
"KV across GPUs\n");
1941+
printf(" - row: split rows across GPUs\n");
1942+
printf(" -ts SPLIT --tensor-split SPLIT\n");
1943+
printf(" fraction of the model to offload to "
1944+
"each GPU, comma-separated list of proportions, e.g. 3,1\n");
1945+
printf(" -mg i, --main-gpu i the GPU to use for the model (with "
1946+
"split-mode = none),\n");
1947+
printf(" or for intermediate results and KV "
1948+
"(with split-mode = row)\n");
19291949
}
1930-
printf(" --numa attempt optimizations that help on some NUMA "
1931-
"systems\n");
1932-
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
1933-
printf(" -ngl N, --n-gpu-layers N\n");
1934-
printf(" number of layers to store in VRAM\n");
1935-
printf(" -ts SPLIT --tensor-split SPLIT\n");
1936-
printf(" how to split tensors across multiple GPUs, "
1937-
"comma-separated list of proportions, e.g. 3,1\n");
1938-
printf(
1939-
" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
1940-
printf(" -nommq, --no-mul-mat-q\n");
1941-
printf(" use cuBLAS instead of custom mul_mat_q CUDA "
1942-
"kernels.\n");
1943-
printf(" Not recommended since this is both slower "
1944-
"and uses more VRAM.\n");
1945-
#endif
19461950
printf(" -m FNAME, --model FNAME\n");
1947-
printf(" model path (default: %s)\n",
1951+
printf(" model path (default: %s)\n",
19481952
params.model.c_str());
19491953
printf(" -a ALIAS, --alias ALIAS\n");
1950-
printf(" set an alias for the model, will be added as "
1951-
"`model` field in completion response\n");
1952-
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
1953-
printf(" --lora-base FNAME optional model to use as a base for the "
1954-
"layers modified by the LoRA adapter\n");
1954+
printf(" set an alias for the model, will be "
1955+
"added as `model` field in completion response\n");
19551956
printf(
1956-
" --host ip address to listen (default (default: %s)\n",
1957-
sparams.hostname.c_str());
1958-
printf(" --port PORT port to listen (default (default: %d)\n",
1957+
" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
1958+
printf(" --lora-base FNAME optional model to use as a base for the "
1959+
"layers modified by the LoRA adapter\n");
1960+
printf(" --host ip address to listen (default (default: "
1961+
"%s)\n",
1962+
sparams.hostname.c_str());
1963+
printf(" --port PORT port to listen (default (default: %d)\n",
19591964
sparams.port);
1960-
printf(" --path PUBLIC_PATH path from which to serve static files "
1965+
printf(" --path PUBLIC_PATH path from which to serve static files "
19611966
"(default %s)\n",
19621967
sparams.public_path.c_str());
1963-
printf(" --api-key API_KEY optional api key to enhance server security. "
1964-
"If set, requests must include this key for access.\n");
1965-
printf(" -to N, --timeout N server read/write timeout in seconds "
1968+
printf(" --api-key API_KEY optional api key to enhance server "
1969+
"security. If set, requests must include this key for access.\n");
1970+
printf(" --api-key-file FNAME path to file containing api keys "
1971+
"delimited by new lines. If set, requests must include one of the "
1972+
"keys for access.\n");
1973+
printf(" -to N, --timeout N server read/write timeout in seconds "
19661974
"(default: %d)\n",
19671975
sparams.read_timeout);
1968-
printf(
1969-
" --embedding enable embedding vector output (default: %s)\n",
1970-
params.embedding ? "enabled" : "disabled");
1971-
printf(" -np N, --parallel N number of slots for process requests "
1976+
printf(" --embedding enable embedding vector output (default: "
1977+
"%s)\n",
1978+
params.embedding ? "enabled" : "disabled");
1979+
printf(" -np N, --parallel N number of slots for process requests "
19721980
"(default: %d)\n",
19731981
params.n_parallel);
1974-
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic "
1975-
"batching) (default: disabled)\n");
1976-
printf(" -spf FNAME, --system-prompt-file FNAME\n");
1977-
printf(" Set a file to load a system prompt (initial "
1978-
"prompt of all slots), this is useful for chat applications.\n");
1979-
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for "
1982+
printf(" -cb, --cont-batching enable continuous batching (a.k.a "
1983+
"dynamic batching) (default: disabled)\n");
1984+
printf(" -spf FNAME, --system-prompt-file FNAME\n");
1985+
printf(
1986+
" set a file to load a system prompt (initial "
1987+
"prompt of all slots), this is useful for chat applications.\n");
1988+
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for "
19801989
"LLaVA.\n");
1981-
printf(" --log-disable disables logging to a file.\n");
1990+
printf(" --log-disable disables logging to a file.\n");
1991+
printf("\n");
1992+
printf(" --override-kv KEY=TYPE:VALUE\n");
1993+
printf(" advanced option to override model "
1994+
"metadata by key. may be specified multiple times.\n");
1995+
printf(" types: int, float, bool. example: "
1996+
"--override-kv tokenizer.ggml.add_bos_token=bool:false\n");
1997+
printf(" -gan N, --grp-attn-n N set the group attention factor to extend "
1998+
"context size through self-extend(default: 1=disabled), used together "
1999+
"with group attention width `--grp-attn-w`");
2000+
printf(" -gaw N, --grp-attn-w N set the group attention width to extend "
2001+
"context size through self-extend(default: 512), used together with "
2002+
"group attention factor `--grp-attn-n`");
19822003
printf("\n");
19832004
}
19842005

@@ -2121,15 +2142,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
21212142
invalid_param = true;
21222143
break;
21232144
}
2124-
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
2125-
params.n_gpu_layers = std::stoi(argv[i]);
2126-
#else
2127-
LOG_WARNING_LLAMA(
2128-
"Not compiled with GPU offload support, --n-gpu-layers option will "
2129-
"be ignored. "
2130-
"See main README.md for information on enabling GPU BLAS support",
2131-
{{"n_gpu_layers", params.n_gpu_layers}});
2132-
#endif
2145+
if (llama_supports_gpu_offload()) {
2146+
params.n_gpu_layers = std::stoi(argv[i]);
2147+
} else {
2148+
LOG_WARNING_LLAMA(
2149+
"Not compiled with GPU offload support, --n-gpu-layers option will "
2150+
"be ignored. "
2151+
"See main README.md for information on enabling GPU BLAS support",
2152+
{{"n_gpu_layers", params.n_gpu_layers}});
2153+
}
21332154
} else if (arg == "--tensor-split" || arg == "-ts") {
21342155
if (++i >= argc) {
21352156
invalid_param = true;
@@ -2143,9 +2164,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
21432164
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex,
21442165
-1};
21452166
std::vector<std::string> split_arg{it, {}};
2146-
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
2167+
GGML_ASSERT(split_arg.size() <= llama_max_devices());
21472168

2148-
for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) {
2169+
for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) {
21492170
if (i_device < split_arg.size()) {
21502171
params.tensor_split[i_device] = std::stof(split_arg[i_device]);
21512172
} else {

llama.cpp

whisper.cpp

0 commit comments

Comments
 (0)