@@ -1919,66 +1919,87 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
19191919 " key+value (default: disabled)\n " );
19201920 printf (" not recommended: doubles context memory "
19211921 " required and no measurable increase in quality\n " );
1922- if (llama_mlock_supported ()) {
1923- printf (" --mlock force system to keep model in RAM rather "
1924- " than swapping or compressing\n " );
1922+ if (llama_supports_mlock ()) {
1923+ printf (" --mlock force system to keep model in RAM "
1924+ " rather than swapping or compressing\n " );
19251925 }
1926- if (llama_mmap_supported ()) {
1927- printf (" --no-mmap do not memory-map model (slower load but "
1928- " may reduce pageouts if not using mlock)\n " );
1926+ if (llama_supports_mmap ()) {
1927+ printf (" --no-mmap do not memory-map model (slower load "
1928+ " but may reduce pageouts if not using mlock)\n " );
1929+ }
1930+ printf (" --numa attempt optimizations that help on some "
1931+ " NUMA systems\n " );
1932+ if (llama_supports_gpu_offload ()) {
1933+ printf (" -ngl N, --n-gpu-layers N\n " );
1934+ printf (" number of layers to store in VRAM\n " );
1935+ printf (" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n " );
1936+ printf (" how to split the model across multiple "
1937+ " GPUs, one of:\n " );
1938+ printf (" - none: use one GPU only\n " );
1939+ printf (" - layer (default): split layers and "
1940+ " KV across GPUs\n " );
1941+ printf (" - row: split rows across GPUs\n " );
1942+ printf (" -ts SPLIT --tensor-split SPLIT\n " );
1943+ printf (" fraction of the model to offload to "
1944+ " each GPU, comma-separated list of proportions, e.g. 3,1\n " );
1945+ printf (" -mg i, --main-gpu i the GPU to use for the model (with "
1946+ " split-mode = none),\n " );
1947+ printf (" or for intermediate results and KV "
1948+ " (with split-mode = row)\n " );
19291949 }
1930- printf (" --numa attempt optimizations that help on some NUMA "
1931- " systems\n " );
1932- #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
1933- printf (" -ngl N, --n-gpu-layers N\n " );
1934- printf (" number of layers to store in VRAM\n " );
1935- printf (" -ts SPLIT --tensor-split SPLIT\n " );
1936- printf (" how to split tensors across multiple GPUs, "
1937- " comma-separated list of proportions, e.g. 3,1\n " );
1938- printf (
1939- " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n " );
1940- printf (" -nommq, --no-mul-mat-q\n " );
1941- printf (" use cuBLAS instead of custom mul_mat_q CUDA "
1942- " kernels.\n " );
1943- printf (" Not recommended since this is both slower "
1944- " and uses more VRAM.\n " );
1945- #endif
19461950 printf (" -m FNAME, --model FNAME\n " );
1947- printf (" model path (default: %s)\n " ,
1951+ printf (" model path (default: %s)\n " ,
19481952 params.model .c_str ());
19491953 printf (" -a ALIAS, --alias ALIAS\n " );
1950- printf (" set an alias for the model, will be added as "
1951- " `model` field in completion response\n " );
1952- printf (" --lora FNAME apply LoRA adapter (implies --no-mmap)\n " );
1953- printf (" --lora-base FNAME optional model to use as a base for the "
1954- " layers modified by the LoRA adapter\n " );
1954+ printf (" set an alias for the model, will be "
1955+ " added as `model` field in completion response\n " );
19551956 printf (
1956- " --host ip address to listen (default (default: %s)\n " ,
1957- sparams.hostname .c_str ());
1958- printf (" --port PORT port to listen (default (default: %d)\n " ,
1957+ " --lora FNAME apply LoRA adapter (implies --no-mmap)\n " );
1958+ printf (" --lora-base FNAME optional model to use as a base for the "
1959+ " layers modified by the LoRA adapter\n " );
1960+ printf (" --host ip address to listen (default (default: "
1961+ " %s)\n " ,
1962+ sparams.hostname .c_str ());
1963+ printf (" --port PORT port to listen (default (default: %d)\n " ,
19591964 sparams.port );
1960- printf (" --path PUBLIC_PATH path from which to serve static files "
1965+ printf (" --path PUBLIC_PATH path from which to serve static files "
19611966 " (default %s)\n " ,
19621967 sparams.public_path .c_str ());
1963- printf (" --api-key API_KEY optional api key to enhance server security. "
1964- " If set, requests must include this key for access.\n " );
1965- printf (" -to N, --timeout N server read/write timeout in seconds "
1968+ printf (" --api-key API_KEY optional api key to enhance server "
1969+ " security. If set, requests must include this key for access.\n " );
1970+ printf (" --api-key-file FNAME path to file containing api keys "
1971+ " delimited by new lines. If set, requests must include one of the "
1972+ " keys for access.\n " );
1973+ printf (" -to N, --timeout N server read/write timeout in seconds "
19661974 " (default: %d)\n " ,
19671975 sparams.read_timeout );
1968- printf (
1969- " --embedding enable embedding vector output (default: %s)\n " ,
1970- params.embedding ? " enabled" : " disabled" );
1971- printf (" -np N, --parallel N number of slots for process requests "
1976+ printf (" --embedding enable embedding vector output (default: "
1977+ " %s)\n " ,
1978+ params.embedding ? " enabled" : " disabled" );
1979+ printf (" -np N, --parallel N number of slots for process requests "
19721980 " (default: %d)\n " ,
19731981 params.n_parallel );
1974- printf (" -cb, --cont-batching enable continuous batching (a.k.a dynamic "
1975- " batching) (default: disabled)\n " );
1976- printf (" -spf FNAME, --system-prompt-file FNAME\n " );
1977- printf (" Set a file to load a system prompt (initial "
1978- " prompt of all slots), this is useful for chat applications.\n " );
1979- printf (" --mmproj MMPROJ_FILE path to a multimodal projector file for "
1982+ printf (" -cb, --cont-batching enable continuous batching (a.k.a "
1983+ " dynamic batching) (default: disabled)\n " );
1984+ printf (" -spf FNAME, --system-prompt-file FNAME\n " );
1985+ printf (
1986+ " set a file to load a system prompt (initial "
1987+ " prompt of all slots), this is useful for chat applications.\n " );
1988+ printf (" --mmproj MMPROJ_FILE path to a multimodal projector file for "
19801989 " LLaVA.\n " );
1981- printf (" --log-disable disables logging to a file.\n " );
1990+ printf (" --log-disable disables logging to a file.\n " );
1991+ printf (" \n " );
1992+ printf (" --override-kv KEY=TYPE:VALUE\n " );
1993+ printf (" advanced option to override model "
1994+ " metadata by key. may be specified multiple times.\n " );
1995+ printf (" types: int, float, bool. example: "
1996+ " --override-kv tokenizer.ggml.add_bos_token=bool:false\n " );
1997+ printf (" -gan N, --grp-attn-n N set the group attention factor to extend "
1998+ " context size through self-extend(default: 1=disabled), used together "
1999+ " with group attention width `--grp-attn-w`" );
2000+ printf (" -gaw N, --grp-attn-w N set the group attention width to extend "
2001+ " context size through self-extend(default: 512), used together with "
2002+ " group attention factor `--grp-attn-n`" );
19822003 printf (" \n " );
19832004}
19842005
@@ -2121,15 +2142,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
21212142 invalid_param = true ;
21222143 break ;
21232144 }
2124- # ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
2125- params.n_gpu_layers = std::stoi (argv[i]);
2126- # else
2127- LOG_WARNING_LLAMA (
2128- " Not compiled with GPU offload support, --n-gpu-layers option will "
2129- " be ignored. "
2130- " See main README.md for information on enabling GPU BLAS support" ,
2131- {{" n_gpu_layers" , params.n_gpu_layers }});
2132- # endif
2145+ if ( llama_supports_gpu_offload ()) {
2146+ params.n_gpu_layers = std::stoi (argv[i]);
2147+ } else {
2148+ LOG_WARNING_LLAMA (
2149+ " Not compiled with GPU offload support, --n-gpu-layers option will "
2150+ " be ignored. "
2151+ " See main README.md for information on enabling GPU BLAS support" ,
2152+ {{" n_gpu_layers" , params.n_gpu_layers }});
2153+ }
21332154 } else if (arg == " --tensor-split" || arg == " -ts" ) {
21342155 if (++i >= argc) {
21352156 invalid_param = true ;
@@ -2143,9 +2164,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
21432164 std::sregex_token_iterator it{arg_next.begin (), arg_next.end (), regex,
21442165 -1 };
21452166 std::vector<std::string> split_arg{it, {}};
2146- GGML_ASSERT (split_arg.size () <= LLAMA_MAX_DEVICES );
2167+ GGML_ASSERT (split_arg.size () <= llama_max_devices () );
21472168
2148- for (size_t i_device = 0 ; i_device < LLAMA_MAX_DEVICES ; ++i_device) {
2169+ for (size_t i_device = 0 ; i_device < llama_max_devices () ; ++i_device) {
21492170 if (i_device < split_arg.size ()) {
21502171 params.tensor_split [i_device] = std::stof (split_arg[i_device]);
21512172 } else {
0 commit comments