Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit 397b3cd

Browse files
committed
direct fix
1 parent 0ed5e95 commit 397b3cd

File tree

1 file changed

+6
-309
lines changed

1 file changed

+6
-309
lines changed

controllers/llamaCPP.h

Lines changed: 6 additions & 309 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "clip.h"
1717
#include "common.h"
1818
#include "llama.h"
19+
#include "llava.h"
1920

2021
#include "stb_image.h"
2122

@@ -1065,39 +1066,24 @@ struct llama_server_context {
10651066

10661067
return slot.has_next_token; // continue
10671068
}
1068-
10691069
bool process_images(llama_client_slot &slot) const {
10701070
for (slot_image &img : slot.images) {
10711071
if (!img.request_encode_image) {
10721072
continue;
10731073
}
1074-
clip_image_f32 *img_res = clip_image_f32_init();
1075-
if (!clip_image_preprocess(clp_ctx, img.img_data, img_res,
1076-
/*pad2square =*/true)) {
1074+
1075+
if (!llava_image_embed_make_with_clip_img(
1076+
clp_ctx, params.n_threads, img.img_data, &img.image_embedding,
1077+
&img.image_tokens)) {
10771078
LOG_TEE("Error processing the given image");
1078-
clip_free(clp_ctx);
1079-
return false;
1080-
}
1081-
img.image_tokens = clip_n_patches(clp_ctx);
1082-
img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
1083-
if (!img.image_embedding) {
1084-
LOG_TEE("Unable to allocate memory for image embeddings\n");
1085-
clip_free(clp_ctx);
10861079
return false;
10871080
}
1088-
LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
1089-
if (!clip_image_encode(clp_ctx, params.n_threads, img_res,
1090-
img.image_embedding)) {
1091-
LOG_TEE("Unable to encode image\n");
1092-
return false;
1093-
}
1094-
clip_image_f32_free(img_res);
1081+
10951082
img.request_encode_image = false;
10961083
}
10971084

10981085
return slot.images.size() > 0;
10991086
}
1100-
11011087
void send_error(task_server &task, std::string error) {
11021088
std::unique_lock<std::mutex> lock(mutex_results);
11031089
task_result res;
@@ -2004,295 +1990,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
20041990
"group attention factor `--grp-attn-n`");
20051991
printf("\n");
20061992
}
2007-
2008-
static void server_params_parse(int argc, char **argv, server_params &sparams,
2009-
gpt_params &params,
2010-
llama_server_context &llama) {
2011-
gpt_params default_params;
2012-
server_params default_sparams;
2013-
std::string arg;
2014-
bool invalid_param = false;
2015-
2016-
for (int i = 1; i < argc; i++) {
2017-
arg = argv[i];
2018-
if (arg == "--port") {
2019-
if (++i >= argc) {
2020-
invalid_param = true;
2021-
break;
2022-
}
2023-
sparams.port = std::stoi(argv[i]);
2024-
} else if (arg == "--host") {
2025-
if (++i >= argc) {
2026-
invalid_param = true;
2027-
break;
2028-
}
2029-
sparams.hostname = argv[i];
2030-
} else if (arg == "--path") {
2031-
if (++i >= argc) {
2032-
invalid_param = true;
2033-
break;
2034-
}
2035-
sparams.public_path = argv[i];
2036-
} else if (arg == "--api-key") {
2037-
if (++i >= argc) {
2038-
invalid_param = true;
2039-
break;
2040-
}
2041-
sparams.api_key = argv[i];
2042-
} else if (arg == "--timeout" || arg == "-to") {
2043-
if (++i >= argc) {
2044-
invalid_param = true;
2045-
break;
2046-
}
2047-
sparams.read_timeout = std::stoi(argv[i]);
2048-
sparams.write_timeout = std::stoi(argv[i]);
2049-
} else if (arg == "-m" || arg == "--model") {
2050-
if (++i >= argc) {
2051-
invalid_param = true;
2052-
break;
2053-
}
2054-
params.model = argv[i];
2055-
} else if (arg == "-a" || arg == "--alias") {
2056-
if (++i >= argc) {
2057-
invalid_param = true;
2058-
break;
2059-
}
2060-
params.model_alias = argv[i];
2061-
} else if (arg == "-h" || arg == "--help") {
2062-
server_print_usage(argv[0], default_params, default_sparams);
2063-
exit(0);
2064-
} else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") {
2065-
if (++i >= argc) {
2066-
invalid_param = true;
2067-
break;
2068-
}
2069-
params.n_ctx = std::stoi(argv[i]);
2070-
} else if (arg == "--rope-scaling") {
2071-
if (++i >= argc) {
2072-
invalid_param = true;
2073-
break;
2074-
}
2075-
std::string value(argv[i]);
2076-
/**/ if (value == "none") {
2077-
params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE;
2078-
} else if (value == "linear") {
2079-
params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR;
2080-
} else if (value == "yarn") {
2081-
params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN;
2082-
} else {
2083-
invalid_param = true;
2084-
break;
2085-
}
2086-
} else if (arg == "--rope-freq-base") {
2087-
if (++i >= argc) {
2088-
invalid_param = true;
2089-
break;
2090-
}
2091-
params.rope_freq_base = std::stof(argv[i]);
2092-
} else if (arg == "--rope-freq-scale") {
2093-
if (++i >= argc) {
2094-
invalid_param = true;
2095-
break;
2096-
}
2097-
params.rope_freq_scale = std::stof(argv[i]);
2098-
} else if (arg == "--yarn-ext-factor") {
2099-
if (++i >= argc) {
2100-
invalid_param = true;
2101-
break;
2102-
}
2103-
params.yarn_ext_factor = std::stof(argv[i]);
2104-
} else if (arg == "--yarn-attn-factor") {
2105-
if (++i >= argc) {
2106-
invalid_param = true;
2107-
break;
2108-
}
2109-
params.yarn_attn_factor = std::stof(argv[i]);
2110-
} else if (arg == "--yarn-beta-fast") {
2111-
if (++i >= argc) {
2112-
invalid_param = true;
2113-
break;
2114-
}
2115-
params.yarn_beta_fast = std::stof(argv[i]);
2116-
} else if (arg == "--yarn-beta-slow") {
2117-
if (++i >= argc) {
2118-
invalid_param = true;
2119-
break;
2120-
}
2121-
params.yarn_beta_slow = std::stof(argv[i]);
2122-
} else if (arg == "--threads" || arg == "-t") {
2123-
if (++i >= argc) {
2124-
invalid_param = true;
2125-
break;
2126-
}
2127-
params.n_threads = std::stoi(argv[i]);
2128-
} else if (arg == "--threads-batch" || arg == "-tb") {
2129-
if (++i >= argc) {
2130-
invalid_param = true;
2131-
break;
2132-
}
2133-
params.n_threads_batch = std::stoi(argv[i]);
2134-
} else if (arg == "-b" || arg == "--batch-size") {
2135-
if (++i >= argc) {
2136-
invalid_param = true;
2137-
break;
2138-
}
2139-
params.n_batch = std::stoi(argv[i]);
2140-
params.n_batch = std::min(512, params.n_batch);
2141-
} else if (arg == "--gpu-layers" || arg == "-ngl" ||
2142-
arg == "--n-gpu-layers") {
2143-
if (++i >= argc) {
2144-
invalid_param = true;
2145-
break;
2146-
}
2147-
if (llama_supports_gpu_offload()) {
2148-
params.n_gpu_layers = std::stoi(argv[i]);
2149-
} else {
2150-
LOG_WARNING_LLAMA(
2151-
"Not compiled with GPU offload support, --n-gpu-layers option will "
2152-
"be ignored. "
2153-
"See main README.md for information on enabling GPU BLAS support",
2154-
{{"n_gpu_layers", params.n_gpu_layers}});
2155-
}
2156-
} else if (arg == "--tensor-split" || arg == "-ts") {
2157-
if (++i >= argc) {
2158-
invalid_param = true;
2159-
break;
2160-
}
2161-
#ifdef GGML_USE_CUBLAS
2162-
std::string arg_next = argv[i];
2163-
2164-
// split string by , and /
2165-
const std::regex regex{R"([,/]+)"};
2166-
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex,
2167-
-1};
2168-
std::vector<std::string> split_arg{it, {}};
2169-
GGML_ASSERT(split_arg.size() <= llama_max_devices());
2170-
2171-
for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) {
2172-
if (i_device < split_arg.size()) {
2173-
params.tensor_split[i_device] = std::stof(split_arg[i_device]);
2174-
} else {
2175-
params.tensor_split[i_device] = 0.0f;
2176-
}
2177-
}
2178-
#else
2179-
LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not "
2180-
"possible to set a tensor split.\n",
2181-
{});
2182-
#endif // GGML_USE_CUBLAS
2183-
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
2184-
#ifdef GGML_USE_CUBLAS
2185-
params.mul_mat_q = false;
2186-
#else
2187-
LOG_WARNING_LLAMA("warning: llama.cpp was compiled without cuBLAS. "
2188-
"Disabling mul_mat_q kernels has no effect.\n",
2189-
{});
2190-
#endif // GGML_USE_CUBLAS
2191-
} else if (arg == "--main-gpu" || arg == "-mg") {
2192-
if (++i >= argc) {
2193-
invalid_param = true;
2194-
break;
2195-
}
2196-
#ifdef GGML_USE_CUBLAS
2197-
params.main_gpu = std::stoi(argv[i]);
2198-
#else
2199-
LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not "
2200-
"possible to set a main GPU.",
2201-
{});
2202-
#endif
2203-
} else if (arg == "--lora") {
2204-
if (++i >= argc) {
2205-
invalid_param = true;
2206-
break;
2207-
}
2208-
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
2209-
params.use_mmap = false;
2210-
} else if (arg == "--lora-scaled") {
2211-
if (++i >= argc) {
2212-
invalid_param = true;
2213-
break;
2214-
}
2215-
const char *lora_adapter = argv[i];
2216-
if (++i >= argc) {
2217-
invalid_param = true;
2218-
break;
2219-
}
2220-
params.lora_adapter.push_back(
2221-
std::make_tuple(lora_adapter, std::stof(argv[i])));
2222-
params.use_mmap = false;
2223-
} else if (arg == "--lora-base") {
2224-
if (++i >= argc) {
2225-
invalid_param = true;
2226-
break;
2227-
}
2228-
params.lora_base = argv[i];
2229-
} else if (arg == "-v" || arg == "--verbose") {
2230-
#if SERVER_VERBOSE != 1
2231-
LOG_WARNING_LLAMA("server.cpp is not built with verbose logging.", {});
2232-
#else
2233-
server_verbose = true;
2234-
#endif
2235-
} else if (arg == "--mlock") {
2236-
params.use_mlock = true;
2237-
} else if (arg == "--no-mmap") {
2238-
params.use_mmap = false;
2239-
} else if (arg == "--numa") {
2240-
params.numa = true;
2241-
} else if (arg == "--embedding") {
2242-
params.embedding = true;
2243-
} else if (arg == "-cb" || arg == "--cont-batching") {
2244-
params.cont_batching = true;
2245-
} else if (arg == "-np" || arg == "--parallel") {
2246-
if (++i >= argc) {
2247-
invalid_param = true;
2248-
break;
2249-
}
2250-
params.n_parallel = std::stoi(argv[i]);
2251-
} else if (arg == "-n" || arg == "--n-predict") {
2252-
if (++i >= argc) {
2253-
invalid_param = true;
2254-
break;
2255-
}
2256-
params.n_predict = std::stoi(argv[i]);
2257-
} else if (arg == "-spf" || arg == "--system-prompt-file") {
2258-
if (++i >= argc) {
2259-
invalid_param = true;
2260-
break;
2261-
}
2262-
std::ifstream file(argv[i]);
2263-
if (!file) {
2264-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
2265-
invalid_param = true;
2266-
break;
2267-
}
2268-
std::string systm_content;
2269-
std::copy(std::istreambuf_iterator<char>(file),
2270-
std::istreambuf_iterator<char>(),
2271-
std::back_inserter(systm_content));
2272-
llama.process_system_prompt_data(json::parse(systm_content));
2273-
} else if (arg == "--mmproj") {
2274-
if (++i >= argc) {
2275-
invalid_param = true;
2276-
break;
2277-
}
2278-
params.mmproj = argv[i];
2279-
} else if (arg == "--log-disable") {
2280-
log_set_target(stdout);
2281-
LOG_INFO_LLAMA("logging to file is disabled.", {});
2282-
} else {
2283-
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
2284-
server_print_usage(argv[0], default_params, default_sparams);
2285-
exit(1);
2286-
}
2287-
}
2288-
2289-
if (invalid_param) {
2290-
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
2291-
server_print_usage(argv[0], default_params, default_sparams);
2292-
exit(1);
2293-
}
2294-
}
2295-
22961993
static std::string random_string() {
22971994
static const std::string str(
22981995
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");

0 commit comments

Comments
 (0)