|
16 | 16 | #include "clip.h" |
17 | 17 | #include "common.h" |
18 | 18 | #include "llama.h" |
| 19 | +#include "llava.h" |
19 | 20 |
|
20 | 21 | #include "stb_image.h" |
21 | 22 |
|
@@ -1065,39 +1066,24 @@ struct llama_server_context { |
1065 | 1066 |
|
1066 | 1067 | return slot.has_next_token; // continue |
1067 | 1068 | } |
1068 | | - |
1069 | 1069 | bool process_images(llama_client_slot &slot) const { |
1070 | 1070 | for (slot_image &img : slot.images) { |
1071 | 1071 | if (!img.request_encode_image) { |
1072 | 1072 | continue; |
1073 | 1073 | } |
1074 | | - clip_image_f32 *img_res = clip_image_f32_init(); |
1075 | | - if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, |
1076 | | - /*pad2square =*/true)) { |
| 1074 | + |
| 1075 | + if (!llava_image_embed_make_with_clip_img( |
| 1076 | + clp_ctx, params.n_threads, img.img_data, &img.image_embedding, |
| 1077 | + &img.image_tokens)) { |
1077 | 1078 | LOG_TEE("Error processing the given image"); |
1078 | | - clip_free(clp_ctx); |
1079 | | - return false; |
1080 | | - } |
1081 | | - img.image_tokens = clip_n_patches(clp_ctx); |
1082 | | - img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx)); |
1083 | | - if (!img.image_embedding) { |
1084 | | - LOG_TEE("Unable to allocate memory for image embeddings\n"); |
1085 | | - clip_free(clp_ctx); |
1086 | 1079 | return false; |
1087 | 1080 | } |
1088 | | - LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id); |
1089 | | - if (!clip_image_encode(clp_ctx, params.n_threads, img_res, |
1090 | | - img.image_embedding)) { |
1091 | | - LOG_TEE("Unable to encode image\n"); |
1092 | | - return false; |
1093 | | - } |
1094 | | - clip_image_f32_free(img_res); |
| 1081 | + |
1095 | 1082 | img.request_encode_image = false; |
1096 | 1083 | } |
1097 | 1084 |
|
1098 | 1085 | return slot.images.size() > 0; |
1099 | 1086 | } |
1100 | | - |
1101 | 1087 | void send_error(task_server &task, std::string error) { |
1102 | 1088 | std::unique_lock<std::mutex> lock(mutex_results); |
1103 | 1089 | task_result res; |
@@ -2004,295 +1990,6 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, |
2004 | 1990 | "group attention factor `--grp-attn-n`"); |
2005 | 1991 | printf("\n"); |
2006 | 1992 | } |
2007 | | - |
2008 | | -static void server_params_parse(int argc, char **argv, server_params &sparams, |
2009 | | - gpt_params ¶ms, |
2010 | | - llama_server_context &llama) { |
2011 | | - gpt_params default_params; |
2012 | | - server_params default_sparams; |
2013 | | - std::string arg; |
2014 | | - bool invalid_param = false; |
2015 | | - |
2016 | | - for (int i = 1; i < argc; i++) { |
2017 | | - arg = argv[i]; |
2018 | | - if (arg == "--port") { |
2019 | | - if (++i >= argc) { |
2020 | | - invalid_param = true; |
2021 | | - break; |
2022 | | - } |
2023 | | - sparams.port = std::stoi(argv[i]); |
2024 | | - } else if (arg == "--host") { |
2025 | | - if (++i >= argc) { |
2026 | | - invalid_param = true; |
2027 | | - break; |
2028 | | - } |
2029 | | - sparams.hostname = argv[i]; |
2030 | | - } else if (arg == "--path") { |
2031 | | - if (++i >= argc) { |
2032 | | - invalid_param = true; |
2033 | | - break; |
2034 | | - } |
2035 | | - sparams.public_path = argv[i]; |
2036 | | - } else if (arg == "--api-key") { |
2037 | | - if (++i >= argc) { |
2038 | | - invalid_param = true; |
2039 | | - break; |
2040 | | - } |
2041 | | - sparams.api_key = argv[i]; |
2042 | | - } else if (arg == "--timeout" || arg == "-to") { |
2043 | | - if (++i >= argc) { |
2044 | | - invalid_param = true; |
2045 | | - break; |
2046 | | - } |
2047 | | - sparams.read_timeout = std::stoi(argv[i]); |
2048 | | - sparams.write_timeout = std::stoi(argv[i]); |
2049 | | - } else if (arg == "-m" || arg == "--model") { |
2050 | | - if (++i >= argc) { |
2051 | | - invalid_param = true; |
2052 | | - break; |
2053 | | - } |
2054 | | - params.model = argv[i]; |
2055 | | - } else if (arg == "-a" || arg == "--alias") { |
2056 | | - if (++i >= argc) { |
2057 | | - invalid_param = true; |
2058 | | - break; |
2059 | | - } |
2060 | | - params.model_alias = argv[i]; |
2061 | | - } else if (arg == "-h" || arg == "--help") { |
2062 | | - server_print_usage(argv[0], default_params, default_sparams); |
2063 | | - exit(0); |
2064 | | - } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") { |
2065 | | - if (++i >= argc) { |
2066 | | - invalid_param = true; |
2067 | | - break; |
2068 | | - } |
2069 | | - params.n_ctx = std::stoi(argv[i]); |
2070 | | - } else if (arg == "--rope-scaling") { |
2071 | | - if (++i >= argc) { |
2072 | | - invalid_param = true; |
2073 | | - break; |
2074 | | - } |
2075 | | - std::string value(argv[i]); |
2076 | | - /**/ if (value == "none") { |
2077 | | - params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; |
2078 | | - } else if (value == "linear") { |
2079 | | - params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; |
2080 | | - } else if (value == "yarn") { |
2081 | | - params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; |
2082 | | - } else { |
2083 | | - invalid_param = true; |
2084 | | - break; |
2085 | | - } |
2086 | | - } else if (arg == "--rope-freq-base") { |
2087 | | - if (++i >= argc) { |
2088 | | - invalid_param = true; |
2089 | | - break; |
2090 | | - } |
2091 | | - params.rope_freq_base = std::stof(argv[i]); |
2092 | | - } else if (arg == "--rope-freq-scale") { |
2093 | | - if (++i >= argc) { |
2094 | | - invalid_param = true; |
2095 | | - break; |
2096 | | - } |
2097 | | - params.rope_freq_scale = std::stof(argv[i]); |
2098 | | - } else if (arg == "--yarn-ext-factor") { |
2099 | | - if (++i >= argc) { |
2100 | | - invalid_param = true; |
2101 | | - break; |
2102 | | - } |
2103 | | - params.yarn_ext_factor = std::stof(argv[i]); |
2104 | | - } else if (arg == "--yarn-attn-factor") { |
2105 | | - if (++i >= argc) { |
2106 | | - invalid_param = true; |
2107 | | - break; |
2108 | | - } |
2109 | | - params.yarn_attn_factor = std::stof(argv[i]); |
2110 | | - } else if (arg == "--yarn-beta-fast") { |
2111 | | - if (++i >= argc) { |
2112 | | - invalid_param = true; |
2113 | | - break; |
2114 | | - } |
2115 | | - params.yarn_beta_fast = std::stof(argv[i]); |
2116 | | - } else if (arg == "--yarn-beta-slow") { |
2117 | | - if (++i >= argc) { |
2118 | | - invalid_param = true; |
2119 | | - break; |
2120 | | - } |
2121 | | - params.yarn_beta_slow = std::stof(argv[i]); |
2122 | | - } else if (arg == "--threads" || arg == "-t") { |
2123 | | - if (++i >= argc) { |
2124 | | - invalid_param = true; |
2125 | | - break; |
2126 | | - } |
2127 | | - params.n_threads = std::stoi(argv[i]); |
2128 | | - } else if (arg == "--threads-batch" || arg == "-tb") { |
2129 | | - if (++i >= argc) { |
2130 | | - invalid_param = true; |
2131 | | - break; |
2132 | | - } |
2133 | | - params.n_threads_batch = std::stoi(argv[i]); |
2134 | | - } else if (arg == "-b" || arg == "--batch-size") { |
2135 | | - if (++i >= argc) { |
2136 | | - invalid_param = true; |
2137 | | - break; |
2138 | | - } |
2139 | | - params.n_batch = std::stoi(argv[i]); |
2140 | | - params.n_batch = std::min(512, params.n_batch); |
2141 | | - } else if (arg == "--gpu-layers" || arg == "-ngl" || |
2142 | | - arg == "--n-gpu-layers") { |
2143 | | - if (++i >= argc) { |
2144 | | - invalid_param = true; |
2145 | | - break; |
2146 | | - } |
2147 | | - if (llama_supports_gpu_offload()) { |
2148 | | - params.n_gpu_layers = std::stoi(argv[i]); |
2149 | | - } else { |
2150 | | - LOG_WARNING_LLAMA( |
2151 | | - "Not compiled with GPU offload support, --n-gpu-layers option will " |
2152 | | - "be ignored. " |
2153 | | - "See main README.md for information on enabling GPU BLAS support", |
2154 | | - {{"n_gpu_layers", params.n_gpu_layers}}); |
2155 | | - } |
2156 | | - } else if (arg == "--tensor-split" || arg == "-ts") { |
2157 | | - if (++i >= argc) { |
2158 | | - invalid_param = true; |
2159 | | - break; |
2160 | | - } |
2161 | | -#ifdef GGML_USE_CUBLAS |
2162 | | - std::string arg_next = argv[i]; |
2163 | | - |
2164 | | - // split string by , and / |
2165 | | - const std::regex regex{R"([,/]+)"}; |
2166 | | - std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, |
2167 | | - -1}; |
2168 | | - std::vector<std::string> split_arg{it, {}}; |
2169 | | - GGML_ASSERT(split_arg.size() <= llama_max_devices()); |
2170 | | - |
2171 | | - for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) { |
2172 | | - if (i_device < split_arg.size()) { |
2173 | | - params.tensor_split[i_device] = std::stof(split_arg[i_device]); |
2174 | | - } else { |
2175 | | - params.tensor_split[i_device] = 0.0f; |
2176 | | - } |
2177 | | - } |
2178 | | -#else |
2179 | | - LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not " |
2180 | | - "possible to set a tensor split.\n", |
2181 | | - {}); |
2182 | | -#endif // GGML_USE_CUBLAS |
2183 | | - } else if (arg == "--no-mul-mat-q" || arg == "-nommq") { |
2184 | | -#ifdef GGML_USE_CUBLAS |
2185 | | - params.mul_mat_q = false; |
2186 | | -#else |
2187 | | - LOG_WARNING_LLAMA("warning: llama.cpp was compiled without cuBLAS. " |
2188 | | - "Disabling mul_mat_q kernels has no effect.\n", |
2189 | | - {}); |
2190 | | -#endif // GGML_USE_CUBLAS |
2191 | | - } else if (arg == "--main-gpu" || arg == "-mg") { |
2192 | | - if (++i >= argc) { |
2193 | | - invalid_param = true; |
2194 | | - break; |
2195 | | - } |
2196 | | -#ifdef GGML_USE_CUBLAS |
2197 | | - params.main_gpu = std::stoi(argv[i]); |
2198 | | -#else |
2199 | | - LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not " |
2200 | | - "possible to set a main GPU.", |
2201 | | - {}); |
2202 | | -#endif |
2203 | | - } else if (arg == "--lora") { |
2204 | | - if (++i >= argc) { |
2205 | | - invalid_param = true; |
2206 | | - break; |
2207 | | - } |
2208 | | - params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f)); |
2209 | | - params.use_mmap = false; |
2210 | | - } else if (arg == "--lora-scaled") { |
2211 | | - if (++i >= argc) { |
2212 | | - invalid_param = true; |
2213 | | - break; |
2214 | | - } |
2215 | | - const char *lora_adapter = argv[i]; |
2216 | | - if (++i >= argc) { |
2217 | | - invalid_param = true; |
2218 | | - break; |
2219 | | - } |
2220 | | - params.lora_adapter.push_back( |
2221 | | - std::make_tuple(lora_adapter, std::stof(argv[i]))); |
2222 | | - params.use_mmap = false; |
2223 | | - } else if (arg == "--lora-base") { |
2224 | | - if (++i >= argc) { |
2225 | | - invalid_param = true; |
2226 | | - break; |
2227 | | - } |
2228 | | - params.lora_base = argv[i]; |
2229 | | - } else if (arg == "-v" || arg == "--verbose") { |
2230 | | -#if SERVER_VERBOSE != 1 |
2231 | | - LOG_WARNING_LLAMA("server.cpp is not built with verbose logging.", {}); |
2232 | | -#else |
2233 | | - server_verbose = true; |
2234 | | -#endif |
2235 | | - } else if (arg == "--mlock") { |
2236 | | - params.use_mlock = true; |
2237 | | - } else if (arg == "--no-mmap") { |
2238 | | - params.use_mmap = false; |
2239 | | - } else if (arg == "--numa") { |
2240 | | - params.numa = true; |
2241 | | - } else if (arg == "--embedding") { |
2242 | | - params.embedding = true; |
2243 | | - } else if (arg == "-cb" || arg == "--cont-batching") { |
2244 | | - params.cont_batching = true; |
2245 | | - } else if (arg == "-np" || arg == "--parallel") { |
2246 | | - if (++i >= argc) { |
2247 | | - invalid_param = true; |
2248 | | - break; |
2249 | | - } |
2250 | | - params.n_parallel = std::stoi(argv[i]); |
2251 | | - } else if (arg == "-n" || arg == "--n-predict") { |
2252 | | - if (++i >= argc) { |
2253 | | - invalid_param = true; |
2254 | | - break; |
2255 | | - } |
2256 | | - params.n_predict = std::stoi(argv[i]); |
2257 | | - } else if (arg == "-spf" || arg == "--system-prompt-file") { |
2258 | | - if (++i >= argc) { |
2259 | | - invalid_param = true; |
2260 | | - break; |
2261 | | - } |
2262 | | - std::ifstream file(argv[i]); |
2263 | | - if (!file) { |
2264 | | - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); |
2265 | | - invalid_param = true; |
2266 | | - break; |
2267 | | - } |
2268 | | - std::string systm_content; |
2269 | | - std::copy(std::istreambuf_iterator<char>(file), |
2270 | | - std::istreambuf_iterator<char>(), |
2271 | | - std::back_inserter(systm_content)); |
2272 | | - llama.process_system_prompt_data(json::parse(systm_content)); |
2273 | | - } else if (arg == "--mmproj") { |
2274 | | - if (++i >= argc) { |
2275 | | - invalid_param = true; |
2276 | | - break; |
2277 | | - } |
2278 | | - params.mmproj = argv[i]; |
2279 | | - } else if (arg == "--log-disable") { |
2280 | | - log_set_target(stdout); |
2281 | | - LOG_INFO_LLAMA("logging to file is disabled.", {}); |
2282 | | - } else { |
2283 | | - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); |
2284 | | - server_print_usage(argv[0], default_params, default_sparams); |
2285 | | - exit(1); |
2286 | | - } |
2287 | | - } |
2288 | | - |
2289 | | - if (invalid_param) { |
2290 | | - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); |
2291 | | - server_print_usage(argv[0], default_params, default_sparams); |
2292 | | - exit(1); |
2293 | | - } |
2294 | | -} |
2295 | | - |
2296 | 1993 | static std::string random_string() { |
2297 | 1994 | static const std::string str( |
2298 | 1995 | "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); |
|
0 commit comments