diff --git a/android/src/main/jniLibs/arm64-v8a/libcactus.a b/android/src/main/jniLibs/arm64-v8a/libcactus.a index 621821f..0a9fc58 100644 Binary files a/android/src/main/jniLibs/arm64-v8a/libcactus.a and b/android/src/main/jniLibs/arm64-v8a/libcactus.a differ diff --git a/cpp/HybridCactus.cpp b/cpp/HybridCactus.cpp index 3e66a3a..6a35bb2 100644 --- a/cpp/HybridCactus.cpp +++ b/cpp/HybridCactus.cpp @@ -212,6 +212,54 @@ std::shared_ptr> HybridCactus::transcribe( }); } +std::shared_ptr> HybridCactus::detectLanguage( + const std::variant, std::string> &audio, + double responseBufferSize, + const std::optional &optionsJson) { + return Promise::async( + [this, audio, optionsJson, responseBufferSize]() -> std::string { + std::lock_guard lock(this->_modelMutex); + + if (!this->_model) { + throw std::runtime_error("Cactus model is not initialized"); + } + + std::string responseBuffer; + responseBuffer.resize(responseBufferSize); + + int result; + if (std::holds_alternative(audio)) { + result = cactus_detect_language( + this->_model, std::get(audio).c_str(), + responseBuffer.data(), responseBufferSize, + optionsJson ? optionsJson->c_str() : nullptr, nullptr, 0); + } else { + const auto &audioDoubles = std::get>(audio); + + std::vector audioBytes; + audioBytes.reserve(audioDoubles.size()); + + for (double d : audioDoubles) { + d = std::clamp(d, 0.0, 255.0); + audioBytes.emplace_back(static_cast(d)); + } + + result = cactus_detect_language( + this->_model, nullptr, responseBuffer.data(), responseBufferSize, + optionsJson ? optionsJson->c_str() : nullptr, audioBytes.data(), + audioBytes.size()); + } + + if (result < 0) { + throw std::runtime_error("Cactus detect language failed: " + + std::string(cactus_get_last_error())); + } + + responseBuffer.resize(strlen(responseBuffer.c_str())); + return responseBuffer; + }); +} + std::shared_ptr> HybridCactus::streamTranscribeStart( const std::optional &optionsJson) { return Promise::async([this, optionsJson]() -> void { @@ -477,7 +525,7 @@ std::shared_ptr> HybridCactus::destroy() { std::shared_ptr> HybridCactus::setTelemetryEnvironment(const std::string &cacheDir) { return Promise::async([cacheDir]() -> void { - cactus_set_telemetry_environment("react-native-v1.7", cacheDir.c_str()); + cactus_set_telemetry_environment("react-native", cacheDir.c_str(), "1.10.0"); }); } diff --git a/cpp/HybridCactus.hpp b/cpp/HybridCactus.hpp index 9e79306..2c5db1d 100644 --- a/cpp/HybridCactus.hpp +++ b/cpp/HybridCactus.hpp @@ -39,6 +39,11 @@ class HybridCactus : public HybridCactusSpec { double /* tokenId */)>> &callback) override; + std::shared_ptr> + detectLanguage(const std::variant, std::string> &audio, + double responseBufferSize, + const std::optional &optionsJson) override; + std::shared_ptr> streamTranscribeStart(const std::optional &optionsJson) override; diff --git a/cpp/cactus_ffi.h b/cpp/cactus_ffi.h index c627a13..aa72986 100644 --- a/cpp/cactus_ffi.h +++ b/cpp/cactus_ffi.h @@ -76,6 +76,16 @@ CACTUS_FFI_EXPORT int cactus_transcribe( size_t pcm_buffer_size ); +CACTUS_FFI_EXPORT int cactus_detect_language( + cactus_model_t model, + const char* audio_file_path, // NULL if using pcm_buffer + char* response_buffer, + size_t buffer_size, + const char* options_json, // optional + const uint8_t* pcm_buffer, // NULL if using audio_file_path + size_t pcm_buffer_size +); + CACTUS_FFI_EXPORT cactus_stream_transcribe_t cactus_stream_transcribe_start( cactus_model_t model, const char* options_json // optional @@ -189,7 +199,10 @@ CACTUS_FFI_EXPORT void cactus_index_destroy(cactus_index_t index); CACTUS_FFI_EXPORT const char* cactus_get_last_error(void); -CACTUS_FFI_EXPORT void cactus_set_telemetry_environment(const char* framework, const char* cache_location); +CACTUS_FFI_EXPORT void cactus_set_telemetry_environment(const char* framework, const char* cache_location, const char* version); +CACTUS_FFI_EXPORT void cactus_set_app_id(const char* app_id); +CACTUS_FFI_EXPORT void cactus_telemetry_flush(void); +CACTUS_FFI_EXPORT void cactus_telemetry_shutdown(void); #ifdef __cplusplus } diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock index 4fce007..d6f7e96 100644 --- a/example/ios/Podfile.lock +++ b/example/ios/Podfile.lock @@ -1,6 +1,6 @@ PODS: - boost (1.84.0) - - Cactus (1.7.0): + - Cactus (1.10.0): - boost - DoubleConversion - fast_float @@ -2643,7 +2643,7 @@ EXTERNAL SOURCES: SPEC CHECKSUMS: boost: 7e761d76ca2ce687f7cc98e698152abd03a18f90 - Cactus: d549ac2651ab939a9b5bbcfd6827a1a4e7fa2d81 + Cactus: 88585f8a152312dcb391526d839133d72d054031 DoubleConversion: cb417026b2400c8f53ae97020b2be961b59470cb fast_float: b32c788ed9c6a8c584d114d0047beda9664e7cc6 FBLazyVector: b8f1312d48447cca7b4abc21ed155db14742bd03 diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_cloud.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_cloud.h new file mode 100644 index 0000000..e61841d --- /dev/null +++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_cloud.h @@ -0,0 +1,48 @@ +#ifndef CACTUS_CLOUD_H +#define CACTUS_CLOUD_H + +#include "cactus_utils.h" +#include +#include + +namespace cactus { +namespace ffi { + +struct CloudResponse { + std::string transcript; + std::string api_key_hash; + bool used_cloud = false; + std::string error; +}; + +struct CloudCompletionRequest { + std::vector messages; + std::vector tools; + std::string local_output; + std::vector local_function_calls; + bool has_images = false; + std::string cloud_key; +}; + +struct CloudCompletionResult { + bool ok = false; + bool used_cloud = false; + std::string response; + std::vector function_calls; + std::string error; +}; + +std::string cloud_base64_encode(const uint8_t* data, size_t len); +std::vector cloud_build_wav(const uint8_t* pcm, size_t pcm_bytes); +std::string resolve_cloud_api_key(const char* cloud_key_param); +CloudResponse cloud_transcribe_request(const std::string& audio_b64, + const std::string& fallback_text, + long timeout_seconds = 15L, + const char* cloud_key = nullptr); +CloudCompletionResult cloud_complete_request(const CloudCompletionRequest& request, + long timeout_ms); + +} // namespace ffi +} // namespace cactus + +#endif // CACTUS_CLOUD_H diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h index c627a13..aa72986 100644 --- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h +++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h @@ -76,6 +76,16 @@ CACTUS_FFI_EXPORT int cactus_transcribe( size_t pcm_buffer_size ); +CACTUS_FFI_EXPORT int cactus_detect_language( + cactus_model_t model, + const char* audio_file_path, // NULL if using pcm_buffer + char* response_buffer, + size_t buffer_size, + const char* options_json, // optional + const uint8_t* pcm_buffer, // NULL if using audio_file_path + size_t pcm_buffer_size +); + CACTUS_FFI_EXPORT cactus_stream_transcribe_t cactus_stream_transcribe_start( cactus_model_t model, const char* options_json // optional @@ -189,7 +199,10 @@ CACTUS_FFI_EXPORT void cactus_index_destroy(cactus_index_t index); CACTUS_FFI_EXPORT const char* cactus_get_last_error(void); -CACTUS_FFI_EXPORT void cactus_set_telemetry_environment(const char* framework, const char* cache_location); +CACTUS_FFI_EXPORT void cactus_set_telemetry_environment(const char* framework, const char* cache_location, const char* version); +CACTUS_FFI_EXPORT void cactus_set_app_id(const char* app_id); +CACTUS_FFI_EXPORT void cactus_telemetry_flush(void); +CACTUS_FFI_EXPORT void cactus_telemetry_shutdown(void); #ifdef __cplusplus } diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h index 5f360bd..3b5d97f 100644 --- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h +++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h @@ -2,6 +2,7 @@ #define CACTUS_UTILS_H #include "../engine/engine.h" +#include "../models/model.h" #include #include #include @@ -12,6 +13,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -101,12 +105,92 @@ inline cactus::engine::AudioProcessor::SpectrogramConfig get_whisper_spectrogram return cfg; } +inline cactus::engine::AudioProcessor::SpectrogramConfig get_parakeet_spectrogram_config() { + cactus::engine::AudioProcessor::SpectrogramConfig cfg{}; + cfg.n_fft = 512; + cfg.frame_length = 400; + cfg.hop_length = 160; + cfg.power = 2.0f; + cfg.center = true; + cfg.pad_mode = "constant"; + cfg.onesided = true; + cfg.dither = 0.0f; + cfg.mel_floor = 5.960464477539063e-08f; // 2^-24 guard value used by HF Parakeet. + cfg.log_mel = "log"; + cfg.reference = 1.0f; + cfg.min_value = 1e-10f; + cfg.remove_dc_offset = false; + cfg.hann_periodic = false; + return cfg; +} + +inline void apply_preemphasis(std::vector& waveform, float coefficient = 0.97f) { + if (waveform.size() < 2 || coefficient == 0.0f) { + return; + } + for (size_t i = waveform.size() - 1; i > 0; --i) { + waveform[i] -= coefficient * waveform[i - 1]; + } +} + +inline void normalize_parakeet_log_mel(std::vector& mel, size_t num_mels, float epsilon = 1e-5f) { + if (mel.empty() || num_mels == 0 || (mel.size() % num_mels) != 0) { + return; + } + const size_t num_frames = mel.size() / num_mels; + if (num_frames == 0) { + return; + } + + for (size_t m = 0; m < num_mels; ++m) { + const size_t base = m * num_frames; + float mean = 0.0f; + for (size_t t = 0; t < num_frames; ++t) { + mean += mel[base + t]; + } + mean /= static_cast(num_frames); + + float variance = 0.0f; + for (size_t t = 0; t < num_frames; ++t) { + const float d = mel[base + t] - mean; + variance += d * d; + } + const float denom = static_cast(std::max(1, num_frames - 1)); + const float inv_std = 1.0f / std::sqrt((variance / denom) + epsilon); + for (size_t t = 0; t < num_frames; ++t) { + mel[base + t] = (mel[base + t] - mean) * inv_std; + } + } +} + +inline void trim_mel_frames(std::vector& mel, size_t num_mels, size_t valid_frames) { + if (mel.empty() || num_mels == 0 || (mel.size() % num_mels) != 0) { + return; + } + size_t total_frames = mel.size() / num_mels; + if (valid_frames == 0 || valid_frames >= total_frames) { + return; + } + std::vector trimmed(num_mels * valid_frames); + for (size_t m = 0; m < num_mels; ++m) { + const float* src = &mel[m * total_frames]; + float* dst = &trimmed[m * valid_frames]; + std::copy(src, src + valid_frames, dst); + } + mel.swap(trimmed); +} + } // namespace audio } // namespace cactus namespace cactus { namespace ffi { +inline bool env_flag_enabled(const char* key) { + const char* value = std::getenv(key); + return value && value[0] != '\0' && !(value[0] == '0' && value[1] == '\0'); +} + inline std::string generateUUID() { #ifdef __APPLE__ uuid_t uuid; @@ -114,6 +198,25 @@ inline std::string generateUUID() { char uuid_str[37]; uuid_unparse_lower(uuid, uuid_str); return std::string(uuid_str); +#else + static std::random_device rd; + static std::mt19937 gen(rd()); + static std::uniform_int_distribution<> dis(0, 15); + static std::uniform_int_distribution<> dis2(8, 11); + + std::stringstream ss; + ss << std::hex; + for (int i = 0; i < 8; i++) ss << dis(gen); + ss << "-"; + for (int i = 0; i < 4; i++) ss << dis(gen); + ss << "-4"; + for (int i = 0; i < 3; i++) ss << dis(gen); + ss << "-"; + ss << dis2(gen); + for (int i = 0; i < 3; i++) ss << dis(gen); + ss << "-"; + for (int i = 0; i < 12; i++) ss << dis(gen); + return ss.str(); #endif } @@ -150,6 +253,130 @@ inline std::string escape_json_string(const std::string& s) { return o.str(); } + +inline std::string trim_string(const std::string& s) { + size_t start = 0; + while (start < s.size() && std::isspace(static_cast(s[start]))) ++start; + size_t end = s.size(); + while (end > start && std::isspace(static_cast(s[end - 1]))) --end; + return s.substr(start, end - start); +} + +inline std::string env_or_default(const char* key, const char* fallback) { + const char* v = std::getenv(key); + if (v && v[0] != '\0') return std::string(v); + return std::string(fallback); +} + +inline std::string json_string_field(const std::string& json, const std::string& key) { + std::string pattern = "\"" + key + "\":"; + size_t pos = json.find(pattern); + if (pos == std::string::npos) return {}; + + size_t i = pos + pattern.size(); + while (i < json.size() && std::isspace(static_cast(json[i]))) i++; + if (i >= json.size() || json[i] != '"') return {}; + ++i; + + std::string out; + out.reserve(128); + while (i < json.size()) { + char c = json[i++]; + if (c == '"') return out; + if (c == '\\' && i < json.size()) { + char e = json[i++]; + switch (e) { + case '"': out.push_back('"'); break; + case '\\': out.push_back('\\'); break; + case '/': out.push_back('/'); break; + case 'b': out.push_back('\b'); break; + case 'f': out.push_back('\f'); break; + case 'n': out.push_back('\n'); break; + case 'r': out.push_back('\r'); break; + case 't': out.push_back('\t'); break; + default: out.push_back(e); break; + } + continue; + } + out.push_back(c); + } + return {}; +} + +inline std::string json_array_field(const std::string& json, const std::string& key) { + std::string pattern = "\"" + key + "\":"; + size_t pos = json.find(pattern); + if (pos == std::string::npos) return "[]"; + size_t start = pos + pattern.size(); + while (start < json.size() && std::isspace(static_cast(json[start]))) ++start; + if (start >= json.size() || json[start] != '[') return "[]"; + + int depth = 1; + size_t end = start + 1; + while (end < json.size() && depth > 0) { + if (json[end] == '[') depth++; + else if (json[end] == ']') depth--; + end++; + } + return json.substr(start, end - start); +} + +inline std::vector split_json_array(const std::string& array_json) { + std::vector out; + if (array_json.size() < 2 || array_json.front() != '[' || array_json.back() != ']') return out; + + size_t i = 1; + while (i + 1 < array_json.size()) { + while (i + 1 < array_json.size() && + (std::isspace(static_cast(array_json[i])) || array_json[i] == ',')) i++; + if (i + 1 >= array_json.size() || array_json[i] != '{') break; + + size_t start = i; + int depth = 0; + bool in_str = false; + bool esc = false; + for (; i < array_json.size(); ++i) { + char c = array_json[i]; + if (in_str) { + if (esc) esc = false; + else if (c == '\\') esc = true; + else if (c == '"') in_str = false; + continue; + } + if (c == '"') { in_str = true; continue; } + if (c == '{') depth++; + if (c == '}') { + depth--; + if (depth == 0) { + out.push_back(array_json.substr(start, i - start + 1)); + i++; + break; + } + } + } + } + return out; +} + +inline std::string serialize_tools_json(const std::vector& tools) { + if (tools.empty()) return ""; + std::ostringstream oss; + oss << "["; + for (size_t i = 0; i < tools.size(); ++i) { + if (i > 0) oss << ","; + oss << "{\"type\":\"function\",\"function\":{"; + oss << "\"name\":\"" << escape_json_string(tools[i].name) << "\","; + oss << "\"description\":\"" << escape_json_string(tools[i].description) << "\""; + auto it = tools[i].parameters.find("schema"); + if (it != tools[i].parameters.end()) { + oss << ",\"parameters\":" << it->second; + } + oss << "}}"; + } + oss << "]"; + return oss.str(); +} + inline void handle_error_response(const std::string& error_message, char* response_buffer, size_t buffer_size) { std::ostringstream json; json << "{"; @@ -324,7 +551,10 @@ inline void parse_options_json(const std::string& json, float& confidence_threshold, bool& include_stop_sequences, bool& use_vad, - bool& telemetry_enabled) { + bool& telemetry_enabled, + bool* auto_handoff = nullptr, + size_t* cloud_timeout_ms = nullptr, + bool* handoff_with_images = nullptr) { temperature = 0.0f; top_p = 0.0f; top_k = 0; @@ -335,6 +565,9 @@ inline void parse_options_json(const std::string& json, include_stop_sequences = false; use_vad = true; telemetry_enabled = true; + if (auto_handoff) *auto_handoff = true; + if (cloud_timeout_ms) *cloud_timeout_ms = 15000; + if (handoff_with_images) *handoff_with_images = true; stop_sequences.clear(); if (json.empty()) return; @@ -403,6 +636,32 @@ inline void parse_options_json(const std::string& json, telemetry_enabled = (json.substr(pos, 4) == "true"); } + if (auto_handoff) { + pos = json.find("\"auto_handoff\""); + if (pos != std::string::npos) { + pos = json.find(':', pos) + 1; + while (pos < json.length() && std::isspace(json[pos])) pos++; + *auto_handoff = (json.substr(pos, 4) == "true"); + } + } + + if (cloud_timeout_ms) { + pos = json.find("\"cloud_timeout_ms\""); + if (pos != std::string::npos) { + pos = json.find(':', pos) + 1; + *cloud_timeout_ms = std::stoul(json.substr(pos)); + } + } + + if (handoff_with_images) { + pos = json.find("\"handoff_with_images\""); + if (pos != std::string::npos) { + pos = json.find(':', pos) + 1; + while (pos < json.length() && std::isspace(json[pos])) pos++; + *handoff_with_images = (json.substr(pos, 4) == "true"); + } + } + pos = json.find("\"stop_sequences\""); if (pos != std::string::npos) { pos = json.find('[', pos); @@ -422,31 +681,8 @@ inline void parse_options_json(const std::string& json, } } -inline std::string format_tools_for_prompt(const std::vector& tools) { - if (tools.empty()) return ""; - std::string formatted_tools_json; - for (size_t i = 0; i < tools.size(); i++) { - if (i > 0) formatted_tools_json += "\n"; - formatted_tools_json += "{\"type\":\"function\",\"function\":{\"name\":\"" - + tools[i].name - + "\",\"description\":\"" - + tools[i].description + "\""; - if (tools[i].parameters.find("schema") != tools[i].parameters.end()) { - formatted_tools_json += ",\"parameters\":" + tools[i].parameters.at("schema"); - } - formatted_tools_json += "}}"; - } - return formatted_tools_json; -} - static inline std::string trim_lfm2_slice(const std::string& value, size_t begin, size_t end) { - while (begin < end && std::isspace(static_cast(value[begin]))) { - begin++; - } - while (end > begin && std::isspace(static_cast(value[end - 1]))) { - end--; - } - return value.substr(begin, end - begin); + return trim_string(value.substr(begin, end - begin)); } static inline void append_lfm2_call(const std::string& entry, @@ -577,23 +813,49 @@ inline void parse_function_calls_from_response(const std::string& response_text, if (!content.empty() && content.front() == '[' && content.back() == ']') { std::string inner = content.substr(1, content.size() - 2); - size_t start = 0; - int paren_depth = 0; - - for (size_t i = 0; i < inner.size(); ++i) { - char c = inner[i]; - if (c == '(') { - paren_depth++; - } else if (c == ')' && paren_depth > 0) { - paren_depth--; - } else if (c == ',' && paren_depth == 0) { - append_lfm2_call(inner.substr(start, i - start), function_calls); - start = i + 1; + + size_t inner_first = inner.find_first_not_of(" \t\n\r"); + if (inner_first != std::string::npos && inner[inner_first] == '{') { + size_t pos = inner_first; + while (pos < inner.size()) { + if (inner[pos] == '{') { + int brace_depth = 1; + size_t obj_start = pos; + pos++; + while (pos < inner.size() && brace_depth > 0) { + if (inner[pos] == '{') brace_depth++; + else if (inner[pos] == '}') brace_depth--; + pos++; + } + if (brace_depth == 0) { + std::string json_obj = inner.substr(obj_start, pos - obj_start); + if (json_obj.find("\"name\"") != std::string::npos) { + function_calls.push_back(json_obj); + } + } + } else { + pos++; + } + } + } else { + size_t start = 0; + int paren_depth = 0; + + for (size_t i = 0; i < inner.size(); ++i) { + char c = inner[i]; + if (c == '(') { + paren_depth++; + } else if (c == ')' && paren_depth > 0) { + paren_depth--; + } else if (c == ',' && paren_depth == 0) { + append_lfm2_call(inner.substr(start, i - start), function_calls); + start = i + 1; + } } - } - if (start < inner.size()) { - append_lfm2_call(inner.substr(start), function_calls); + if (start < inner.size()) { + append_lfm2_call(inner.substr(start), function_calls); + } } } else if (!content.empty()) { append_lfm2_call(content, function_calls); @@ -648,7 +910,7 @@ inline std::string construct_response_json(const std::string& regular_response, bool cloud_handoff = false) { std::ostringstream json; json << "{"; - json << "\"success\":" << (cloud_handoff ? "false" : "true") << ","; + json << "\"success\":true,"; json << "\"error\":null,"; json << "\"cloud_handoff\":" << (cloud_handoff ? "true" : "false") << ","; json << "\"response\":\"" << escape_json_string(regular_response) << "\","; @@ -671,30 +933,6 @@ inline std::string construct_response_json(const std::string& regular_response, return json.str(); } -inline std::string construct_cloud_handoff_json(float confidence, - double time_to_first_token, - double prefill_tps, - size_t prompt_tokens) { - std::ostringstream json; - json << "{"; - json << "\"success\":false,"; - json << "\"error\":null,"; - json << "\"cloud_handoff\":true,"; - json << "\"response\":null,"; - json << "\"function_calls\":[],"; - json << "\"confidence\":" << std::fixed << std::setprecision(4) << confidence << ","; - json << "\"time_to_first_token_ms\":" << std::fixed << std::setprecision(2) << time_to_first_token << ","; - json << "\"total_time_ms\":" << std::fixed << std::setprecision(2) << time_to_first_token << ","; - json << "\"prefill_tps\":" << std::fixed << std::setprecision(2) << prefill_tps << ","; - json << "\"decode_tps\":0.0,"; - json << "\"ram_usage_mb\":" << std::fixed << std::setprecision(2) << get_ram_usage_mb() << ","; - json << "\"prefill_tokens\":" << prompt_tokens << ","; - json << "\"decode_tokens\":0,"; - json << "\"total_tokens\":" << prompt_tokens; - json << "}"; - return json.str(); -} - inline std::string serialize_function_calls(const std::vector& calls) { if (calls.empty()) return "[]"; std::ostringstream oss; @@ -720,4 +958,4 @@ const char* cactus_get_last_error(); } #endif -#endif // CACTUS_UTILS_H +#endif // CACTUS_UTILS_H \ No newline at end of file diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h index 620fab6..c8bf34a 100644 --- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h +++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h @@ -56,6 +56,12 @@ struct Config { uint32_t num_shared_experts = 0; uint32_t num_top_experts = 0; uint32_t moe_every_n_layers = 0; + uint32_t moe_intermediate_dim = 0; + uint32_t num_dense_layers = 0; + uint32_t num_experts_per_tok = 0; + bool norm_topk_prob = false; + bool use_expert_bias = false; + float routed_scaling_factor = 1.0f; bool tie_word_embeddings = true; uint32_t vision_hidden_dim = 0; @@ -93,8 +99,22 @@ struct Config { uint32_t num_encoder_layers = 0; uint32_t num_decoder_layers = 0; float partial_rotary_factor = 0.0f; - - enum class ModelType {QWEN = 0, GEMMA = 1, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, MOONSHINE = 8, SILERO_VAD = 9}; + uint32_t pad_token_id = 0; + uint32_t conv_kernel_size = 0; + uint32_t subsampling_conv_kernel_size = 0; + uint32_t subsampling_conv_stride = 0; + uint32_t subsampling_conv_channels = 0; + uint32_t subsampling_factor = 0; + uint32_t num_mel_bins = 80; + std::string encoder_hidden_act = "silu"; + uint32_t predictor_hidden_dim = 0; + uint32_t predictor_num_layers = 0; + uint32_t tdt_joint_dim = 0; + uint32_t tdt_num_durations = 0; + uint32_t tdt_blank_id = 0; + std::vector tdt_durations; + + enum class ModelType {QWEN = 0, GEMMA = 1, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, MOONSHINE = 8, SILERO_VAD = 9, PARAKEET = 10, PARAKEET_TDT = 11}; ModelType model_type = ModelType::QWEN; enum class ModelVariant {DEFAULT = 0, VLM = 1, EXTRACT = 2, RAG = 3}; @@ -168,7 +188,7 @@ class Tokenizer { uint32_t get_global_img_token_id() const { return global_img_token_id_; } protected: - enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, BERT, WHISPER}; + enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, BERT, WHISPER, PARAKEET}; ModelType model_type_ = ModelType::UNKNOWN; enum class ModelVariant { DEFAULT, VLM, EXTRACT, RAG}; ModelVariant model_variant_ = ModelVariant::DEFAULT; @@ -366,7 +386,6 @@ struct KVCache { size_t num_tokens, size_t kv_heads, size_t head_dim); bool is_empty() const { return current_seq_len == 0; } - bool is_int8() const { return precision == Precision::INT8; } void* get_key_ptr(size_t layer); void* get_value_ptr(size_t layer); @@ -684,6 +703,8 @@ class AudioProcessor { float reference = 1.0f; float min_value = 1e-10f; bool remove_dc_offset = false; + float preemphasis = 0.0f; + bool hann_periodic = true; }; AudioProcessor(); @@ -696,6 +717,11 @@ class AudioProcessor { const std::vector& waveform, const SpectrogramConfig& config); + static std::vector compute_irfft( + const std::vector& complex_input, + size_t n, + const char* norm = "backward"); + const std::vector& get_mel_filters() const { return mel_filters_; } size_t get_num_mel_filters() const { return num_mel_filters_; } @@ -721,6 +747,8 @@ namespace index { struct QueryResult { int doc_id; float score; + + QueryResult(int doc_id, float score) : doc_id(doc_id), score(score) {} }; struct QueryOptions { diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h index 255d83c..01b7b2f 100644 --- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h +++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -109,23 +110,33 @@ enum class ComputeBackend { NPU }; +enum class Activation { + SILU, + GELU, + GELU_ERF, + RELU, + SIGMOID, + TANH +}; + enum class OpType { INPUT, PRECISION_CAST, ADD, ADD_CLIPPED, SUBTRACT, MULTIPLY, DIVIDE, MATMUL, TRANSPOSE, RESHAPE, SLICE, GATHER, EMBEDDING, BILINEAR_INTERPOLATION, SUM, MEAN, VARIANCE, MIN, MAX, - RMS_NORM, ROPE, ROPE_GPTJ, SOFTMAX, ATTENTION, ATTENTION_INT8_HYBRID, CONV1D_CAUSAL, CONV1D_K3, CONV1D_K7S3, CONV1D, - SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN, + RMS_NORM, ROPE, ROPE_GPTJ, SOFTMAX, ATTENTION, ATTENTION_INT8_HYBRID, REL_POS_BIAS, CONV1D_CAUSAL, CONV1D_K3, CONV1D_K7S3, CONV1D, CONV1D_SAME_DEPTHWISE_K9, CONV1D_POINTWISE, CONV2D_K3S2P1, CONV2D_DEPTHWISE_K3S2P1, CONV2D_POINTWISE_1X1, GLU, BATCHNORM, + SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN, SCALAR_LOG, RELU, SILU, GELU, GELU_ERF, SIGMOID, TANH, SAMPLE, CONCAT, SCATTER_TOPK, TOPK, LAYERNORM, GROUPNORM, + MOE_LAYER, INDEX, PERSISTENT, QUANTIZE_ACTIVATIONS, LSTM_CELL, - STFT_MAGNITUDE + STFT }; struct PrecisionTraits { @@ -141,11 +152,20 @@ struct PrecisionTraits { static constexpr size_t packed_size_of(Precision prec, size_t count) { switch (prec) { - case Precision::INT4: return (count + 1) / 2; + case Precision::INT4: return (count + 1) / 2; default: return count * size_of(prec); } } + static size_t byte_offset_of(Precision prec, size_t element_offset) { + switch (prec) { + case Precision::INT4: + assert(element_offset % 32 == 0 && "INT4 byte offset must be group-aligned (multiple of 32)"); + return element_offset / 2; + default: return element_offset * size_of(prec); + } + } + static constexpr bool is_integer(Precision prec) { switch (prec) { case Precision::INT8: return true; @@ -181,7 +201,6 @@ struct TensorConfig { Precision compute_precision = Precision::INT8; Precision output_precision = Precision::INT8; bool auto_mixed_precision = false; - bool enable_int4_packing = true; static TensorConfig& global(); }; @@ -243,6 +262,10 @@ struct BufferDesc { return precision == Precision::INT8 && group_size > 0; } + bool is_grouped_int4() const { + return precision == Precision::INT4 && group_size > 0; + } + void set_grouped_scales(size_t gs, size_t ng, void* scales_ptr) { group_size = gs; num_groups = ng; @@ -291,6 +314,7 @@ struct OpParams { size_t slice_length = 0; size_t window_size = 0; bool is_causal = true; + bool attention_mask_is_additive = false; std::vector new_shape; std::vector permutation; Precision output_precision = Precision::INT8; @@ -309,6 +333,11 @@ struct OpParams { size_t num_groups = 0; size_t dst_height = 0; size_t dst_width = 0; + bool normalize_routing = false; + size_t num_experts = 0; + size_t num_experts_per_tok = 0; + bool moe_gated = true; + Activation activation = Activation::SILU; std::vector bias_values; std::vector bias_indices; @@ -356,7 +385,6 @@ void compute_index_node(GraphNode& node, const std::vector>& nodes, const std::unordered_map& node_index_map); void shrink_thread_local_buffers(); - class BufferPool { public: BufferPool() = default; @@ -418,6 +446,7 @@ class CactusGraph { size_t scalar_sqrt(size_t input); size_t scalar_cos(size_t input); size_t scalar_sin(size_t input); + size_t scalar_log(size_t input); size_t relu(size_t input); size_t silu(size_t input); @@ -425,6 +454,7 @@ class CactusGraph { size_t gelu_erf(size_t input); size_t sigmoid(size_t input); size_t tanh(size_t input); + size_t glu(size_t input, int axis = -1); size_t matmul(size_t input1, size_t input2, bool pretransposed_rhs = false, ComputeBackend backend = ComputeBackend::CPU); size_t transpose(size_t input, ComputeBackend backend = ComputeBackend::CPU); @@ -455,7 +485,30 @@ class CactusGraph { size_t layernorm(size_t input, size_t weight, size_t bias, float epsilon = 1e-5f); size_t layernorm(size_t input, size_t weight, float epsilon = 1e-5f); // No bias version size_t groupnorm(size_t input, size_t weight, size_t bias, size_t num_groups = 32, float epsilon = 1e-5f); + size_t batchnorm(size_t input, size_t weight, size_t bias, size_t running_mean, size_t running_var, int axis = 1, float epsilon = 1e-5f); size_t topk(size_t input, size_t k); + size_t moe_layer(size_t hidden, + size_t routing_probs, + size_t topk_indices, + const std::vector& w1_weights, + const std::vector& w3_weights, + const std::vector& w2_weights, + size_t num_experts, + size_t num_experts_per_tok, + bool normalize_routing, + float epsilon, + float routed_scaling_factor); + size_t moe_layer(size_t hidden, + size_t routing_probs, + size_t topk_indices, + const std::vector& w1_weights, + const std::vector& w2_weights, + size_t num_experts, + size_t num_experts_per_tok, + bool normalize_routing, + float epsilon, + float routed_scaling_factor, + Activation activation); size_t rms_norm(size_t input, size_t weight, float epsilon = 1e-5f); size_t rope(size_t input, float theta, size_t position_offset = 0, ComputeBackend backend = ComputeBackend::CPU); size_t rope_gptj(size_t input, float theta, size_t position_offset = 0, size_t rot_dim = 0, ComputeBackend backend = ComputeBackend::CPU); @@ -463,6 +516,10 @@ class CactusGraph { size_t attention(size_t query, size_t key, size_t value, float scale, bool is_causal = true, ComputeBackend backend = ComputeBackend::CPU); size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, ComputeBackend backend = ComputeBackend::CPU); size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, size_t window_size, ComputeBackend backend = ComputeBackend::CPU); + size_t attention_masked(size_t query, size_t key, size_t value, size_t mask, float scale, + bool is_causal = true, ComputeBackend backend = ComputeBackend::CPU, + bool additive_mask = false, size_t position_offset = 0, size_t window_size = 0); + size_t rel_pos_bias(size_t query, size_t relative_key, float scale); size_t attention_int8_hybrid(size_t query, size_t key_new, size_t value_new, float scale, size_t position_offset, const int8_t* cached_keys, const int8_t* cached_values, @@ -474,9 +531,19 @@ class CactusGraph { size_t conv1d_k7s3(size_t input, size_t weight, size_t bias); size_t conv1d(size_t input, size_t weight, size_t stride); size_t conv1d(size_t input, size_t weight, size_t bias, size_t stride); + size_t conv1d_same_depthwise_k9(size_t input, size_t weight); + size_t conv1d_same_depthwise_k9(size_t input, size_t weight, size_t bias); + size_t conv1d_pointwise(size_t input, size_t weight); + size_t conv1d_pointwise(size_t input, size_t weight, size_t bias); + size_t conv2d_k3s2p1(size_t input, size_t weight); + size_t conv2d_k3s2p1(size_t input, size_t weight, size_t bias); + size_t conv2d_depthwise_k3s2p1(size_t input, size_t weight); + size_t conv2d_depthwise_k3s2p1(size_t input, size_t weight, size_t bias); + size_t conv2d_pointwise_1x1(size_t input, size_t weight); + size_t conv2d_pointwise_1x1(size_t input, size_t weight, size_t bias); size_t lstm_cell(size_t input, size_t h_prev, size_t c_prev, size_t weight_ih, size_t weight_hh, size_t bias_ih, size_t bias_hh); - size_t stft_magnitude(size_t input, size_t weight, size_t stride, size_t num_fft_bins); + size_t stft(size_t input, size_t weight, size_t stride, size_t num_fft_bins); size_t sample(size_t logits, float temperature = 0.6f, float top_p = 0.95f, size_t top_k = 20, const std::unordered_map& logit_bias = {}); @@ -581,12 +648,9 @@ namespace GraphFile { bool is_interleaved_ = false; size_t original_N_ = 0; - std::unique_ptr unpacked_data_; - void parse_header(); void apply_madvise_hints(); - void unpack_int4_data(); }; } -#endif +#endif \ No newline at end of file diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h index 17acd36..0ec7265 100644 --- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h +++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h @@ -4,6 +4,8 @@ #include #include +enum class Precision; + enum class ScalarOpType { ADD, SUBTRACT, @@ -12,7 +14,8 @@ enum class ScalarOpType { EXP, SQRT, COS, - SIN + SIN, + LOG }; constexpr size_t KV_QUANT_GROUP_SIZE = 32; @@ -21,6 +24,7 @@ void cactus_add_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num void cactus_add_f16_clipped(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements); void cactus_subtract_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements); void cactus_multiply_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements); +void cactus_add_scaled_f16(const __fp16* base, const __fp16* src, __fp16* output, size_t num_elements, float scale); void cactus_divide_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements); void cactus_add_broadcast_f16(const __fp16* a, const __fp16* b, __fp16* output, @@ -50,6 +54,23 @@ void cactus_matmul_int8(const int8_t* A, const float* A_scales, const int8_t* B, const __fp16* B_scales, __fp16* C, size_t M, size_t K, size_t N, size_t group_size); +void cactus_gemv_int4(const int8_t* A, float A_scale, + const int8_t* B_packed, const __fp16* B_scales, + __fp16* C, size_t K, size_t N, size_t group_size); + +void cactus_gemm_int4(const int8_t* A, const float* A_scales, + const int8_t* B_packed, const __fp16* B_scales, + __fp16* C, size_t M, size_t K, size_t N, size_t group_size); + +void cactus_matmul_int4(const int8_t* A, const float* A_scales, + const int8_t* B_packed, const __fp16* B_scales, + __fp16* C, size_t M, size_t K, size_t N, size_t group_size); + +void cactus_matmul_integer(Precision precision, + const int8_t* A, const float* A_scales, + const int8_t* B, const __fp16* B_scales, + __fp16* C, size_t M, size_t K, size_t N, size_t group_size); + void cactus_matmul_f16(const __fp16* a, const __fp16* b_transposed, __fp16* c, size_t M, size_t K, size_t N); @@ -97,10 +118,52 @@ void cactus_sigmoid_f16(const __fp16* input, __fp16* output, size_t num_elements void cactus_tanh_f16(const __fp16* input, __fp16* output, size_t num_elements); +void cactus_glu_f16( + const __fp16* input, + __fp16* output, + size_t outer_size, + size_t split_size, + size_t inner_size +); + +void cactus_glu_f32( + const float* input, + float* output, + size_t outer_size, + size_t split_size, + size_t inner_size +); + +void cactus_batchnorm_f16( + const __fp16* input, + const float* weight, + const float* bias, + const float* running_mean, + const float* running_var, + __fp16* output, + size_t outer_size, + size_t channels, + size_t inner_size, + float epsilon +); + +void cactus_batchnorm_f32( + const float* input, + const float* weight, + const float* bias, + const float* running_mean, + const float* running_var, + float* output, + size_t outer_size, + size_t channels, + size_t inner_size, + float epsilon +); + void cactus_attention_f16(const __fp16* queries, const __fp16* keys, const __fp16* values, __fp16* output, size_t batch_size, size_t seq_len, size_t kv_seq_len, size_t num_q_heads, size_t num_kv_heads, size_t head_dim, float scale, const __fp16* mask, size_t position_offset = 0, size_t window_size = 0, - bool is_causal = true); + bool is_causal = true, bool mask_is_additive = false, bool mask_per_head = false); void cactus_attention_hybrid_int8_fp16( const __fp16* queries, @@ -150,7 +213,7 @@ void cactus_conv1d_f16( size_t stride ); -void cactus_stft_magnitude_f16( +void cactus_stft_f16( const __fp16* input, const __fp16* weight, __fp16* output, @@ -171,6 +234,62 @@ void cactus_conv1d_f16_k7s3_oc8( size_t C_out ); +void cactus_conv1d_same_depthwise_f16_k9( + const __fp16* input, + const __fp16* weight, + const __fp16* bias, + __fp16* output, + size_t N, + size_t L, + size_t C +); + +void cactus_conv2d_f16_k3s2p1_nchw( + const __fp16* input, + const __fp16* weight, + const __fp16* bias, + __fp16* output, + size_t N, + size_t C_in, + size_t H, + size_t W, + size_t C_out +); + +void cactus_conv2d_depthwise_f16_k3s2p1_nchw( + const __fp16* input, + const __fp16* weight, + const __fp16* bias, + __fp16* output, + size_t N, + size_t C, + size_t H, + size_t W +); + +void cactus_conv2d_pointwise_f16_1x1_nchw_gemm( + const __fp16* input, + const __fp16* weight, + const __fp16* bias, + __fp16* output, + size_t N, + size_t C_in, + size_t H, + size_t W, + size_t C_out +); + +void cactus_conv1d_pointwise_f16_gemm( + const __fp16* input, + const __fp16* weight, + const __fp16* bias, + __fp16* output, + size_t N, + size_t L, + size_t C_in, + size_t C_out +); + void cactus_bilinear_interpolation_f16(const __fp16* input, __fp16* output, size_t src_height, size_t src_width, size_t embed_dim, size_t dst_height, size_t dst_width); @@ -224,4 +343,4 @@ void cactus_lstm_cell_f16( size_t hidden_size ); -#endif +#endif \ No newline at end of file diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h index ac49d05..118c85c 100644 --- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h +++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h @@ -44,6 +44,34 @@ inline void stream_store_f16x8(__fp16* dst, float16x8_t val) { #endif } +inline bool cpu_has_sme2() { +#if defined(__aarch64__) + static std::once_flag once; + static bool has = false; + + std::call_once(once, []() { + +#if defined(__APPLE__) + int ret = 0; + size_t size = sizeof(ret); + if (sysctlbyname("hw.optional.arm.FEAT_SME2", &ret, &size, nullptr, 0) == 0) { + has = ret == 1; + } + +#elif defined(__ANDROID__) + unsigned long hwcap2 = getauxval(AT_HWCAP2); +#ifdef HWCAP2_SME2 + has = (hwcap2 & HWCAP2_SME2) != 0; +#endif + +#endif + }); + + return has; +#else + return false; +#endif +} inline float32x4_t fast_exp_f32x4(float32x4_t x) { const float32x4_t log2e = vdupq_n_f32(1.4426950408889634f); @@ -102,6 +130,12 @@ inline float32x4_t fast_tanh_f32x4(float32x4_t x) { return result; } +inline void unpack_int4_as_int8x16x2(const uint8_t* ptr, int8x16_t& high_decoded, int8x16_t& low_decoded) { + int8x16_t packed = vreinterpretq_s8_u8(vld1q_u8(ptr)); + high_decoded = vshrq_n_s8(packed, 4); + low_decoded = vshrq_n_s8(vshlq_n_s8(packed, 4), 4); +} + namespace CactusThreading { class ThreadPool { @@ -297,7 +331,7 @@ namespace CactusThreading { } static size_t get_gemv_threads(size_t N_blocks, size_t pool_size) { if (N_blocks < GEMV_MIN_N_BLOCKS) return 1; - return std::min(pool_size, static_cast(2)); + return std::min(pool_size, static_cast(3)); } #else static constexpr size_t GEMV_MIN_N_BLOCKS = 256; @@ -308,7 +342,7 @@ namespace CactusThreading { static size_t get_gemv_threads(size_t N_blocks, size_t pool_size) { if (N_blocks < GEMV_MIN_N_BLOCKS) return 1; if (N_blocks < 512) return std::min(pool_size, static_cast(2)); - return std::min(pool_size, static_cast(4)); + return std::min(pool_size, static_cast(5)); } #endif }; @@ -465,4 +499,4 @@ namespace CactusThreading { } -#endif // KERNEL_UTILS_H \ No newline at end of file +#endif // KERNEL_UTILS_H diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Info.plist b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Info.plist index 32b19c0..bce5a16 100644 Binary files a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Info.plist and b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Info.plist differ diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/_CodeSignature/CodeResources b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/_CodeSignature/CodeResources index b48eb36..143e71b 100644 --- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/_CodeSignature/CodeResources +++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/_CodeSignature/CodeResources @@ -6,7 +6,7 @@ Info.plist - yMSW0g+AKq/xXqUOrMbK43roF5I= + cN36qyYdB+mdJFxX4r84gFZ7SS4= files2 diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus index 818966e..b1ee86e 100755 Binary files a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus and b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus differ diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_cloud.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_cloud.h new file mode 100644 index 0000000..e61841d --- /dev/null +++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_cloud.h @@ -0,0 +1,48 @@ +#ifndef CACTUS_CLOUD_H +#define CACTUS_CLOUD_H + +#include "cactus_utils.h" +#include +#include + +namespace cactus { +namespace ffi { + +struct CloudResponse { + std::string transcript; + std::string api_key_hash; + bool used_cloud = false; + std::string error; +}; + +struct CloudCompletionRequest { + std::vector messages; + std::vector tools; + std::string local_output; + std::vector local_function_calls; + bool has_images = false; + std::string cloud_key; +}; + +struct CloudCompletionResult { + bool ok = false; + bool used_cloud = false; + std::string response; + std::vector function_calls; + std::string error; +}; + +std::string cloud_base64_encode(const uint8_t* data, size_t len); +std::vector cloud_build_wav(const uint8_t* pcm, size_t pcm_bytes); +std::string resolve_cloud_api_key(const char* cloud_key_param); +CloudResponse cloud_transcribe_request(const std::string& audio_b64, + const std::string& fallback_text, + long timeout_seconds = 15L, + const char* cloud_key = nullptr); +CloudCompletionResult cloud_complete_request(const CloudCompletionRequest& request, + long timeout_ms); + +} // namespace ffi +} // namespace cactus + +#endif // CACTUS_CLOUD_H diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h index c627a13..aa72986 100644 --- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h +++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h @@ -76,6 +76,16 @@ CACTUS_FFI_EXPORT int cactus_transcribe( size_t pcm_buffer_size ); +CACTUS_FFI_EXPORT int cactus_detect_language( + cactus_model_t model, + const char* audio_file_path, // NULL if using pcm_buffer + char* response_buffer, + size_t buffer_size, + const char* options_json, // optional + const uint8_t* pcm_buffer, // NULL if using audio_file_path + size_t pcm_buffer_size +); + CACTUS_FFI_EXPORT cactus_stream_transcribe_t cactus_stream_transcribe_start( cactus_model_t model, const char* options_json // optional @@ -189,7 +199,10 @@ CACTUS_FFI_EXPORT void cactus_index_destroy(cactus_index_t index); CACTUS_FFI_EXPORT const char* cactus_get_last_error(void); -CACTUS_FFI_EXPORT void cactus_set_telemetry_environment(const char* framework, const char* cache_location); +CACTUS_FFI_EXPORT void cactus_set_telemetry_environment(const char* framework, const char* cache_location, const char* version); +CACTUS_FFI_EXPORT void cactus_set_app_id(const char* app_id); +CACTUS_FFI_EXPORT void cactus_telemetry_flush(void); +CACTUS_FFI_EXPORT void cactus_telemetry_shutdown(void); #ifdef __cplusplus } diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h index 5f360bd..3b5d97f 100644 --- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h +++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h @@ -2,6 +2,7 @@ #define CACTUS_UTILS_H #include "../engine/engine.h" +#include "../models/model.h" #include #include #include @@ -12,6 +13,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -101,12 +105,92 @@ inline cactus::engine::AudioProcessor::SpectrogramConfig get_whisper_spectrogram return cfg; } +inline cactus::engine::AudioProcessor::SpectrogramConfig get_parakeet_spectrogram_config() { + cactus::engine::AudioProcessor::SpectrogramConfig cfg{}; + cfg.n_fft = 512; + cfg.frame_length = 400; + cfg.hop_length = 160; + cfg.power = 2.0f; + cfg.center = true; + cfg.pad_mode = "constant"; + cfg.onesided = true; + cfg.dither = 0.0f; + cfg.mel_floor = 5.960464477539063e-08f; // 2^-24 guard value used by HF Parakeet. + cfg.log_mel = "log"; + cfg.reference = 1.0f; + cfg.min_value = 1e-10f; + cfg.remove_dc_offset = false; + cfg.hann_periodic = false; + return cfg; +} + +inline void apply_preemphasis(std::vector& waveform, float coefficient = 0.97f) { + if (waveform.size() < 2 || coefficient == 0.0f) { + return; + } + for (size_t i = waveform.size() - 1; i > 0; --i) { + waveform[i] -= coefficient * waveform[i - 1]; + } +} + +inline void normalize_parakeet_log_mel(std::vector& mel, size_t num_mels, float epsilon = 1e-5f) { + if (mel.empty() || num_mels == 0 || (mel.size() % num_mels) != 0) { + return; + } + const size_t num_frames = mel.size() / num_mels; + if (num_frames == 0) { + return; + } + + for (size_t m = 0; m < num_mels; ++m) { + const size_t base = m * num_frames; + float mean = 0.0f; + for (size_t t = 0; t < num_frames; ++t) { + mean += mel[base + t]; + } + mean /= static_cast(num_frames); + + float variance = 0.0f; + for (size_t t = 0; t < num_frames; ++t) { + const float d = mel[base + t] - mean; + variance += d * d; + } + const float denom = static_cast(std::max(1, num_frames - 1)); + const float inv_std = 1.0f / std::sqrt((variance / denom) + epsilon); + for (size_t t = 0; t < num_frames; ++t) { + mel[base + t] = (mel[base + t] - mean) * inv_std; + } + } +} + +inline void trim_mel_frames(std::vector& mel, size_t num_mels, size_t valid_frames) { + if (mel.empty() || num_mels == 0 || (mel.size() % num_mels) != 0) { + return; + } + size_t total_frames = mel.size() / num_mels; + if (valid_frames == 0 || valid_frames >= total_frames) { + return; + } + std::vector trimmed(num_mels * valid_frames); + for (size_t m = 0; m < num_mels; ++m) { + const float* src = &mel[m * total_frames]; + float* dst = &trimmed[m * valid_frames]; + std::copy(src, src + valid_frames, dst); + } + mel.swap(trimmed); +} + } // namespace audio } // namespace cactus namespace cactus { namespace ffi { +inline bool env_flag_enabled(const char* key) { + const char* value = std::getenv(key); + return value && value[0] != '\0' && !(value[0] == '0' && value[1] == '\0'); +} + inline std::string generateUUID() { #ifdef __APPLE__ uuid_t uuid; @@ -114,6 +198,25 @@ inline std::string generateUUID() { char uuid_str[37]; uuid_unparse_lower(uuid, uuid_str); return std::string(uuid_str); +#else + static std::random_device rd; + static std::mt19937 gen(rd()); + static std::uniform_int_distribution<> dis(0, 15); + static std::uniform_int_distribution<> dis2(8, 11); + + std::stringstream ss; + ss << std::hex; + for (int i = 0; i < 8; i++) ss << dis(gen); + ss << "-"; + for (int i = 0; i < 4; i++) ss << dis(gen); + ss << "-4"; + for (int i = 0; i < 3; i++) ss << dis(gen); + ss << "-"; + ss << dis2(gen); + for (int i = 0; i < 3; i++) ss << dis(gen); + ss << "-"; + for (int i = 0; i < 12; i++) ss << dis(gen); + return ss.str(); #endif } @@ -150,6 +253,130 @@ inline std::string escape_json_string(const std::string& s) { return o.str(); } + +inline std::string trim_string(const std::string& s) { + size_t start = 0; + while (start < s.size() && std::isspace(static_cast(s[start]))) ++start; + size_t end = s.size(); + while (end > start && std::isspace(static_cast(s[end - 1]))) --end; + return s.substr(start, end - start); +} + +inline std::string env_or_default(const char* key, const char* fallback) { + const char* v = std::getenv(key); + if (v && v[0] != '\0') return std::string(v); + return std::string(fallback); +} + +inline std::string json_string_field(const std::string& json, const std::string& key) { + std::string pattern = "\"" + key + "\":"; + size_t pos = json.find(pattern); + if (pos == std::string::npos) return {}; + + size_t i = pos + pattern.size(); + while (i < json.size() && std::isspace(static_cast(json[i]))) i++; + if (i >= json.size() || json[i] != '"') return {}; + ++i; + + std::string out; + out.reserve(128); + while (i < json.size()) { + char c = json[i++]; + if (c == '"') return out; + if (c == '\\' && i < json.size()) { + char e = json[i++]; + switch (e) { + case '"': out.push_back('"'); break; + case '\\': out.push_back('\\'); break; + case '/': out.push_back('/'); break; + case 'b': out.push_back('\b'); break; + case 'f': out.push_back('\f'); break; + case 'n': out.push_back('\n'); break; + case 'r': out.push_back('\r'); break; + case 't': out.push_back('\t'); break; + default: out.push_back(e); break; + } + continue; + } + out.push_back(c); + } + return {}; +} + +inline std::string json_array_field(const std::string& json, const std::string& key) { + std::string pattern = "\"" + key + "\":"; + size_t pos = json.find(pattern); + if (pos == std::string::npos) return "[]"; + size_t start = pos + pattern.size(); + while (start < json.size() && std::isspace(static_cast(json[start]))) ++start; + if (start >= json.size() || json[start] != '[') return "[]"; + + int depth = 1; + size_t end = start + 1; + while (end < json.size() && depth > 0) { + if (json[end] == '[') depth++; + else if (json[end] == ']') depth--; + end++; + } + return json.substr(start, end - start); +} + +inline std::vector split_json_array(const std::string& array_json) { + std::vector out; + if (array_json.size() < 2 || array_json.front() != '[' || array_json.back() != ']') return out; + + size_t i = 1; + while (i + 1 < array_json.size()) { + while (i + 1 < array_json.size() && + (std::isspace(static_cast(array_json[i])) || array_json[i] == ',')) i++; + if (i + 1 >= array_json.size() || array_json[i] != '{') break; + + size_t start = i; + int depth = 0; + bool in_str = false; + bool esc = false; + for (; i < array_json.size(); ++i) { + char c = array_json[i]; + if (in_str) { + if (esc) esc = false; + else if (c == '\\') esc = true; + else if (c == '"') in_str = false; + continue; + } + if (c == '"') { in_str = true; continue; } + if (c == '{') depth++; + if (c == '}') { + depth--; + if (depth == 0) { + out.push_back(array_json.substr(start, i - start + 1)); + i++; + break; + } + } + } + } + return out; +} + +inline std::string serialize_tools_json(const std::vector& tools) { + if (tools.empty()) return ""; + std::ostringstream oss; + oss << "["; + for (size_t i = 0; i < tools.size(); ++i) { + if (i > 0) oss << ","; + oss << "{\"type\":\"function\",\"function\":{"; + oss << "\"name\":\"" << escape_json_string(tools[i].name) << "\","; + oss << "\"description\":\"" << escape_json_string(tools[i].description) << "\""; + auto it = tools[i].parameters.find("schema"); + if (it != tools[i].parameters.end()) { + oss << ",\"parameters\":" << it->second; + } + oss << "}}"; + } + oss << "]"; + return oss.str(); +} + inline void handle_error_response(const std::string& error_message, char* response_buffer, size_t buffer_size) { std::ostringstream json; json << "{"; @@ -324,7 +551,10 @@ inline void parse_options_json(const std::string& json, float& confidence_threshold, bool& include_stop_sequences, bool& use_vad, - bool& telemetry_enabled) { + bool& telemetry_enabled, + bool* auto_handoff = nullptr, + size_t* cloud_timeout_ms = nullptr, + bool* handoff_with_images = nullptr) { temperature = 0.0f; top_p = 0.0f; top_k = 0; @@ -335,6 +565,9 @@ inline void parse_options_json(const std::string& json, include_stop_sequences = false; use_vad = true; telemetry_enabled = true; + if (auto_handoff) *auto_handoff = true; + if (cloud_timeout_ms) *cloud_timeout_ms = 15000; + if (handoff_with_images) *handoff_with_images = true; stop_sequences.clear(); if (json.empty()) return; @@ -403,6 +636,32 @@ inline void parse_options_json(const std::string& json, telemetry_enabled = (json.substr(pos, 4) == "true"); } + if (auto_handoff) { + pos = json.find("\"auto_handoff\""); + if (pos != std::string::npos) { + pos = json.find(':', pos) + 1; + while (pos < json.length() && std::isspace(json[pos])) pos++; + *auto_handoff = (json.substr(pos, 4) == "true"); + } + } + + if (cloud_timeout_ms) { + pos = json.find("\"cloud_timeout_ms\""); + if (pos != std::string::npos) { + pos = json.find(':', pos) + 1; + *cloud_timeout_ms = std::stoul(json.substr(pos)); + } + } + + if (handoff_with_images) { + pos = json.find("\"handoff_with_images\""); + if (pos != std::string::npos) { + pos = json.find(':', pos) + 1; + while (pos < json.length() && std::isspace(json[pos])) pos++; + *handoff_with_images = (json.substr(pos, 4) == "true"); + } + } + pos = json.find("\"stop_sequences\""); if (pos != std::string::npos) { pos = json.find('[', pos); @@ -422,31 +681,8 @@ inline void parse_options_json(const std::string& json, } } -inline std::string format_tools_for_prompt(const std::vector& tools) { - if (tools.empty()) return ""; - std::string formatted_tools_json; - for (size_t i = 0; i < tools.size(); i++) { - if (i > 0) formatted_tools_json += "\n"; - formatted_tools_json += "{\"type\":\"function\",\"function\":{\"name\":\"" - + tools[i].name - + "\",\"description\":\"" - + tools[i].description + "\""; - if (tools[i].parameters.find("schema") != tools[i].parameters.end()) { - formatted_tools_json += ",\"parameters\":" + tools[i].parameters.at("schema"); - } - formatted_tools_json += "}}"; - } - return formatted_tools_json; -} - static inline std::string trim_lfm2_slice(const std::string& value, size_t begin, size_t end) { - while (begin < end && std::isspace(static_cast(value[begin]))) { - begin++; - } - while (end > begin && std::isspace(static_cast(value[end - 1]))) { - end--; - } - return value.substr(begin, end - begin); + return trim_string(value.substr(begin, end - begin)); } static inline void append_lfm2_call(const std::string& entry, @@ -577,23 +813,49 @@ inline void parse_function_calls_from_response(const std::string& response_text, if (!content.empty() && content.front() == '[' && content.back() == ']') { std::string inner = content.substr(1, content.size() - 2); - size_t start = 0; - int paren_depth = 0; - - for (size_t i = 0; i < inner.size(); ++i) { - char c = inner[i]; - if (c == '(') { - paren_depth++; - } else if (c == ')' && paren_depth > 0) { - paren_depth--; - } else if (c == ',' && paren_depth == 0) { - append_lfm2_call(inner.substr(start, i - start), function_calls); - start = i + 1; + + size_t inner_first = inner.find_first_not_of(" \t\n\r"); + if (inner_first != std::string::npos && inner[inner_first] == '{') { + size_t pos = inner_first; + while (pos < inner.size()) { + if (inner[pos] == '{') { + int brace_depth = 1; + size_t obj_start = pos; + pos++; + while (pos < inner.size() && brace_depth > 0) { + if (inner[pos] == '{') brace_depth++; + else if (inner[pos] == '}') brace_depth--; + pos++; + } + if (brace_depth == 0) { + std::string json_obj = inner.substr(obj_start, pos - obj_start); + if (json_obj.find("\"name\"") != std::string::npos) { + function_calls.push_back(json_obj); + } + } + } else { + pos++; + } + } + } else { + size_t start = 0; + int paren_depth = 0; + + for (size_t i = 0; i < inner.size(); ++i) { + char c = inner[i]; + if (c == '(') { + paren_depth++; + } else if (c == ')' && paren_depth > 0) { + paren_depth--; + } else if (c == ',' && paren_depth == 0) { + append_lfm2_call(inner.substr(start, i - start), function_calls); + start = i + 1; + } } - } - if (start < inner.size()) { - append_lfm2_call(inner.substr(start), function_calls); + if (start < inner.size()) { + append_lfm2_call(inner.substr(start), function_calls); + } } } else if (!content.empty()) { append_lfm2_call(content, function_calls); @@ -648,7 +910,7 @@ inline std::string construct_response_json(const std::string& regular_response, bool cloud_handoff = false) { std::ostringstream json; json << "{"; - json << "\"success\":" << (cloud_handoff ? "false" : "true") << ","; + json << "\"success\":true,"; json << "\"error\":null,"; json << "\"cloud_handoff\":" << (cloud_handoff ? "true" : "false") << ","; json << "\"response\":\"" << escape_json_string(regular_response) << "\","; @@ -671,30 +933,6 @@ inline std::string construct_response_json(const std::string& regular_response, return json.str(); } -inline std::string construct_cloud_handoff_json(float confidence, - double time_to_first_token, - double prefill_tps, - size_t prompt_tokens) { - std::ostringstream json; - json << "{"; - json << "\"success\":false,"; - json << "\"error\":null,"; - json << "\"cloud_handoff\":true,"; - json << "\"response\":null,"; - json << "\"function_calls\":[],"; - json << "\"confidence\":" << std::fixed << std::setprecision(4) << confidence << ","; - json << "\"time_to_first_token_ms\":" << std::fixed << std::setprecision(2) << time_to_first_token << ","; - json << "\"total_time_ms\":" << std::fixed << std::setprecision(2) << time_to_first_token << ","; - json << "\"prefill_tps\":" << std::fixed << std::setprecision(2) << prefill_tps << ","; - json << "\"decode_tps\":0.0,"; - json << "\"ram_usage_mb\":" << std::fixed << std::setprecision(2) << get_ram_usage_mb() << ","; - json << "\"prefill_tokens\":" << prompt_tokens << ","; - json << "\"decode_tokens\":0,"; - json << "\"total_tokens\":" << prompt_tokens; - json << "}"; - return json.str(); -} - inline std::string serialize_function_calls(const std::vector& calls) { if (calls.empty()) return "[]"; std::ostringstream oss; @@ -720,4 +958,4 @@ const char* cactus_get_last_error(); } #endif -#endif // CACTUS_UTILS_H +#endif // CACTUS_UTILS_H \ No newline at end of file diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h index 620fab6..c8bf34a 100644 --- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h +++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h @@ -56,6 +56,12 @@ struct Config { uint32_t num_shared_experts = 0; uint32_t num_top_experts = 0; uint32_t moe_every_n_layers = 0; + uint32_t moe_intermediate_dim = 0; + uint32_t num_dense_layers = 0; + uint32_t num_experts_per_tok = 0; + bool norm_topk_prob = false; + bool use_expert_bias = false; + float routed_scaling_factor = 1.0f; bool tie_word_embeddings = true; uint32_t vision_hidden_dim = 0; @@ -93,8 +99,22 @@ struct Config { uint32_t num_encoder_layers = 0; uint32_t num_decoder_layers = 0; float partial_rotary_factor = 0.0f; - - enum class ModelType {QWEN = 0, GEMMA = 1, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, MOONSHINE = 8, SILERO_VAD = 9}; + uint32_t pad_token_id = 0; + uint32_t conv_kernel_size = 0; + uint32_t subsampling_conv_kernel_size = 0; + uint32_t subsampling_conv_stride = 0; + uint32_t subsampling_conv_channels = 0; + uint32_t subsampling_factor = 0; + uint32_t num_mel_bins = 80; + std::string encoder_hidden_act = "silu"; + uint32_t predictor_hidden_dim = 0; + uint32_t predictor_num_layers = 0; + uint32_t tdt_joint_dim = 0; + uint32_t tdt_num_durations = 0; + uint32_t tdt_blank_id = 0; + std::vector tdt_durations; + + enum class ModelType {QWEN = 0, GEMMA = 1, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, MOONSHINE = 8, SILERO_VAD = 9, PARAKEET = 10, PARAKEET_TDT = 11}; ModelType model_type = ModelType::QWEN; enum class ModelVariant {DEFAULT = 0, VLM = 1, EXTRACT = 2, RAG = 3}; @@ -168,7 +188,7 @@ class Tokenizer { uint32_t get_global_img_token_id() const { return global_img_token_id_; } protected: - enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, BERT, WHISPER}; + enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, BERT, WHISPER, PARAKEET}; ModelType model_type_ = ModelType::UNKNOWN; enum class ModelVariant { DEFAULT, VLM, EXTRACT, RAG}; ModelVariant model_variant_ = ModelVariant::DEFAULT; @@ -366,7 +386,6 @@ struct KVCache { size_t num_tokens, size_t kv_heads, size_t head_dim); bool is_empty() const { return current_seq_len == 0; } - bool is_int8() const { return precision == Precision::INT8; } void* get_key_ptr(size_t layer); void* get_value_ptr(size_t layer); @@ -684,6 +703,8 @@ class AudioProcessor { float reference = 1.0f; float min_value = 1e-10f; bool remove_dc_offset = false; + float preemphasis = 0.0f; + bool hann_periodic = true; }; AudioProcessor(); @@ -696,6 +717,11 @@ class AudioProcessor { const std::vector& waveform, const SpectrogramConfig& config); + static std::vector compute_irfft( + const std::vector& complex_input, + size_t n, + const char* norm = "backward"); + const std::vector& get_mel_filters() const { return mel_filters_; } size_t get_num_mel_filters() const { return num_mel_filters_; } @@ -721,6 +747,8 @@ namespace index { struct QueryResult { int doc_id; float score; + + QueryResult(int doc_id, float score) : doc_id(doc_id), score(score) {} }; struct QueryOptions { diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h index 255d83c..01b7b2f 100644 --- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h +++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -109,23 +110,33 @@ enum class ComputeBackend { NPU }; +enum class Activation { + SILU, + GELU, + GELU_ERF, + RELU, + SIGMOID, + TANH +}; + enum class OpType { INPUT, PRECISION_CAST, ADD, ADD_CLIPPED, SUBTRACT, MULTIPLY, DIVIDE, MATMUL, TRANSPOSE, RESHAPE, SLICE, GATHER, EMBEDDING, BILINEAR_INTERPOLATION, SUM, MEAN, VARIANCE, MIN, MAX, - RMS_NORM, ROPE, ROPE_GPTJ, SOFTMAX, ATTENTION, ATTENTION_INT8_HYBRID, CONV1D_CAUSAL, CONV1D_K3, CONV1D_K7S3, CONV1D, - SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN, + RMS_NORM, ROPE, ROPE_GPTJ, SOFTMAX, ATTENTION, ATTENTION_INT8_HYBRID, REL_POS_BIAS, CONV1D_CAUSAL, CONV1D_K3, CONV1D_K7S3, CONV1D, CONV1D_SAME_DEPTHWISE_K9, CONV1D_POINTWISE, CONV2D_K3S2P1, CONV2D_DEPTHWISE_K3S2P1, CONV2D_POINTWISE_1X1, GLU, BATCHNORM, + SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN, SCALAR_LOG, RELU, SILU, GELU, GELU_ERF, SIGMOID, TANH, SAMPLE, CONCAT, SCATTER_TOPK, TOPK, LAYERNORM, GROUPNORM, + MOE_LAYER, INDEX, PERSISTENT, QUANTIZE_ACTIVATIONS, LSTM_CELL, - STFT_MAGNITUDE + STFT }; struct PrecisionTraits { @@ -141,11 +152,20 @@ struct PrecisionTraits { static constexpr size_t packed_size_of(Precision prec, size_t count) { switch (prec) { - case Precision::INT4: return (count + 1) / 2; + case Precision::INT4: return (count + 1) / 2; default: return count * size_of(prec); } } + static size_t byte_offset_of(Precision prec, size_t element_offset) { + switch (prec) { + case Precision::INT4: + assert(element_offset % 32 == 0 && "INT4 byte offset must be group-aligned (multiple of 32)"); + return element_offset / 2; + default: return element_offset * size_of(prec); + } + } + static constexpr bool is_integer(Precision prec) { switch (prec) { case Precision::INT8: return true; @@ -181,7 +201,6 @@ struct TensorConfig { Precision compute_precision = Precision::INT8; Precision output_precision = Precision::INT8; bool auto_mixed_precision = false; - bool enable_int4_packing = true; static TensorConfig& global(); }; @@ -243,6 +262,10 @@ struct BufferDesc { return precision == Precision::INT8 && group_size > 0; } + bool is_grouped_int4() const { + return precision == Precision::INT4 && group_size > 0; + } + void set_grouped_scales(size_t gs, size_t ng, void* scales_ptr) { group_size = gs; num_groups = ng; @@ -291,6 +314,7 @@ struct OpParams { size_t slice_length = 0; size_t window_size = 0; bool is_causal = true; + bool attention_mask_is_additive = false; std::vector new_shape; std::vector permutation; Precision output_precision = Precision::INT8; @@ -309,6 +333,11 @@ struct OpParams { size_t num_groups = 0; size_t dst_height = 0; size_t dst_width = 0; + bool normalize_routing = false; + size_t num_experts = 0; + size_t num_experts_per_tok = 0; + bool moe_gated = true; + Activation activation = Activation::SILU; std::vector bias_values; std::vector bias_indices; @@ -356,7 +385,6 @@ void compute_index_node(GraphNode& node, const std::vector>& nodes, const std::unordered_map& node_index_map); void shrink_thread_local_buffers(); - class BufferPool { public: BufferPool() = default; @@ -418,6 +446,7 @@ class CactusGraph { size_t scalar_sqrt(size_t input); size_t scalar_cos(size_t input); size_t scalar_sin(size_t input); + size_t scalar_log(size_t input); size_t relu(size_t input); size_t silu(size_t input); @@ -425,6 +454,7 @@ class CactusGraph { size_t gelu_erf(size_t input); size_t sigmoid(size_t input); size_t tanh(size_t input); + size_t glu(size_t input, int axis = -1); size_t matmul(size_t input1, size_t input2, bool pretransposed_rhs = false, ComputeBackend backend = ComputeBackend::CPU); size_t transpose(size_t input, ComputeBackend backend = ComputeBackend::CPU); @@ -455,7 +485,30 @@ class CactusGraph { size_t layernorm(size_t input, size_t weight, size_t bias, float epsilon = 1e-5f); size_t layernorm(size_t input, size_t weight, float epsilon = 1e-5f); // No bias version size_t groupnorm(size_t input, size_t weight, size_t bias, size_t num_groups = 32, float epsilon = 1e-5f); + size_t batchnorm(size_t input, size_t weight, size_t bias, size_t running_mean, size_t running_var, int axis = 1, float epsilon = 1e-5f); size_t topk(size_t input, size_t k); + size_t moe_layer(size_t hidden, + size_t routing_probs, + size_t topk_indices, + const std::vector& w1_weights, + const std::vector& w3_weights, + const std::vector& w2_weights, + size_t num_experts, + size_t num_experts_per_tok, + bool normalize_routing, + float epsilon, + float routed_scaling_factor); + size_t moe_layer(size_t hidden, + size_t routing_probs, + size_t topk_indices, + const std::vector& w1_weights, + const std::vector& w2_weights, + size_t num_experts, + size_t num_experts_per_tok, + bool normalize_routing, + float epsilon, + float routed_scaling_factor, + Activation activation); size_t rms_norm(size_t input, size_t weight, float epsilon = 1e-5f); size_t rope(size_t input, float theta, size_t position_offset = 0, ComputeBackend backend = ComputeBackend::CPU); size_t rope_gptj(size_t input, float theta, size_t position_offset = 0, size_t rot_dim = 0, ComputeBackend backend = ComputeBackend::CPU); @@ -463,6 +516,10 @@ class CactusGraph { size_t attention(size_t query, size_t key, size_t value, float scale, bool is_causal = true, ComputeBackend backend = ComputeBackend::CPU); size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, ComputeBackend backend = ComputeBackend::CPU); size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, size_t window_size, ComputeBackend backend = ComputeBackend::CPU); + size_t attention_masked(size_t query, size_t key, size_t value, size_t mask, float scale, + bool is_causal = true, ComputeBackend backend = ComputeBackend::CPU, + bool additive_mask = false, size_t position_offset = 0, size_t window_size = 0); + size_t rel_pos_bias(size_t query, size_t relative_key, float scale); size_t attention_int8_hybrid(size_t query, size_t key_new, size_t value_new, float scale, size_t position_offset, const int8_t* cached_keys, const int8_t* cached_values, @@ -474,9 +531,19 @@ class CactusGraph { size_t conv1d_k7s3(size_t input, size_t weight, size_t bias); size_t conv1d(size_t input, size_t weight, size_t stride); size_t conv1d(size_t input, size_t weight, size_t bias, size_t stride); + size_t conv1d_same_depthwise_k9(size_t input, size_t weight); + size_t conv1d_same_depthwise_k9(size_t input, size_t weight, size_t bias); + size_t conv1d_pointwise(size_t input, size_t weight); + size_t conv1d_pointwise(size_t input, size_t weight, size_t bias); + size_t conv2d_k3s2p1(size_t input, size_t weight); + size_t conv2d_k3s2p1(size_t input, size_t weight, size_t bias); + size_t conv2d_depthwise_k3s2p1(size_t input, size_t weight); + size_t conv2d_depthwise_k3s2p1(size_t input, size_t weight, size_t bias); + size_t conv2d_pointwise_1x1(size_t input, size_t weight); + size_t conv2d_pointwise_1x1(size_t input, size_t weight, size_t bias); size_t lstm_cell(size_t input, size_t h_prev, size_t c_prev, size_t weight_ih, size_t weight_hh, size_t bias_ih, size_t bias_hh); - size_t stft_magnitude(size_t input, size_t weight, size_t stride, size_t num_fft_bins); + size_t stft(size_t input, size_t weight, size_t stride, size_t num_fft_bins); size_t sample(size_t logits, float temperature = 0.6f, float top_p = 0.95f, size_t top_k = 20, const std::unordered_map& logit_bias = {}); @@ -581,12 +648,9 @@ namespace GraphFile { bool is_interleaved_ = false; size_t original_N_ = 0; - std::unique_ptr unpacked_data_; - void parse_header(); void apply_madvise_hints(); - void unpack_int4_data(); }; } -#endif +#endif \ No newline at end of file diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h index 17acd36..0ec7265 100644 --- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h +++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h @@ -4,6 +4,8 @@ #include #include +enum class Precision; + enum class ScalarOpType { ADD, SUBTRACT, @@ -12,7 +14,8 @@ enum class ScalarOpType { EXP, SQRT, COS, - SIN + SIN, + LOG }; constexpr size_t KV_QUANT_GROUP_SIZE = 32; @@ -21,6 +24,7 @@ void cactus_add_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num void cactus_add_f16_clipped(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements); void cactus_subtract_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements); void cactus_multiply_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements); +void cactus_add_scaled_f16(const __fp16* base, const __fp16* src, __fp16* output, size_t num_elements, float scale); void cactus_divide_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements); void cactus_add_broadcast_f16(const __fp16* a, const __fp16* b, __fp16* output, @@ -50,6 +54,23 @@ void cactus_matmul_int8(const int8_t* A, const float* A_scales, const int8_t* B, const __fp16* B_scales, __fp16* C, size_t M, size_t K, size_t N, size_t group_size); +void cactus_gemv_int4(const int8_t* A, float A_scale, + const int8_t* B_packed, const __fp16* B_scales, + __fp16* C, size_t K, size_t N, size_t group_size); + +void cactus_gemm_int4(const int8_t* A, const float* A_scales, + const int8_t* B_packed, const __fp16* B_scales, + __fp16* C, size_t M, size_t K, size_t N, size_t group_size); + +void cactus_matmul_int4(const int8_t* A, const float* A_scales, + const int8_t* B_packed, const __fp16* B_scales, + __fp16* C, size_t M, size_t K, size_t N, size_t group_size); + +void cactus_matmul_integer(Precision precision, + const int8_t* A, const float* A_scales, + const int8_t* B, const __fp16* B_scales, + __fp16* C, size_t M, size_t K, size_t N, size_t group_size); + void cactus_matmul_f16(const __fp16* a, const __fp16* b_transposed, __fp16* c, size_t M, size_t K, size_t N); @@ -97,10 +118,52 @@ void cactus_sigmoid_f16(const __fp16* input, __fp16* output, size_t num_elements void cactus_tanh_f16(const __fp16* input, __fp16* output, size_t num_elements); +void cactus_glu_f16( + const __fp16* input, + __fp16* output, + size_t outer_size, + size_t split_size, + size_t inner_size +); + +void cactus_glu_f32( + const float* input, + float* output, + size_t outer_size, + size_t split_size, + size_t inner_size +); + +void cactus_batchnorm_f16( + const __fp16* input, + const float* weight, + const float* bias, + const float* running_mean, + const float* running_var, + __fp16* output, + size_t outer_size, + size_t channels, + size_t inner_size, + float epsilon +); + +void cactus_batchnorm_f32( + const float* input, + const float* weight, + const float* bias, + const float* running_mean, + const float* running_var, + float* output, + size_t outer_size, + size_t channels, + size_t inner_size, + float epsilon +); + void cactus_attention_f16(const __fp16* queries, const __fp16* keys, const __fp16* values, __fp16* output, size_t batch_size, size_t seq_len, size_t kv_seq_len, size_t num_q_heads, size_t num_kv_heads, size_t head_dim, float scale, const __fp16* mask, size_t position_offset = 0, size_t window_size = 0, - bool is_causal = true); + bool is_causal = true, bool mask_is_additive = false, bool mask_per_head = false); void cactus_attention_hybrid_int8_fp16( const __fp16* queries, @@ -150,7 +213,7 @@ void cactus_conv1d_f16( size_t stride ); -void cactus_stft_magnitude_f16( +void cactus_stft_f16( const __fp16* input, const __fp16* weight, __fp16* output, @@ -171,6 +234,62 @@ void cactus_conv1d_f16_k7s3_oc8( size_t C_out ); +void cactus_conv1d_same_depthwise_f16_k9( + const __fp16* input, + const __fp16* weight, + const __fp16* bias, + __fp16* output, + size_t N, + size_t L, + size_t C +); + +void cactus_conv2d_f16_k3s2p1_nchw( + const __fp16* input, + const __fp16* weight, + const __fp16* bias, + __fp16* output, + size_t N, + size_t C_in, + size_t H, + size_t W, + size_t C_out +); + +void cactus_conv2d_depthwise_f16_k3s2p1_nchw( + const __fp16* input, + const __fp16* weight, + const __fp16* bias, + __fp16* output, + size_t N, + size_t C, + size_t H, + size_t W +); + +void cactus_conv2d_pointwise_f16_1x1_nchw_gemm( + const __fp16* input, + const __fp16* weight, + const __fp16* bias, + __fp16* output, + size_t N, + size_t C_in, + size_t H, + size_t W, + size_t C_out +); + +void cactus_conv1d_pointwise_f16_gemm( + const __fp16* input, + const __fp16* weight, + const __fp16* bias, + __fp16* output, + size_t N, + size_t L, + size_t C_in, + size_t C_out +); + void cactus_bilinear_interpolation_f16(const __fp16* input, __fp16* output, size_t src_height, size_t src_width, size_t embed_dim, size_t dst_height, size_t dst_width); @@ -224,4 +343,4 @@ void cactus_lstm_cell_f16( size_t hidden_size ); -#endif +#endif \ No newline at end of file diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h index ac49d05..118c85c 100644 --- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h +++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h @@ -44,6 +44,34 @@ inline void stream_store_f16x8(__fp16* dst, float16x8_t val) { #endif } +inline bool cpu_has_sme2() { +#if defined(__aarch64__) + static std::once_flag once; + static bool has = false; + + std::call_once(once, []() { + +#if defined(__APPLE__) + int ret = 0; + size_t size = sizeof(ret); + if (sysctlbyname("hw.optional.arm.FEAT_SME2", &ret, &size, nullptr, 0) == 0) { + has = ret == 1; + } + +#elif defined(__ANDROID__) + unsigned long hwcap2 = getauxval(AT_HWCAP2); +#ifdef HWCAP2_SME2 + has = (hwcap2 & HWCAP2_SME2) != 0; +#endif + +#endif + }); + + return has; +#else + return false; +#endif +} inline float32x4_t fast_exp_f32x4(float32x4_t x) { const float32x4_t log2e = vdupq_n_f32(1.4426950408889634f); @@ -102,6 +130,12 @@ inline float32x4_t fast_tanh_f32x4(float32x4_t x) { return result; } +inline void unpack_int4_as_int8x16x2(const uint8_t* ptr, int8x16_t& high_decoded, int8x16_t& low_decoded) { + int8x16_t packed = vreinterpretq_s8_u8(vld1q_u8(ptr)); + high_decoded = vshrq_n_s8(packed, 4); + low_decoded = vshrq_n_s8(vshlq_n_s8(packed, 4), 4); +} + namespace CactusThreading { class ThreadPool { @@ -297,7 +331,7 @@ namespace CactusThreading { } static size_t get_gemv_threads(size_t N_blocks, size_t pool_size) { if (N_blocks < GEMV_MIN_N_BLOCKS) return 1; - return std::min(pool_size, static_cast(2)); + return std::min(pool_size, static_cast(3)); } #else static constexpr size_t GEMV_MIN_N_BLOCKS = 256; @@ -308,7 +342,7 @@ namespace CactusThreading { static size_t get_gemv_threads(size_t N_blocks, size_t pool_size) { if (N_blocks < GEMV_MIN_N_BLOCKS) return 1; if (N_blocks < 512) return std::min(pool_size, static_cast(2)); - return std::min(pool_size, static_cast(4)); + return std::min(pool_size, static_cast(5)); } #endif }; @@ -465,4 +499,4 @@ namespace CactusThreading { } -#endif // KERNEL_UTILS_H \ No newline at end of file +#endif // KERNEL_UTILS_H diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Info.plist b/ios/cactus.xcframework/ios-arm64/cactus.framework/Info.plist index db7b528..ba87ea8 100644 Binary files a/ios/cactus.xcframework/ios-arm64/cactus.framework/Info.plist and b/ios/cactus.xcframework/ios-arm64/cactus.framework/Info.plist differ diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus b/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus index fdd1651..e521921 100755 Binary files a/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus and b/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus differ diff --git a/nitrogen/generated/shared/c++/HybridCactusSpec.cpp b/nitrogen/generated/shared/c++/HybridCactusSpec.cpp index 775ceb8..e3b9763 100644 --- a/nitrogen/generated/shared/c++/HybridCactusSpec.cpp +++ b/nitrogen/generated/shared/c++/HybridCactusSpec.cpp @@ -19,6 +19,7 @@ namespace margelo::nitro::cactus { prototype.registerHybridMethod("tokenize", &HybridCactusSpec::tokenize); prototype.registerHybridMethod("scoreWindow", &HybridCactusSpec::scoreWindow); prototype.registerHybridMethod("transcribe", &HybridCactusSpec::transcribe); + prototype.registerHybridMethod("detectLanguage", &HybridCactusSpec::detectLanguage); prototype.registerHybridMethod("streamTranscribeStart", &HybridCactusSpec::streamTranscribeStart); prototype.registerHybridMethod("streamTranscribeProcess", &HybridCactusSpec::streamTranscribeProcess); prototype.registerHybridMethod("streamTranscribeStop", &HybridCactusSpec::streamTranscribeStop); diff --git a/nitrogen/generated/shared/c++/HybridCactusSpec.hpp b/nitrogen/generated/shared/c++/HybridCactusSpec.hpp index 9ebee13..f44ffbe 100644 --- a/nitrogen/generated/shared/c++/HybridCactusSpec.hpp +++ b/nitrogen/generated/shared/c++/HybridCactusSpec.hpp @@ -58,6 +58,7 @@ namespace margelo::nitro::cactus { virtual std::shared_ptr>> tokenize(const std::string& text) = 0; virtual std::shared_ptr> scoreWindow(const std::vector& tokens, double start, double end, double context) = 0; virtual std::shared_ptr> transcribe(const std::variant, std::string>& audio, const std::string& prompt, double responseBufferSize, const std::optional& optionsJson, const std::optional>& callback) = 0; + virtual std::shared_ptr> detectLanguage(const std::variant, std::string>& audio, double responseBufferSize, const std::optional& optionsJson) = 0; virtual std::shared_ptr> streamTranscribeStart(const std::optional& optionsJson) = 0; virtual std::shared_ptr> streamTranscribeProcess(const std::vector& audio) = 0; virtual std::shared_ptr> streamTranscribeStop() = 0; diff --git a/package.json b/package.json index f628d38..9707d4b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "cactus-react-native", - "version": "1.7.0", + "version": "1.10.0", "description": "Run AI models locally on mobile devices", "main": "./lib/module/index.js", "types": "./lib/typescript/src/index.d.ts", diff --git a/src/classes/CactusSTT.ts b/src/classes/CactusSTT.ts index ac9c9a4..8dbdeb6 100644 --- a/src/classes/CactusSTT.ts +++ b/src/classes/CactusSTT.ts @@ -10,6 +10,8 @@ import type { CactusSTTStreamTranscribeProcessParams, CactusSTTStreamTranscribeProcessResult, CactusSTTStreamTranscribeStopResult, + CactusSTTDetectLanguageParams, + CactusSTTDetectLanguageResult, } from '../types/CactusSTT'; import { getRegistry } from '../modelRegistry'; import type { CactusModel } from '../types/common'; @@ -179,6 +181,24 @@ export class CactusSTT { } } + public async detectLanguage({ + audio, + options, + }: CactusSTTDetectLanguageParams): Promise { + if (this.isGenerating) { + throw new Error('CactusSTT is already generating'); + } + + await this.init(); + + this.isGenerating = true; + try { + return await this.cactus.detectLanguage(audio, options); + } finally { + this.isGenerating = false; + } + } + public async audioEmbed({ audioPath, }: CactusSTTAudioEmbedParams): Promise { diff --git a/src/index.tsx b/src/index.tsx index ea4435d..113e341 100644 --- a/src/index.tsx +++ b/src/index.tsx @@ -41,6 +41,9 @@ export type { CactusSTTStreamTranscribeProcessParams, CactusSTTStreamTranscribeProcessResult, CactusSTTStreamTranscribeStopResult, + CactusSTTDetectLanguageOptions, + CactusSTTDetectLanguageParams, + CactusSTTDetectLanguageResult, } from './types/CactusSTT'; export type { CactusVADParams, diff --git a/src/native/Cactus.ts b/src/native/Cactus.ts index e41a88a..4608b12 100644 --- a/src/native/Cactus.ts +++ b/src/native/Cactus.ts @@ -13,6 +13,8 @@ import type { CactusSTTStreamTranscribeStartOptions, CactusSTTStreamTranscribeProcessResult, CactusSTTStreamTranscribeStopResult, + CactusSTTDetectLanguageOptions, + CactusSTTDetectLanguageResult, } from '../types/CactusSTT'; import type { CactusVADOptions, CactusVADResult } from '../types/CactusVAD'; @@ -228,6 +230,36 @@ export class Cactus { } } + public async detectLanguage( + audio: string | number[], + options?: CactusSTTDetectLanguageOptions + ): Promise { + if (typeof audio === 'string') { + audio = audio.replace('file://', ''); + } + + const optionsJson = options + ? JSON.stringify({ use_vad: options.useVad }) + : undefined; + + const response = await this.hybridCactus.detectLanguage( + audio, + 1024, + optionsJson + ); + + try { + const parsed = JSON.parse(response); + + return { + language: parsed.language, + confidence: parsed.confidence, + }; + } catch { + throw new Error('Unable to parse detect language response'); + } + } + public async streamTranscribeStop(): Promise { const response = await this.hybridCactus.streamTranscribeStop(); try { diff --git a/src/specs/Cactus.nitro.ts b/src/specs/Cactus.nitro.ts index 8a36be0..ea5daa6 100644 --- a/src/specs/Cactus.nitro.ts +++ b/src/specs/Cactus.nitro.ts @@ -27,6 +27,11 @@ export interface Cactus extends HybridObject<{ ios: 'c++'; android: 'c++' }> { optionsJson?: string, callback?: (token: string, tokenId: number) => void ): Promise; + detectLanguage( + audio: string | number[], + responseBufferSize: number, + optionsJson?: string + ): Promise; streamTranscribeStart(optionsJson?: string): Promise; streamTranscribeProcess(audio: number[]): Promise; streamTranscribeStop(): Promise; diff --git a/src/types/CactusSTT.ts b/src/types/CactusSTT.ts index 28fe275..6686045 100644 --- a/src/types/CactusSTT.ts +++ b/src/types/CactusSTT.ts @@ -86,3 +86,17 @@ export interface CactusSTTStreamTranscribeStopResult { success: boolean; confirmed: string; } + +export interface CactusSTTDetectLanguageOptions { + useVad?: boolean; +} + +export interface CactusSTTDetectLanguageParams { + audio: string | number[]; + options?: CactusSTTDetectLanguageOptions; +} + +export interface CactusSTTDetectLanguageResult { + language: string; + confidence?: number; +}