diff --git a/android/src/main/jniLibs/arm64-v8a/libcactus.a b/android/src/main/jniLibs/arm64-v8a/libcactus.a
index 621821f..0a9fc58 100644
Binary files a/android/src/main/jniLibs/arm64-v8a/libcactus.a and b/android/src/main/jniLibs/arm64-v8a/libcactus.a differ
diff --git a/cpp/HybridCactus.cpp b/cpp/HybridCactus.cpp
index 3e66a3a..6a35bb2 100644
--- a/cpp/HybridCactus.cpp
+++ b/cpp/HybridCactus.cpp
@@ -212,6 +212,54 @@ std::shared_ptr<Promise<std::string>> HybridCactus::transcribe(
   });
 }
 
+std::shared_ptr<Promise<std::string>> HybridCactus::detectLanguage(
+    const std::variant<std::vector<double>, std::string> &audio,
+    double responseBufferSize,
+    const std::optional<std::string> &optionsJson) {
+  return Promise<std::string>::async(
+      [this, audio, optionsJson, responseBufferSize]() -> std::string {
+        std::lock_guard<std::mutex> lock(this->_modelMutex);
+
+        if (!this->_model) {
+          throw std::runtime_error("Cactus model is not initialized");
+        }
+
+        std::string responseBuffer;
+        responseBuffer.resize(responseBufferSize);
+
+        int result;
+        if (std::holds_alternative<std::string>(audio)) {
+          result = cactus_detect_language(
+              this->_model, std::get<std::string>(audio).c_str(),
+              responseBuffer.data(), responseBufferSize,
+              optionsJson ? optionsJson->c_str() : nullptr, nullptr, 0);
+        } else {
+          const auto &audioDoubles = std::get<std::vector<double>>(audio);
+
+          std::vector<uint8_t> audioBytes;
+          audioBytes.reserve(audioDoubles.size());
+
+          for (double d : audioDoubles) {
+            d = std::clamp(d, 0.0, 255.0);
+            audioBytes.emplace_back(static_cast<uint8_t>(d));
+          }
+
+          result = cactus_detect_language(
+              this->_model, nullptr, responseBuffer.data(), responseBufferSize,
+              optionsJson ? optionsJson->c_str() : nullptr, audioBytes.data(),
+              audioBytes.size());
+        }
+
+        if (result < 0) {
+          throw std::runtime_error("Cactus detect language failed: " +
+                                   std::string(cactus_get_last_error()));
+        }
+
+        responseBuffer.resize(strlen(responseBuffer.c_str()));
+        return responseBuffer;
+      });
+}
+
 std::shared_ptr<Promise<void>> HybridCactus::streamTranscribeStart(
     const std::optional<std::string> &optionsJson) {
   return Promise<void>::async([this, optionsJson]() -> void {
@@ -477,7 +525,7 @@ std::shared_ptr<Promise<void>> HybridCactus::destroy() {
 std::shared_ptr<Promise<void>>
 HybridCactus::setTelemetryEnvironment(const std::string &cacheDir) {
   return Promise<void>::async([cacheDir]() -> void {
-    cactus_set_telemetry_environment("react-native-v1.7", cacheDir.c_str());
+    cactus_set_telemetry_environment("react-native", cacheDir.c_str(), "1.10.0");
   });
 }
 
diff --git a/cpp/HybridCactus.hpp b/cpp/HybridCactus.hpp
index 9e79306..2c5db1d 100644
--- a/cpp/HybridCactus.hpp
+++ b/cpp/HybridCactus.hpp
@@ -39,6 +39,11 @@ class HybridCactus : public HybridCactusSpec {
                                              double /* tokenId */)>> &callback)
       override;
 
+  std::shared_ptr<Promise<std::string>>
+  detectLanguage(const std::variant<std::vector<double>, std::string> &audio,
+                 double responseBufferSize,
+                 const std::optional<std::string> &optionsJson) override;
+
   std::shared_ptr<Promise<void>>
   streamTranscribeStart(const std::optional<std::string> &optionsJson) override;
 
diff --git a/cpp/cactus_ffi.h b/cpp/cactus_ffi.h
index c627a13..aa72986 100644
--- a/cpp/cactus_ffi.h
+++ b/cpp/cactus_ffi.h
@@ -76,6 +76,16 @@ CACTUS_FFI_EXPORT int cactus_transcribe(
     size_t pcm_buffer_size
 );
 
+CACTUS_FFI_EXPORT int cactus_detect_language(
+    cactus_model_t model,
+    const char* audio_file_path,            // NULL if using pcm_buffer
+    char* response_buffer,
+    size_t buffer_size,
+    const char* options_json,               // optional
+    const uint8_t* pcm_buffer,              // NULL if using audio_file_path
+    size_t pcm_buffer_size
+);
+
 CACTUS_FFI_EXPORT cactus_stream_transcribe_t cactus_stream_transcribe_start(
     cactus_model_t model,
     const char* options_json                // optional
@@ -189,7 +199,10 @@ CACTUS_FFI_EXPORT void cactus_index_destroy(cactus_index_t index);
 
 CACTUS_FFI_EXPORT const char* cactus_get_last_error(void);
 
-CACTUS_FFI_EXPORT void cactus_set_telemetry_environment(const char* framework, const char* cache_location);
+CACTUS_FFI_EXPORT void cactus_set_telemetry_environment(const char* framework, const char* cache_location, const char* version);
+CACTUS_FFI_EXPORT void cactus_set_app_id(const char* app_id);
+CACTUS_FFI_EXPORT void cactus_telemetry_flush(void);
+CACTUS_FFI_EXPORT void cactus_telemetry_shutdown(void);
 
 #ifdef __cplusplus
 }
diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock
index 4fce007..d6f7e96 100644
--- a/example/ios/Podfile.lock
+++ b/example/ios/Podfile.lock
@@ -1,6 +1,6 @@
 PODS:
   - boost (1.84.0)
-  - Cactus (1.7.0):
+  - Cactus (1.10.0):
     - boost
     - DoubleConversion
     - fast_float
@@ -2643,7 +2643,7 @@ EXTERNAL SOURCES:
 
 SPEC CHECKSUMS:
   boost: 7e761d76ca2ce687f7cc98e698152abd03a18f90
-  Cactus: d549ac2651ab939a9b5bbcfd6827a1a4e7fa2d81
+  Cactus: 88585f8a152312dcb391526d839133d72d054031
   DoubleConversion: cb417026b2400c8f53ae97020b2be961b59470cb
   fast_float: b32c788ed9c6a8c584d114d0047beda9664e7cc6
   FBLazyVector: b8f1312d48447cca7b4abc21ed155db14742bd03
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_cloud.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_cloud.h
new file mode 100644
index 0000000..e61841d
--- /dev/null
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_cloud.h
@@ -0,0 +1,48 @@
+#ifndef CACTUS_CLOUD_H
+#define CACTUS_CLOUD_H
+
+#include "cactus_utils.h"
+#include <string>
+#include <vector>
+
+namespace cactus {
+namespace ffi {
+
+struct CloudResponse {
+    std::string transcript;
+    std::string api_key_hash;
+    bool used_cloud = false;
+    std::string error;
+};
+
+struct CloudCompletionRequest {
+    std::vector<cactus::engine::ChatMessage> messages;
+    std::vector<ToolFunction> tools;
+    std::string local_output;
+    std::vector<std::string> local_function_calls;
+    bool has_images = false;
+    std::string cloud_key;
+};
+
+struct CloudCompletionResult {
+    bool ok = false;
+    bool used_cloud = false;
+    std::string response;
+    std::vector<std::string> function_calls;
+    std::string error;
+};
+
+std::string cloud_base64_encode(const uint8_t* data, size_t len);
+std::vector<uint8_t> cloud_build_wav(const uint8_t* pcm, size_t pcm_bytes);
+std::string resolve_cloud_api_key(const char* cloud_key_param);
+CloudResponse cloud_transcribe_request(const std::string& audio_b64,
+                                       const std::string& fallback_text,
+                                       long timeout_seconds = 15L,
+                                       const char* cloud_key = nullptr);
+CloudCompletionResult cloud_complete_request(const CloudCompletionRequest& request,
+                                             long timeout_ms);
+
+} // namespace ffi
+} // namespace cactus
+
+#endif // CACTUS_CLOUD_H
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h
index c627a13..aa72986 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h
@@ -76,6 +76,16 @@ CACTUS_FFI_EXPORT int cactus_transcribe(
     size_t pcm_buffer_size
 );
 
+CACTUS_FFI_EXPORT int cactus_detect_language(
+    cactus_model_t model,
+    const char* audio_file_path,            // NULL if using pcm_buffer
+    char* response_buffer,
+    size_t buffer_size,
+    const char* options_json,               // optional
+    const uint8_t* pcm_buffer,              // NULL if using audio_file_path
+    size_t pcm_buffer_size
+);
+
 CACTUS_FFI_EXPORT cactus_stream_transcribe_t cactus_stream_transcribe_start(
     cactus_model_t model,
     const char* options_json                // optional
@@ -189,7 +199,10 @@ CACTUS_FFI_EXPORT void cactus_index_destroy(cactus_index_t index);
 
 CACTUS_FFI_EXPORT const char* cactus_get_last_error(void);
 
-CACTUS_FFI_EXPORT void cactus_set_telemetry_environment(const char* framework, const char* cache_location);
+CACTUS_FFI_EXPORT void cactus_set_telemetry_environment(const char* framework, const char* cache_location, const char* version);
+CACTUS_FFI_EXPORT void cactus_set_app_id(const char* app_id);
+CACTUS_FFI_EXPORT void cactus_telemetry_flush(void);
+CACTUS_FFI_EXPORT void cactus_telemetry_shutdown(void);
 
 #ifdef __cplusplus
 }
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h
index 5f360bd..3b5d97f 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h
@@ -2,6 +2,7 @@
 #define CACTUS_UTILS_H
 
 #include "../engine/engine.h"
+#include "../models/model.h"
 #include <string>
 #include <vector>
 #include <unordered_map>
@@ -12,6 +13,9 @@
 #include <iostream>
 #include <filesystem>
 #include <cctype>
+#include <algorithm>
+#include <cmath>
+#include <limits>
 #include <memory>
 #include <atomic>
 #include <mutex>
@@ -101,12 +105,92 @@ inline cactus::engine::AudioProcessor::SpectrogramConfig get_whisper_spectrogram
     return cfg;
 }
 
+inline cactus::engine::AudioProcessor::SpectrogramConfig get_parakeet_spectrogram_config() {
+    cactus::engine::AudioProcessor::SpectrogramConfig cfg{};
+    cfg.n_fft        = 512;
+    cfg.frame_length = 400;
+    cfg.hop_length   = 160;
+    cfg.power        = 2.0f;
+    cfg.center       = true;
+    cfg.pad_mode     = "constant";
+    cfg.onesided     = true;
+    cfg.dither       = 0.0f;
+    cfg.mel_floor    = 5.960464477539063e-08f; // 2^-24 guard value used by HF Parakeet.
+    cfg.log_mel      = "log";
+    cfg.reference    = 1.0f;
+    cfg.min_value    = 1e-10f;
+    cfg.remove_dc_offset = false;
+    cfg.hann_periodic = false;
+    return cfg;
+}
+
+inline void apply_preemphasis(std::vector<float>& waveform, float coefficient = 0.97f) {
+    if (waveform.size() < 2 || coefficient == 0.0f) {
+        return;
+    }
+    for (size_t i = waveform.size() - 1; i > 0; --i) {
+        waveform[i] -= coefficient * waveform[i - 1];
+    }
+}
+
+inline void normalize_parakeet_log_mel(std::vector<float>& mel, size_t num_mels, float epsilon = 1e-5f) {
+    if (mel.empty() || num_mels == 0 || (mel.size() % num_mels) != 0) {
+        return;
+    }
+    const size_t num_frames = mel.size() / num_mels;
+    if (num_frames == 0) {
+        return;
+    }
+
+    for (size_t m = 0; m < num_mels; ++m) {
+        const size_t base = m * num_frames;
+        float mean = 0.0f;
+        for (size_t t = 0; t < num_frames; ++t) {
+            mean += mel[base + t];
+        }
+        mean /= static_cast<float>(num_frames);
+
+        float variance = 0.0f;
+        for (size_t t = 0; t < num_frames; ++t) {
+            const float d = mel[base + t] - mean;
+            variance += d * d;
+        }
+        const float denom = static_cast<float>(std::max<size_t>(1, num_frames - 1));
+        const float inv_std = 1.0f / std::sqrt((variance / denom) + epsilon);
+        for (size_t t = 0; t < num_frames; ++t) {
+            mel[base + t] = (mel[base + t] - mean) * inv_std;
+        }
+    }
+}
+
+inline void trim_mel_frames(std::vector<float>& mel, size_t num_mels, size_t valid_frames) {
+    if (mel.empty() || num_mels == 0 || (mel.size() % num_mels) != 0) {
+        return;
+    }
+    size_t total_frames = mel.size() / num_mels;
+    if (valid_frames == 0 || valid_frames >= total_frames) {
+        return;
+    }
+    std::vector<float> trimmed(num_mels * valid_frames);
+    for (size_t m = 0; m < num_mels; ++m) {
+        const float* src = &mel[m * total_frames];
+        float* dst = &trimmed[m * valid_frames];
+        std::copy(src, src + valid_frames, dst);
+    }
+    mel.swap(trimmed);
+}
+
 } // namespace audio
 } // namespace cactus
 
 namespace cactus {
 namespace ffi {
 
+inline bool env_flag_enabled(const char* key) {
+    const char* value = std::getenv(key);
+    return value && value[0] != '\0' && !(value[0] == '0' && value[1] == '\0');
+}
+
 inline std::string generateUUID() {
 #ifdef __APPLE__
     uuid_t uuid;
@@ -114,6 +198,25 @@ inline std::string generateUUID() {
     char uuid_str[37];
     uuid_unparse_lower(uuid, uuid_str);
     return std::string(uuid_str);
+#else
+    static std::random_device rd;
+    static std::mt19937 gen(rd());
+    static std::uniform_int_distribution<> dis(0, 15);
+    static std::uniform_int_distribution<> dis2(8, 11);
+
+    std::stringstream ss;
+    ss << std::hex;
+    for (int i = 0; i < 8; i++) ss << dis(gen);
+    ss << "-";
+    for (int i = 0; i < 4; i++) ss << dis(gen);
+    ss << "-4";
+    for (int i = 0; i < 3; i++) ss << dis(gen);
+    ss << "-";
+    ss << dis2(gen);
+    for (int i = 0; i < 3; i++) ss << dis(gen);
+    ss << "-";
+    for (int i = 0; i < 12; i++) ss << dis(gen);
+    return ss.str();
 #endif
 }
 
@@ -150,6 +253,130 @@ inline std::string escape_json_string(const std::string& s) {
     return o.str();
 }
 
+
+inline std::string trim_string(const std::string& s) {
+    size_t start = 0;
+    while (start < s.size() && std::isspace(static_cast<unsigned char>(s[start]))) ++start;
+    size_t end = s.size();
+    while (end > start && std::isspace(static_cast<unsigned char>(s[end - 1]))) --end;
+    return s.substr(start, end - start);
+}
+
+inline std::string env_or_default(const char* key, const char* fallback) {
+    const char* v = std::getenv(key);
+    if (v && v[0] != '\0') return std::string(v);
+    return std::string(fallback);
+}
+
+inline std::string json_string_field(const std::string& json, const std::string& key) {
+    std::string pattern = "\"" + key + "\":";
+    size_t pos = json.find(pattern);
+    if (pos == std::string::npos) return {};
+
+    size_t i = pos + pattern.size();
+    while (i < json.size() && std::isspace(static_cast<unsigned char>(json[i]))) i++;
+    if (i >= json.size() || json[i] != '"') return {};
+    ++i;
+
+    std::string out;
+    out.reserve(128);
+    while (i < json.size()) {
+        char c = json[i++];
+        if (c == '"') return out;
+        if (c == '\\' && i < json.size()) {
+            char e = json[i++];
+            switch (e) {
+                case '"':  out.push_back('"');  break;
+                case '\\': out.push_back('\\'); break;
+                case '/':  out.push_back('/');  break;
+                case 'b':  out.push_back('\b'); break;
+                case 'f':  out.push_back('\f'); break;
+                case 'n':  out.push_back('\n'); break;
+                case 'r':  out.push_back('\r'); break;
+                case 't':  out.push_back('\t'); break;
+                default:   out.push_back(e);    break;
+            }
+            continue;
+        }
+        out.push_back(c);
+    }
+    return {};
+}
+
+inline std::string json_array_field(const std::string& json, const std::string& key) {
+    std::string pattern = "\"" + key + "\":";
+    size_t pos = json.find(pattern);
+    if (pos == std::string::npos) return "[]";
+    size_t start = pos + pattern.size();
+    while (start < json.size() && std::isspace(static_cast<unsigned char>(json[start]))) ++start;
+    if (start >= json.size() || json[start] != '[') return "[]";
+
+    int depth = 1;
+    size_t end = start + 1;
+    while (end < json.size() && depth > 0) {
+        if (json[end] == '[') depth++;
+        else if (json[end] == ']') depth--;
+        end++;
+    }
+    return json.substr(start, end - start);
+}
+
+inline std::vector<std::string> split_json_array(const std::string& array_json) {
+    std::vector<std::string> out;
+    if (array_json.size() < 2 || array_json.front() != '[' || array_json.back() != ']') return out;
+
+    size_t i = 1;
+    while (i + 1 < array_json.size()) {
+        while (i + 1 < array_json.size() &&
+               (std::isspace(static_cast<unsigned char>(array_json[i])) || array_json[i] == ',')) i++;
+        if (i + 1 >= array_json.size() || array_json[i] != '{') break;
+
+        size_t start = i;
+        int depth = 0;
+        bool in_str = false;
+        bool esc = false;
+        for (; i < array_json.size(); ++i) {
+            char c = array_json[i];
+            if (in_str) {
+                if (esc) esc = false;
+                else if (c == '\\') esc = true;
+                else if (c == '"') in_str = false;
+                continue;
+            }
+            if (c == '"') { in_str = true; continue; }
+            if (c == '{') depth++;
+            if (c == '}') {
+                depth--;
+                if (depth == 0) {
+                    out.push_back(array_json.substr(start, i - start + 1));
+                    i++;
+                    break;
+                }
+            }
+        }
+    }
+    return out;
+}
+
+inline std::string serialize_tools_json(const std::vector<ToolFunction>& tools) {
+    if (tools.empty()) return "";
+    std::ostringstream oss;
+    oss << "[";
+    for (size_t i = 0; i < tools.size(); ++i) {
+        if (i > 0) oss << ",";
+        oss << "{\"type\":\"function\",\"function\":{";
+        oss << "\"name\":\"" << escape_json_string(tools[i].name) << "\",";
+        oss << "\"description\":\"" << escape_json_string(tools[i].description) << "\"";
+        auto it = tools[i].parameters.find("schema");
+        if (it != tools[i].parameters.end()) {
+            oss << ",\"parameters\":" << it->second;
+        }
+        oss << "}}";
+    }
+    oss << "]";
+    return oss.str();
+}
+
 inline void handle_error_response(const std::string& error_message, char* response_buffer, size_t buffer_size) {
     std::ostringstream json;
     json << "{";
@@ -324,7 +551,10 @@ inline void parse_options_json(const std::string& json,
                                float& confidence_threshold,
                                bool& include_stop_sequences,
                                bool& use_vad,
-                               bool& telemetry_enabled) {
+                               bool& telemetry_enabled,
+                               bool* auto_handoff = nullptr,
+                               size_t* cloud_timeout_ms = nullptr,
+                               bool* handoff_with_images = nullptr) {
     temperature = 0.0f;
     top_p = 0.0f;
     top_k = 0;
@@ -335,6 +565,9 @@ inline void parse_options_json(const std::string& json,
     include_stop_sequences = false;
     use_vad = true;
     telemetry_enabled = true;
+    if (auto_handoff) *auto_handoff = true;
+    if (cloud_timeout_ms) *cloud_timeout_ms = 15000;
+    if (handoff_with_images) *handoff_with_images = true;
     stop_sequences.clear();
 
     if (json.empty()) return;
@@ -403,6 +636,32 @@ inline void parse_options_json(const std::string& json,
         telemetry_enabled = (json.substr(pos, 4) == "true");
     }
 
+    if (auto_handoff) {
+        pos = json.find("\"auto_handoff\"");
+        if (pos != std::string::npos) {
+            pos = json.find(':', pos) + 1;
+            while (pos < json.length() && std::isspace(json[pos])) pos++;
+            *auto_handoff = (json.substr(pos, 4) == "true");
+        }
+    }
+
+    if (cloud_timeout_ms) {
+        pos = json.find("\"cloud_timeout_ms\"");
+        if (pos != std::string::npos) {
+            pos = json.find(':', pos) + 1;
+            *cloud_timeout_ms = std::stoul(json.substr(pos));
+        }
+    }
+
+    if (handoff_with_images) {
+        pos = json.find("\"handoff_with_images\"");
+        if (pos != std::string::npos) {
+            pos = json.find(':', pos) + 1;
+            while (pos < json.length() && std::isspace(json[pos])) pos++;
+            *handoff_with_images = (json.substr(pos, 4) == "true");
+        }
+    }
+
     pos = json.find("\"stop_sequences\"");
     if (pos != std::string::npos) {
         pos = json.find('[', pos);
@@ -422,31 +681,8 @@ inline void parse_options_json(const std::string& json,
     }
 }
 
-inline std::string format_tools_for_prompt(const std::vector<ToolFunction>& tools) {
-    if (tools.empty()) return "";
-    std::string formatted_tools_json;
-    for (size_t i = 0; i < tools.size(); i++) {
-        if (i > 0) formatted_tools_json += "\n";
-        formatted_tools_json += "{\"type\":\"function\",\"function\":{\"name\":\""
-                              + tools[i].name
-                              + "\",\"description\":\""
-                              + tools[i].description + "\"";
-        if (tools[i].parameters.find("schema") != tools[i].parameters.end()) {
-            formatted_tools_json += ",\"parameters\":" + tools[i].parameters.at("schema");
-        }
-        formatted_tools_json += "}}";
-    }
-    return formatted_tools_json;
-}
-
 static inline std::string trim_lfm2_slice(const std::string& value, size_t begin, size_t end) {
-    while (begin < end && std::isspace(static_cast<unsigned char>(value[begin]))) {
-        begin++;
-    }
-    while (end > begin && std::isspace(static_cast<unsigned char>(value[end - 1]))) {
-        end--;
-    }
-    return value.substr(begin, end - begin);
+    return trim_string(value.substr(begin, end - begin));
 }
 
 static inline void append_lfm2_call(const std::string& entry,
@@ -577,23 +813,49 @@ inline void parse_function_calls_from_response(const std::string& response_text,
 
             if (!content.empty() && content.front() == '[' && content.back() == ']') {
                 std::string inner = content.substr(1, content.size() - 2);
-                size_t start = 0;
-                int paren_depth = 0;
-
-                for (size_t i = 0; i < inner.size(); ++i) {
-                    char c = inner[i];
-                    if (c == '(') {
-                        paren_depth++;
-                    } else if (c == ')' && paren_depth > 0) {
-                        paren_depth--;
-                    } else if (c == ',' && paren_depth == 0) {
-                        append_lfm2_call(inner.substr(start, i - start), function_calls);
-                        start = i + 1;
+
+                size_t inner_first = inner.find_first_not_of(" \t\n\r");
+                if (inner_first != std::string::npos && inner[inner_first] == '{') {
+                    size_t pos = inner_first;
+                    while (pos < inner.size()) {
+                        if (inner[pos] == '{') {
+                            int brace_depth = 1;
+                            size_t obj_start = pos;
+                            pos++;
+                            while (pos < inner.size() && brace_depth > 0) {
+                                if (inner[pos] == '{') brace_depth++;
+                                else if (inner[pos] == '}') brace_depth--;
+                                pos++;
+                            }
+                            if (brace_depth == 0) {
+                                std::string json_obj = inner.substr(obj_start, pos - obj_start);
+                                if (json_obj.find("\"name\"") != std::string::npos) {
+                                    function_calls.push_back(json_obj);
+                                }
+                            }
+                        } else {
+                            pos++;
+                        }
+                    }
+                } else {
+                    size_t start = 0;
+                    int paren_depth = 0;
+
+                    for (size_t i = 0; i < inner.size(); ++i) {
+                        char c = inner[i];
+                        if (c == '(') {
+                            paren_depth++;
+                        } else if (c == ')' && paren_depth > 0) {
+                            paren_depth--;
+                        } else if (c == ',' && paren_depth == 0) {
+                            append_lfm2_call(inner.substr(start, i - start), function_calls);
+                            start = i + 1;
+                        }
                     }
-                }
 
-                if (start < inner.size()) {
-                    append_lfm2_call(inner.substr(start), function_calls);
+                    if (start < inner.size()) {
+                        append_lfm2_call(inner.substr(start), function_calls);
+                    }
                 }
             } else if (!content.empty()) {
                 append_lfm2_call(content, function_calls);
@@ -648,7 +910,7 @@ inline std::string construct_response_json(const std::string& regular_response,
                                            bool cloud_handoff = false) {
     std::ostringstream json;
     json << "{";
-    json << "\"success\":" << (cloud_handoff ? "false" : "true") << ",";
+    json << "\"success\":true,";
     json << "\"error\":null,";
     json << "\"cloud_handoff\":" << (cloud_handoff ? "true" : "false") << ",";
     json << "\"response\":\"" << escape_json_string(regular_response) << "\",";
@@ -671,30 +933,6 @@ inline std::string construct_response_json(const std::string& regular_response,
     return json.str();
 }
 
-inline std::string construct_cloud_handoff_json(float confidence,
-                                                 double time_to_first_token,
-                                                 double prefill_tps,
-                                                 size_t prompt_tokens) {
-    std::ostringstream json;
-    json << "{";
-    json << "\"success\":false,";
-    json << "\"error\":null,";
-    json << "\"cloud_handoff\":true,";
-    json << "\"response\":null,";
-    json << "\"function_calls\":[],";
-    json << "\"confidence\":" << std::fixed << std::setprecision(4) << confidence << ",";
-    json << "\"time_to_first_token_ms\":" << std::fixed << std::setprecision(2) << time_to_first_token << ",";
-    json << "\"total_time_ms\":" << std::fixed << std::setprecision(2) << time_to_first_token << ",";
-    json << "\"prefill_tps\":" << std::fixed << std::setprecision(2) << prefill_tps << ",";
-    json << "\"decode_tps\":0.0,";
-    json << "\"ram_usage_mb\":" << std::fixed << std::setprecision(2) << get_ram_usage_mb() << ",";
-    json << "\"prefill_tokens\":" << prompt_tokens << ",";
-    json << "\"decode_tokens\":0,";
-    json << "\"total_tokens\":" << prompt_tokens;
-    json << "}";
-    return json.str();
-}
-
 inline std::string serialize_function_calls(const std::vector<std::string>& calls) {
     if (calls.empty()) return "[]";
     std::ostringstream oss;
@@ -720,4 +958,4 @@ const char* cactus_get_last_error();
 }
 #endif
 
-#endif // CACTUS_UTILS_H
+#endif // CACTUS_UTILS_H
\ No newline at end of file
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h
index 620fab6..c8bf34a 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h
@@ -56,6 +56,12 @@ struct Config {
     uint32_t num_shared_experts = 0;
     uint32_t num_top_experts = 0;
     uint32_t moe_every_n_layers = 0;
+    uint32_t moe_intermediate_dim = 0;
+    uint32_t num_dense_layers = 0;
+    uint32_t num_experts_per_tok = 0;
+    bool norm_topk_prob = false;
+    bool use_expert_bias = false;
+    float routed_scaling_factor = 1.0f;
     bool tie_word_embeddings = true;
 
     uint32_t vision_hidden_dim = 0;
@@ -93,8 +99,22 @@ struct Config {
     uint32_t num_encoder_layers = 0;
     uint32_t num_decoder_layers = 0;
     float partial_rotary_factor = 0.0f;
-
-    enum class ModelType {QWEN = 0, GEMMA = 1, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, MOONSHINE = 8, SILERO_VAD = 9};
+    uint32_t pad_token_id = 0;
+    uint32_t conv_kernel_size = 0;
+    uint32_t subsampling_conv_kernel_size = 0;
+    uint32_t subsampling_conv_stride = 0;
+    uint32_t subsampling_conv_channels = 0;
+    uint32_t subsampling_factor = 0;
+    uint32_t num_mel_bins = 80;
+    std::string encoder_hidden_act = "silu";
+    uint32_t predictor_hidden_dim = 0;
+    uint32_t predictor_num_layers = 0;
+    uint32_t tdt_joint_dim = 0;
+    uint32_t tdt_num_durations = 0;
+    uint32_t tdt_blank_id = 0;
+    std::vector<uint32_t> tdt_durations;
+
+    enum class ModelType {QWEN = 0, GEMMA = 1, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, MOONSHINE = 8, SILERO_VAD = 9, PARAKEET = 10, PARAKEET_TDT = 11};
     ModelType model_type = ModelType::QWEN;
 
     enum class ModelVariant {DEFAULT = 0, VLM = 1, EXTRACT = 2, RAG = 3};
@@ -168,7 +188,7 @@ class Tokenizer {
     uint32_t get_global_img_token_id() const { return global_img_token_id_; }
 
 protected:
-    enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, BERT, WHISPER};
+    enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, BERT, WHISPER, PARAKEET};
     ModelType model_type_ = ModelType::UNKNOWN;
     enum class ModelVariant { DEFAULT, VLM, EXTRACT, RAG};
     ModelVariant model_variant_ = ModelVariant::DEFAULT;
@@ -366,7 +386,6 @@ struct KVCache {
                          size_t num_tokens, size_t kv_heads, size_t head_dim);
 
     bool is_empty() const { return current_seq_len == 0; }
-    bool is_int8() const { return precision == Precision::INT8; }
     void* get_key_ptr(size_t layer);
     void* get_value_ptr(size_t layer);
 
@@ -684,6 +703,8 @@ class AudioProcessor {
         float reference = 1.0f;
         float min_value = 1e-10f;
         bool remove_dc_offset = false;
+        float preemphasis = 0.0f;
+        bool hann_periodic = true;
     };
 
     AudioProcessor();
@@ -696,6 +717,11 @@ class AudioProcessor {
         const std::vector<float>& waveform,
         const SpectrogramConfig& config);
 
+    static std::vector<float> compute_irfft(
+        const std::vector<float>& complex_input,
+        size_t n,
+        const char* norm = "backward");
+
     const std::vector<float>& get_mel_filters() const { return mel_filters_; }
 
     size_t get_num_mel_filters() const { return num_mel_filters_; }
@@ -721,6 +747,8 @@ namespace index {
     struct QueryResult {
         int doc_id;
         float score;
+
+        QueryResult(int doc_id, float score) : doc_id(doc_id), score(score) {}
     };
 
     struct QueryOptions {
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h
index 255d83c..01b7b2f 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h
@@ -6,6 +6,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <functional>
+#include <cassert>
 #include <cstring>
 #include <stdexcept>
 #include <string>
@@ -109,23 +110,33 @@ enum class ComputeBackend {
     NPU
 };
 
+enum class Activation {
+    SILU,
+    GELU,
+    GELU_ERF,
+    RELU,
+    SIGMOID,
+    TANH
+};
+
 enum class OpType {
     INPUT, PRECISION_CAST,
     ADD, ADD_CLIPPED, SUBTRACT, MULTIPLY, DIVIDE,
     MATMUL, TRANSPOSE, RESHAPE, SLICE, GATHER, EMBEDDING,
     BILINEAR_INTERPOLATION,
     SUM, MEAN, VARIANCE, MIN, MAX,
-    RMS_NORM, ROPE, ROPE_GPTJ, SOFTMAX, ATTENTION, ATTENTION_INT8_HYBRID, CONV1D_CAUSAL, CONV1D_K3, CONV1D_K7S3, CONV1D,
-    SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN,
+    RMS_NORM, ROPE, ROPE_GPTJ, SOFTMAX, ATTENTION, ATTENTION_INT8_HYBRID, REL_POS_BIAS, CONV1D_CAUSAL, CONV1D_K3, CONV1D_K7S3, CONV1D, CONV1D_SAME_DEPTHWISE_K9, CONV1D_POINTWISE, CONV2D_K3S2P1, CONV2D_DEPTHWISE_K3S2P1, CONV2D_POINTWISE_1X1, GLU, BATCHNORM,
+    SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN, SCALAR_LOG,
     RELU, SILU, GELU, GELU_ERF, SIGMOID, TANH,
     SAMPLE, CONCAT,
     SCATTER_TOPK,
     TOPK, LAYERNORM, GROUPNORM,
+    MOE_LAYER,
     INDEX,
     PERSISTENT,
     QUANTIZE_ACTIVATIONS,
     LSTM_CELL,
-    STFT_MAGNITUDE
+    STFT
 };
 
 struct PrecisionTraits {
@@ -141,11 +152,20 @@ struct PrecisionTraits {
 
     static constexpr size_t packed_size_of(Precision prec, size_t count) {
         switch (prec) {
-            case Precision::INT4: return (count + 1) / 2;  
+            case Precision::INT4: return (count + 1) / 2;
             default: return count * size_of(prec);
         }
     }
 
+    static size_t byte_offset_of(Precision prec, size_t element_offset) {
+        switch (prec) {
+            case Precision::INT4:
+                assert(element_offset % 32 == 0 && "INT4 byte offset must be group-aligned (multiple of 32)");
+                return element_offset / 2;
+            default: return element_offset * size_of(prec);
+        }
+    }
+
     static constexpr bool is_integer(Precision prec) {
         switch (prec) {
             case Precision::INT8: return true;
@@ -181,7 +201,6 @@ struct TensorConfig {
     Precision compute_precision = Precision::INT8;
     Precision output_precision = Precision::INT8;
     bool auto_mixed_precision = false;
-    bool enable_int4_packing = true;
     
     static TensorConfig& global();
 };
@@ -243,6 +262,10 @@ struct BufferDesc {
         return precision == Precision::INT8 && group_size > 0;
     }
 
+    bool is_grouped_int4() const {
+        return precision == Precision::INT4 && group_size > 0;
+    }
+
     void set_grouped_scales(size_t gs, size_t ng, void* scales_ptr) {
         group_size = gs;
         num_groups = ng;
@@ -291,6 +314,7 @@ struct OpParams {
     size_t slice_length = 0;
     size_t window_size = 0;
     bool is_causal = true;  
+    bool attention_mask_is_additive = false;
     std::vector<size_t> new_shape;
     std::vector<size_t> permutation;
     Precision output_precision = Precision::INT8;
@@ -309,6 +333,11 @@ struct OpParams {
     size_t num_groups = 0;
     size_t dst_height = 0;
     size_t dst_width = 0;
+    bool normalize_routing = false;
+    size_t num_experts = 0;
+    size_t num_experts_per_tok = 0;
+    bool moe_gated = true; 
+    Activation activation = Activation::SILU;
 
     std::vector<float> bias_values;
     std::vector<uint32_t> bias_indices;
@@ -356,7 +385,6 @@ void compute_index_node(GraphNode& node, const std::vector<std::unique_ptr<Graph
 void compute_lstm_cell_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
 
 void shrink_thread_local_buffers();
-
 class BufferPool {
 public:
     BufferPool() = default;
@@ -418,6 +446,7 @@ class CactusGraph {
     size_t scalar_sqrt(size_t input);
     size_t scalar_cos(size_t input);
     size_t scalar_sin(size_t input);
+    size_t scalar_log(size_t input);
     
     size_t relu(size_t input);
     size_t silu(size_t input);
@@ -425,6 +454,7 @@ class CactusGraph {
     size_t gelu_erf(size_t input);
     size_t sigmoid(size_t input);
     size_t tanh(size_t input);
+    size_t glu(size_t input, int axis = -1);
     
     size_t matmul(size_t input1, size_t input2, bool pretransposed_rhs = false, ComputeBackend backend = ComputeBackend::CPU);
     size_t transpose(size_t input, ComputeBackend backend = ComputeBackend::CPU);
@@ -455,7 +485,30 @@ class CactusGraph {
     size_t layernorm(size_t input, size_t weight, size_t bias, float epsilon = 1e-5f);
     size_t layernorm(size_t input, size_t weight, float epsilon = 1e-5f);  // No bias version
     size_t groupnorm(size_t input, size_t weight, size_t bias, size_t num_groups = 32, float epsilon = 1e-5f);
+    size_t batchnorm(size_t input, size_t weight, size_t bias, size_t running_mean, size_t running_var, int axis = 1, float epsilon = 1e-5f);
     size_t topk(size_t input, size_t k);
+    size_t moe_layer(size_t hidden,
+                     size_t routing_probs,
+                     size_t topk_indices,
+                     const std::vector<size_t>& w1_weights,
+                     const std::vector<size_t>& w3_weights,
+                     const std::vector<size_t>& w2_weights,
+                     size_t num_experts,
+                     size_t num_experts_per_tok,
+                     bool normalize_routing,
+                     float epsilon,
+                     float routed_scaling_factor);
+    size_t moe_layer(size_t hidden,
+                     size_t routing_probs,
+                     size_t topk_indices,
+                     const std::vector<size_t>& w1_weights,
+                     const std::vector<size_t>& w2_weights,
+                     size_t num_experts,
+                     size_t num_experts_per_tok,
+                     bool normalize_routing,
+                     float epsilon,
+                     float routed_scaling_factor,
+                     Activation activation);
     size_t rms_norm(size_t input, size_t weight, float epsilon = 1e-5f);
     size_t rope(size_t input, float theta, size_t position_offset = 0, ComputeBackend backend = ComputeBackend::CPU);
     size_t rope_gptj(size_t input, float theta, size_t position_offset = 0, size_t rot_dim = 0, ComputeBackend backend = ComputeBackend::CPU);
@@ -463,6 +516,10 @@ class CactusGraph {
     size_t attention(size_t query, size_t key, size_t value, float scale, bool is_causal = true, ComputeBackend backend = ComputeBackend::CPU);
     size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, ComputeBackend backend = ComputeBackend::CPU);
     size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, size_t window_size, ComputeBackend backend = ComputeBackend::CPU);
+    size_t attention_masked(size_t query, size_t key, size_t value, size_t mask, float scale,
+                            bool is_causal = true, ComputeBackend backend = ComputeBackend::CPU,
+                            bool additive_mask = false, size_t position_offset = 0, size_t window_size = 0);
+    size_t rel_pos_bias(size_t query, size_t relative_key, float scale);
 
     size_t attention_int8_hybrid(size_t query, size_t key_new, size_t value_new, float scale, size_t position_offset,
                                  const int8_t* cached_keys, const int8_t* cached_values,
@@ -474,9 +531,19 @@ class CactusGraph {
     size_t conv1d_k7s3(size_t input, size_t weight, size_t bias);
     size_t conv1d(size_t input, size_t weight, size_t stride);
     size_t conv1d(size_t input, size_t weight, size_t bias, size_t stride);
+    size_t conv1d_same_depthwise_k9(size_t input, size_t weight);
+    size_t conv1d_same_depthwise_k9(size_t input, size_t weight, size_t bias);
+    size_t conv1d_pointwise(size_t input, size_t weight);
+    size_t conv1d_pointwise(size_t input, size_t weight, size_t bias);
+    size_t conv2d_k3s2p1(size_t input, size_t weight);
+    size_t conv2d_k3s2p1(size_t input, size_t weight, size_t bias);
+    size_t conv2d_depthwise_k3s2p1(size_t input, size_t weight);
+    size_t conv2d_depthwise_k3s2p1(size_t input, size_t weight, size_t bias);
+    size_t conv2d_pointwise_1x1(size_t input, size_t weight);
+    size_t conv2d_pointwise_1x1(size_t input, size_t weight, size_t bias);
 
     size_t lstm_cell(size_t input, size_t h_prev, size_t c_prev, size_t weight_ih, size_t weight_hh, size_t bias_ih, size_t bias_hh);
-    size_t stft_magnitude(size_t input, size_t weight, size_t stride, size_t num_fft_bins);
+    size_t stft(size_t input, size_t weight, size_t stride, size_t num_fft_bins);
 
     size_t sample(size_t logits, float temperature = 0.6f, float top_p = 0.95f, size_t top_k = 20,
                   const std::unordered_map<uint32_t, float>& logit_bias = {});
@@ -581,12 +648,9 @@ namespace GraphFile {
         bool is_interleaved_ = false;
         size_t original_N_ = 0;
 
-        std::unique_ptr<int8_t[]> unpacked_data_;  
-
         void parse_header();
         void apply_madvise_hints();
-        void unpack_int4_data();
     };
 }
 
-#endif 
+#endif
\ No newline at end of file
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h
index 17acd36..0ec7265 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h
@@ -4,6 +4,8 @@
 #include <cstddef>
 #include <arm_neon.h>
 
+enum class Precision;
+
 enum class ScalarOpType {
     ADD,
     SUBTRACT,
@@ -12,7 +14,8 @@ enum class ScalarOpType {
     EXP,
     SQRT,
     COS,
-    SIN
+    SIN,
+    LOG
 };
 
 constexpr size_t KV_QUANT_GROUP_SIZE = 32;
@@ -21,6 +24,7 @@ void cactus_add_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num
 void cactus_add_f16_clipped(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
 void cactus_subtract_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
 void cactus_multiply_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
+void cactus_add_scaled_f16(const __fp16* base, const __fp16* src, __fp16* output, size_t num_elements, float scale);
 void cactus_divide_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
 
 void cactus_add_broadcast_f16(const __fp16* a, const __fp16* b, __fp16* output,
@@ -50,6 +54,23 @@ void cactus_matmul_int8(const int8_t* A, const float* A_scales,
                         const int8_t* B, const __fp16* B_scales,
                         __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
 
+void cactus_gemv_int4(const int8_t* A, float A_scale,
+                      const int8_t* B_packed, const __fp16* B_scales,
+                      __fp16* C, size_t K, size_t N, size_t group_size);
+
+void cactus_gemm_int4(const int8_t* A, const float* A_scales,
+                      const int8_t* B_packed, const __fp16* B_scales,
+                      __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
+
+void cactus_matmul_int4(const int8_t* A, const float* A_scales,
+                        const int8_t* B_packed, const __fp16* B_scales,
+                        __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
+
+void cactus_matmul_integer(Precision precision,
+                            const int8_t* A, const float* A_scales,
+                            const int8_t* B, const __fp16* B_scales,
+                            __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
+
 void cactus_matmul_f16(const __fp16* a, const __fp16* b_transposed, __fp16* c,
                        size_t M, size_t K, size_t N);
 
@@ -97,10 +118,52 @@ void cactus_sigmoid_f16(const __fp16* input, __fp16* output, size_t num_elements
 
 void cactus_tanh_f16(const __fp16* input, __fp16* output, size_t num_elements);
 
+void cactus_glu_f16(
+    const __fp16* input,
+    __fp16* output,
+    size_t outer_size,
+    size_t split_size,
+    size_t inner_size
+);
+
+void cactus_glu_f32(
+    const float* input,
+    float* output,
+    size_t outer_size,
+    size_t split_size,
+    size_t inner_size
+);
+
+void cactus_batchnorm_f16(
+    const __fp16* input,
+    const float* weight,
+    const float* bias,
+    const float* running_mean,
+    const float* running_var,
+    __fp16* output,
+    size_t outer_size,
+    size_t channels,
+    size_t inner_size,
+    float epsilon
+);
+
+void cactus_batchnorm_f32(
+    const float* input,
+    const float* weight,
+    const float* bias,
+    const float* running_mean,
+    const float* running_var,
+    float* output,
+    size_t outer_size,
+    size_t channels,
+    size_t inner_size,
+    float epsilon
+);
+
 void cactus_attention_f16(const __fp16* queries, const __fp16* keys, const __fp16* values, __fp16* output,
                           size_t batch_size, size_t seq_len, size_t kv_seq_len, size_t num_q_heads, size_t num_kv_heads,
                           size_t head_dim, float scale, const __fp16* mask, size_t position_offset = 0, size_t window_size = 0,
-                          bool is_causal = true);
+                          bool is_causal = true, bool mask_is_additive = false, bool mask_per_head = false);
 
 void cactus_attention_hybrid_int8_fp16(
     const __fp16* queries,        
@@ -150,7 +213,7 @@ void cactus_conv1d_f16(
     size_t stride
 );
 
-void cactus_stft_magnitude_f16(
+void cactus_stft_f16(
     const __fp16* input,
     const __fp16* weight,
     __fp16* output,
@@ -171,6 +234,62 @@ void cactus_conv1d_f16_k7s3_oc8(
     size_t C_out
 );
 
+void cactus_conv1d_same_depthwise_f16_k9(
+    const __fp16* input,
+    const __fp16* weight,
+    const __fp16* bias,
+    __fp16* output,
+    size_t N,
+    size_t L,
+    size_t C
+);
+
+void cactus_conv2d_f16_k3s2p1_nchw(
+    const __fp16* input,
+    const __fp16* weight,
+    const __fp16* bias,
+    __fp16* output,
+    size_t N,
+    size_t C_in,
+    size_t H,
+    size_t W,
+    size_t C_out
+);
+
+void cactus_conv2d_depthwise_f16_k3s2p1_nchw(
+    const __fp16* input,
+    const __fp16* weight,
+    const __fp16* bias,
+    __fp16* output,
+    size_t N,
+    size_t C,
+    size_t H,
+    size_t W
+);
+
+void cactus_conv2d_pointwise_f16_1x1_nchw_gemm(
+    const __fp16* input,
+    const __fp16* weight,
+    const __fp16* bias,
+    __fp16* output,
+    size_t N,
+    size_t C_in,
+    size_t H,
+    size_t W,
+    size_t C_out
+);
+
+void cactus_conv1d_pointwise_f16_gemm(
+    const __fp16* input,
+    const __fp16* weight,
+    const __fp16* bias,
+    __fp16* output,
+    size_t N,
+    size_t L,
+    size_t C_in,
+    size_t C_out
+);
+
 void cactus_bilinear_interpolation_f16(const __fp16* input, __fp16* output, size_t src_height, size_t src_width, size_t embed_dim,
                                        size_t dst_height, size_t dst_width);
 
@@ -224,4 +343,4 @@ void cactus_lstm_cell_f16(
     size_t hidden_size
 );
 
-#endif
+#endif
\ No newline at end of file
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h
index ac49d05..118c85c 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h
@@ -44,6 +44,34 @@ inline void stream_store_f16x8(__fp16* dst, float16x8_t val) {
 #endif
 }
 
+inline bool cpu_has_sme2() {
+#if defined(__aarch64__)
+	static std::once_flag once;
+	static bool has = false;
+	
+	std::call_once(once, []() {
+
+#if defined(__APPLE__)
+	int ret = 0;
+	size_t size = sizeof(ret);
+	if (sysctlbyname("hw.optional.arm.FEAT_SME2", &ret, &size, nullptr, 0) == 0) {
+		has = ret == 1;
+	}
+
+#elif defined(__ANDROID__)
+	unsigned long hwcap2 = getauxval(AT_HWCAP2);
+#ifdef HWCAP2_SME2
+	has = (hwcap2 & HWCAP2_SME2) != 0;
+#endif
+
+#endif
+	});
+	
+	return has;
+#else
+	return false;
+#endif
+}
 
 inline float32x4_t fast_exp_f32x4(float32x4_t x) {
     const float32x4_t log2e = vdupq_n_f32(1.4426950408889634f);
@@ -102,6 +130,12 @@ inline float32x4_t fast_tanh_f32x4(float32x4_t x) {
     return result;
 }
 
+inline void unpack_int4_as_int8x16x2(const uint8_t* ptr, int8x16_t& high_decoded, int8x16_t& low_decoded) {
+    int8x16_t packed = vreinterpretq_s8_u8(vld1q_u8(ptr));
+    high_decoded = vshrq_n_s8(packed, 4);
+    low_decoded = vshrq_n_s8(vshlq_n_s8(packed, 4), 4);
+}
+
 namespace CactusThreading {
 
     class ThreadPool {
@@ -297,7 +331,7 @@ namespace CactusThreading {
         }
         static size_t get_gemv_threads(size_t N_blocks, size_t pool_size) {
             if (N_blocks < GEMV_MIN_N_BLOCKS) return 1;
-            return std::min(pool_size, static_cast<size_t>(2));
+            return std::min(pool_size, static_cast<size_t>(3));
         }
         #else 
         static constexpr size_t GEMV_MIN_N_BLOCKS = 256;  
@@ -308,7 +342,7 @@ namespace CactusThreading {
         static size_t get_gemv_threads(size_t N_blocks, size_t pool_size) {
             if (N_blocks < GEMV_MIN_N_BLOCKS) return 1;
             if (N_blocks < 512) return std::min(pool_size, static_cast<size_t>(2));
-            return std::min(pool_size, static_cast<size_t>(4));
+            return std::min(pool_size, static_cast<size_t>(5));
         }
         #endif
     };
@@ -465,4 +499,4 @@ namespace CactusThreading {
 }
 
 
-#endif // KERNEL_UTILS_H 
\ No newline at end of file
+#endif // KERNEL_UTILS_H 
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Info.plist b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Info.plist
index 32b19c0..bce5a16 100644
Binary files a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Info.plist and b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Info.plist differ
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/_CodeSignature/CodeResources b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/_CodeSignature/CodeResources
index b48eb36..143e71b 100644
--- a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/_CodeSignature/CodeResources
+++ b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/_CodeSignature/CodeResources
@@ -6,7 +6,7 @@
 	<dict>
 		<key>Info.plist</key>
 		<data>
-		yMSW0g+AKq/xXqUOrMbK43roF5I=
+		cN36qyYdB+mdJFxX4r84gFZ7SS4=
 		</data>
 	</dict>
 	<key>files2</key>
diff --git a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus
index 818966e..b1ee86e 100755
Binary files a/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus and b/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus differ
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_cloud.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_cloud.h
new file mode 100644
index 0000000..e61841d
--- /dev/null
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_cloud.h
@@ -0,0 +1,48 @@
+#ifndef CACTUS_CLOUD_H
+#define CACTUS_CLOUD_H
+
+#include "cactus_utils.h"
+#include <string>
+#include <vector>
+
+namespace cactus {
+namespace ffi {
+
+struct CloudResponse {
+    std::string transcript;
+    std::string api_key_hash;
+    bool used_cloud = false;
+    std::string error;
+};
+
+struct CloudCompletionRequest {
+    std::vector<cactus::engine::ChatMessage> messages;
+    std::vector<ToolFunction> tools;
+    std::string local_output;
+    std::vector<std::string> local_function_calls;
+    bool has_images = false;
+    std::string cloud_key;
+};
+
+struct CloudCompletionResult {
+    bool ok = false;
+    bool used_cloud = false;
+    std::string response;
+    std::vector<std::string> function_calls;
+    std::string error;
+};
+
+std::string cloud_base64_encode(const uint8_t* data, size_t len);
+std::vector<uint8_t> cloud_build_wav(const uint8_t* pcm, size_t pcm_bytes);
+std::string resolve_cloud_api_key(const char* cloud_key_param);
+CloudResponse cloud_transcribe_request(const std::string& audio_b64,
+                                       const std::string& fallback_text,
+                                       long timeout_seconds = 15L,
+                                       const char* cloud_key = nullptr);
+CloudCompletionResult cloud_complete_request(const CloudCompletionRequest& request,
+                                             long timeout_ms);
+
+} // namespace ffi
+} // namespace cactus
+
+#endif // CACTUS_CLOUD_H
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h
index c627a13..aa72986 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h
@@ -76,6 +76,16 @@ CACTUS_FFI_EXPORT int cactus_transcribe(
     size_t pcm_buffer_size
 );
 
+CACTUS_FFI_EXPORT int cactus_detect_language(
+    cactus_model_t model,
+    const char* audio_file_path,            // NULL if using pcm_buffer
+    char* response_buffer,
+    size_t buffer_size,
+    const char* options_json,               // optional
+    const uint8_t* pcm_buffer,              // NULL if using audio_file_path
+    size_t pcm_buffer_size
+);
+
 CACTUS_FFI_EXPORT cactus_stream_transcribe_t cactus_stream_transcribe_start(
     cactus_model_t model,
     const char* options_json                // optional
@@ -189,7 +199,10 @@ CACTUS_FFI_EXPORT void cactus_index_destroy(cactus_index_t index);
 
 CACTUS_FFI_EXPORT const char* cactus_get_last_error(void);
 
-CACTUS_FFI_EXPORT void cactus_set_telemetry_environment(const char* framework, const char* cache_location);
+CACTUS_FFI_EXPORT void cactus_set_telemetry_environment(const char* framework, const char* cache_location, const char* version);
+CACTUS_FFI_EXPORT void cactus_set_app_id(const char* app_id);
+CACTUS_FFI_EXPORT void cactus_telemetry_flush(void);
+CACTUS_FFI_EXPORT void cactus_telemetry_shutdown(void);
 
 #ifdef __cplusplus
 }
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h
index 5f360bd..3b5d97f 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h
@@ -2,6 +2,7 @@
 #define CACTUS_UTILS_H
 
 #include "../engine/engine.h"
+#include "../models/model.h"
 #include <string>
 #include <vector>
 #include <unordered_map>
@@ -12,6 +13,9 @@
 #include <iostream>
 #include <filesystem>
 #include <cctype>
+#include <algorithm>
+#include <cmath>
+#include <limits>
 #include <memory>
 #include <atomic>
 #include <mutex>
@@ -101,12 +105,92 @@ inline cactus::engine::AudioProcessor::SpectrogramConfig get_whisper_spectrogram
     return cfg;
 }
 
+inline cactus::engine::AudioProcessor::SpectrogramConfig get_parakeet_spectrogram_config() {
+    cactus::engine::AudioProcessor::SpectrogramConfig cfg{};
+    cfg.n_fft        = 512;
+    cfg.frame_length = 400;
+    cfg.hop_length   = 160;
+    cfg.power        = 2.0f;
+    cfg.center       = true;
+    cfg.pad_mode     = "constant";
+    cfg.onesided     = true;
+    cfg.dither       = 0.0f;
+    cfg.mel_floor    = 5.960464477539063e-08f; // 2^-24 guard value used by HF Parakeet.
+    cfg.log_mel      = "log";
+    cfg.reference    = 1.0f;
+    cfg.min_value    = 1e-10f;
+    cfg.remove_dc_offset = false;
+    cfg.hann_periodic = false;
+    return cfg;
+}
+
+inline void apply_preemphasis(std::vector<float>& waveform, float coefficient = 0.97f) {
+    if (waveform.size() < 2 || coefficient == 0.0f) {
+        return;
+    }
+    for (size_t i = waveform.size() - 1; i > 0; --i) {
+        waveform[i] -= coefficient * waveform[i - 1];
+    }
+}
+
+inline void normalize_parakeet_log_mel(std::vector<float>& mel, size_t num_mels, float epsilon = 1e-5f) {
+    if (mel.empty() || num_mels == 0 || (mel.size() % num_mels) != 0) {
+        return;
+    }
+    const size_t num_frames = mel.size() / num_mels;
+    if (num_frames == 0) {
+        return;
+    }
+
+    for (size_t m = 0; m < num_mels; ++m) {
+        const size_t base = m * num_frames;
+        float mean = 0.0f;
+        for (size_t t = 0; t < num_frames; ++t) {
+            mean += mel[base + t];
+        }
+        mean /= static_cast<float>(num_frames);
+
+        float variance = 0.0f;
+        for (size_t t = 0; t < num_frames; ++t) {
+            const float d = mel[base + t] - mean;
+            variance += d * d;
+        }
+        const float denom = static_cast<float>(std::max<size_t>(1, num_frames - 1));
+        const float inv_std = 1.0f / std::sqrt((variance / denom) + epsilon);
+        for (size_t t = 0; t < num_frames; ++t) {
+            mel[base + t] = (mel[base + t] - mean) * inv_std;
+        }
+    }
+}
+
+inline void trim_mel_frames(std::vector<float>& mel, size_t num_mels, size_t valid_frames) {
+    if (mel.empty() || num_mels == 0 || (mel.size() % num_mels) != 0) {
+        return;
+    }
+    size_t total_frames = mel.size() / num_mels;
+    if (valid_frames == 0 || valid_frames >= total_frames) {
+        return;
+    }
+    std::vector<float> trimmed(num_mels * valid_frames);
+    for (size_t m = 0; m < num_mels; ++m) {
+        const float* src = &mel[m * total_frames];
+        float* dst = &trimmed[m * valid_frames];
+        std::copy(src, src + valid_frames, dst);
+    }
+    mel.swap(trimmed);
+}
+
 } // namespace audio
 } // namespace cactus
 
 namespace cactus {
 namespace ffi {
 
+inline bool env_flag_enabled(const char* key) {
+    const char* value = std::getenv(key);
+    return value && value[0] != '\0' && !(value[0] == '0' && value[1] == '\0');
+}
+
 inline std::string generateUUID() {
 #ifdef __APPLE__
     uuid_t uuid;
@@ -114,6 +198,25 @@ inline std::string generateUUID() {
     char uuid_str[37];
     uuid_unparse_lower(uuid, uuid_str);
     return std::string(uuid_str);
+#else
+    static std::random_device rd;
+    static std::mt19937 gen(rd());
+    static std::uniform_int_distribution<> dis(0, 15);
+    static std::uniform_int_distribution<> dis2(8, 11);
+
+    std::stringstream ss;
+    ss << std::hex;
+    for (int i = 0; i < 8; i++) ss << dis(gen);
+    ss << "-";
+    for (int i = 0; i < 4; i++) ss << dis(gen);
+    ss << "-4";
+    for (int i = 0; i < 3; i++) ss << dis(gen);
+    ss << "-";
+    ss << dis2(gen);
+    for (int i = 0; i < 3; i++) ss << dis(gen);
+    ss << "-";
+    for (int i = 0; i < 12; i++) ss << dis(gen);
+    return ss.str();
 #endif
 }
 
@@ -150,6 +253,130 @@ inline std::string escape_json_string(const std::string& s) {
     return o.str();
 }
 
+
+inline std::string trim_string(const std::string& s) {
+    size_t start = 0;
+    while (start < s.size() && std::isspace(static_cast<unsigned char>(s[start]))) ++start;
+    size_t end = s.size();
+    while (end > start && std::isspace(static_cast<unsigned char>(s[end - 1]))) --end;
+    return s.substr(start, end - start);
+}
+
+inline std::string env_or_default(const char* key, const char* fallback) {
+    const char* v = std::getenv(key);
+    if (v && v[0] != '\0') return std::string(v);
+    return std::string(fallback);
+}
+
+inline std::string json_string_field(const std::string& json, const std::string& key) {
+    std::string pattern = "\"" + key + "\":";
+    size_t pos = json.find(pattern);
+    if (pos == std::string::npos) return {};
+
+    size_t i = pos + pattern.size();
+    while (i < json.size() && std::isspace(static_cast<unsigned char>(json[i]))) i++;
+    if (i >= json.size() || json[i] != '"') return {};
+    ++i;
+
+    std::string out;
+    out.reserve(128);
+    while (i < json.size()) {
+        char c = json[i++];
+        if (c == '"') return out;
+        if (c == '\\' && i < json.size()) {
+            char e = json[i++];
+            switch (e) {
+                case '"':  out.push_back('"');  break;
+                case '\\': out.push_back('\\'); break;
+                case '/':  out.push_back('/');  break;
+                case 'b':  out.push_back('\b'); break;
+                case 'f':  out.push_back('\f'); break;
+                case 'n':  out.push_back('\n'); break;
+                case 'r':  out.push_back('\r'); break;
+                case 't':  out.push_back('\t'); break;
+                default:   out.push_back(e);    break;
+            }
+            continue;
+        }
+        out.push_back(c);
+    }
+    return {};
+}
+
+inline std::string json_array_field(const std::string& json, const std::string& key) {
+    std::string pattern = "\"" + key + "\":";
+    size_t pos = json.find(pattern);
+    if (pos == std::string::npos) return "[]";
+    size_t start = pos + pattern.size();
+    while (start < json.size() && std::isspace(static_cast<unsigned char>(json[start]))) ++start;
+    if (start >= json.size() || json[start] != '[') return "[]";
+
+    int depth = 1;
+    size_t end = start + 1;
+    while (end < json.size() && depth > 0) {
+        if (json[end] == '[') depth++;
+        else if (json[end] == ']') depth--;
+        end++;
+    }
+    return json.substr(start, end - start);
+}
+
+inline std::vector<std::string> split_json_array(const std::string& array_json) {
+    std::vector<std::string> out;
+    if (array_json.size() < 2 || array_json.front() != '[' || array_json.back() != ']') return out;
+
+    size_t i = 1;
+    while (i + 1 < array_json.size()) {
+        while (i + 1 < array_json.size() &&
+               (std::isspace(static_cast<unsigned char>(array_json[i])) || array_json[i] == ',')) i++;
+        if (i + 1 >= array_json.size() || array_json[i] != '{') break;
+
+        size_t start = i;
+        int depth = 0;
+        bool in_str = false;
+        bool esc = false;
+        for (; i < array_json.size(); ++i) {
+            char c = array_json[i];
+            if (in_str) {
+                if (esc) esc = false;
+                else if (c == '\\') esc = true;
+                else if (c == '"') in_str = false;
+                continue;
+            }
+            if (c == '"') { in_str = true; continue; }
+            if (c == '{') depth++;
+            if (c == '}') {
+                depth--;
+                if (depth == 0) {
+                    out.push_back(array_json.substr(start, i - start + 1));
+                    i++;
+                    break;
+                }
+            }
+        }
+    }
+    return out;
+}
+
+inline std::string serialize_tools_json(const std::vector<ToolFunction>& tools) {
+    if (tools.empty()) return "";
+    std::ostringstream oss;
+    oss << "[";
+    for (size_t i = 0; i < tools.size(); ++i) {
+        if (i > 0) oss << ",";
+        oss << "{\"type\":\"function\",\"function\":{";
+        oss << "\"name\":\"" << escape_json_string(tools[i].name) << "\",";
+        oss << "\"description\":\"" << escape_json_string(tools[i].description) << "\"";
+        auto it = tools[i].parameters.find("schema");
+        if (it != tools[i].parameters.end()) {
+            oss << ",\"parameters\":" << it->second;
+        }
+        oss << "}}";
+    }
+    oss << "]";
+    return oss.str();
+}
+
 inline void handle_error_response(const std::string& error_message, char* response_buffer, size_t buffer_size) {
     std::ostringstream json;
     json << "{";
@@ -324,7 +551,10 @@ inline void parse_options_json(const std::string& json,
                                float& confidence_threshold,
                                bool& include_stop_sequences,
                                bool& use_vad,
-                               bool& telemetry_enabled) {
+                               bool& telemetry_enabled,
+                               bool* auto_handoff = nullptr,
+                               size_t* cloud_timeout_ms = nullptr,
+                               bool* handoff_with_images = nullptr) {
     temperature = 0.0f;
     top_p = 0.0f;
     top_k = 0;
@@ -335,6 +565,9 @@ inline void parse_options_json(const std::string& json,
     include_stop_sequences = false;
     use_vad = true;
     telemetry_enabled = true;
+    if (auto_handoff) *auto_handoff = true;
+    if (cloud_timeout_ms) *cloud_timeout_ms = 15000;
+    if (handoff_with_images) *handoff_with_images = true;
     stop_sequences.clear();
 
     if (json.empty()) return;
@@ -403,6 +636,32 @@ inline void parse_options_json(const std::string& json,
         telemetry_enabled = (json.substr(pos, 4) == "true");
     }
 
+    if (auto_handoff) {
+        pos = json.find("\"auto_handoff\"");
+        if (pos != std::string::npos) {
+            pos = json.find(':', pos) + 1;
+            while (pos < json.length() && std::isspace(json[pos])) pos++;
+            *auto_handoff = (json.substr(pos, 4) == "true");
+        }
+    }
+
+    if (cloud_timeout_ms) {
+        pos = json.find("\"cloud_timeout_ms\"");
+        if (pos != std::string::npos) {
+            pos = json.find(':', pos) + 1;
+            *cloud_timeout_ms = std::stoul(json.substr(pos));
+        }
+    }
+
+    if (handoff_with_images) {
+        pos = json.find("\"handoff_with_images\"");
+        if (pos != std::string::npos) {
+            pos = json.find(':', pos) + 1;
+            while (pos < json.length() && std::isspace(json[pos])) pos++;
+            *handoff_with_images = (json.substr(pos, 4) == "true");
+        }
+    }
+
     pos = json.find("\"stop_sequences\"");
     if (pos != std::string::npos) {
         pos = json.find('[', pos);
@@ -422,31 +681,8 @@ inline void parse_options_json(const std::string& json,
     }
 }
 
-inline std::string format_tools_for_prompt(const std::vector<ToolFunction>& tools) {
-    if (tools.empty()) return "";
-    std::string formatted_tools_json;
-    for (size_t i = 0; i < tools.size(); i++) {
-        if (i > 0) formatted_tools_json += "\n";
-        formatted_tools_json += "{\"type\":\"function\",\"function\":{\"name\":\""
-                              + tools[i].name
-                              + "\",\"description\":\""
-                              + tools[i].description + "\"";
-        if (tools[i].parameters.find("schema") != tools[i].parameters.end()) {
-            formatted_tools_json += ",\"parameters\":" + tools[i].parameters.at("schema");
-        }
-        formatted_tools_json += "}}";
-    }
-    return formatted_tools_json;
-}
-
 static inline std::string trim_lfm2_slice(const std::string& value, size_t begin, size_t end) {
-    while (begin < end && std::isspace(static_cast<unsigned char>(value[begin]))) {
-        begin++;
-    }
-    while (end > begin && std::isspace(static_cast<unsigned char>(value[end - 1]))) {
-        end--;
-    }
-    return value.substr(begin, end - begin);
+    return trim_string(value.substr(begin, end - begin));
 }
 
 static inline void append_lfm2_call(const std::string& entry,
@@ -577,23 +813,49 @@ inline void parse_function_calls_from_response(const std::string& response_text,
 
             if (!content.empty() && content.front() == '[' && content.back() == ']') {
                 std::string inner = content.substr(1, content.size() - 2);
-                size_t start = 0;
-                int paren_depth = 0;
-
-                for (size_t i = 0; i < inner.size(); ++i) {
-                    char c = inner[i];
-                    if (c == '(') {
-                        paren_depth++;
-                    } else if (c == ')' && paren_depth > 0) {
-                        paren_depth--;
-                    } else if (c == ',' && paren_depth == 0) {
-                        append_lfm2_call(inner.substr(start, i - start), function_calls);
-                        start = i + 1;
+
+                size_t inner_first = inner.find_first_not_of(" \t\n\r");
+                if (inner_first != std::string::npos && inner[inner_first] == '{') {
+                    size_t pos = inner_first;
+                    while (pos < inner.size()) {
+                        if (inner[pos] == '{') {
+                            int brace_depth = 1;
+                            size_t obj_start = pos;
+                            pos++;
+                            while (pos < inner.size() && brace_depth > 0) {
+                                if (inner[pos] == '{') brace_depth++;
+                                else if (inner[pos] == '}') brace_depth--;
+                                pos++;
+                            }
+                            if (brace_depth == 0) {
+                                std::string json_obj = inner.substr(obj_start, pos - obj_start);
+                                if (json_obj.find("\"name\"") != std::string::npos) {
+                                    function_calls.push_back(json_obj);
+                                }
+                            }
+                        } else {
+                            pos++;
+                        }
+                    }
+                } else {
+                    size_t start = 0;
+                    int paren_depth = 0;
+
+                    for (size_t i = 0; i < inner.size(); ++i) {
+                        char c = inner[i];
+                        if (c == '(') {
+                            paren_depth++;
+                        } else if (c == ')' && paren_depth > 0) {
+                            paren_depth--;
+                        } else if (c == ',' && paren_depth == 0) {
+                            append_lfm2_call(inner.substr(start, i - start), function_calls);
+                            start = i + 1;
+                        }
                     }
-                }
 
-                if (start < inner.size()) {
-                    append_lfm2_call(inner.substr(start), function_calls);
+                    if (start < inner.size()) {
+                        append_lfm2_call(inner.substr(start), function_calls);
+                    }
                 }
             } else if (!content.empty()) {
                 append_lfm2_call(content, function_calls);
@@ -648,7 +910,7 @@ inline std::string construct_response_json(const std::string& regular_response,
                                            bool cloud_handoff = false) {
     std::ostringstream json;
     json << "{";
-    json << "\"success\":" << (cloud_handoff ? "false" : "true") << ",";
+    json << "\"success\":true,";
     json << "\"error\":null,";
     json << "\"cloud_handoff\":" << (cloud_handoff ? "true" : "false") << ",";
     json << "\"response\":\"" << escape_json_string(regular_response) << "\",";
@@ -671,30 +933,6 @@ inline std::string construct_response_json(const std::string& regular_response,
     return json.str();
 }
 
-inline std::string construct_cloud_handoff_json(float confidence,
-                                                 double time_to_first_token,
-                                                 double prefill_tps,
-                                                 size_t prompt_tokens) {
-    std::ostringstream json;
-    json << "{";
-    json << "\"success\":false,";
-    json << "\"error\":null,";
-    json << "\"cloud_handoff\":true,";
-    json << "\"response\":null,";
-    json << "\"function_calls\":[],";
-    json << "\"confidence\":" << std::fixed << std::setprecision(4) << confidence << ",";
-    json << "\"time_to_first_token_ms\":" << std::fixed << std::setprecision(2) << time_to_first_token << ",";
-    json << "\"total_time_ms\":" << std::fixed << std::setprecision(2) << time_to_first_token << ",";
-    json << "\"prefill_tps\":" << std::fixed << std::setprecision(2) << prefill_tps << ",";
-    json << "\"decode_tps\":0.0,";
-    json << "\"ram_usage_mb\":" << std::fixed << std::setprecision(2) << get_ram_usage_mb() << ",";
-    json << "\"prefill_tokens\":" << prompt_tokens << ",";
-    json << "\"decode_tokens\":0,";
-    json << "\"total_tokens\":" << prompt_tokens;
-    json << "}";
-    return json.str();
-}
-
 inline std::string serialize_function_calls(const std::vector<std::string>& calls) {
     if (calls.empty()) return "[]";
     std::ostringstream oss;
@@ -720,4 +958,4 @@ const char* cactus_get_last_error();
 }
 #endif
 
-#endif // CACTUS_UTILS_H
+#endif // CACTUS_UTILS_H
\ No newline at end of file
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h
index 620fab6..c8bf34a 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h
@@ -56,6 +56,12 @@ struct Config {
     uint32_t num_shared_experts = 0;
     uint32_t num_top_experts = 0;
     uint32_t moe_every_n_layers = 0;
+    uint32_t moe_intermediate_dim = 0;
+    uint32_t num_dense_layers = 0;
+    uint32_t num_experts_per_tok = 0;
+    bool norm_topk_prob = false;
+    bool use_expert_bias = false;
+    float routed_scaling_factor = 1.0f;
     bool tie_word_embeddings = true;
 
     uint32_t vision_hidden_dim = 0;
@@ -93,8 +99,22 @@ struct Config {
     uint32_t num_encoder_layers = 0;
     uint32_t num_decoder_layers = 0;
     float partial_rotary_factor = 0.0f;
-
-    enum class ModelType {QWEN = 0, GEMMA = 1, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, MOONSHINE = 8, SILERO_VAD = 9};
+    uint32_t pad_token_id = 0;
+    uint32_t conv_kernel_size = 0;
+    uint32_t subsampling_conv_kernel_size = 0;
+    uint32_t subsampling_conv_stride = 0;
+    uint32_t subsampling_conv_channels = 0;
+    uint32_t subsampling_factor = 0;
+    uint32_t num_mel_bins = 80;
+    std::string encoder_hidden_act = "silu";
+    uint32_t predictor_hidden_dim = 0;
+    uint32_t predictor_num_layers = 0;
+    uint32_t tdt_joint_dim = 0;
+    uint32_t tdt_num_durations = 0;
+    uint32_t tdt_blank_id = 0;
+    std::vector<uint32_t> tdt_durations;
+
+    enum class ModelType {QWEN = 0, GEMMA = 1, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, MOONSHINE = 8, SILERO_VAD = 9, PARAKEET = 10, PARAKEET_TDT = 11};
     ModelType model_type = ModelType::QWEN;
 
     enum class ModelVariant {DEFAULT = 0, VLM = 1, EXTRACT = 2, RAG = 3};
@@ -168,7 +188,7 @@ class Tokenizer {
     uint32_t get_global_img_token_id() const { return global_img_token_id_; }
 
 protected:
-    enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, BERT, WHISPER};
+    enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, BERT, WHISPER, PARAKEET};
     ModelType model_type_ = ModelType::UNKNOWN;
     enum class ModelVariant { DEFAULT, VLM, EXTRACT, RAG};
     ModelVariant model_variant_ = ModelVariant::DEFAULT;
@@ -366,7 +386,6 @@ struct KVCache {
                          size_t num_tokens, size_t kv_heads, size_t head_dim);
 
     bool is_empty() const { return current_seq_len == 0; }
-    bool is_int8() const { return precision == Precision::INT8; }
     void* get_key_ptr(size_t layer);
     void* get_value_ptr(size_t layer);
 
@@ -684,6 +703,8 @@ class AudioProcessor {
         float reference = 1.0f;
         float min_value = 1e-10f;
         bool remove_dc_offset = false;
+        float preemphasis = 0.0f;
+        bool hann_periodic = true;
     };
 
     AudioProcessor();
@@ -696,6 +717,11 @@ class AudioProcessor {
         const std::vector<float>& waveform,
         const SpectrogramConfig& config);
 
+    static std::vector<float> compute_irfft(
+        const std::vector<float>& complex_input,
+        size_t n,
+        const char* norm = "backward");
+
     const std::vector<float>& get_mel_filters() const { return mel_filters_; }
 
     size_t get_num_mel_filters() const { return num_mel_filters_; }
@@ -721,6 +747,8 @@ namespace index {
     struct QueryResult {
         int doc_id;
         float score;
+
+        QueryResult(int doc_id, float score) : doc_id(doc_id), score(score) {}
     };
 
     struct QueryOptions {
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h
index 255d83c..01b7b2f 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h
@@ -6,6 +6,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <functional>
+#include <cassert>
 #include <cstring>
 #include <stdexcept>
 #include <string>
@@ -109,23 +110,33 @@ enum class ComputeBackend {
     NPU
 };
 
+enum class Activation {
+    SILU,
+    GELU,
+    GELU_ERF,
+    RELU,
+    SIGMOID,
+    TANH
+};
+
 enum class OpType {
     INPUT, PRECISION_CAST,
     ADD, ADD_CLIPPED, SUBTRACT, MULTIPLY, DIVIDE,
     MATMUL, TRANSPOSE, RESHAPE, SLICE, GATHER, EMBEDDING,
     BILINEAR_INTERPOLATION,
     SUM, MEAN, VARIANCE, MIN, MAX,
-    RMS_NORM, ROPE, ROPE_GPTJ, SOFTMAX, ATTENTION, ATTENTION_INT8_HYBRID, CONV1D_CAUSAL, CONV1D_K3, CONV1D_K7S3, CONV1D,
-    SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN,
+    RMS_NORM, ROPE, ROPE_GPTJ, SOFTMAX, ATTENTION, ATTENTION_INT8_HYBRID, REL_POS_BIAS, CONV1D_CAUSAL, CONV1D_K3, CONV1D_K7S3, CONV1D, CONV1D_SAME_DEPTHWISE_K9, CONV1D_POINTWISE, CONV2D_K3S2P1, CONV2D_DEPTHWISE_K3S2P1, CONV2D_POINTWISE_1X1, GLU, BATCHNORM,
+    SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN, SCALAR_LOG,
     RELU, SILU, GELU, GELU_ERF, SIGMOID, TANH,
     SAMPLE, CONCAT,
     SCATTER_TOPK,
     TOPK, LAYERNORM, GROUPNORM,
+    MOE_LAYER,
     INDEX,
     PERSISTENT,
     QUANTIZE_ACTIVATIONS,
     LSTM_CELL,
-    STFT_MAGNITUDE
+    STFT
 };
 
 struct PrecisionTraits {
@@ -141,11 +152,20 @@ struct PrecisionTraits {
 
     static constexpr size_t packed_size_of(Precision prec, size_t count) {
         switch (prec) {
-            case Precision::INT4: return (count + 1) / 2;  
+            case Precision::INT4: return (count + 1) / 2;
             default: return count * size_of(prec);
         }
     }
 
+    static size_t byte_offset_of(Precision prec, size_t element_offset) {
+        switch (prec) {
+            case Precision::INT4:
+                assert(element_offset % 32 == 0 && "INT4 byte offset must be group-aligned (multiple of 32)");
+                return element_offset / 2;
+            default: return element_offset * size_of(prec);
+        }
+    }
+
     static constexpr bool is_integer(Precision prec) {
         switch (prec) {
             case Precision::INT8: return true;
@@ -181,7 +201,6 @@ struct TensorConfig {
     Precision compute_precision = Precision::INT8;
     Precision output_precision = Precision::INT8;
     bool auto_mixed_precision = false;
-    bool enable_int4_packing = true;
     
     static TensorConfig& global();
 };
@@ -243,6 +262,10 @@ struct BufferDesc {
         return precision == Precision::INT8 && group_size > 0;
     }
 
+    bool is_grouped_int4() const {
+        return precision == Precision::INT4 && group_size > 0;
+    }
+
     void set_grouped_scales(size_t gs, size_t ng, void* scales_ptr) {
         group_size = gs;
         num_groups = ng;
@@ -291,6 +314,7 @@ struct OpParams {
     size_t slice_length = 0;
     size_t window_size = 0;
     bool is_causal = true;  
+    bool attention_mask_is_additive = false;
     std::vector<size_t> new_shape;
     std::vector<size_t> permutation;
     Precision output_precision = Precision::INT8;
@@ -309,6 +333,11 @@ struct OpParams {
     size_t num_groups = 0;
     size_t dst_height = 0;
     size_t dst_width = 0;
+    bool normalize_routing = false;
+    size_t num_experts = 0;
+    size_t num_experts_per_tok = 0;
+    bool moe_gated = true; 
+    Activation activation = Activation::SILU;
 
     std::vector<float> bias_values;
     std::vector<uint32_t> bias_indices;
@@ -356,7 +385,6 @@ void compute_index_node(GraphNode& node, const std::vector<std::unique_ptr<Graph
 void compute_lstm_cell_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
 
 void shrink_thread_local_buffers();
-
 class BufferPool {
 public:
     BufferPool() = default;
@@ -418,6 +446,7 @@ class CactusGraph {
     size_t scalar_sqrt(size_t input);
     size_t scalar_cos(size_t input);
     size_t scalar_sin(size_t input);
+    size_t scalar_log(size_t input);
     
     size_t relu(size_t input);
     size_t silu(size_t input);
@@ -425,6 +454,7 @@ class CactusGraph {
     size_t gelu_erf(size_t input);
     size_t sigmoid(size_t input);
     size_t tanh(size_t input);
+    size_t glu(size_t input, int axis = -1);
     
     size_t matmul(size_t input1, size_t input2, bool pretransposed_rhs = false, ComputeBackend backend = ComputeBackend::CPU);
     size_t transpose(size_t input, ComputeBackend backend = ComputeBackend::CPU);
@@ -455,7 +485,30 @@ class CactusGraph {
     size_t layernorm(size_t input, size_t weight, size_t bias, float epsilon = 1e-5f);
     size_t layernorm(size_t input, size_t weight, float epsilon = 1e-5f);  // No bias version
     size_t groupnorm(size_t input, size_t weight, size_t bias, size_t num_groups = 32, float epsilon = 1e-5f);
+    size_t batchnorm(size_t input, size_t weight, size_t bias, size_t running_mean, size_t running_var, int axis = 1, float epsilon = 1e-5f);
     size_t topk(size_t input, size_t k);
+    size_t moe_layer(size_t hidden,
+                     size_t routing_probs,
+                     size_t topk_indices,
+                     const std::vector<size_t>& w1_weights,
+                     const std::vector<size_t>& w3_weights,
+                     const std::vector<size_t>& w2_weights,
+                     size_t num_experts,
+                     size_t num_experts_per_tok,
+                     bool normalize_routing,
+                     float epsilon,
+                     float routed_scaling_factor);
+    size_t moe_layer(size_t hidden,
+                     size_t routing_probs,
+                     size_t topk_indices,
+                     const std::vector<size_t>& w1_weights,
+                     const std::vector<size_t>& w2_weights,
+                     size_t num_experts,
+                     size_t num_experts_per_tok,
+                     bool normalize_routing,
+                     float epsilon,
+                     float routed_scaling_factor,
+                     Activation activation);
     size_t rms_norm(size_t input, size_t weight, float epsilon = 1e-5f);
     size_t rope(size_t input, float theta, size_t position_offset = 0, ComputeBackend backend = ComputeBackend::CPU);
     size_t rope_gptj(size_t input, float theta, size_t position_offset = 0, size_t rot_dim = 0, ComputeBackend backend = ComputeBackend::CPU);
@@ -463,6 +516,10 @@ class CactusGraph {
     size_t attention(size_t query, size_t key, size_t value, float scale, bool is_causal = true, ComputeBackend backend = ComputeBackend::CPU);
     size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, ComputeBackend backend = ComputeBackend::CPU);
     size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, size_t window_size, ComputeBackend backend = ComputeBackend::CPU);
+    size_t attention_masked(size_t query, size_t key, size_t value, size_t mask, float scale,
+                            bool is_causal = true, ComputeBackend backend = ComputeBackend::CPU,
+                            bool additive_mask = false, size_t position_offset = 0, size_t window_size = 0);
+    size_t rel_pos_bias(size_t query, size_t relative_key, float scale);
 
     size_t attention_int8_hybrid(size_t query, size_t key_new, size_t value_new, float scale, size_t position_offset,
                                  const int8_t* cached_keys, const int8_t* cached_values,
@@ -474,9 +531,19 @@ class CactusGraph {
     size_t conv1d_k7s3(size_t input, size_t weight, size_t bias);
     size_t conv1d(size_t input, size_t weight, size_t stride);
     size_t conv1d(size_t input, size_t weight, size_t bias, size_t stride);
+    size_t conv1d_same_depthwise_k9(size_t input, size_t weight);
+    size_t conv1d_same_depthwise_k9(size_t input, size_t weight, size_t bias);
+    size_t conv1d_pointwise(size_t input, size_t weight);
+    size_t conv1d_pointwise(size_t input, size_t weight, size_t bias);
+    size_t conv2d_k3s2p1(size_t input, size_t weight);
+    size_t conv2d_k3s2p1(size_t input, size_t weight, size_t bias);
+    size_t conv2d_depthwise_k3s2p1(size_t input, size_t weight);
+    size_t conv2d_depthwise_k3s2p1(size_t input, size_t weight, size_t bias);
+    size_t conv2d_pointwise_1x1(size_t input, size_t weight);
+    size_t conv2d_pointwise_1x1(size_t input, size_t weight, size_t bias);
 
     size_t lstm_cell(size_t input, size_t h_prev, size_t c_prev, size_t weight_ih, size_t weight_hh, size_t bias_ih, size_t bias_hh);
-    size_t stft_magnitude(size_t input, size_t weight, size_t stride, size_t num_fft_bins);
+    size_t stft(size_t input, size_t weight, size_t stride, size_t num_fft_bins);
 
     size_t sample(size_t logits, float temperature = 0.6f, float top_p = 0.95f, size_t top_k = 20,
                   const std::unordered_map<uint32_t, float>& logit_bias = {});
@@ -581,12 +648,9 @@ namespace GraphFile {
         bool is_interleaved_ = false;
         size_t original_N_ = 0;
 
-        std::unique_ptr<int8_t[]> unpacked_data_;  
-
         void parse_header();
         void apply_madvise_hints();
-        void unpack_int4_data();
     };
 }
 
-#endif 
+#endif
\ No newline at end of file
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h
index 17acd36..0ec7265 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h
@@ -4,6 +4,8 @@
 #include <cstddef>
 #include <arm_neon.h>
 
+enum class Precision;
+
 enum class ScalarOpType {
     ADD,
     SUBTRACT,
@@ -12,7 +14,8 @@ enum class ScalarOpType {
     EXP,
     SQRT,
     COS,
-    SIN
+    SIN,
+    LOG
 };
 
 constexpr size_t KV_QUANT_GROUP_SIZE = 32;
@@ -21,6 +24,7 @@ void cactus_add_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num
 void cactus_add_f16_clipped(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
 void cactus_subtract_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
 void cactus_multiply_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
+void cactus_add_scaled_f16(const __fp16* base, const __fp16* src, __fp16* output, size_t num_elements, float scale);
 void cactus_divide_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
 
 void cactus_add_broadcast_f16(const __fp16* a, const __fp16* b, __fp16* output,
@@ -50,6 +54,23 @@ void cactus_matmul_int8(const int8_t* A, const float* A_scales,
                         const int8_t* B, const __fp16* B_scales,
                         __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
 
+void cactus_gemv_int4(const int8_t* A, float A_scale,
+                      const int8_t* B_packed, const __fp16* B_scales,
+                      __fp16* C, size_t K, size_t N, size_t group_size);
+
+void cactus_gemm_int4(const int8_t* A, const float* A_scales,
+                      const int8_t* B_packed, const __fp16* B_scales,
+                      __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
+
+void cactus_matmul_int4(const int8_t* A, const float* A_scales,
+                        const int8_t* B_packed, const __fp16* B_scales,
+                        __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
+
+void cactus_matmul_integer(Precision precision,
+                            const int8_t* A, const float* A_scales,
+                            const int8_t* B, const __fp16* B_scales,
+                            __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
+
 void cactus_matmul_f16(const __fp16* a, const __fp16* b_transposed, __fp16* c,
                        size_t M, size_t K, size_t N);
 
@@ -97,10 +118,52 @@ void cactus_sigmoid_f16(const __fp16* input, __fp16* output, size_t num_elements
 
 void cactus_tanh_f16(const __fp16* input, __fp16* output, size_t num_elements);
 
+void cactus_glu_f16(
+    const __fp16* input,
+    __fp16* output,
+    size_t outer_size,
+    size_t split_size,
+    size_t inner_size
+);
+
+void cactus_glu_f32(
+    const float* input,
+    float* output,
+    size_t outer_size,
+    size_t split_size,
+    size_t inner_size
+);
+
+void cactus_batchnorm_f16(
+    const __fp16* input,
+    const float* weight,
+    const float* bias,
+    const float* running_mean,
+    const float* running_var,
+    __fp16* output,
+    size_t outer_size,
+    size_t channels,
+    size_t inner_size,
+    float epsilon
+);
+
+void cactus_batchnorm_f32(
+    const float* input,
+    const float* weight,
+    const float* bias,
+    const float* running_mean,
+    const float* running_var,
+    float* output,
+    size_t outer_size,
+    size_t channels,
+    size_t inner_size,
+    float epsilon
+);
+
 void cactus_attention_f16(const __fp16* queries, const __fp16* keys, const __fp16* values, __fp16* output,
                           size_t batch_size, size_t seq_len, size_t kv_seq_len, size_t num_q_heads, size_t num_kv_heads,
                           size_t head_dim, float scale, const __fp16* mask, size_t position_offset = 0, size_t window_size = 0,
-                          bool is_causal = true);
+                          bool is_causal = true, bool mask_is_additive = false, bool mask_per_head = false);
 
 void cactus_attention_hybrid_int8_fp16(
     const __fp16* queries,        
@@ -150,7 +213,7 @@ void cactus_conv1d_f16(
     size_t stride
 );
 
-void cactus_stft_magnitude_f16(
+void cactus_stft_f16(
     const __fp16* input,
     const __fp16* weight,
     __fp16* output,
@@ -171,6 +234,62 @@ void cactus_conv1d_f16_k7s3_oc8(
     size_t C_out
 );
 
+void cactus_conv1d_same_depthwise_f16_k9(
+    const __fp16* input,
+    const __fp16* weight,
+    const __fp16* bias,
+    __fp16* output,
+    size_t N,
+    size_t L,
+    size_t C
+);
+
+void cactus_conv2d_f16_k3s2p1_nchw(
+    const __fp16* input,
+    const __fp16* weight,
+    const __fp16* bias,
+    __fp16* output,
+    size_t N,
+    size_t C_in,
+    size_t H,
+    size_t W,
+    size_t C_out
+);
+
+void cactus_conv2d_depthwise_f16_k3s2p1_nchw(
+    const __fp16* input,
+    const __fp16* weight,
+    const __fp16* bias,
+    __fp16* output,
+    size_t N,
+    size_t C,
+    size_t H,
+    size_t W
+);
+
+void cactus_conv2d_pointwise_f16_1x1_nchw_gemm(
+    const __fp16* input,
+    const __fp16* weight,
+    const __fp16* bias,
+    __fp16* output,
+    size_t N,
+    size_t C_in,
+    size_t H,
+    size_t W,
+    size_t C_out
+);
+
+void cactus_conv1d_pointwise_f16_gemm(
+    const __fp16* input,
+    const __fp16* weight,
+    const __fp16* bias,
+    __fp16* output,
+    size_t N,
+    size_t L,
+    size_t C_in,
+    size_t C_out
+);
+
 void cactus_bilinear_interpolation_f16(const __fp16* input, __fp16* output, size_t src_height, size_t src_width, size_t embed_dim,
                                        size_t dst_height, size_t dst_width);
 
@@ -224,4 +343,4 @@ void cactus_lstm_cell_f16(
     size_t hidden_size
 );
 
-#endif
+#endif
\ No newline at end of file
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h
index ac49d05..118c85c 100644
--- a/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h
+++ b/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h
@@ -44,6 +44,34 @@ inline void stream_store_f16x8(__fp16* dst, float16x8_t val) {
 #endif
 }
 
+inline bool cpu_has_sme2() {
+#if defined(__aarch64__)
+	static std::once_flag once;
+	static bool has = false;
+	
+	std::call_once(once, []() {
+
+#if defined(__APPLE__)
+	int ret = 0;
+	size_t size = sizeof(ret);
+	if (sysctlbyname("hw.optional.arm.FEAT_SME2", &ret, &size, nullptr, 0) == 0) {
+		has = ret == 1;
+	}
+
+#elif defined(__ANDROID__)
+	unsigned long hwcap2 = getauxval(AT_HWCAP2);
+#ifdef HWCAP2_SME2
+	has = (hwcap2 & HWCAP2_SME2) != 0;
+#endif
+
+#endif
+	});
+	
+	return has;
+#else
+	return false;
+#endif
+}
 
 inline float32x4_t fast_exp_f32x4(float32x4_t x) {
     const float32x4_t log2e = vdupq_n_f32(1.4426950408889634f);
@@ -102,6 +130,12 @@ inline float32x4_t fast_tanh_f32x4(float32x4_t x) {
     return result;
 }
 
+inline void unpack_int4_as_int8x16x2(const uint8_t* ptr, int8x16_t& high_decoded, int8x16_t& low_decoded) {
+    int8x16_t packed = vreinterpretq_s8_u8(vld1q_u8(ptr));
+    high_decoded = vshrq_n_s8(packed, 4);
+    low_decoded = vshrq_n_s8(vshlq_n_s8(packed, 4), 4);
+}
+
 namespace CactusThreading {
 
     class ThreadPool {
@@ -297,7 +331,7 @@ namespace CactusThreading {
         }
         static size_t get_gemv_threads(size_t N_blocks, size_t pool_size) {
             if (N_blocks < GEMV_MIN_N_BLOCKS) return 1;
-            return std::min(pool_size, static_cast<size_t>(2));
+            return std::min(pool_size, static_cast<size_t>(3));
         }
         #else 
         static constexpr size_t GEMV_MIN_N_BLOCKS = 256;  
@@ -308,7 +342,7 @@ namespace CactusThreading {
         static size_t get_gemv_threads(size_t N_blocks, size_t pool_size) {
             if (N_blocks < GEMV_MIN_N_BLOCKS) return 1;
             if (N_blocks < 512) return std::min(pool_size, static_cast<size_t>(2));
-            return std::min(pool_size, static_cast<size_t>(4));
+            return std::min(pool_size, static_cast<size_t>(5));
         }
         #endif
     };
@@ -465,4 +499,4 @@ namespace CactusThreading {
 }
 
 
-#endif // KERNEL_UTILS_H 
\ No newline at end of file
+#endif // KERNEL_UTILS_H 
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/Info.plist b/ios/cactus.xcframework/ios-arm64/cactus.framework/Info.plist
index db7b528..ba87ea8 100644
Binary files a/ios/cactus.xcframework/ios-arm64/cactus.framework/Info.plist and b/ios/cactus.xcframework/ios-arm64/cactus.framework/Info.plist differ
diff --git a/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus b/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus
index fdd1651..e521921 100755
Binary files a/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus and b/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus differ
diff --git a/nitrogen/generated/shared/c++/HybridCactusSpec.cpp b/nitrogen/generated/shared/c++/HybridCactusSpec.cpp
index 775ceb8..e3b9763 100644
--- a/nitrogen/generated/shared/c++/HybridCactusSpec.cpp
+++ b/nitrogen/generated/shared/c++/HybridCactusSpec.cpp
@@ -19,6 +19,7 @@ namespace margelo::nitro::cactus {
       prototype.registerHybridMethod("tokenize", &HybridCactusSpec::tokenize);
       prototype.registerHybridMethod("scoreWindow", &HybridCactusSpec::scoreWindow);
       prototype.registerHybridMethod("transcribe", &HybridCactusSpec::transcribe);
+      prototype.registerHybridMethod("detectLanguage", &HybridCactusSpec::detectLanguage);
       prototype.registerHybridMethod("streamTranscribeStart", &HybridCactusSpec::streamTranscribeStart);
       prototype.registerHybridMethod("streamTranscribeProcess", &HybridCactusSpec::streamTranscribeProcess);
       prototype.registerHybridMethod("streamTranscribeStop", &HybridCactusSpec::streamTranscribeStop);
diff --git a/nitrogen/generated/shared/c++/HybridCactusSpec.hpp b/nitrogen/generated/shared/c++/HybridCactusSpec.hpp
index 9ebee13..f44ffbe 100644
--- a/nitrogen/generated/shared/c++/HybridCactusSpec.hpp
+++ b/nitrogen/generated/shared/c++/HybridCactusSpec.hpp
@@ -58,6 +58,7 @@ namespace margelo::nitro::cactus {
       virtual std::shared_ptr<Promise<std::vector<double>>> tokenize(const std::string& text) = 0;
       virtual std::shared_ptr<Promise<std::string>> scoreWindow(const std::vector<double>& tokens, double start, double end, double context) = 0;
       virtual std::shared_ptr<Promise<std::string>> transcribe(const std::variant<std::vector<double>, std::string>& audio, const std::string& prompt, double responseBufferSize, const std::optional<std::string>& optionsJson, const std::optional<std::function<void(const std::string& /* token */, double /* tokenId */)>>& callback) = 0;
+      virtual std::shared_ptr<Promise<std::string>> detectLanguage(const std::variant<std::vector<double>, std::string>& audio, double responseBufferSize, const std::optional<std::string>& optionsJson) = 0;
       virtual std::shared_ptr<Promise<void>> streamTranscribeStart(const std::optional<std::string>& optionsJson) = 0;
       virtual std::shared_ptr<Promise<std::string>> streamTranscribeProcess(const std::vector<double>& audio) = 0;
       virtual std::shared_ptr<Promise<std::string>> streamTranscribeStop() = 0;
diff --git a/package.json b/package.json
index f628d38..9707d4b 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "cactus-react-native",
-  "version": "1.7.0",
+  "version": "1.10.0",
   "description": "Run AI models locally on mobile devices",
   "main": "./lib/module/index.js",
   "types": "./lib/typescript/src/index.d.ts",
diff --git a/src/classes/CactusSTT.ts b/src/classes/CactusSTT.ts
index ac9c9a4..8dbdeb6 100644
--- a/src/classes/CactusSTT.ts
+++ b/src/classes/CactusSTT.ts
@@ -10,6 +10,8 @@ import type {
   CactusSTTStreamTranscribeProcessParams,
   CactusSTTStreamTranscribeProcessResult,
   CactusSTTStreamTranscribeStopResult,
+  CactusSTTDetectLanguageParams,
+  CactusSTTDetectLanguageResult,
 } from '../types/CactusSTT';
 import { getRegistry } from '../modelRegistry';
 import type { CactusModel } from '../types/common';
@@ -179,6 +181,24 @@ export class CactusSTT {
     }
   }
 
+  public async detectLanguage({
+    audio,
+    options,
+  }: CactusSTTDetectLanguageParams): Promise<CactusSTTDetectLanguageResult> {
+    if (this.isGenerating) {
+      throw new Error('CactusSTT is already generating');
+    }
+
+    await this.init();
+
+    this.isGenerating = true;
+    try {
+      return await this.cactus.detectLanguage(audio, options);
+    } finally {
+      this.isGenerating = false;
+    }
+  }
+
   public async audioEmbed({
     audioPath,
   }: CactusSTTAudioEmbedParams): Promise<CactusSTTAudioEmbedResult> {
diff --git a/src/index.tsx b/src/index.tsx
index ea4435d..113e341 100644
--- a/src/index.tsx
+++ b/src/index.tsx
@@ -41,6 +41,9 @@ export type {
   CactusSTTStreamTranscribeProcessParams,
   CactusSTTStreamTranscribeProcessResult,
   CactusSTTStreamTranscribeStopResult,
+  CactusSTTDetectLanguageOptions,
+  CactusSTTDetectLanguageParams,
+  CactusSTTDetectLanguageResult,
 } from './types/CactusSTT';
 export type {
   CactusVADParams,
diff --git a/src/native/Cactus.ts b/src/native/Cactus.ts
index e41a88a..4608b12 100644
--- a/src/native/Cactus.ts
+++ b/src/native/Cactus.ts
@@ -13,6 +13,8 @@ import type {
   CactusSTTStreamTranscribeStartOptions,
   CactusSTTStreamTranscribeProcessResult,
   CactusSTTStreamTranscribeStopResult,
+  CactusSTTDetectLanguageOptions,
+  CactusSTTDetectLanguageResult,
 } from '../types/CactusSTT';
 import type { CactusVADOptions, CactusVADResult } from '../types/CactusVAD';
 
@@ -228,6 +230,36 @@ export class Cactus {
     }
   }
 
+  public async detectLanguage(
+    audio: string | number[],
+    options?: CactusSTTDetectLanguageOptions
+  ): Promise<CactusSTTDetectLanguageResult> {
+    if (typeof audio === 'string') {
+      audio = audio.replace('file://', '');
+    }
+
+    const optionsJson = options
+      ? JSON.stringify({ use_vad: options.useVad })
+      : undefined;
+
+    const response = await this.hybridCactus.detectLanguage(
+      audio,
+      1024,
+      optionsJson
+    );
+
+    try {
+      const parsed = JSON.parse(response);
+
+      return {
+        language: parsed.language,
+        confidence: parsed.confidence,
+      };
+    } catch {
+      throw new Error('Unable to parse detect language response');
+    }
+  }
+
   public async streamTranscribeStop(): Promise<CactusSTTStreamTranscribeStopResult> {
     const response = await this.hybridCactus.streamTranscribeStop();
     try {
diff --git a/src/specs/Cactus.nitro.ts b/src/specs/Cactus.nitro.ts
index 8a36be0..ea5daa6 100644
--- a/src/specs/Cactus.nitro.ts
+++ b/src/specs/Cactus.nitro.ts
@@ -27,6 +27,11 @@ export interface Cactus extends HybridObject<{ ios: 'c++'; android: 'c++' }> {
     optionsJson?: string,
     callback?: (token: string, tokenId: number) => void
   ): Promise<string>;
+  detectLanguage(
+    audio: string | number[],
+    responseBufferSize: number,
+    optionsJson?: string
+  ): Promise<string>;
   streamTranscribeStart(optionsJson?: string): Promise<void>;
   streamTranscribeProcess(audio: number[]): Promise<string>;
   streamTranscribeStop(): Promise<string>;
diff --git a/src/types/CactusSTT.ts b/src/types/CactusSTT.ts
index 28fe275..6686045 100644
--- a/src/types/CactusSTT.ts
+++ b/src/types/CactusSTT.ts
@@ -86,3 +86,17 @@ export interface CactusSTTStreamTranscribeStopResult {
   success: boolean;
   confirmed: string;
 }
+
+export interface CactusSTTDetectLanguageOptions {
+  useVad?: boolean;
+}
+
+export interface CactusSTTDetectLanguageParams {
+  audio: string | number[];
+  options?: CactusSTTDetectLanguageOptions;
+}
+
+export interface CactusSTTDetectLanguageResult {
+  language: string;
+  confidence?: number;
+}