From df228cad6189a4e004e9691499b64efe94f4e872 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Mon, 22 Jun 2026 14:01:40 +0200 Subject: [PATCH 1/2] feat(nlp): add tokenizer pipeline (worklet host object) [#1248] --- .../android/CMakeLists.txt | 9 + .../cpp/RnExecutorch.cpp | 2 + .../cpp/extensions/nlp/install.cpp | 8 + .../cpp/extensions/nlp/install.h | 7 + .../cpp/extensions/nlp/tokenizer.cpp | 265 ++++++++++++++++++ .../cpp/extensions/nlp/tokenizer.h | 29 ++ .../react-native-executorch.podspec | 7 + .../src/extensions/nlp/index.ts | 2 + .../src/extensions/nlp/ops/tokenizer.ts | 52 ++++ .../src/extensions/nlp/tasks/tokenizer.ts | 74 +++++ .../src/hooks/useTokenizer.ts | 43 +++ packages/react-native-executorch/src/index.ts | 3 + .../react-native-executorch/src/models.ts | 11 + .../third-party/README.md | 5 +- 14 files changed, 516 insertions(+), 1 deletion(-) create mode 100644 packages/react-native-executorch/cpp/extensions/nlp/install.cpp create mode 100644 packages/react-native-executorch/cpp/extensions/nlp/install.h create mode 100644 packages/react-native-executorch/cpp/extensions/nlp/tokenizer.cpp create mode 100644 packages/react-native-executorch/cpp/extensions/nlp/tokenizer.h create mode 100644 packages/react-native-executorch/src/extensions/nlp/index.ts create mode 100644 packages/react-native-executorch/src/extensions/nlp/ops/tokenizer.ts create mode 100644 packages/react-native-executorch/src/extensions/nlp/tasks/tokenizer.ts create mode 100644 packages/react-native-executorch/src/hooks/useTokenizer.ts diff --git a/packages/react-native-executorch/android/CMakeLists.txt b/packages/react-native-executorch/android/CMakeLists.txt index 14a06d7546..9fe4ff2f7f 100644 --- a/packages/react-native-executorch/android/CMakeLists.txt +++ b/packages/react-native-executorch/android/CMakeLists.txt @@ -18,12 +18,14 @@ find_package(ReactAndroid REQUIRED CONFIG) file(GLOB CORE_SOURCES ../cpp/core/*.cpp) file(GLOB MATH_SOURCES ../cpp/extensions/math/*.cpp) file(GLOB CV_SOURCES ../cpp/extensions/cv/*.cpp) +file(GLOB NLP_SOURCES ../cpp/extensions/nlp/*.cpp) add_library(${CMAKE_PROJECT_NAME} SHARED ../cpp/RnExecutorch.cpp ${CORE_SOURCES} ${CV_SOURCES} ${MATH_SOURCES} + ${NLP_SOURCES} cpp-adapter.cpp ) @@ -35,6 +37,13 @@ target_compile_definitions(${CMAKE_PROJECT_NAME} PRIVATE target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE ../cpp ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/include + # pytorch/tokenizers headers (and the third-party libs they pull in: + # nlohmann/json, re2 and its abseil dependency) ship inside the ExecuTorch + # llm extension bundle + ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/include/executorch/extension/llm/tokenizers/include + ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/include/executorch/extension/llm/tokenizers/third-party/json/include + ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/include/executorch/extension/llm/tokenizers/third-party/re2 + ${CMAKE_CURRENT_SOURCE_DIR}/../third-party/include/executorch/extension/llm/tokenizers/third-party/abseil-cpp ) # Ensure executorch is linked with WHOLE_ARCHIVE so all static initializers are included diff --git a/packages/react-native-executorch/cpp/RnExecutorch.cpp b/packages/react-native-executorch/cpp/RnExecutorch.cpp index e9724e5820..3b71b61775 100644 --- a/packages/react-native-executorch/cpp/RnExecutorch.cpp +++ b/packages/react-native-executorch/cpp/RnExecutorch.cpp @@ -3,6 +3,7 @@ #include "core/install.h" #include "extensions/cv/install.h" #include "extensions/math/install.h" +#include "extensions/nlp/install.h" using namespace facebook; @@ -13,6 +14,7 @@ void install(jsi::Runtime &jsiRuntime) { rnexecutorch::core::install(jsiRuntime, module); rnexecutorch::extensions::cv::install(jsiRuntime, module); rnexecutorch::extensions::math::install(jsiRuntime, module); + rnexecutorch::extensions::nlp::install(jsiRuntime, module); jsiRuntime.global().setProperty(jsiRuntime, "__rnexecutorch_jsi__", std::move(module)); } diff --git a/packages/react-native-executorch/cpp/extensions/nlp/install.cpp b/packages/react-native-executorch/cpp/extensions/nlp/install.cpp new file mode 100644 index 0000000000..760520c180 --- /dev/null +++ b/packages/react-native-executorch/cpp/extensions/nlp/install.cpp @@ -0,0 +1,8 @@ +#include "install.h" +#include "tokenizer.h" + +namespace rnexecutorch::extensions::nlp { +void install(facebook::jsi::Runtime &rt, facebook::jsi::Object &module) { + tokenizer::install_loadTokenizer(rt, module); +} +} // namespace rnexecutorch::extensions::nlp diff --git a/packages/react-native-executorch/cpp/extensions/nlp/install.h b/packages/react-native-executorch/cpp/extensions/nlp/install.h new file mode 100644 index 0000000000..e2afefd727 --- /dev/null +++ b/packages/react-native-executorch/cpp/extensions/nlp/install.h @@ -0,0 +1,7 @@ +#pragma once + +#include + +namespace rnexecutorch::extensions::nlp { +void install(facebook::jsi::Runtime &rt, facebook::jsi::Object &module); +} // namespace rnexecutorch::extensions::nlp diff --git a/packages/react-native-executorch/cpp/extensions/nlp/tokenizer.cpp b/packages/react-native-executorch/cpp/extensions/nlp/tokenizer.cpp new file mode 100644 index 0000000000..8513cad2cb --- /dev/null +++ b/packages/react-native-executorch/cpp/extensions/nlp/tokenizer.cpp @@ -0,0 +1,265 @@ +#include "tokenizer.h" + +#include +#include + +#include + +namespace rnexecutorch::extensions::nlp::tokenizer { +namespace jsi = facebook::jsi; + +namespace { +// Number of BOS/EOS tokens to add on top of what the tokenizer.json defines. +// Keeping these at 0 means encoding follows the tokenizer's own post_processor +// (i.e. special tokens are added exactly as configured in tokenizer.json). +constexpr uint64_t kNumAddedBosTokens = 0; +constexpr uint64_t kNumAddedEosTokens = 0; +} // namespace + +TokenizerHostObject::TokenizerHostObject(const std::string &tokenizerPath) + : tokenizerPath_(tokenizerPath), + tokenizer_(std::make_unique()) { + auto error = tokenizer_->load(tokenizerPath_); + if (error != tokenizers::Error::Ok) { + throw std::runtime_error("Failed to load tokenizer from '" + tokenizerPath_ + + "': error " + std::to_string(static_cast(error))); + } +} + +jsi::Value TokenizerHostObject::get(jsi::Runtime &rt, const jsi::PropNameID &name) { + auto nameStr = name.utf8(rt); + + if (nameStr == "path") { + return jsi::String::createFromUtf8(rt, tokenizerPath_); + } + + if (nameStr == "encode") { + auto self = shared_from_this(); + auto fnBody = [self](jsi::Runtime &rt, const jsi::Value &thisVal, const jsi::Value *args, size_t count) -> jsi::Value { + if (count != 1) { + throw jsi::JSError(rt, "encode: Usage: encode(text)"); + } + + if (!args[0].isString()) { + throw jsi::JSError(rt, "encode: Expected arg0 to be a string"); + } + + std::unique_lock lock(self->mutex_, std::try_to_lock); + if (!lock.owns_lock()) { + throw jsi::JSError(rt, "encode: Tokenizer is currently in use"); + } + + if (!self->tokenizer_) { + throw jsi::JSError(rt, "encode: Tokenizer has been disposed"); + } + + auto text = args[0].asString(rt).utf8(rt); + auto result = self->tokenizer_->encode(text, kNumAddedBosTokens, kNumAddedEosTokens); + if (!result.ok()) { + throw jsi::JSError(rt, "encode: Failed to encode input: error " + + std::to_string(static_cast(result.error()))); + } + + const auto &ids = result.get(); + auto jsArray = jsi::Array(rt, ids.size()); + for (size_t i = 0; i < ids.size(); ++i) { + jsArray.setValueAtIndex(rt, i, static_cast(ids[i])); + } + + return jsArray; + }; + return jsi::Function::createFromHostFunction(rt, jsi::PropNameID::forAscii(rt, "encode"), 1, fnBody); + } + + if (nameStr == "decode") { + auto self = shared_from_this(); + auto fnBody = [self](jsi::Runtime &rt, const jsi::Value &thisVal, const jsi::Value *args, size_t count) -> jsi::Value { + if (count != 2) { + throw jsi::JSError(rt, "decode: Usage: decode(tokens, skipSpecialTokens)"); + } + + if (!args[0].isObject() || !args[0].asObject(rt).isArray(rt)) { + throw jsi::JSError(rt, "decode: Expected arg0 to be an array"); + } + + if (!args[1].isBool()) { + throw jsi::JSError(rt, "decode: Expected arg1 to be a boolean"); + } + + std::unique_lock lock(self->mutex_, std::try_to_lock); + if (!lock.owns_lock()) { + throw jsi::JSError(rt, "decode: Tokenizer is currently in use"); + } + + if (!self->tokenizer_) { + throw jsi::JSError(rt, "decode: Tokenizer has been disposed"); + } + + auto tokensArray = args[0].asObject(rt).asArray(rt); + auto skipSpecialTokens = args[1].asBool(); + + std::vector tokens; + tokens.reserve(tokensArray.size(rt)); + for (size_t i = 0; i < tokensArray.size(rt); ++i) { + auto val = tokensArray.getValueAtIndex(rt, i); + if (!val.isNumber()) { + throw jsi::JSError(rt, "decode: Expected tokens[" + std::to_string(i) + "] to be a number"); + } + tokens.push_back(static_cast(val.asNumber())); + } + + auto result = self->tokenizer_->decode(tokens, skipSpecialTokens); + if (!result.ok()) { + throw jsi::JSError(rt, "decode: Failed to decode tokens: error " + + std::to_string(static_cast(result.error()))); + } + + return jsi::String::createFromUtf8(rt, result.get()); + }; + return jsi::Function::createFromHostFunction(rt, jsi::PropNameID::forAscii(rt, "decode"), 2, fnBody); + } + + if (nameStr == "getVocabSize") { + auto self = shared_from_this(); + auto fnBody = [self](jsi::Runtime &rt, const jsi::Value &thisVal, const jsi::Value *args, size_t count) -> jsi::Value { + if (count != 0) { + throw jsi::JSError(rt, "getVocabSize: Usage: getVocabSize()"); + } + + std::unique_lock lock(self->mutex_, std::try_to_lock); + if (!lock.owns_lock()) { + throw jsi::JSError(rt, "getVocabSize: Tokenizer is currently in use"); + } + + if (!self->tokenizer_) { + throw jsi::JSError(rt, "getVocabSize: Tokenizer has been disposed"); + } + + return static_cast(self->tokenizer_->vocab_size()); + }; + return jsi::Function::createFromHostFunction(rt, jsi::PropNameID::forAscii(rt, "getVocabSize"), 0, fnBody); + } + + if (nameStr == "idToToken") { + auto self = shared_from_this(); + auto fnBody = [self](jsi::Runtime &rt, const jsi::Value &thisVal, const jsi::Value *args, size_t count) -> jsi::Value { + if (count != 1) { + throw jsi::JSError(rt, "idToToken: Usage: idToToken(id)"); + } + + if (!args[0].isNumber()) { + throw jsi::JSError(rt, "idToToken: Expected arg0 to be a number"); + } + + std::unique_lock lock(self->mutex_, std::try_to_lock); + if (!lock.owns_lock()) { + throw jsi::JSError(rt, "idToToken: Tokenizer is currently in use"); + } + + if (!self->tokenizer_) { + throw jsi::JSError(rt, "idToToken: Tokenizer has been disposed"); + } + + auto tokenId = static_cast(args[0].asNumber()); + auto result = self->tokenizer_->id_to_piece(tokenId); + if (!result.ok()) { + throw jsi::JSError(rt, "idToToken: Failed to convert id to token: error " + + std::to_string(static_cast(result.error()))); + } + + return jsi::String::createFromUtf8(rt, result.get()); + }; + return jsi::Function::createFromHostFunction(rt, jsi::PropNameID::forAscii(rt, "idToToken"), 1, fnBody); + } + + if (nameStr == "tokenToId") { + auto self = shared_from_this(); + auto fnBody = [self](jsi::Runtime &rt, const jsi::Value &thisVal, const jsi::Value *args, size_t count) -> jsi::Value { + if (count != 1) { + throw jsi::JSError(rt, "tokenToId: Usage: tokenToId(token)"); + } + + if (!args[0].isString()) { + throw jsi::JSError(rt, "tokenToId: Expected arg0 to be a string"); + } + + std::unique_lock lock(self->mutex_, std::try_to_lock); + if (!lock.owns_lock()) { + throw jsi::JSError(rt, "tokenToId: Tokenizer is currently in use"); + } + + if (!self->tokenizer_) { + throw jsi::JSError(rt, "tokenToId: Tokenizer has been disposed"); + } + + auto token = args[0].asString(rt).utf8(rt); + auto result = self->tokenizer_->piece_to_id(token); + if (!result.ok()) { + throw jsi::JSError(rt, "tokenToId: Failed to convert token to id: error " + + std::to_string(static_cast(result.error()))); + } + + return static_cast(result.get()); + }; + return jsi::Function::createFromHostFunction(rt, jsi::PropNameID::forAscii(rt, "tokenToId"), 1, fnBody); + } + + if (nameStr == "dispose") { + auto self = shared_from_this(); + auto fnBody = [self](jsi::Runtime &rt, const jsi::Value &thisVal, const jsi::Value *args, size_t count) -> jsi::Value { + if (count != 0) { + throw jsi::JSError(rt, "dispose: Usage: dispose()"); + } + + std::unique_lock lock(self->mutex_); + + if (!self->tokenizer_) { + throw jsi::JSError(rt, "dispose: Tokenizer has already been disposed"); + } + + self->tokenizer_.reset(); + + return jsi::Value::undefined(); + }; + return jsi::Function::createFromHostFunction(rt, jsi::PropNameID::forAscii(rt, "dispose"), 0, fnBody); + } + + return jsi::Value::undefined(); +} + +std::vector TokenizerHostObject::getPropertyNames(jsi::Runtime &rt) { + std::vector properties; + properties.push_back(jsi::PropNameID::forAscii(rt, "path")); + properties.push_back(jsi::PropNameID::forAscii(rt, "encode")); + properties.push_back(jsi::PropNameID::forAscii(rt, "decode")); + properties.push_back(jsi::PropNameID::forAscii(rt, "getVocabSize")); + properties.push_back(jsi::PropNameID::forAscii(rt, "idToToken")); + properties.push_back(jsi::PropNameID::forAscii(rt, "tokenToId")); + properties.push_back(jsi::PropNameID::forAscii(rt, "dispose")); + return properties; +} + +void install_loadTokenizer(jsi::Runtime &rt, jsi::Object &module) { + auto name = "loadTokenizer"; + auto fnBody = [](jsi::Runtime &rt, const jsi::Value &thisVal, const jsi::Value *args, size_t count) -> jsi::Value { + if (count != 1) { + throw jsi::JSError(rt, "loadTokenizer: Usage: loadTokenizer(arg0)"); + } + + if (!args[0].isString()) { + throw jsi::JSError(rt, "loadTokenizer: Expected arg0 to be a string"); + } + + auto tokenizerPath = args[0].asString(rt).utf8(rt); + try { + auto tokenizerInstance = std::make_shared(tokenizerPath); + return jsi::Object::createFromHostObject(rt, tokenizerInstance); + } catch (const std::exception &e) { + throw jsi::JSError(rt, std::string("loadTokenizer: ") + e.what()); + } + }; + auto fn = jsi::Function::createFromHostFunction(rt, jsi::PropNameID::forAscii(rt, name), 1, fnBody); + + module.setProperty(rt, name, fn); +} +} // namespace rnexecutorch::extensions::nlp::tokenizer diff --git a/packages/react-native-executorch/cpp/extensions/nlp/tokenizer.h b/packages/react-native-executorch/cpp/extensions/nlp/tokenizer.h new file mode 100644 index 0000000000..c85a804bb0 --- /dev/null +++ b/packages/react-native-executorch/cpp/extensions/nlp/tokenizer.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include +#include +#include + +#include + +#include + +namespace rnexecutorch::extensions::nlp::tokenizer { +class TokenizerHostObject : public facebook::jsi::HostObject, + public std::enable_shared_from_this { +public: + // Loads the tokenizer from `tokenizerPath`; throws std::runtime_error on failure. + explicit TokenizerHostObject(const std::string &tokenizerPath); + + facebook::jsi::Value get(facebook::jsi::Runtime &rt, const facebook::jsi::PropNameID &name) override; + std::vector getPropertyNames(facebook::jsi::Runtime &rt) override; + +private: + std::string tokenizerPath_; + std::unique_ptr tokenizer_; + std::mutex mutex_; +}; + +void install_loadTokenizer(facebook::jsi::Runtime &rt, facebook::jsi::Object &module); +} // namespace rnexecutorch::extensions::nlp::tokenizer diff --git a/packages/react-native-executorch/react-native-executorch.podspec b/packages/react-native-executorch/react-native-executorch.podspec index a7b7b43882..00e3b167ee 100644 --- a/packages/react-native-executorch/react-native-executorch.podspec +++ b/packages/react-native-executorch/react-native-executorch.podspec @@ -32,6 +32,13 @@ Pod::Spec.new do |s| "HEADER_SEARCH_PATHS" => [ "\"$(PODS_TARGET_SRCROOT)/cpp\"", "\"$(PODS_TARGET_SRCROOT)/third-party/include\"", + # pytorch/tokenizers headers (and the third-party libs they pull in: + # nlohmann/json, re2 and its abseil dependency) ship inside the ExecuTorch + # llm extension bundle + "\"$(PODS_TARGET_SRCROOT)/third-party/include/executorch/extension/llm/tokenizers/include\"", + "\"$(PODS_TARGET_SRCROOT)/third-party/include/executorch/extension/llm/tokenizers/third-party/json/include\"", + "\"$(PODS_TARGET_SRCROOT)/third-party/include/executorch/extension/llm/tokenizers/third-party/re2\"", + "\"$(PODS_TARGET_SRCROOT)/third-party/include/executorch/extension/llm/tokenizers/third-party/abseil-cpp\"", ].join(' '), "WARNING_CFLAGS" => "-Wno-documentation" diff --git a/packages/react-native-executorch/src/extensions/nlp/index.ts b/packages/react-native-executorch/src/extensions/nlp/index.ts new file mode 100644 index 0000000000..e6038624ef --- /dev/null +++ b/packages/react-native-executorch/src/extensions/nlp/index.ts @@ -0,0 +1,2 @@ +export * from './ops/tokenizer'; +export * from './tasks/tokenizer'; diff --git a/packages/react-native-executorch/src/extensions/nlp/ops/tokenizer.ts b/packages/react-native-executorch/src/extensions/nlp/ops/tokenizer.ts new file mode 100644 index 0000000000..464a7d2ff7 --- /dev/null +++ b/packages/react-native-executorch/src/extensions/nlp/ops/tokenizer.ts @@ -0,0 +1,52 @@ +import { rnexecutorchJsi } from '../../../native/bridge'; + +declare const tokenizerBrand: unique symbol; + +/** + * A native HuggingFace-compatible tokenizer instance, backed by a JSI host + * object living on the worklet runtime it was loaded on. + * + * All methods are synchronous and worklet-callable, mirroring the {@link Model} + * and {@link Tensor} primitives. For app-level usage prefer the asynchronous + * {@link createTokenizer} factory or the `useTokenizer` hook, which marshal + * these calls onto the worklet runtime for you. + * @category Types + */ +export type Tokenizer = { + readonly path: string; + + /** Encodes a string into an array of token ids (special tokens follow the tokenizer.json post_processor). */ + encode(text: string): number[]; + /** Decodes an array of token ids back into a string. */ + decode(tokens: number[], skipSpecialTokens: boolean): string; + /** Returns the size of the tokenizer's vocabulary. */ + getVocabSize(): number; + /** Returns the token string associated with the given id. */ + idToToken(id: number): string; + /** Returns the id associated with the given token string. */ + tokenToId(token: string): number; + /** Frees the native tokenizer. The instance must not be used afterwards. */ + dispose(): void; + + /** + * Prevents plain JS objects from being cast as Tokenizers. Tokenizers should + * only be created via the `loadTokenizer` function exported from this module. + * @internal + */ + readonly [tokenizerBrand]: never; +}; + +/** + * Loads a HuggingFace tokenizer from a local `tokenizer.json` file. + * + * This is a worklet-compatible primitive — it must be invoked on the worklet + * runtime (e.g. via {@link wrapAsync}) and returns a {@link Tokenizer} host + * object bound to that runtime. + * @category Typescript API + * @param tokenizerPath Absolute local path to a `tokenizer.json` file. + * @returns The loaded tokenizer. + */ +export function loadTokenizer(tokenizerPath: string): Tokenizer { + 'worklet'; + return rnexecutorchJsi.loadTokenizer(tokenizerPath) as Tokenizer; +} diff --git a/packages/react-native-executorch/src/extensions/nlp/tasks/tokenizer.ts b/packages/react-native-executorch/src/extensions/nlp/tasks/tokenizer.ts new file mode 100644 index 0000000000..8e2c857bc2 --- /dev/null +++ b/packages/react-native-executorch/src/extensions/nlp/tasks/tokenizer.ts @@ -0,0 +1,74 @@ +import type { WorkletRuntime } from 'react-native-worklets'; + +import { wrapAsync } from '../../../core/runtime'; +import { loadTokenizer } from '../ops/tokenizer'; + +/** + * Model configuration required to instantiate a tokenizer task runner. + * @category Types + */ +export type TokenizerConfig = { + readonly tokenizerPath: string; +}; + +/** + * Creates a tokenizer runner around a local `tokenizer.json` file. + * + * The native tokenizer is loaded on the provided worklet runtime. Each exposed + * method comes in two flavours: an asynchronous variant (default) that marshals + * the call onto the worklet runtime, and a `*Worklet` variant for synchronous + * use inside other worklets (e.g. when composing a text-embeddings pipeline). + * @category Typescript API + * @param config Tokenizer configuration containing the local path. + * @param runtime Optional worklet runtime thread on which to run the tokenizer. + * @returns A promise resolving to an object containing tokenization and disposal + * controls. + */ +export async function createTokenizer(config: TokenizerConfig, runtime?: WorkletRuntime) { + const { tokenizerPath } = config; + const tokenizer = await wrapAsync(loadTokenizer, runtime)(tokenizerPath); + + const encodeWorklet = (text: string): number[] => { + 'worklet'; + return tokenizer.encode(text); + }; + + const decodeWorklet = (tokens: number[], skipSpecialTokens: boolean = true): string => { + 'worklet'; + if (tokens.length === 0) { + return ''; + } + return tokenizer.decode(tokens, skipSpecialTokens); + }; + + const getVocabSizeWorklet = (): number => { + 'worklet'; + return tokenizer.getVocabSize(); + }; + + const idToTokenWorklet = (id: number): string => { + 'worklet'; + return tokenizer.idToToken(id); + }; + + const tokenToIdWorklet = (token: string): number => { + 'worklet'; + return tokenizer.tokenToId(token); + }; + + const dispose = () => tokenizer.dispose(); + + return { + encode: wrapAsync(encodeWorklet, runtime), + decode: wrapAsync(decodeWorklet, runtime), + getVocabSize: wrapAsync(getVocabSizeWorklet, runtime), + idToToken: wrapAsync(idToTokenWorklet, runtime), + tokenToId: wrapAsync(tokenToIdWorklet, runtime), + encodeWorklet, + decodeWorklet, + getVocabSizeWorklet, + idToTokenWorklet, + tokenToIdWorklet, + dispose, + }; +} diff --git a/packages/react-native-executorch/src/hooks/useTokenizer.ts b/packages/react-native-executorch/src/hooks/useTokenizer.ts new file mode 100644 index 0000000000..d62ae2abe8 --- /dev/null +++ b/packages/react-native-executorch/src/hooks/useTokenizer.ts @@ -0,0 +1,43 @@ +import { useModel } from './useModel'; +import { useResourceDownload } from './useResourceDownload'; +import { createTokenizer, type TokenizerConfig } from '../extensions/nlp/tasks/tokenizer'; + +/** + * React hook to load and use a HuggingFace tokenizer. + * + * This hook manages downloading the `tokenizer.json` file (if it's a remote + * URL), loading it natively, tracking download progress and load errors, and + * cleaning up native memory when the component unmounts or configuration + * changes. + * @category Hooks + * @param config The tokenizer configuration containing `tokenizerPath` (a + * remote URL or local path to a `tokenizer.json` file). + * @param options Hook options. + * @param options.preventLoad If true, prevents downloading and loading the + * tokenizer. + * @returns An object containing the tokenizer's loading state, error, download + * progress, and tokenization functions. + */ +export function useTokenizer(config: TokenizerConfig, options?: { preventLoad?: boolean }) { + const { localPath, downloadProgress, downloadError } = useResourceDownload( + config.tokenizerPath, + options?.preventLoad + ); + const { model, error } = useModel( + createTokenizer, + localPath ? { tokenizerPath: localPath } : null, + [localPath] + ); + + return { + isReady: !!model, + error: downloadError || error, + downloadProgress, + localPath, + encode: model?.encode, + decode: model?.decode, + getVocabSize: model?.getVocabSize, + idToToken: model?.idToToken, + tokenToId: model?.tokenToId, + }; +} diff --git a/packages/react-native-executorch/src/index.ts b/packages/react-native-executorch/src/index.ts index 8f709b8433..5eadb56665 100644 --- a/packages/react-native-executorch/src/index.ts +++ b/packages/react-native-executorch/src/index.ts @@ -1,5 +1,6 @@ // Hooks — primary API for app developers export * from './hooks/useClassifier'; +export * from './hooks/useTokenizer'; export * from './hooks/useResourceDownload'; export * from './hooks/useModel'; @@ -9,6 +10,7 @@ export * as constants from './constants'; // Task APIs — for developers needing manual lifetime/disposal control export * from './extensions/cv/tasks/classification'; +export * from './extensions/nlp/tasks/tokenizer'; // Core primitives — for library builders and power users export { tensor } from './core/tensor'; @@ -31,6 +33,7 @@ export { defaultWorkletRuntime, wrapAsync } from './core/runtime'; export * as math from './extensions/math'; export * as cv from './extensions/cv'; +export * as nlp from './extensions/nlp'; // Utils export * from './utils'; diff --git a/packages/react-native-executorch/src/models.ts b/packages/react-native-executorch/src/models.ts index 66a88e937b..59f647736b 100644 --- a/packages/react-native-executorch/src/models.ts +++ b/packages/react-native-executorch/src/models.ts @@ -1,4 +1,5 @@ import type { ClassifierModel } from './extensions/cv/tasks/classification'; +import type { TokenizerConfig } from './extensions/nlp/tasks/tokenizer'; import { IMAGENET1K_LABELS, type ImageNet1KLabel } from './constants'; const BASE_URL = 'https://huggingface.co/software-mansion/react-native-executorch'; @@ -27,6 +28,13 @@ const EFFICIENTNET_V2_S_COREML_FP16: ClassifierModel = { classifierOpts: EFFICIENTNET_V2_S_OPTS, }; +// ============================================================================= +// Tokenizers +// ============================================================================= +const ALL_MINILM_L6_V2_TOKENIZER: TokenizerConfig = { + tokenizerPath: `${BASE_URL}-all-MiniLM-L6-v2/${VERSION_TAG}/tokenizer.json`, +}; + /** * Registry of pre-configured ExecuTorch models. * @@ -44,4 +52,7 @@ export const models = { COREML_FP16: EFFICIENTNET_V2_S_COREML_FP16, }, }, + tokenizer: { + ALL_MINILM_L6_V2: ALL_MINILM_L6_V2_TOKENIZER, + }, }; diff --git a/packages/react-native-executorch/third-party/README.md b/packages/react-native-executorch/third-party/README.md index cf3f2e0bb2..b05f958921 100644 --- a/packages/react-native-executorch/third-party/README.md +++ b/packages/react-native-executorch/third-party/README.md @@ -5,7 +5,10 @@ Native ExecuTorch binaries and headers are **not** committed to this branch. The core package's `android/CMakeLists.txt` and `react-native-executorch.podspec` expect ExecuTorch artifacts under this directory: -- `include/` — ExecuTorch + c10 + torch headers +- `include/` — ExecuTorch + c10 + torch headers, including the `pytorch/tokenizers` + headers under `include/executorch/extension/llm/tokenizers/include` (used by the + nlp/tokenizer extension; the `tokenizers::HFTokenizer` symbols resolve from + `libexecutorch`, which is built with the llm/tokenizers extension) - `android/jniLibs//libexecutorch.so`, `android/libs/executorch.jar` - `ios/Frameworks/ExecutorchLib.xcframework` From 81d7ab1866853630454bdf5afe25a7edf5612571 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Mon, 22 Jun 2026 20:46:52 +0200 Subject: [PATCH 2/2] feat(nlp): add tokenizer demo to a dedicated nlp example app --- apps/nlp/app.json | 45 ++++ apps/nlp/app/_layout.tsx | 32 +++ apps/nlp/app/index.tsx | 51 +++++ apps/nlp/app/tokenizer/index.tsx | 272 ++++++++++++++++++++++++ apps/nlp/assets/icons/adaptive-icon.png | Bin 0 -> 17547 bytes apps/nlp/assets/icons/executorch.svg | 9 + apps/nlp/assets/icons/favicon.png | Bin 0 -> 1466 bytes apps/nlp/assets/icons/icon.png | Bin 0 -> 22380 bytes apps/nlp/assets/icons/splash.png | Bin 0 -> 47346 bytes apps/nlp/babel.config.js | 7 + apps/nlp/components/Button.tsx | 102 +++++++++ apps/nlp/components/ModelStatus.tsx | 69 ++++++ apps/nlp/components/ScreenWrapper.tsx | 8 + apps/nlp/declarations.d.ts | 5 + apps/nlp/index.ts | 8 + apps/nlp/metro.config.js | 21 ++ apps/nlp/package.json | 42 ++++ apps/nlp/theme.ts | 76 +++++++ apps/nlp/tsconfig.json | 16 ++ yarn.lock | 30 +++ 20 files changed, 793 insertions(+) create mode 100644 apps/nlp/app.json create mode 100644 apps/nlp/app/_layout.tsx create mode 100644 apps/nlp/app/index.tsx create mode 100644 apps/nlp/app/tokenizer/index.tsx create mode 100644 apps/nlp/assets/icons/adaptive-icon.png create mode 100644 apps/nlp/assets/icons/executorch.svg create mode 100644 apps/nlp/assets/icons/favicon.png create mode 100644 apps/nlp/assets/icons/icon.png create mode 100644 apps/nlp/assets/icons/splash.png create mode 100644 apps/nlp/babel.config.js create mode 100644 apps/nlp/components/Button.tsx create mode 100644 apps/nlp/components/ModelStatus.tsx create mode 100644 apps/nlp/components/ScreenWrapper.tsx create mode 100644 apps/nlp/declarations.d.ts create mode 100644 apps/nlp/index.ts create mode 100644 apps/nlp/metro.config.js create mode 100644 apps/nlp/package.json create mode 100644 apps/nlp/theme.ts create mode 100644 apps/nlp/tsconfig.json diff --git a/apps/nlp/app.json b/apps/nlp/app.json new file mode 100644 index 0000000000..929d222914 --- /dev/null +++ b/apps/nlp/app.json @@ -0,0 +1,45 @@ +{ + "expo": { + "name": "nlp", + "slug": "nlp", + "version": "1.0.0", + "orientation": "portrait", + "icon": "./assets/icons/icon.png", + "userInterfaceStyle": "light", + "newArchEnabled": true, + "scheme": "rne-nlp", + "splash": { + "image": "./assets/icons/splash.png", + "resizeMode": "contain", + "backgroundColor": "#ffffff" + }, + "ios": { + "supportsTablet": true, + "bundleIdentifier": "com.anonymous.nlp" + }, + "android": { + "adaptiveIcon": { + "foregroundImage": "./assets/icons/adaptive-icon.png", + "backgroundColor": "#ffffff" + }, + "package": "com.anonymous.nlp" + }, + "web": { + "favicon": "./assets/icons/favicon.png" + }, + "plugins": [ + "expo-router", + [ + "expo-build-properties", + { + "android": { + "minSdkVersion": 26 + }, + "ios": { + "deploymentTarget": "17.0" + } + } + ] + ] + } +} diff --git a/apps/nlp/app/_layout.tsx b/apps/nlp/app/_layout.tsx new file mode 100644 index 0000000000..bdcfc39660 --- /dev/null +++ b/apps/nlp/app/_layout.tsx @@ -0,0 +1,32 @@ +import { Drawer } from 'expo-router/drawer'; +import { ColorPalette } from '../theme'; +import React from 'react'; + +export default function Layout() { + return ( + + null, + title: 'Main Menu', + drawerItemStyle: { display: 'none' }, + }} + /> + + + ); +} diff --git a/apps/nlp/app/index.tsx b/apps/nlp/app/index.tsx new file mode 100644 index 0000000000..98ff59ab97 --- /dev/null +++ b/apps/nlp/app/index.tsx @@ -0,0 +1,51 @@ +import { useRouter } from 'expo-router'; +import { View, Text, StyleSheet, TouchableOpacity } from 'react-native'; +import { ColorPalette } from '../theme'; +import ExecutorchLogo from '../assets/icons/executorch.svg'; + +export default function Home() { + const router = useRouter(); + + return ( + + + Select a demo + + router.navigate('tokenizer/')}> + Tokenizer + + + + ); +} + +const styles = StyleSheet.create({ + container: { + flex: 1, + justifyContent: 'center', + alignItems: 'center', + backgroundColor: '#fff', + }, + headerText: { + fontSize: 18, + color: ColorPalette.strongPrimary, + margin: 20, + }, + buttonContainer: { + width: '80%', + justifyContent: 'space-evenly', + marginBottom: 20, + }, + button: { + backgroundColor: ColorPalette.strongPrimary, + borderRadius: 8, + padding: 14, + alignItems: 'center', + marginBottom: 12, + }, + buttonText: { + color: 'white', + fontSize: 16, + fontWeight: '600', + }, +}); diff --git a/apps/nlp/app/tokenizer/index.tsx b/apps/nlp/app/tokenizer/index.tsx new file mode 100644 index 0000000000..ac201437d3 --- /dev/null +++ b/apps/nlp/app/tokenizer/index.tsx @@ -0,0 +1,272 @@ +import React, { useEffect, useRef, useState } from 'react'; +import { View, Text, TextInput, ScrollView, StyleSheet } from 'react-native'; +import { useTokenizer, models } from 'react-native-executorch'; +import ScreenWrapper from '../../components/ScreenWrapper'; +import { ModelStatus } from '../../components/ModelStatus'; +import { Button } from '../../components/Button'; +import { theme } from '../../theme'; + +type Check = { label: string; detail: string; pass: boolean }; + +function TokenizerContent() { + const { isReady, downloadProgress, error, encode, decode, getVocabSize, idToToken, tokenToId } = + useTokenizer(models.tokenizer.ALL_MINILM_L6_V2); + + const [text, setText] = useState('Hello world'); + const [running, setRunning] = useState(false); + const [runError, setRunError] = useState(null); + const [ids, setIds] = useState(null); + const [roundTrip, setRoundTrip] = useState(null); + const [vocabSize, setVocabSize] = useState(null); + const [checks, setChecks] = useState([]); + + const ready = isReady && encode && decode && getVocabSize && idToToken && tokenToId; + + const run = async () => { + if (!ready) return; + setRunning(true); + setRunError(null); + setIds(null); + setRoundTrip(null); + setVocabSize(null); + setChecks([]); + try { + const tokenIds = await encode(text); + const decoded = await decode(tokenIds, true); + const vocab = await getVocabSize(); + + // Self-consistent inverse check on a token from the actual output + // (HFTokenizer adds special tokens per the tokenizer.json post_processor). + const sampleId = tokenIds[Math.min(1, tokenIds.length - 1)]!; + const sampleToken = await idToToken(sampleId); + const sampleIdBack = await tokenToId(sampleToken); + + const nextChecks: Check[] = [ + { + label: 'Round-trip decode(encode(text))', + detail: `"${decoded}" vs "${text.toLowerCase()}"`, + // all-MiniLM-L6-v2 is an uncased BERT WordPiece tokenizer + pass: decoded.trim() === text.trim().toLowerCase(), + }, + { + label: 'Vocabulary size', + detail: `${vocab} (expected 30522 for bert-base-uncased)`, + pass: vocab === 30522, + }, + { + label: 'Inverse tokenToId(idToToken(id))', + detail: `${sampleId} → "${sampleToken}" → ${sampleIdBack}`, + pass: sampleIdBack === sampleId, + }, + ]; + + setIds(tokenIds); + setRoundTrip(decoded); + setVocabSize(vocab); + setChecks(nextChecks); + + // Structured log so the result is verifiable from device/Metro logs. + console.log( + '[TokenizerTest]', + JSON.stringify({ + allPass: nextChecks.every((c) => c.pass), + input: text, + ids: tokenIds, + decoded, + vocab, + checks: nextChecks.map((c) => ({ label: c.label, pass: c.pass, detail: c.detail })), + }) + ); + } catch (e: any) { + console.log('[TokenizerTest] ERROR', e?.message ?? String(e)); + setRunError(e?.message ?? String(e)); + } finally { + setRunning(false); + } + }; + + // Auto-run once as soon as the tokenizer is ready, so the demo doubles as a + // self-checking smoke test (results logged under "[TokenizerTest]"). + const autoRan = useRef(false); + useEffect(() => { + if (ready && !autoRan.current) { + autoRan.current = true; + run(); + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [ready]); + + return ( + + + Tokenizer + + Loads the all-MiniLM-L6-v2 tokenizer and proves encode / decode / getVocabSize / idToToken + / tokenToId work end-to-end against the native HFTokenizer. + + + + + + + +