Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,6 @@
[submodule "thirdparty/limonp/limonp-v1.0.2"]
path = thirdparty/limonp/limonp-v1.0.2
url = https://github.com/yanyiwu/limonp.git
[submodule "thirdparty/snowball/snowball-3.1.1"]
path = thirdparty/snowball/snowball-3.1.1
url = https://github.com/snowballstem/snowball.git
1 change: 1 addition & 0 deletions src/db/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ cc_library(
libprotobuf
FastPFOR
cppjieba
snowball
Arrow::arrow_static
Arrow::parquet_static
Arrow::arrow_compute
Expand Down
1 change: 1 addition & 0 deletions src/db/index/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ cc_library(
Arrow::arrow_compute
Arrow::arrow_dataset
cppjieba
snowball
FastPFOR
INCS . ${PROJECT_ROOT_DIR}/src
VERSION "${PROXIMA_ZVEC_VERSION}"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// Copyright 2025-present the zvec project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "stemmer_token_filter.h"
#include <unordered_map>
#include <zvec/ailego/logger/logger.h>

extern "C" {
#include <libstemmer.h>
}

namespace zvec::fts {

struct ThreadLocalStemmerCache {
std::unordered_map<std::string, struct sb_stemmer *> stemmers;

~ThreadLocalStemmerCache() {
for (auto &[_, s] : stemmers) {
sb_stemmer_delete(s);
}
}

struct sb_stemmer *get(const std::string &lang) {
auto it = stemmers.find(lang);
if (it != stemmers.end()) {
return it->second;
}
auto *s = sb_stemmer_new(lang.c_str(), nullptr);
if (s) {
stemmers[lang] = s;
}
return s;
}
};

bool StemmerTokenFilter::init(const ailego::JsonObject &config) {
std::string lang;
if (config.get("stemmer_lang", &lang) && !lang.empty()) {
language_ = lang;
}
auto *test_stemmer = sb_stemmer_new(language_.c_str(), nullptr);
if (!test_stemmer) {
LOG_ERROR("[StemmerTokenFilter] failed to create stemmer for language: %s",
language_.c_str());
return false;
}
sb_stemmer_delete(test_stemmer);
return true;
}

std::vector<Token> StemmerTokenFilter::filter(std::vector<Token> tokens) const {
static thread_local ThreadLocalStemmerCache tls_cache;
auto *stemmer = tls_cache.get(language_);
if (!stemmer) {
return tokens;
}
for (auto &token : tokens) {
const auto *result = sb_stemmer_stem(
stemmer, reinterpret_cast<const unsigned char *>(token.text.data()),
static_cast<int>(token.text.size()));
if (result) {
int len = sb_stemmer_length(stemmer);
token.text.assign(reinterpret_cast<const char *>(result), len);
}
}
return tokens;
}

} // namespace zvec::fts
42 changes: 42 additions & 0 deletions src/db/index/column/fts_column/tokenizer/stemmer_token_filter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Copyright 2025-present the zvec project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <string>
#include <vector>
#include "token_filter.h"

namespace zvec::fts {

class StemmerTokenFilter : public TokenFilter {
public:
StemmerTokenFilter() = default;
~StemmerTokenFilter() override = default;

StemmerTokenFilter(const StemmerTokenFilter &) = delete;
StemmerTokenFilter &operator=(const StemmerTokenFilter &) = delete;

bool init(const ailego::JsonObject &config) override;
std::vector<Token> filter(std::vector<Token> tokens) const override;

const char *name() const override {
return "stemmer";
}

private:
std::string language_{"english"};
};

} // namespace zvec::fts
10 changes: 10 additions & 0 deletions src/db/index/column/fts_column/tokenizer/token_filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <memory>
#include <string>
#include <vector>
#include <zvec/ailego/encoding/json/mod_json_plus.h>
#include "tokenizer.h"

namespace zvec::fts {
Expand All @@ -29,6 +30,15 @@ class TokenFilter {
public:
virtual ~TokenFilter() = default;

/*! Initialise the filter from a JSON configuration object.
* Must be called once before filter().
* \param config JSON object containing filter-specific parameters.
* \return true on success, false on failure.
*/
virtual bool init(const ailego::JsonObject & /*config*/) {
return true;
}

/*! Filter/transform a list of tokens.
* \param tokens input token list (may be modified in place)
* \return processed token list
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <zvec/ailego/logger/logger.h>
#include "jieba_tokenizer.h"
#include "standard_tokenizer.h"
#include "stemmer_token_filter.h"
#include "whitespace_tokenizer.h"

namespace zvec::fts {
Expand Down Expand Up @@ -55,6 +56,11 @@ TokenizerPipelinePtr TokenizerFactory::create(const FtsIndexParams &params) {
filter_name.c_str());
return nullptr;
}
if (!filter->init(extra_json)) {
LOG_ERROR("[TokenizerFactory] failed to init filter: %s",
filter_name.c_str());
return nullptr;
}
filters.push_back(std::move(filter));
}

Expand Down Expand Up @@ -96,6 +102,8 @@ TokenizerPtr TokenizerFactory::create_tokenizer(
TokenFilterPtr TokenizerFactory::create_filter(const std::string &filter_name) {
if (filter_name == "lowercase") {
return std::make_shared<LowercaseTokenFilter>();
} else if (filter_name == "stemmer") {
return std::make_shared<StemmerTokenFilter>();
}
LOG_ERROR("[TokenizerFactory] unknown filter name: %s", filter_name.c_str());
return nullptr;
Expand Down
86 changes: 86 additions & 0 deletions tests/db/index/column/fts_column/fts_column_indexer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1832,3 +1832,89 @@ TEST_F(FtsColumnIndexerTest, FilterPushdownNullFilterUnchanged) {
EXPECT_FLOAT_EQ(baseline[i].score, with_null[i].score);
}
}

// ============================================================
// Stemmer token filter end-to-end tests
// ============================================================

static zvec::fts::TokenizerPipelinePtr make_stemmer_pipeline() {
zvec::fts::FtsIndexParams params;
params.tokenizer_name = "standard";
params.filters = {"lowercase", "stemmer"};
return zvec::fts::TokenizerFactory::create(params);
}

class FtsStemmerIndexerTest : public FtsColumnIndexerTest {
protected:
std::unique_ptr<FtsColumnIndexer> make_stemmer_indexer(
const std::string &field_name = "content") {
auto fts_params = std::make_shared<zvec::FtsIndexParams>(
"standard", std::vector<std::string>{"lowercase", "stemmer"}, "");
auto field_meta = make_test_field_meta(field_name, fts_params);
auto indexer = std::make_unique<FtsColumnIndexer>();
auto ret = indexer->open(field_meta, &db_, postings_cf_, positions_cf_,
term_freq_cf_, max_tf_cf_, doc_len_cf_, stat_cf_);
EXPECT_TRUE(ret.has_value());
return indexer;
}
};

TEST_F(FtsStemmerIndexerTest, StemmedTermMatchesMorphologicalVariants) {
auto indexer = make_stemmer_indexer();
EXPECT_TRUE(indexer->insert(0, "the cats are running quickly").has_value());
EXPECT_TRUE(indexer->insert(1, "a dog runs slowly").has_value());
EXPECT_TRUE(indexer->insert(2, "birds fly high").has_value());

auto pipeline = make_stemmer_pipeline();

// "running" stems to "run", matches doc 0 ("running") and doc 1 ("runs")
std::vector<FtsResult> results;
EXPECT_TRUE(search_ok(*indexer, "running", 10, &results, pipeline));
EXPECT_EQ(results.size(), 2u);

// "cats" stems to "cat", matches only doc 0
results.clear();
EXPECT_TRUE(search_ok(*indexer, "cats", 10, &results, pipeline));
EXPECT_EQ(results.size(), 1u);
EXPECT_EQ(results[0].doc_id, 0ull);
}

TEST_F(FtsStemmerIndexerTest, QueryWithBaseFormMatchesVariants) {
auto indexer = make_stemmer_indexer();
EXPECT_TRUE(indexer->insert(0, "connected connections").has_value());
EXPECT_TRUE(indexer->insert(1, "connecting wires").has_value());
EXPECT_TRUE(indexer->insert(2, "unrelated text").has_value());

auto pipeline = make_stemmer_pipeline();

// "connect" is already a stem, should match doc 0 and doc 1
std::vector<FtsResult> results;
EXPECT_TRUE(search_ok(*indexer, "connect", 10, &results, pipeline));
EXPECT_EQ(results.size(), 2u);
}

TEST_F(FtsStemmerIndexerTest, StemmerWithAndQuery) {
auto indexer = make_stemmer_indexer();
EXPECT_TRUE(indexer->insert(0, "dogs running fast").has_value());
EXPECT_TRUE(indexer->insert(1, "cats running slow").has_value());
EXPECT_TRUE(indexer->insert(2, "dogs sleeping").has_value());

auto pipeline = make_stemmer_pipeline();

// "dogs AND running" -> stems to "dog AND run" -> doc 0 only
std::vector<FtsResult> results;
EXPECT_TRUE(search_ok(*indexer, "dogs AND running", 10, &results, pipeline));
EXPECT_EQ(results.size(), 1u);
EXPECT_EQ(results[0].doc_id, 0ull);
}

TEST_F(FtsStemmerIndexerTest, StemmerNoMatchAfterStemming) {
auto indexer = make_stemmer_indexer();
EXPECT_TRUE(indexer->insert(0, "hello world").has_value());

auto pipeline = make_stemmer_pipeline();

std::vector<FtsResult> results;
EXPECT_TRUE(search_ok(*indexer, "nonexistent", 10, &results, pipeline));
EXPECT_TRUE(results.empty());
}
Loading
Loading