From fe35f17692c3c20dbf46b9d296c2a97df76197eb Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 17 Oct 2025 22:53:29 +0200 Subject: [PATCH 01/23] add cpp linter workflow --- .github/workflows/cpp-linter.yml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 .github/workflows/cpp-linter.yml diff --git a/.github/workflows/cpp-linter.yml b/.github/workflows/cpp-linter.yml new file mode 100644 index 000000000..f7d65a25b --- /dev/null +++ b/.github/workflows/cpp-linter.yml @@ -0,0 +1,30 @@ +name: cpp-linter +on: + pull_request: + paths: ['**.cpp', '**.h', '**.hpp', '**CMakeLists.txt', '**.cmake'] + +jobs: + cpp-linter: + runs-on: ubuntu-latest + permissions: + pull-requests: write + steps: + - name: install Linux deps + if: runner.os == 'Linux' + run: | + sudo apt-get update + sudo apt-get install -y libsnappy-dev libzzip-dev zlib1g-dev libboost-all-dev + + - uses: actions/checkout@v5 + + - uses: cpp-linter/cpp-linter-action@v2 + id: linter + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + version: 21 + extra-args: '-std=c++17' + style: '' # disable clang-format checks. + tidy-checks: '' # Use .clang-tidy config file. + # only 'update' a single comment in a pull request's thread. + thread-comments: ${{ github.event_name == 'pull_request' && 'update' }} From 9937c7380abdddd905e2f2c5adf158b571550687 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 17 Oct 2025 22:57:50 +0200 Subject: [PATCH 02/23] trigger on workflow itself --- .github/workflows/cpp-linter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cpp-linter.yml b/.github/workflows/cpp-linter.yml index f7d65a25b..07c28ddd4 100644 --- a/.github/workflows/cpp-linter.yml +++ b/.github/workflows/cpp-linter.yml @@ -1,7 +1,7 @@ name: cpp-linter on: pull_request: - paths: ['**.cpp', '**.h', '**.hpp', '**CMakeLists.txt', '**.cmake'] + paths: ['**.cpp', '**.h', '**.hpp', '**CMakeLists.txt', '**.cmake', '.github/workflows/cpp-linter.yml'] jobs: cpp-linter: From 075ed3ac2a001de17479b197f61ead2d2d83a119 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 17 Oct 2025 23:16:49 +0200 Subject: [PATCH 03/23] pin version --- .github/workflows/cpp-linter.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cpp-linter.yml b/.github/workflows/cpp-linter.yml index 07c28ddd4..352786f01 100644 --- a/.github/workflows/cpp-linter.yml +++ b/.github/workflows/cpp-linter.yml @@ -17,7 +17,8 @@ jobs: - uses: actions/checkout@v5 - - uses: cpp-linter/cpp-linter-action@v2 + - name: C/C++ Linter + uses: cpp-linter/cpp-linter-action@v2.16.5 id: linter env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 0009b248bea431f8133b36da666620b6a7cc26cb Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 17 Oct 2025 23:19:19 +0200 Subject: [PATCH 04/23] add some files for testing --- keyvi/include/keyvi/dictionary/fsa/automata.h | 16 ++--- .../fsa/comparable_state_traverser.h | 38 ++++++------ .../fsa/internal/lru_generation_cache.h | 16 ++--- .../fsa/internal/minimization_hash.h | 12 ++-- .../fsa/internal/sparse_array_persistence.h | 4 +- .../keyvi/dictionary/fsa/state_traverser.h | 16 ++--- .../dictionary/fsa/zip_state_traverser.h | 58 +++++++++---------- 7 files changed, 75 insertions(+), 85 deletions(-) diff --git a/keyvi/include/keyvi/dictionary/fsa/automata.h b/keyvi/include/keyvi/dictionary/fsa/automata.h index 7a4a94b28..b61ce3eff 100644 --- a/keyvi/include/keyvi/dictionary/fsa/automata.h +++ b/keyvi/include/keyvi/dictionary/fsa/automata.h @@ -406,17 +406,11 @@ class Automata final { return value_store_reader_->GetMsgPackedValueAsString(state_value, compression_algorithm); } - std::string GetStatistics() const { - return dictionary_properties_->GetStatistics(); - } + std::string GetStatistics() const { return dictionary_properties_->GetStatistics(); } - const std::string& GetManifest() const { - return dictionary_properties_->GetManifest(); - } + const std::string& GetManifest() const { return dictionary_properties_->GetManifest(); } - const uint64_t GetVersion() const { - return dictionary_properties_->GetVersion(); - } + const uint64_t GetVersion() const { return dictionary_properties_->GetVersion(); } private: dictionary_properties_t dictionary_properties_; @@ -478,9 +472,7 @@ class Automata final { friend class keyvi::dictionary::SecondaryKeyDictionary; - const dictionary_properties_t& GetDictionaryProperties() const { - return dictionary_properties_; - } + const dictionary_properties_t& GetDictionaryProperties() const { return dictionary_properties_; } }; // shared pointer diff --git a/keyvi/include/keyvi/dictionary/fsa/comparable_state_traverser.h b/keyvi/include/keyvi/dictionary/fsa/comparable_state_traverser.h index a9ac8b6fa..33cdc14fe 100644 --- a/keyvi/include/keyvi/dictionary/fsa/comparable_state_traverser.h +++ b/keyvi/include/keyvi/dictionary/fsa/comparable_state_traverser.h @@ -65,7 +65,7 @@ class ComparableStateTraverser final { using label_t = typename innerTraverserType::label_t; using transition_t = typename innerTraverserType::transition_t; - explicit ComparableStateTraverser(const innerTraverserType &&traverser, const bool advance = true, + explicit ComparableStateTraverser(const innerTraverserType&& traverser, const bool advance = true, const size_t order = 0) : state_traverser_(std::move(traverser)), order_(order) { if (advance) { @@ -85,7 +85,7 @@ class ComparableStateTraverser final { : ComparableStateTraverser(f, f->GetStartState(), advance, order) {} explicit ComparableStateTraverser(const automata_t f, const uint64_t start_state, - traversal::TraversalPayload &&payload, const bool advance = true, + traversal::TraversalPayload&& payload, const bool advance = true, const size_t order = 0) : state_traverser_(f, start_state, std::move(payload), false), order_(order) { if (advance) { @@ -94,13 +94,13 @@ class ComparableStateTraverser final { } ComparableStateTraverser() = delete; - ComparableStateTraverser &operator=(ComparableStateTraverser const &) = delete; - ComparableStateTraverser(const ComparableStateTraverser &that) = delete; + ComparableStateTraverser& operator=(ComparableStateTraverser const&) = delete; + ComparableStateTraverser(const ComparableStateTraverser& that) = delete; /** * Comparison of the state traverser for the purpose of ordering them */ - bool operator<(const ComparableStateTraverser &rhs) const { + bool operator<(const ComparableStateTraverser& rhs) const { int compare = std::memcmp(label_stack_.data(), rhs.label_stack_.data(), std::min(label_stack_.size(), rhs.label_stack_.size()) * sizeof(label_t)); if (compare != 0) { @@ -114,16 +114,16 @@ class ComparableStateTraverser final { return order_ > rhs.order_; } - bool operator>(const ComparableStateTraverser &rhs) const { return rhs.operator<(*this); } + bool operator>(const ComparableStateTraverser& rhs) const { return rhs.operator<(*this); } - bool operator<=(const ComparableStateTraverser &rhs) const { return !operator>(rhs); } + bool operator<=(const ComparableStateTraverser& rhs) const { return !operator>(rhs); } - bool operator>=(const ComparableStateTraverser &rhs) const { return !operator<(rhs); } + bool operator>=(const ComparableStateTraverser& rhs) const { return !operator<(rhs); } /** * Compare traverser with another one, _ignoring_ the order value */ - bool operator==(const ComparableStateTraverser &rhs) const { + bool operator==(const ComparableStateTraverser& rhs) const { if (label_stack_.size() != rhs.label_stack_.size()) { return false; } @@ -131,7 +131,7 @@ class ComparableStateTraverser final { return std::memcmp(label_stack_.data(), rhs.label_stack_.data(), label_stack_.size() * sizeof(label_t)) == 0; } - bool operator!=(const ComparableStateTraverser &rhs) const { return !operator==(rhs); } + bool operator!=(const ComparableStateTraverser& rhs) const { return !operator==(rhs); } operator bool() const { return state_traverser_; } @@ -162,7 +162,7 @@ class ComparableStateTraverser final { label_t GetStateLabel() const { return state_traverser_.GetStateLabel(); } - const std::vector &GetStateLabels() const { return label_stack_; } + const std::vector& GetStateLabels() const { return label_stack_; } size_t GetOrder() const { return order_; } @@ -186,22 +186,22 @@ class ComparableStateTraverser final { template friend class matching::NearMatching; - traversal::TraversalState &GetStates() { return state_traverser_.GetStates(); } + traversal::TraversalState& GetStates() { return state_traverser_.GetStates(); } - traversal::TraversalPayload &GetTraversalPayload() { return state_traverser_.GetTraversalPayload(); } + traversal::TraversalPayload& GetTraversalPayload() { return state_traverser_.GetTraversalPayload(); } - const traversal::TraversalPayload &GetTraversalPayload() const { + const traversal::TraversalPayload& GetTraversalPayload() const { return state_traverser_.GetTraversalPayload(); } }; -inline bool CompareWeights(const traversal::TraversalState &i, - const traversal::TraversalState &j) { +inline bool CompareWeights(const traversal::TraversalState& i, + const traversal::TraversalState& j) { return i.GetNextInnerWeight() == j.GetNextInnerWeight(); } template <> -inline bool ComparableStateTraverser::operator<(const ComparableStateTraverser &rhs) const { +inline bool ComparableStateTraverser::operator<(const ComparableStateTraverser& rhs) const { TRACE("operator< (weighted state specialization)"); TRACE("depth %ld %ld", state_traverser_.GetDepth(), rhs.state_traverser_.GetDepth()); @@ -231,7 +231,7 @@ inline bool ComparableStateTraverser::operator<(const Co } template <> -inline bool ComparableStateTraverser::operator==(const ComparableStateTraverser &rhs) const { +inline bool ComparableStateTraverser::operator==(const ComparableStateTraverser& rhs) const { if (label_stack_.size() != rhs.label_stack_.size()) { return false; } @@ -244,7 +244,7 @@ inline bool ComparableStateTraverser::operator==(const Compa } template <> -inline bool ComparableStateTraverser::operator<(const ComparableStateTraverser &rhs) const { +inline bool ComparableStateTraverser::operator<(const ComparableStateTraverser& rhs) const { TRACE("operator< (near state specialization)"); if (GetTraversalPayload().exact != rhs.GetTraversalPayload().exact) { diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/lru_generation_cache.h b/keyvi/include/keyvi/dictionary/fsa/internal/lru_generation_cache.h index 613051f3f..e62d31912 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/lru_generation_cache.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/lru_generation_cache.h @@ -66,21 +66,21 @@ class LeastRecentlyUsedGenerationsCache final { ~LeastRecentlyUsedGenerationsCache() { delete current_generation_; - for (MinimizationHash *generation : generations_) { + for (MinimizationHash* generation : generations_) { delete generation; } } LeastRecentlyUsedGenerationsCache() = delete; - LeastRecentlyUsedGenerationsCache &operator=(LeastRecentlyUsedGenerationsCache const &) = delete; - LeastRecentlyUsedGenerationsCache(const LeastRecentlyUsedGenerationsCache &that) = delete; + LeastRecentlyUsedGenerationsCache& operator=(LeastRecentlyUsedGenerationsCache const&) = delete; + LeastRecentlyUsedGenerationsCache(const LeastRecentlyUsedGenerationsCache& that) = delete; /** Add this object. * @param key The key to add */ void Add(EntryT key) { if (current_generation_->Size() >= size_per_generation_) { - MinimizationHash *newGeneration = nullptr; + MinimizationHash* newGeneration = nullptr; if (generations_.size() + 1 == max_number_of_generations_) { // remove(free) the first generation newGeneration = generations_[0]; @@ -101,7 +101,7 @@ class LeastRecentlyUsedGenerationsCache final { } template - const EntryT Get(EqualityType &key) { // NOLINT + const EntryT Get(EqualityType& key) { // NOLINT EntryT state = current_generation_->Get(key); if (!state.IsEmpty()) { @@ -126,7 +126,7 @@ class LeastRecentlyUsedGenerationsCache final { */ void Clear() { current_generation_->Clear(); - for (MinimizationHash *generation : generations_) { + for (MinimizationHash* generation : generations_) { delete generation; } generations_.clear(); @@ -148,8 +148,8 @@ class LeastRecentlyUsedGenerationsCache final { private: size_t size_per_generation_; size_t max_number_of_generations_; - MinimizationHash *current_generation_; - std::vector *> generations_; + MinimizationHash* current_generation_; + std::vector*> generations_; }; } /* namespace internal */ diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/minimization_hash.h b/keyvi/include/keyvi/dictionary/fsa/internal/minimization_hash.h index 26253a490..18886aa2a 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/minimization_hash.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/minimization_hash.h @@ -180,7 +180,7 @@ class MinimizationHash final { * @return the equal state or an empty value */ template - inline const T Get(EqualityType &key) const { // NOLINT + inline const T Get(EqualityType& key) const { // NOLINT size_t hash = key.GetHashcode() & 0x7fffffff; size_t bucket = hash % hash_size_; @@ -209,7 +209,7 @@ class MinimizationHash final { * @return the equal state or an empty value */ template - inline const T GetAndMove(EqualityType &key, MinimizationHash *other) { // NOLINT + inline const T GetAndMove(EqualityType& key, MinimizationHash* other) { // NOLINT size_t hash = key.GetHashcode() & 0x7fffffff; size_t bucket = hash % hash_size_; T entry = entries_[bucket]; @@ -325,10 +325,10 @@ class MinimizationHash final { size_t rehash_limit_ = 0; /// the actual data storage - T *entries_ = 0; + T* entries_ = 0; /// overflow data storage for colliding entries - T *overflow_entries_ = 0; + T* overflow_entries_ = 0; /// number of items in the data size_t count_ = 0; @@ -399,10 +399,10 @@ class MinimizationHash final { hash_size_ = hash_size_step_table_[hash_size_step_]; rehash_limit_ = static_cast(hash_size_ * load_factor_); - T *old_entries = entries_; + T* old_entries = entries_; entries_ = new T[hash_size_]; - T *old_overflow_entries = overflow_entries_; + T* old_overflow_entries = overflow_entries_; overflow_entries_size_ = std::min(hash_size_ >> 2, max_cookie_size_); overflow_entries_ = new T[overflow_entries_size_]; diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/sparse_array_persistence.h b/keyvi/include/keyvi/dictionary/fsa/internal/sparse_array_persistence.h index 4a4a6b947..d930be301 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/sparse_array_persistence.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/sparse_array_persistence.h @@ -200,9 +200,7 @@ class SparseArrayPersistence final { TRACE("Wrote Transitions, stream at %d", stream.tellp()); } - size_t GetChunkSizeExternalTransitions() const { - return transitions_extern_->GetChunkSize(); - } + size_t GetChunkSizeExternalTransitions() const { return transitions_extern_->GetChunkSize(); } uint32_t GetVersion() const; diff --git a/keyvi/include/keyvi/dictionary/fsa/state_traverser.h b/keyvi/include/keyvi/dictionary/fsa/state_traverser.h index 667f3ca82..04016c60a 100644 --- a/keyvi/include/keyvi/dictionary/fsa/state_traverser.h +++ b/keyvi/include/keyvi/dictionary/fsa/state_traverser.h @@ -55,7 +55,7 @@ class StateTraverser final { this->operator++(0); } - StateTraverser(automata_t f, const uint64_t start_state, traversal::TraversalPayload &&payload, + StateTraverser(automata_t f, const uint64_t start_state, traversal::TraversalPayload&& payload, const bool advance = true) : fsa_(f), current_weight_(0), current_label_(0), stack_(std::move(payload)) { current_state_ = start_state; @@ -81,10 +81,10 @@ class StateTraverser final { } StateTraverser() = delete; - StateTraverser &operator=(StateTraverser const &) = delete; - StateTraverser(const StateTraverser &that) = delete; + StateTraverser& operator=(StateTraverser const&) = delete; + StateTraverser(const StateTraverser& that) = delete; - StateTraverser(StateTraverser &&other) + StateTraverser(StateTraverser&& other) : fsa_(other.fsa_), current_state_(other.current_state_), current_weight_(other.current_weight_), @@ -182,13 +182,13 @@ class StateTraverser final { template friend class ComparableStateTraverser; - const traversal::TraversalStack &GetStack() const { return stack_; } + const traversal::TraversalStack& GetStack() const { return stack_; } - traversal::TraversalState &GetStates() { return stack_.GetStates(); } + traversal::TraversalState& GetStates() { return stack_.GetStates(); } - traversal::TraversalPayload &GetTraversalPayload() { return stack_.traversal_stack_payload; } + traversal::TraversalPayload& GetTraversalPayload() { return stack_.traversal_stack_payload; } - const traversal::TraversalPayload &GetTraversalPayload() const { return stack_.traversal_stack_payload; } + const traversal::TraversalPayload& GetTraversalPayload() const { return stack_.traversal_stack_payload; } }; /** diff --git a/keyvi/include/keyvi/dictionary/fsa/zip_state_traverser.h b/keyvi/include/keyvi/dictionary/fsa/zip_state_traverser.h index f787e4023..406a2db4a 100644 --- a/keyvi/include/keyvi/dictionary/fsa/zip_state_traverser.h +++ b/keyvi/include/keyvi/dictionary/fsa/zip_state_traverser.h @@ -66,7 +66,7 @@ class ZipStateTraverser final { using traverser_t = std::shared_ptr>; struct TraverserCompare { - bool operator()(const traverser_t &t1, const traverser_t &t2) const { return *t1 > *t2; } + bool operator()(const traverser_t& t1, const traverser_t& t2) const { return *t1 > *t2; } }; public: @@ -74,9 +74,9 @@ class ZipStateTraverser final { using transition_t = typename innerTraverserType::transition_t; using heap_t = boost::heap::skew_heap, boost::heap::mutable_>; - explicit ZipStateTraverser(const std::vector &fsas, const bool advance = true) { + explicit ZipStateTraverser(const std::vector& fsas, const bool advance = true) { size_t order = 0; - for (const automata_t &f : fsas) { + for (const automata_t& f : fsas) { traverser_t traverser = std::make_shared>(f, advance, order++); // the traverser could be exhausted after it has been advanced if (*traverser) { @@ -98,7 +98,7 @@ class ZipStateTraverser final { FillInValues(); } - explicit ZipStateTraverser(const std::vector> &fsa_start_state_pairs, + explicit ZipStateTraverser(const std::vector>& fsa_start_state_pairs, const bool advance = true) { size_t order = 0; for (auto f : fsa_start_state_pairs) { @@ -114,12 +114,12 @@ class ZipStateTraverser final { FillInValues(); } - explicit ZipStateTraverser(std::vector>> - &&fsa_start_state_payloads, + explicit ZipStateTraverser(std::vector>>&& + fsa_start_state_payloads, const bool advance = true) { size_t order = 0; - for (auto &f : fsa_start_state_payloads) { + for (auto& f : fsa_start_state_payloads) { if (std::get<1>(f) > 0) { traverser_t traverser = std::make_shared>( std::get<0>(f), std::get<1>(f), std::move(std::get<2>(f)), advance, order++); @@ -134,10 +134,10 @@ class ZipStateTraverser final { } ZipStateTraverser() = delete; - ZipStateTraverser &operator=(ZipStateTraverser const &) = delete; - ZipStateTraverser(const ZipStateTraverser &that) = delete; + ZipStateTraverser& operator=(ZipStateTraverser const&) = delete; + ZipStateTraverser(const ZipStateTraverser& that) = delete; - ZipStateTraverser(ZipStateTraverser &&other) + ZipStateTraverser(ZipStateTraverser&& other) : traverser_queue_(std::move(other.traverser_queue_)), final_(other.final_), depth_(other.depth_), @@ -212,7 +212,7 @@ class ZipStateTraverser final { label_t GetStateLabel() const { return state_label_; } - const std::vector &GetStateLabels() const { return traverser_queue_.top()->GetStateLabels(); } + const std::vector& GetStateLabels() const { return traverser_queue_.top()->GetStateLabels(); } /** * Set the minimum weight states must be greater or equal to. @@ -245,7 +245,7 @@ class ZipStateTraverser final { pruned = false; if (!traverser_queue_.empty()) { - const traverser_t &t = traverser_queue_.top(); + const traverser_t& t = traverser_queue_.top(); TRACE("take values from traverser %lu", t->GetOrder()); final_ = t->IsFinalState(); @@ -295,7 +295,7 @@ class ZipStateTraverser final { template friend class matching::NearMatching; - const traversal::TraversalPayload &GetTraversalPayload() const { + const traversal::TraversalPayload& GetTraversalPayload() const { return traverser_queue_.top()->GetTraversalPayload(); } }; @@ -322,7 +322,7 @@ inline void ZipStateTraverser::PreIncrement() { size_t steps = equal_states_; while (steps > 0) { - for (const transition_t &transition : (*it)->GetStates().traversal_state_payload.transitions) { + for (const transition_t& transition : (*it)->GetStates().traversal_state_payload.transitions) { if (global_weights.count(transition.label) == 0 || global_weights.at(transition.label) < transition.weight) { global_weights[transition.label] = transition.weight; } @@ -335,7 +335,7 @@ inline void ZipStateTraverser::PreIncrement() { it = traverser_queue_.ordered_begin(); steps = equal_states_; while (steps > 0) { - for (transition_t &transition : (*it)->GetStates().traversal_state_payload.transitions) { + for (transition_t& transition : (*it)->GetStates().traversal_state_payload.transitions) { transition.weight = global_weights.at(transition.label); } // re-sort transitions @@ -370,16 +370,16 @@ inline ZipStateTraverser::ZipStateTraverser(const std::i } // 1st pass collect all weights per label - for (const auto &t : traversers) { - for (const transition_t &transition : t->GetStates().traversal_state_payload.transitions) { + for (const auto& t : traversers) { + for (const transition_t& transition : t->GetStates().traversal_state_payload.transitions) { if (global_weights.count(transition.label) == 0 || global_weights.at(transition.label) < transition.weight) { global_weights[transition.label] = transition.weight; } } } // 2nd pass apply global weights - for (const auto &t : traversers) { - for (transition_t &transition : t->GetStates().traversal_state_payload.transitions) { + for (const auto& t : traversers) { + for (transition_t& transition : t->GetStates().traversal_state_payload.transitions) { transition.weight = global_weights.at(transition.label); } // re-sort transitions @@ -400,7 +400,7 @@ inline ZipStateTraverser::ZipStateTraverser(const std::i } template <> -inline ZipStateTraverser::ZipStateTraverser(const std::vector &fsas, +inline ZipStateTraverser::ZipStateTraverser(const std::vector& fsas, const bool advance) { TRACE("construct (weighted state specialization)"); size_t order = 0; @@ -423,16 +423,16 @@ inline ZipStateTraverser::ZipStateTraverser(const std::v } // 1st pass collect all weights per label - for (const auto &t : traversers) { - for (const transition_t &transition : t->GetStates().traversal_state_payload.transitions) { + for (const auto& t : traversers) { + for (const transition_t& transition : t->GetStates().traversal_state_payload.transitions) { if (global_weights.count(transition.label) == 0 || global_weights.at(transition.label) < transition.weight) { global_weights[transition.label] = transition.weight; } } } // 2nd pass apply global weights - for (const auto &t : traversers) { - for (transition_t &transition : t->GetStates().traversal_state_payload.transitions) { + for (const auto& t : traversers) { + for (transition_t& transition : t->GetStates().traversal_state_payload.transitions) { transition.weight = global_weights.at(transition.label); } // re-sort transitions @@ -454,7 +454,7 @@ inline ZipStateTraverser::ZipStateTraverser(const std::v template <> inline ZipStateTraverser::ZipStateTraverser( - const std::vector> &fsa_start_state_pairs, const bool advance) { + const std::vector>& fsa_start_state_pairs, const bool advance) { size_t order = 0; if (fsa_start_state_pairs.size() < 2) { @@ -480,16 +480,16 @@ inline ZipStateTraverser::ZipStateTraverser( } } // 1st pass collect all weights per label - for (const auto &t : traversers) { - for (const transition_t &transition : t->GetStates().traversal_state_payload.transitions) { + for (const auto& t : traversers) { + for (const transition_t& transition : t->GetStates().traversal_state_payload.transitions) { if (global_weights.count(transition.label) == 0 || global_weights.at(transition.label) < transition.weight) { global_weights[transition.label] = transition.weight; } } } // 2nd pass apply global weights - for (const auto &t : traversers) { - for (transition_t &transition : t->GetStates().traversal_state_payload.transitions) { + for (const auto& t : traversers) { + for (transition_t& transition : t->GetStates().traversal_state_payload.transitions) { transition.weight = global_weights.at(transition.label); } TRACE("resort %ld", t->GetOrder()); From 7e9de88aef3cc3676d8e595c81de159ffc68bba8 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 17 Oct 2025 23:32:49 +0200 Subject: [PATCH 05/23] add content read permission --- .github/workflows/cpp-linter.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cpp-linter.yml b/.github/workflows/cpp-linter.yml index 352786f01..782577fce 100644 --- a/.github/workflows/cpp-linter.yml +++ b/.github/workflows/cpp-linter.yml @@ -7,6 +7,7 @@ jobs: cpp-linter: runs-on: ubuntu-latest permissions: + contents: read pull-requests: write steps: - name: install Linux deps From d4e4006d429e0c75cd8f7c4365c4b6bfb3bc0bbf Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 17 Oct 2025 23:42:34 +0200 Subject: [PATCH 06/23] no thread comments --- .github/workflows/cpp-linter.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/cpp-linter.yml b/.github/workflows/cpp-linter.yml index 782577fce..bc368564f 100644 --- a/.github/workflows/cpp-linter.yml +++ b/.github/workflows/cpp-linter.yml @@ -28,5 +28,3 @@ jobs: extra-args: '-std=c++17' style: '' # disable clang-format checks. tidy-checks: '' # Use .clang-tidy config file. - # only 'update' a single comment in a pull request's thread. - thread-comments: ${{ github.event_name == 'pull_request' && 'update' }} From 95e5bac699100560fc294370eb4442859bd985f2 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Sat, 18 Oct 2025 00:01:38 +0200 Subject: [PATCH 07/23] use database --- .github/workflows/cpp-linter.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cpp-linter.yml b/.github/workflows/cpp-linter.yml index bc368564f..1c112c647 100644 --- a/.github/workflows/cpp-linter.yml +++ b/.github/workflows/cpp-linter.yml @@ -18,6 +18,10 @@ jobs: - uses: actions/checkout@v5 + - name: run cmake + run: | + cmake -Bbuild -DCMAKE_BUILD_TYPE=Debug -DCMAKE_EXPORT_COMPILE_COMMANDS=ON . + - name: C/C++ Linter uses: cpp-linter/cpp-linter-action@v2.16.5 id: linter @@ -25,6 +29,6 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: version: 21 - extra-args: '-std=c++17' + database: build style: '' # disable clang-format checks. tidy-checks: '' # Use .clang-tidy config file. From 992548d0da6ec2b9f69ccfbaa8f3ac37393cf2bf Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Sat, 18 Oct 2025 00:13:37 +0200 Subject: [PATCH 08/23] add more files to see output --- keyvi/include/keyvi/dictionary/util/trace.h | 40 +++++++++--------- .../include/keyvi/dictionary/util/transform.h | 5 +-- .../keyvi/dictionary/util/utf8_utils.h | 41 +++++++------------ .../fsa/internal/minimization_hash_test.cpp | 4 +- .../fsa/zip_state_traverser_test.cpp | 2 +- keyvi/tests/keyvi/index/index_limits_test.cpp | 4 +- 6 files changed, 41 insertions(+), 55 deletions(-) diff --git a/keyvi/include/keyvi/dictionary/util/trace.h b/keyvi/include/keyvi/dictionary/util/trace.h index 008b56432..665702784 100644 --- a/keyvi/include/keyvi/dictionary/util/trace.h +++ b/keyvi/include/keyvi/dictionary/util/trace.h @@ -22,15 +22,15 @@ * Author: hendrik */ -//The following is left intentionally without include guard -//so that tracing can be switched on and off on a per file basis. +// The following is left intentionally without include guard +// so that tracing can be switched on and off on a per file basis. #ifdef ENABLE_TRACING -# undef TRACE -# define TRACE ::keyvi::dictionary::util::trace::trace_it -# undef ENABLE_TRACING +#undef TRACE +#define TRACE ::keyvi::dictionary::util::trace::trace_it +#undef ENABLE_TRACING #else -# undef TRACE -# define TRACE(x,...) +#undef TRACE +#define TRACE(x, ...) #endif #ifndef TRACE_H_ @@ -45,17 +45,17 @@ namespace util { class trace final { public: - static void trace_it(const char* message, ...) { - va_list arguments; - va_start(arguments, message); - - fprintf(stderr, "* "); - vfprintf(stderr, message, arguments); - fprintf(stderr, "\n"); - } - }; - - } /* namespace util */ - } /* namespace dictionary */ - } /* namespace keyvi */ + static void trace_it(const char* message, ...) { + va_list arguments; + va_start(arguments, message); + + fprintf(stderr, "* "); + vfprintf(stderr, message, arguments); + fprintf(stderr, "\n"); + } +}; + +} /* namespace util */ +} /* namespace dictionary */ +} /* namespace keyvi */ #endif /* TRACE_H_ */ diff --git a/keyvi/include/keyvi/dictionary/util/transform.h b/keyvi/include/keyvi/dictionary/util/transform.h index 09cf95e1f..19cd5e76e 100644 --- a/keyvi/include/keyvi/dictionary/util/transform.h +++ b/keyvi/include/keyvi/dictionary/util/transform.h @@ -31,15 +31,14 @@ namespace keyvi { namespace dictionary { namespace util { -class Transform final{ +class Transform final { public: /** * Apply Bag of Words reordering for all but the last token * @param input * @return token with bow applied */ - static std::string BagOfWordsPartial(const std::string& input, size_t& number_of_tokens) - { + static std::string BagOfWordsPartial(const std::string& input, size_t& number_of_tokens) { std::vector strs; boost::split(strs, input, boost::is_any_of("\t ")); number_of_tokens = strs.size(); diff --git a/keyvi/include/keyvi/dictionary/util/utf8_utils.h b/keyvi/include/keyvi/dictionary/util/utf8_utils.h index 3925837a1..6662e0cc5 100644 --- a/keyvi/include/keyvi/dictionary/util/utf8_utils.h +++ b/keyvi/include/keyvi/dictionary/util/utf8_utils.h @@ -25,12 +25,11 @@ #ifndef UTF8_UTILS_H_ #define UTF8_UTILS_H_ - namespace keyvi { namespace dictionary { namespace util { -class Utf8Utils final{ +class Utf8Utils final { public: static bool IsLeadByte(char utf8_byte) { int intValue = utf8_byte & 0xFF; @@ -42,32 +41,22 @@ class Utf8Utils final{ return (intValue < 0x80) || (intValue >= 0xC0); } - static size_t GetCharLength(char utf8_lead_byte) - { - int intValue = utf8_lead_byte & 0xff; + static size_t GetCharLength(char utf8_lead_byte) { + int intValue = utf8_lead_byte & 0xff; - if (intValue < 0x80) - { - return 1; - } - else if (intValue < 0xc0) - { - std::invalid_argument("Illegal UTF-8 lead byte: " + std::to_string(intValue)); - } - else if (intValue < 0xe0) - { - return 2; - } - else if (intValue < 0xf0) - { - return 3; - } - else if (intValue < 0xf8) - { - return 4; - } + if (intValue < 0x80) { + return 1; + } else if (intValue < 0xc0) { + std::invalid_argument("Illegal UTF-8 lead byte: " + std::to_string(intValue)); + } else if (intValue < 0xe0) { + return 2; + } else if (intValue < 0xf0) { + return 3; + } else if (intValue < 0xf8) { + return 4; + } - throw std::invalid_argument("Illegal UTF-8 lead byte: " + std::to_string(intValue)); + throw std::invalid_argument("Illegal UTF-8 lead byte: " + std::to_string(intValue)); } }; diff --git a/keyvi/tests/keyvi/dictionary/fsa/internal/minimization_hash_test.cpp b/keyvi/tests/keyvi/dictionary/fsa/internal/minimization_hash_test.cpp index 24024de9c..006510ef1 100644 --- a/keyvi/tests/keyvi/dictionary/fsa/internal/minimization_hash_test.cpp +++ b/keyvi/tests/keyvi/dictionary/fsa/internal/minimization_hash_test.cpp @@ -35,7 +35,7 @@ namespace internal { BOOST_AUTO_TEST_SUITE(MinimizationHashTests) BOOST_AUTO_TEST_CASE(insert) { - MinimizationHash> *hash = new MinimizationHash>(); + MinimizationHash>* hash = new MinimizationHash>(); PackedState<> p1 = {10, 25, 2}; hash->Add(p1); PackedState<> p2 = {12, 25, 3}; @@ -54,7 +54,7 @@ BOOST_AUTO_TEST_CASE(insert) { } BOOST_AUTO_TEST_CASE(reset) { - MinimizationHash> *hash = new MinimizationHash>(); + MinimizationHash>* hash = new MinimizationHash>(); PackedState<> p1 = {10, 25, 2}; hash->Add(p1); PackedState<> p2 = {12, 25, 3}; diff --git a/keyvi/tests/keyvi/dictionary/fsa/zip_state_traverser_test.cpp b/keyvi/tests/keyvi/dictionary/fsa/zip_state_traverser_test.cpp index 3602adca0..0577ab0b4 100644 --- a/keyvi/tests/keyvi/dictionary/fsa/zip_state_traverser_test.cpp +++ b/keyvi/tests/keyvi/dictionary/fsa/zip_state_traverser_test.cpp @@ -510,7 +510,7 @@ BOOST_AUTO_TEST_CASE(basic) { BOOST_CHECK(!t); } -std::vector GetAllKeys(ZipStateTraverser> *zip_traverser) { +std::vector GetAllKeys(ZipStateTraverser>* zip_traverser) { std::vector label_stack; std::vector keys; diff --git a/keyvi/tests/keyvi/index/index_limits_test.cpp b/keyvi/tests/keyvi/index/index_limits_test.cpp index 141e82cb9..34e6acc15 100644 --- a/keyvi/tests/keyvi/index/index_limits_test.cpp +++ b/keyvi/tests/keyvi/index/index_limits_test.cpp @@ -46,9 +46,7 @@ inline std::string get_keyvimerger_bin() { } inline size_t limit_filedescriptors(size_t file_descriptor_limit) { - struct rlimit limit { - 0 - }; + struct rlimit limit{0}; getrlimit(RLIMIT_NOFILE, &limit); const size_t old_limit = limit.rlim_cur; From f2e9daf36b5182732bfb25616dba89eeba0bffcb Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Sat, 18 Oct 2025 09:29:32 +0200 Subject: [PATCH 09/23] try again with thread comments --- .github/workflows/cpp-linter.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cpp-linter.yml b/.github/workflows/cpp-linter.yml index 1c112c647..8470c0953 100644 --- a/.github/workflows/cpp-linter.yml +++ b/.github/workflows/cpp-linter.yml @@ -32,3 +32,4 @@ jobs: database: build style: '' # disable clang-format checks. tidy-checks: '' # Use .clang-tidy config file. + thread-comments: ${{ github.event_name == 'pull_request' && 'update' }} From e59c3262c88d478517357031d1bca0e26fe05573 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Sat, 18 Oct 2025 09:52:45 +0200 Subject: [PATCH 10/23] try step-summary --- .github/workflows/cpp-linter.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cpp-linter.yml b/.github/workflows/cpp-linter.yml index 8470c0953..a2b052939 100644 --- a/.github/workflows/cpp-linter.yml +++ b/.github/workflows/cpp-linter.yml @@ -32,4 +32,5 @@ jobs: database: build style: '' # disable clang-format checks. tidy-checks: '' # Use .clang-tidy config file. - thread-comments: ${{ github.event_name == 'pull_request' && 'update' }} + ignore-tidy: 'keyvi/3rdparty' + step-summary: true From 3349025dc5e47bd838f962c7d2ab1b6c94a84613 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Sat, 18 Oct 2025 10:40:36 +0200 Subject: [PATCH 11/23] test clang tidy review as alternative --- .github/workflows/clang-tidy-review.yml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 .github/workflows/clang-tidy-review.yml diff --git a/.github/workflows/clang-tidy-review.yml b/.github/workflows/clang-tidy-review.yml new file mode 100644 index 000000000..950cb03e4 --- /dev/null +++ b/.github/workflows/clang-tidy-review.yml @@ -0,0 +1,25 @@ +name: clang-tidy-review + +# You can be more specific, but it currently only works on pull requests +on: + pull_request: + paths: ['**.cpp', '**.h', '**.hpp', '**CMakeLists.txt', '**.cmake', '.github/workflows/clang*.yml'] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v5 + + # Optionally generate compile_commands.json + + - uses: ZedThree/clang-tidy-review@v0.21.0 + with: + split_workflow: true + build_dir: build + apt_packages: "libsnappy-dev libzzip-dev zlib1g-dev libboost-all-dev" + clang_tidy_checks: '' + cmake_command: "cmake -Bbuild -DCMAKE_BUILD_TYPE=Debug -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ." + + - uses: ZedThree/clang-tidy-review/upload@v0.21.0 From ff146d73011e40458a55cc383bf053e132563934 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Sat, 18 Oct 2025 10:44:16 +0200 Subject: [PATCH 12/23] comma separation --- .github/workflows/clang-tidy-review.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/clang-tidy-review.yml b/.github/workflows/clang-tidy-review.yml index 950cb03e4..02a632e19 100644 --- a/.github/workflows/clang-tidy-review.yml +++ b/.github/workflows/clang-tidy-review.yml @@ -12,13 +12,11 @@ jobs: steps: - uses: actions/checkout@v5 - # Optionally generate compile_commands.json - - uses: ZedThree/clang-tidy-review@v0.21.0 with: split_workflow: true build_dir: build - apt_packages: "libsnappy-dev libzzip-dev zlib1g-dev libboost-all-dev" + apt_packages: "libsnappy-dev, libzzip-dev, zlib1g-dev, libboost-all-dev" clang_tidy_checks: '' cmake_command: "cmake -Bbuild -DCMAKE_BUILD_TYPE=Debug -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ." From a4c7bf7ed610ff912d90d37edf690719f33214b8 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Sat, 18 Oct 2025 11:35:44 +0200 Subject: [PATCH 13/23] zstd --- .github/workflows/clang-tidy-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/clang-tidy-review.yml b/.github/workflows/clang-tidy-review.yml index 02a632e19..6507b1457 100644 --- a/.github/workflows/clang-tidy-review.yml +++ b/.github/workflows/clang-tidy-review.yml @@ -16,7 +16,7 @@ jobs: with: split_workflow: true build_dir: build - apt_packages: "libsnappy-dev, libzzip-dev, zlib1g-dev, libboost-all-dev" + apt_packages: "libsnappy-dev, libzzip-dev, zlib1g-dev, libboost-all-dev libzstd-dev" clang_tidy_checks: '' cmake_command: "cmake -Bbuild -DCMAKE_BUILD_TYPE=Debug -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ." From 7650d95e649f655ee5499bb8e95bb44c7e65023e Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Sat, 18 Oct 2025 12:00:30 +0200 Subject: [PATCH 14/23] comma --- .github/workflows/clang-tidy-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/clang-tidy-review.yml b/.github/workflows/clang-tidy-review.yml index 6507b1457..ce0d987ac 100644 --- a/.github/workflows/clang-tidy-review.yml +++ b/.github/workflows/clang-tidy-review.yml @@ -16,7 +16,7 @@ jobs: with: split_workflow: true build_dir: build - apt_packages: "libsnappy-dev, libzzip-dev, zlib1g-dev, libboost-all-dev libzstd-dev" + apt_packages: "libsnappy-dev, libzzip-dev, zlib1g-dev, libboost-all-dev, libzstd-dev" clang_tidy_checks: '' cmake_command: "cmake -Bbuild -DCMAKE_BUILD_TYPE=Debug -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ." From 723574da1a1f6faf52f0bb19e9d4b56157620aea Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Tue, 21 Oct 2025 05:27:26 +0200 Subject: [PATCH 15/23] update --- .github/workflows/clang-tidy-post.yml | 1 + .github/workflows/clang-tidy-review.yml | 5 ++++ .github/workflows/cpp-linter.yml | 36 ------------------------- 3 files changed, 6 insertions(+), 36 deletions(-) delete mode 100644 .github/workflows/cpp-linter.yml diff --git a/.github/workflows/clang-tidy-post.yml b/.github/workflows/clang-tidy-post.yml index 1cbece053..cd827ac4b 100644 --- a/.github/workflows/clang-tidy-post.yml +++ b/.github/workflows/clang-tidy-post.yml @@ -19,3 +19,4 @@ jobs: lgtm_comment_body: '' annotations: false max_comments: 10 + num_comments_as_exitcode: false diff --git a/.github/workflows/clang-tidy-review.yml b/.github/workflows/clang-tidy-review.yml index ce0d987ac..87734f1d0 100644 --- a/.github/workflows/clang-tidy-review.yml +++ b/.github/workflows/clang-tidy-review.yml @@ -13,6 +13,7 @@ jobs: - uses: actions/checkout@v5 - uses: ZedThree/clang-tidy-review@v0.21.0 + id: review with: split_workflow: true build_dir: build @@ -21,3 +22,7 @@ jobs: cmake_command: "cmake -Bbuild -DCMAKE_BUILD_TYPE=Debug -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ." - uses: ZedThree/clang-tidy-review/upload@v0.21.0 + + # If there are any comments, fail the check + - if: steps.review.outputs.total_comments > 0 + run: exit 1 diff --git a/.github/workflows/cpp-linter.yml b/.github/workflows/cpp-linter.yml deleted file mode 100644 index a2b052939..000000000 --- a/.github/workflows/cpp-linter.yml +++ /dev/null @@ -1,36 +0,0 @@ -name: cpp-linter -on: - pull_request: - paths: ['**.cpp', '**.h', '**.hpp', '**CMakeLists.txt', '**.cmake', '.github/workflows/cpp-linter.yml'] - -jobs: - cpp-linter: - runs-on: ubuntu-latest - permissions: - contents: read - pull-requests: write - steps: - - name: install Linux deps - if: runner.os == 'Linux' - run: | - sudo apt-get update - sudo apt-get install -y libsnappy-dev libzzip-dev zlib1g-dev libboost-all-dev - - - uses: actions/checkout@v5 - - - name: run cmake - run: | - cmake -Bbuild -DCMAKE_BUILD_TYPE=Debug -DCMAKE_EXPORT_COMPILE_COMMANDS=ON . - - - name: C/C++ Linter - uses: cpp-linter/cpp-linter-action@v2.16.5 - id: linter - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - version: 21 - database: build - style: '' # disable clang-format checks. - tidy-checks: '' # Use .clang-tidy config file. - ignore-tidy: 'keyvi/3rdparty' - step-summary: true From 0e60c824154e174615ff3e14d08fb2df6d08045c Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Tue, 21 Oct 2025 06:08:19 +0200 Subject: [PATCH 16/23] apply suggestions --- keyvi/include/keyvi/dictionary/fsa/automata.h | 8 ++++---- .../keyvi/dictionary/fsa/comparable_state_traverser.h | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/keyvi/include/keyvi/dictionary/fsa/automata.h b/keyvi/include/keyvi/dictionary/fsa/automata.h index b61ce3eff..1fd2db980 100644 --- a/keyvi/include/keyvi/dictionary/fsa/automata.h +++ b/keyvi/include/keyvi/dictionary/fsa/automata.h @@ -406,11 +406,11 @@ class Automata final { return value_store_reader_->GetMsgPackedValueAsString(state_value, compression_algorithm); } - std::string GetStatistics() const { return dictionary_properties_->GetStatistics(); } + [[nodiscard]] std::string GetStatistics() const { return dictionary_properties_->GetStatistics(); } - const std::string& GetManifest() const { return dictionary_properties_->GetManifest(); } + [[nodiscard]] const std::string& GetManifest() const { return dictionary_properties_->GetManifest(); } - const uint64_t GetVersion() const { return dictionary_properties_->GetVersion(); } + [[nodiscard]] const uint64_t GetVersion() const { return dictionary_properties_->GetVersion(); } private: dictionary_properties_t dictionary_properties_; @@ -472,7 +472,7 @@ class Automata final { friend class keyvi::dictionary::SecondaryKeyDictionary; - const dictionary_properties_t& GetDictionaryProperties() const { return dictionary_properties_; } + [[nodiscard]] const dictionary_properties_t& GetDictionaryProperties() const { return dictionary_properties_; } }; // shared pointer diff --git a/keyvi/include/keyvi/dictionary/fsa/comparable_state_traverser.h b/keyvi/include/keyvi/dictionary/fsa/comparable_state_traverser.h index 33cdc14fe..1873bf031 100644 --- a/keyvi/include/keyvi/dictionary/fsa/comparable_state_traverser.h +++ b/keyvi/include/keyvi/dictionary/fsa/comparable_state_traverser.h @@ -31,6 +31,7 @@ #include #include "keyvi/dictionary/fsa/automata.h" +#include "keyvi/dictionary/fsa/traversal/traversal_base.h" #include "keyvi/dictionary/fsa/traverser_types.h" // #define ENABLE_TRACING @@ -162,7 +163,7 @@ class ComparableStateTraverser final { label_t GetStateLabel() const { return state_traverser_.GetStateLabel(); } - const std::vector& GetStateLabels() const { return label_stack_; } + [[nodiscard]] const std::vector& GetStateLabels() const { return label_stack_; } size_t GetOrder() const { return order_; } @@ -190,7 +191,7 @@ class ComparableStateTraverser final { traversal::TraversalPayload& GetTraversalPayload() { return state_traverser_.GetTraversalPayload(); } - const traversal::TraversalPayload& GetTraversalPayload() const { + [[nodiscard]] const traversal::TraversalPayload& GetTraversalPayload() const { return state_traverser_.GetTraversalPayload(); } }; From c58b95b04c889fb7edb6fb056a17b4dc4a23b550 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Tue, 21 Oct 2025 07:05:20 +0200 Subject: [PATCH 17/23] revert some changes --- .../fsa/comparable_state_traverser.h | 39 ++++++------- .../fsa/internal/lru_generation_cache.h | 16 ++--- .../fsa/internal/minimization_hash.h | 12 ++-- .../keyvi/dictionary/fsa/state_traverser.h | 16 ++--- .../dictionary/fsa/zip_state_traverser.h | 58 +++++++++---------- keyvi/include/keyvi/dictionary/util/trace.h | 40 ++++++------- .../include/keyvi/dictionary/util/transform.h | 5 +- .../keyvi/dictionary/util/utf8_utils.h | 41 ++++++++----- .../fsa/internal/minimization_hash_test.cpp | 4 +- .../fsa/zip_state_traverser_test.cpp | 2 +- keyvi/tests/keyvi/index/index_limits_test.cpp | 4 +- 11 files changed, 125 insertions(+), 112 deletions(-) diff --git a/keyvi/include/keyvi/dictionary/fsa/comparable_state_traverser.h b/keyvi/include/keyvi/dictionary/fsa/comparable_state_traverser.h index 1873bf031..a9ac8b6fa 100644 --- a/keyvi/include/keyvi/dictionary/fsa/comparable_state_traverser.h +++ b/keyvi/include/keyvi/dictionary/fsa/comparable_state_traverser.h @@ -31,7 +31,6 @@ #include #include "keyvi/dictionary/fsa/automata.h" -#include "keyvi/dictionary/fsa/traversal/traversal_base.h" #include "keyvi/dictionary/fsa/traverser_types.h" // #define ENABLE_TRACING @@ -66,7 +65,7 @@ class ComparableStateTraverser final { using label_t = typename innerTraverserType::label_t; using transition_t = typename innerTraverserType::transition_t; - explicit ComparableStateTraverser(const innerTraverserType&& traverser, const bool advance = true, + explicit ComparableStateTraverser(const innerTraverserType &&traverser, const bool advance = true, const size_t order = 0) : state_traverser_(std::move(traverser)), order_(order) { if (advance) { @@ -86,7 +85,7 @@ class ComparableStateTraverser final { : ComparableStateTraverser(f, f->GetStartState(), advance, order) {} explicit ComparableStateTraverser(const automata_t f, const uint64_t start_state, - traversal::TraversalPayload&& payload, const bool advance = true, + traversal::TraversalPayload &&payload, const bool advance = true, const size_t order = 0) : state_traverser_(f, start_state, std::move(payload), false), order_(order) { if (advance) { @@ -95,13 +94,13 @@ class ComparableStateTraverser final { } ComparableStateTraverser() = delete; - ComparableStateTraverser& operator=(ComparableStateTraverser const&) = delete; - ComparableStateTraverser(const ComparableStateTraverser& that) = delete; + ComparableStateTraverser &operator=(ComparableStateTraverser const &) = delete; + ComparableStateTraverser(const ComparableStateTraverser &that) = delete; /** * Comparison of the state traverser for the purpose of ordering them */ - bool operator<(const ComparableStateTraverser& rhs) const { + bool operator<(const ComparableStateTraverser &rhs) const { int compare = std::memcmp(label_stack_.data(), rhs.label_stack_.data(), std::min(label_stack_.size(), rhs.label_stack_.size()) * sizeof(label_t)); if (compare != 0) { @@ -115,16 +114,16 @@ class ComparableStateTraverser final { return order_ > rhs.order_; } - bool operator>(const ComparableStateTraverser& rhs) const { return rhs.operator<(*this); } + bool operator>(const ComparableStateTraverser &rhs) const { return rhs.operator<(*this); } - bool operator<=(const ComparableStateTraverser& rhs) const { return !operator>(rhs); } + bool operator<=(const ComparableStateTraverser &rhs) const { return !operator>(rhs); } - bool operator>=(const ComparableStateTraverser& rhs) const { return !operator<(rhs); } + bool operator>=(const ComparableStateTraverser &rhs) const { return !operator<(rhs); } /** * Compare traverser with another one, _ignoring_ the order value */ - bool operator==(const ComparableStateTraverser& rhs) const { + bool operator==(const ComparableStateTraverser &rhs) const { if (label_stack_.size() != rhs.label_stack_.size()) { return false; } @@ -132,7 +131,7 @@ class ComparableStateTraverser final { return std::memcmp(label_stack_.data(), rhs.label_stack_.data(), label_stack_.size() * sizeof(label_t)) == 0; } - bool operator!=(const ComparableStateTraverser& rhs) const { return !operator==(rhs); } + bool operator!=(const ComparableStateTraverser &rhs) const { return !operator==(rhs); } operator bool() const { return state_traverser_; } @@ -163,7 +162,7 @@ class ComparableStateTraverser final { label_t GetStateLabel() const { return state_traverser_.GetStateLabel(); } - [[nodiscard]] const std::vector& GetStateLabels() const { return label_stack_; } + const std::vector &GetStateLabels() const { return label_stack_; } size_t GetOrder() const { return order_; } @@ -187,22 +186,22 @@ class ComparableStateTraverser final { template friend class matching::NearMatching; - traversal::TraversalState& GetStates() { return state_traverser_.GetStates(); } + traversal::TraversalState &GetStates() { return state_traverser_.GetStates(); } - traversal::TraversalPayload& GetTraversalPayload() { return state_traverser_.GetTraversalPayload(); } + traversal::TraversalPayload &GetTraversalPayload() { return state_traverser_.GetTraversalPayload(); } - [[nodiscard]] const traversal::TraversalPayload& GetTraversalPayload() const { + const traversal::TraversalPayload &GetTraversalPayload() const { return state_traverser_.GetTraversalPayload(); } }; -inline bool CompareWeights(const traversal::TraversalState& i, - const traversal::TraversalState& j) { +inline bool CompareWeights(const traversal::TraversalState &i, + const traversal::TraversalState &j) { return i.GetNextInnerWeight() == j.GetNextInnerWeight(); } template <> -inline bool ComparableStateTraverser::operator<(const ComparableStateTraverser& rhs) const { +inline bool ComparableStateTraverser::operator<(const ComparableStateTraverser &rhs) const { TRACE("operator< (weighted state specialization)"); TRACE("depth %ld %ld", state_traverser_.GetDepth(), rhs.state_traverser_.GetDepth()); @@ -232,7 +231,7 @@ inline bool ComparableStateTraverser::operator<(const Co } template <> -inline bool ComparableStateTraverser::operator==(const ComparableStateTraverser& rhs) const { +inline bool ComparableStateTraverser::operator==(const ComparableStateTraverser &rhs) const { if (label_stack_.size() != rhs.label_stack_.size()) { return false; } @@ -245,7 +244,7 @@ inline bool ComparableStateTraverser::operator==(const Compa } template <> -inline bool ComparableStateTraverser::operator<(const ComparableStateTraverser& rhs) const { +inline bool ComparableStateTraverser::operator<(const ComparableStateTraverser &rhs) const { TRACE("operator< (near state specialization)"); if (GetTraversalPayload().exact != rhs.GetTraversalPayload().exact) { diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/lru_generation_cache.h b/keyvi/include/keyvi/dictionary/fsa/internal/lru_generation_cache.h index e62d31912..613051f3f 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/lru_generation_cache.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/lru_generation_cache.h @@ -66,21 +66,21 @@ class LeastRecentlyUsedGenerationsCache final { ~LeastRecentlyUsedGenerationsCache() { delete current_generation_; - for (MinimizationHash* generation : generations_) { + for (MinimizationHash *generation : generations_) { delete generation; } } LeastRecentlyUsedGenerationsCache() = delete; - LeastRecentlyUsedGenerationsCache& operator=(LeastRecentlyUsedGenerationsCache const&) = delete; - LeastRecentlyUsedGenerationsCache(const LeastRecentlyUsedGenerationsCache& that) = delete; + LeastRecentlyUsedGenerationsCache &operator=(LeastRecentlyUsedGenerationsCache const &) = delete; + LeastRecentlyUsedGenerationsCache(const LeastRecentlyUsedGenerationsCache &that) = delete; /** Add this object. * @param key The key to add */ void Add(EntryT key) { if (current_generation_->Size() >= size_per_generation_) { - MinimizationHash* newGeneration = nullptr; + MinimizationHash *newGeneration = nullptr; if (generations_.size() + 1 == max_number_of_generations_) { // remove(free) the first generation newGeneration = generations_[0]; @@ -101,7 +101,7 @@ class LeastRecentlyUsedGenerationsCache final { } template - const EntryT Get(EqualityType& key) { // NOLINT + const EntryT Get(EqualityType &key) { // NOLINT EntryT state = current_generation_->Get(key); if (!state.IsEmpty()) { @@ -126,7 +126,7 @@ class LeastRecentlyUsedGenerationsCache final { */ void Clear() { current_generation_->Clear(); - for (MinimizationHash* generation : generations_) { + for (MinimizationHash *generation : generations_) { delete generation; } generations_.clear(); @@ -148,8 +148,8 @@ class LeastRecentlyUsedGenerationsCache final { private: size_t size_per_generation_; size_t max_number_of_generations_; - MinimizationHash* current_generation_; - std::vector*> generations_; + MinimizationHash *current_generation_; + std::vector *> generations_; }; } /* namespace internal */ diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/minimization_hash.h b/keyvi/include/keyvi/dictionary/fsa/internal/minimization_hash.h index 18886aa2a..26253a490 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/minimization_hash.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/minimization_hash.h @@ -180,7 +180,7 @@ class MinimizationHash final { * @return the equal state or an empty value */ template - inline const T Get(EqualityType& key) const { // NOLINT + inline const T Get(EqualityType &key) const { // NOLINT size_t hash = key.GetHashcode() & 0x7fffffff; size_t bucket = hash % hash_size_; @@ -209,7 +209,7 @@ class MinimizationHash final { * @return the equal state or an empty value */ template - inline const T GetAndMove(EqualityType& key, MinimizationHash* other) { // NOLINT + inline const T GetAndMove(EqualityType &key, MinimizationHash *other) { // NOLINT size_t hash = key.GetHashcode() & 0x7fffffff; size_t bucket = hash % hash_size_; T entry = entries_[bucket]; @@ -325,10 +325,10 @@ class MinimizationHash final { size_t rehash_limit_ = 0; /// the actual data storage - T* entries_ = 0; + T *entries_ = 0; /// overflow data storage for colliding entries - T* overflow_entries_ = 0; + T *overflow_entries_ = 0; /// number of items in the data size_t count_ = 0; @@ -399,10 +399,10 @@ class MinimizationHash final { hash_size_ = hash_size_step_table_[hash_size_step_]; rehash_limit_ = static_cast(hash_size_ * load_factor_); - T* old_entries = entries_; + T *old_entries = entries_; entries_ = new T[hash_size_]; - T* old_overflow_entries = overflow_entries_; + T *old_overflow_entries = overflow_entries_; overflow_entries_size_ = std::min(hash_size_ >> 2, max_cookie_size_); overflow_entries_ = new T[overflow_entries_size_]; diff --git a/keyvi/include/keyvi/dictionary/fsa/state_traverser.h b/keyvi/include/keyvi/dictionary/fsa/state_traverser.h index 04016c60a..667f3ca82 100644 --- a/keyvi/include/keyvi/dictionary/fsa/state_traverser.h +++ b/keyvi/include/keyvi/dictionary/fsa/state_traverser.h @@ -55,7 +55,7 @@ class StateTraverser final { this->operator++(0); } - StateTraverser(automata_t f, const uint64_t start_state, traversal::TraversalPayload&& payload, + StateTraverser(automata_t f, const uint64_t start_state, traversal::TraversalPayload &&payload, const bool advance = true) : fsa_(f), current_weight_(0), current_label_(0), stack_(std::move(payload)) { current_state_ = start_state; @@ -81,10 +81,10 @@ class StateTraverser final { } StateTraverser() = delete; - StateTraverser& operator=(StateTraverser const&) = delete; - StateTraverser(const StateTraverser& that) = delete; + StateTraverser &operator=(StateTraverser const &) = delete; + StateTraverser(const StateTraverser &that) = delete; - StateTraverser(StateTraverser&& other) + StateTraverser(StateTraverser &&other) : fsa_(other.fsa_), current_state_(other.current_state_), current_weight_(other.current_weight_), @@ -182,13 +182,13 @@ class StateTraverser final { template friend class ComparableStateTraverser; - const traversal::TraversalStack& GetStack() const { return stack_; } + const traversal::TraversalStack &GetStack() const { return stack_; } - traversal::TraversalState& GetStates() { return stack_.GetStates(); } + traversal::TraversalState &GetStates() { return stack_.GetStates(); } - traversal::TraversalPayload& GetTraversalPayload() { return stack_.traversal_stack_payload; } + traversal::TraversalPayload &GetTraversalPayload() { return stack_.traversal_stack_payload; } - const traversal::TraversalPayload& GetTraversalPayload() const { return stack_.traversal_stack_payload; } + const traversal::TraversalPayload &GetTraversalPayload() const { return stack_.traversal_stack_payload; } }; /** diff --git a/keyvi/include/keyvi/dictionary/fsa/zip_state_traverser.h b/keyvi/include/keyvi/dictionary/fsa/zip_state_traverser.h index 406a2db4a..f787e4023 100644 --- a/keyvi/include/keyvi/dictionary/fsa/zip_state_traverser.h +++ b/keyvi/include/keyvi/dictionary/fsa/zip_state_traverser.h @@ -66,7 +66,7 @@ class ZipStateTraverser final { using traverser_t = std::shared_ptr>; struct TraverserCompare { - bool operator()(const traverser_t& t1, const traverser_t& t2) const { return *t1 > *t2; } + bool operator()(const traverser_t &t1, const traverser_t &t2) const { return *t1 > *t2; } }; public: @@ -74,9 +74,9 @@ class ZipStateTraverser final { using transition_t = typename innerTraverserType::transition_t; using heap_t = boost::heap::skew_heap, boost::heap::mutable_>; - explicit ZipStateTraverser(const std::vector& fsas, const bool advance = true) { + explicit ZipStateTraverser(const std::vector &fsas, const bool advance = true) { size_t order = 0; - for (const automata_t& f : fsas) { + for (const automata_t &f : fsas) { traverser_t traverser = std::make_shared>(f, advance, order++); // the traverser could be exhausted after it has been advanced if (*traverser) { @@ -98,7 +98,7 @@ class ZipStateTraverser final { FillInValues(); } - explicit ZipStateTraverser(const std::vector>& fsa_start_state_pairs, + explicit ZipStateTraverser(const std::vector> &fsa_start_state_pairs, const bool advance = true) { size_t order = 0; for (auto f : fsa_start_state_pairs) { @@ -114,12 +114,12 @@ class ZipStateTraverser final { FillInValues(); } - explicit ZipStateTraverser(std::vector>>&& - fsa_start_state_payloads, + explicit ZipStateTraverser(std::vector>> + &&fsa_start_state_payloads, const bool advance = true) { size_t order = 0; - for (auto& f : fsa_start_state_payloads) { + for (auto &f : fsa_start_state_payloads) { if (std::get<1>(f) > 0) { traverser_t traverser = std::make_shared>( std::get<0>(f), std::get<1>(f), std::move(std::get<2>(f)), advance, order++); @@ -134,10 +134,10 @@ class ZipStateTraverser final { } ZipStateTraverser() = delete; - ZipStateTraverser& operator=(ZipStateTraverser const&) = delete; - ZipStateTraverser(const ZipStateTraverser& that) = delete; + ZipStateTraverser &operator=(ZipStateTraverser const &) = delete; + ZipStateTraverser(const ZipStateTraverser &that) = delete; - ZipStateTraverser(ZipStateTraverser&& other) + ZipStateTraverser(ZipStateTraverser &&other) : traverser_queue_(std::move(other.traverser_queue_)), final_(other.final_), depth_(other.depth_), @@ -212,7 +212,7 @@ class ZipStateTraverser final { label_t GetStateLabel() const { return state_label_; } - const std::vector& GetStateLabels() const { return traverser_queue_.top()->GetStateLabels(); } + const std::vector &GetStateLabels() const { return traverser_queue_.top()->GetStateLabels(); } /** * Set the minimum weight states must be greater or equal to. @@ -245,7 +245,7 @@ class ZipStateTraverser final { pruned = false; if (!traverser_queue_.empty()) { - const traverser_t& t = traverser_queue_.top(); + const traverser_t &t = traverser_queue_.top(); TRACE("take values from traverser %lu", t->GetOrder()); final_ = t->IsFinalState(); @@ -295,7 +295,7 @@ class ZipStateTraverser final { template friend class matching::NearMatching; - const traversal::TraversalPayload& GetTraversalPayload() const { + const traversal::TraversalPayload &GetTraversalPayload() const { return traverser_queue_.top()->GetTraversalPayload(); } }; @@ -322,7 +322,7 @@ inline void ZipStateTraverser::PreIncrement() { size_t steps = equal_states_; while (steps > 0) { - for (const transition_t& transition : (*it)->GetStates().traversal_state_payload.transitions) { + for (const transition_t &transition : (*it)->GetStates().traversal_state_payload.transitions) { if (global_weights.count(transition.label) == 0 || global_weights.at(transition.label) < transition.weight) { global_weights[transition.label] = transition.weight; } @@ -335,7 +335,7 @@ inline void ZipStateTraverser::PreIncrement() { it = traverser_queue_.ordered_begin(); steps = equal_states_; while (steps > 0) { - for (transition_t& transition : (*it)->GetStates().traversal_state_payload.transitions) { + for (transition_t &transition : (*it)->GetStates().traversal_state_payload.transitions) { transition.weight = global_weights.at(transition.label); } // re-sort transitions @@ -370,16 +370,16 @@ inline ZipStateTraverser::ZipStateTraverser(const std::i } // 1st pass collect all weights per label - for (const auto& t : traversers) { - for (const transition_t& transition : t->GetStates().traversal_state_payload.transitions) { + for (const auto &t : traversers) { + for (const transition_t &transition : t->GetStates().traversal_state_payload.transitions) { if (global_weights.count(transition.label) == 0 || global_weights.at(transition.label) < transition.weight) { global_weights[transition.label] = transition.weight; } } } // 2nd pass apply global weights - for (const auto& t : traversers) { - for (transition_t& transition : t->GetStates().traversal_state_payload.transitions) { + for (const auto &t : traversers) { + for (transition_t &transition : t->GetStates().traversal_state_payload.transitions) { transition.weight = global_weights.at(transition.label); } // re-sort transitions @@ -400,7 +400,7 @@ inline ZipStateTraverser::ZipStateTraverser(const std::i } template <> -inline ZipStateTraverser::ZipStateTraverser(const std::vector& fsas, +inline ZipStateTraverser::ZipStateTraverser(const std::vector &fsas, const bool advance) { TRACE("construct (weighted state specialization)"); size_t order = 0; @@ -423,16 +423,16 @@ inline ZipStateTraverser::ZipStateTraverser(const std::v } // 1st pass collect all weights per label - for (const auto& t : traversers) { - for (const transition_t& transition : t->GetStates().traversal_state_payload.transitions) { + for (const auto &t : traversers) { + for (const transition_t &transition : t->GetStates().traversal_state_payload.transitions) { if (global_weights.count(transition.label) == 0 || global_weights.at(transition.label) < transition.weight) { global_weights[transition.label] = transition.weight; } } } // 2nd pass apply global weights - for (const auto& t : traversers) { - for (transition_t& transition : t->GetStates().traversal_state_payload.transitions) { + for (const auto &t : traversers) { + for (transition_t &transition : t->GetStates().traversal_state_payload.transitions) { transition.weight = global_weights.at(transition.label); } // re-sort transitions @@ -454,7 +454,7 @@ inline ZipStateTraverser::ZipStateTraverser(const std::v template <> inline ZipStateTraverser::ZipStateTraverser( - const std::vector>& fsa_start_state_pairs, const bool advance) { + const std::vector> &fsa_start_state_pairs, const bool advance) { size_t order = 0; if (fsa_start_state_pairs.size() < 2) { @@ -480,16 +480,16 @@ inline ZipStateTraverser::ZipStateTraverser( } } // 1st pass collect all weights per label - for (const auto& t : traversers) { - for (const transition_t& transition : t->GetStates().traversal_state_payload.transitions) { + for (const auto &t : traversers) { + for (const transition_t &transition : t->GetStates().traversal_state_payload.transitions) { if (global_weights.count(transition.label) == 0 || global_weights.at(transition.label) < transition.weight) { global_weights[transition.label] = transition.weight; } } } // 2nd pass apply global weights - for (const auto& t : traversers) { - for (transition_t& transition : t->GetStates().traversal_state_payload.transitions) { + for (const auto &t : traversers) { + for (transition_t &transition : t->GetStates().traversal_state_payload.transitions) { transition.weight = global_weights.at(transition.label); } TRACE("resort %ld", t->GetOrder()); diff --git a/keyvi/include/keyvi/dictionary/util/trace.h b/keyvi/include/keyvi/dictionary/util/trace.h index 665702784..008b56432 100644 --- a/keyvi/include/keyvi/dictionary/util/trace.h +++ b/keyvi/include/keyvi/dictionary/util/trace.h @@ -22,15 +22,15 @@ * Author: hendrik */ -// The following is left intentionally without include guard -// so that tracing can be switched on and off on a per file basis. +//The following is left intentionally without include guard +//so that tracing can be switched on and off on a per file basis. #ifdef ENABLE_TRACING -#undef TRACE -#define TRACE ::keyvi::dictionary::util::trace::trace_it -#undef ENABLE_TRACING +# undef TRACE +# define TRACE ::keyvi::dictionary::util::trace::trace_it +# undef ENABLE_TRACING #else -#undef TRACE -#define TRACE(x, ...) +# undef TRACE +# define TRACE(x,...) #endif #ifndef TRACE_H_ @@ -45,17 +45,17 @@ namespace util { class trace final { public: - static void trace_it(const char* message, ...) { - va_list arguments; - va_start(arguments, message); - - fprintf(stderr, "* "); - vfprintf(stderr, message, arguments); - fprintf(stderr, "\n"); - } -}; - -} /* namespace util */ -} /* namespace dictionary */ -} /* namespace keyvi */ + static void trace_it(const char* message, ...) { + va_list arguments; + va_start(arguments, message); + + fprintf(stderr, "* "); + vfprintf(stderr, message, arguments); + fprintf(stderr, "\n"); + } + }; + + } /* namespace util */ + } /* namespace dictionary */ + } /* namespace keyvi */ #endif /* TRACE_H_ */ diff --git a/keyvi/include/keyvi/dictionary/util/transform.h b/keyvi/include/keyvi/dictionary/util/transform.h index 19cd5e76e..09cf95e1f 100644 --- a/keyvi/include/keyvi/dictionary/util/transform.h +++ b/keyvi/include/keyvi/dictionary/util/transform.h @@ -31,14 +31,15 @@ namespace keyvi { namespace dictionary { namespace util { -class Transform final { +class Transform final{ public: /** * Apply Bag of Words reordering for all but the last token * @param input * @return token with bow applied */ - static std::string BagOfWordsPartial(const std::string& input, size_t& number_of_tokens) { + static std::string BagOfWordsPartial(const std::string& input, size_t& number_of_tokens) + { std::vector strs; boost::split(strs, input, boost::is_any_of("\t ")); number_of_tokens = strs.size(); diff --git a/keyvi/include/keyvi/dictionary/util/utf8_utils.h b/keyvi/include/keyvi/dictionary/util/utf8_utils.h index 6662e0cc5..3925837a1 100644 --- a/keyvi/include/keyvi/dictionary/util/utf8_utils.h +++ b/keyvi/include/keyvi/dictionary/util/utf8_utils.h @@ -25,11 +25,12 @@ #ifndef UTF8_UTILS_H_ #define UTF8_UTILS_H_ + namespace keyvi { namespace dictionary { namespace util { -class Utf8Utils final { +class Utf8Utils final{ public: static bool IsLeadByte(char utf8_byte) { int intValue = utf8_byte & 0xFF; @@ -41,22 +42,32 @@ class Utf8Utils final { return (intValue < 0x80) || (intValue >= 0xC0); } - static size_t GetCharLength(char utf8_lead_byte) { - int intValue = utf8_lead_byte & 0xff; + static size_t GetCharLength(char utf8_lead_byte) + { + int intValue = utf8_lead_byte & 0xff; - if (intValue < 0x80) { - return 1; - } else if (intValue < 0xc0) { - std::invalid_argument("Illegal UTF-8 lead byte: " + std::to_string(intValue)); - } else if (intValue < 0xe0) { - return 2; - } else if (intValue < 0xf0) { - return 3; - } else if (intValue < 0xf8) { - return 4; - } + if (intValue < 0x80) + { + return 1; + } + else if (intValue < 0xc0) + { + std::invalid_argument("Illegal UTF-8 lead byte: " + std::to_string(intValue)); + } + else if (intValue < 0xe0) + { + return 2; + } + else if (intValue < 0xf0) + { + return 3; + } + else if (intValue < 0xf8) + { + return 4; + } - throw std::invalid_argument("Illegal UTF-8 lead byte: " + std::to_string(intValue)); + throw std::invalid_argument("Illegal UTF-8 lead byte: " + std::to_string(intValue)); } }; diff --git a/keyvi/tests/keyvi/dictionary/fsa/internal/minimization_hash_test.cpp b/keyvi/tests/keyvi/dictionary/fsa/internal/minimization_hash_test.cpp index 006510ef1..24024de9c 100644 --- a/keyvi/tests/keyvi/dictionary/fsa/internal/minimization_hash_test.cpp +++ b/keyvi/tests/keyvi/dictionary/fsa/internal/minimization_hash_test.cpp @@ -35,7 +35,7 @@ namespace internal { BOOST_AUTO_TEST_SUITE(MinimizationHashTests) BOOST_AUTO_TEST_CASE(insert) { - MinimizationHash>* hash = new MinimizationHash>(); + MinimizationHash> *hash = new MinimizationHash>(); PackedState<> p1 = {10, 25, 2}; hash->Add(p1); PackedState<> p2 = {12, 25, 3}; @@ -54,7 +54,7 @@ BOOST_AUTO_TEST_CASE(insert) { } BOOST_AUTO_TEST_CASE(reset) { - MinimizationHash>* hash = new MinimizationHash>(); + MinimizationHash> *hash = new MinimizationHash>(); PackedState<> p1 = {10, 25, 2}; hash->Add(p1); PackedState<> p2 = {12, 25, 3}; diff --git a/keyvi/tests/keyvi/dictionary/fsa/zip_state_traverser_test.cpp b/keyvi/tests/keyvi/dictionary/fsa/zip_state_traverser_test.cpp index 0577ab0b4..3602adca0 100644 --- a/keyvi/tests/keyvi/dictionary/fsa/zip_state_traverser_test.cpp +++ b/keyvi/tests/keyvi/dictionary/fsa/zip_state_traverser_test.cpp @@ -510,7 +510,7 @@ BOOST_AUTO_TEST_CASE(basic) { BOOST_CHECK(!t); } -std::vector GetAllKeys(ZipStateTraverser>* zip_traverser) { +std::vector GetAllKeys(ZipStateTraverser> *zip_traverser) { std::vector label_stack; std::vector keys; diff --git a/keyvi/tests/keyvi/index/index_limits_test.cpp b/keyvi/tests/keyvi/index/index_limits_test.cpp index 34e6acc15..141e82cb9 100644 --- a/keyvi/tests/keyvi/index/index_limits_test.cpp +++ b/keyvi/tests/keyvi/index/index_limits_test.cpp @@ -46,7 +46,9 @@ inline std::string get_keyvimerger_bin() { } inline size_t limit_filedescriptors(size_t file_descriptor_limit) { - struct rlimit limit{0}; + struct rlimit limit { + 0 + }; getrlimit(RLIMIT_NOFILE, &limit); const size_t old_limit = limit.rlim_cur; From 1363c0b34e5671d6b1af178e0ec376daa4b287cd Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Tue, 21 Oct 2025 07:49:58 +0200 Subject: [PATCH 18/23] Apply suggestion from @github-actions[bot] Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .../keyvi/dictionary/fsa/internal/sparse_array_persistence.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/sparse_array_persistence.h b/keyvi/include/keyvi/dictionary/fsa/internal/sparse_array_persistence.h index d930be301..3365121ca 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/sparse_array_persistence.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/sparse_array_persistence.h @@ -200,7 +200,7 @@ class SparseArrayPersistence final { TRACE("Wrote Transitions, stream at %d", stream.tellp()); } - size_t GetChunkSizeExternalTransitions() const { return transitions_extern_->GetChunkSize(); } + [[nodiscard]] size_t GetChunkSizeExternalTransitions() const { return transitions_extern_->GetChunkSize(); } uint32_t GetVersion() const; From 17ae0d029b14867b4237e5807b3fc884380494a6 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Tue, 21 Oct 2025 07:50:11 +0200 Subject: [PATCH 19/23] Apply suggestion from @github-actions[bot] Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- keyvi/include/keyvi/dictionary/fsa/automata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keyvi/include/keyvi/dictionary/fsa/automata.h b/keyvi/include/keyvi/dictionary/fsa/automata.h index 1fd2db980..12feca7cd 100644 --- a/keyvi/include/keyvi/dictionary/fsa/automata.h +++ b/keyvi/include/keyvi/dictionary/fsa/automata.h @@ -410,7 +410,7 @@ class Automata final { [[nodiscard]] const std::string& GetManifest() const { return dictionary_properties_->GetManifest(); } - [[nodiscard]] const uint64_t GetVersion() const { return dictionary_properties_->GetVersion(); } + [[nodiscard]] uint64_t GetVersion() const { return dictionary_properties_->GetVersion(); } private: dictionary_properties_t dictionary_properties_; From ec0090cd492964e238af601a038a10438cd22d5d Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 24 Oct 2025 08:42:29 +0200 Subject: [PATCH 20/23] add clang tidy config --- .github/workflows/clang-tidy-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/clang-tidy-review.yml b/.github/workflows/clang-tidy-review.yml index 87734f1d0..d5f58bc6f 100644 --- a/.github/workflows/clang-tidy-review.yml +++ b/.github/workflows/clang-tidy-review.yml @@ -3,7 +3,7 @@ name: clang-tidy-review # You can be more specific, but it currently only works on pull requests on: pull_request: - paths: ['**.cpp', '**.h', '**.hpp', '**CMakeLists.txt', '**.cmake', '.github/workflows/clang*.yml'] + paths: ['**.cpp', '**.h', '**.hpp', '**CMakeLists.txt', '**.cmake', '.github/workflows/clang*.yml', '**.clang-tidy'] jobs: build: From 10776744e4e3c0bb763b6f78de1145405a6821bf Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 24 Oct 2025 08:50:14 +0200 Subject: [PATCH 21/23] improve comments --- .github/workflows/clang-tidy-review.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/clang-tidy-review.yml b/.github/workflows/clang-tidy-review.yml index d5f58bc6f..9d5e28739 100644 --- a/.github/workflows/clang-tidy-review.yml +++ b/.github/workflows/clang-tidy-review.yml @@ -1,6 +1,7 @@ name: clang-tidy-review -# You can be more specific, but it currently only works on pull requests +# Runs clang-tidy on the changed files and uploads the result as artifact +# clang-tidy-post takes the artifact and posts the comments on: pull_request: paths: ['**.cpp', '**.h', '**.hpp', '**CMakeLists.txt', '**.cmake', '.github/workflows/clang*.yml', '**.clang-tidy'] From d78de95e7117378827051d904d12b6a3b6863be7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Oct 2025 08:25:09 +0000 Subject: [PATCH 22/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .github/PULL_REQUEST_TEMPLATE.md | 2 +- LICENSE | 1 - README.md | 6 +- doc/RELEASE_PROCESS.md | 4 +- doc/algorithm/Construction-Basics.md | 79 +++++----- doc/algorithm/Extensibility.md | 6 +- doc/algorithm/Minimization.md | 20 +-- doc/algorithm/Persistence-Basics.md | 9 +- doc/algorithm/Scaling.md | 20 +-- .../bbuzz2016/keyvi-presentation.svg | 66 ++++---- .../keyvi-presentation-progscon2017.svg | 66 ++++---- .../keyvi-presentation-search-meetup-2018.svg | 66 ++++---- ...Building keyvi dictionaries with python.md | 9 +- doc/usage/Building keyvi dictionaries.md | 10 +- doc/usage/Crashcourse.md | 52 +++---- doc/usage/Keyvi Index with python.md | 2 +- doc/usage/Using pykeyvi in EMR.md | 6 +- .../fsa/comparable_state_traverser.h | 38 ++--- .../fsa/internal/lru_generation_cache.h | 16 +- .../fsa/internal/minimization_hash.h | 12 +- .../keyvi/dictionary/fsa/state_traverser.h | 16 +- .../dictionary/fsa/zip_state_traverser.h | 58 +++---- keyvi/include/keyvi/dictionary/util/trace.h | 40 ++--- .../include/keyvi/dictionary/util/transform.h | 5 +- .../keyvi/dictionary/util/utf8_utils.h | 41 ++--- .../fsa/internal/minimization_hash_test.cpp | 4 +- .../fsa/zip_state_traverser_test.cpp | 2 +- keyvi/tests/keyvi/index/index_limits_test.cpp | 4 +- python/LICENSE.txt | 1 - python/autowrap_includes/autowrap_tools.hpp | 40 ++--- python/build_macosx_wheels.sh | 2 +- .../completion/multiword_completion_tester.py | 9 +- .../completion/multiword_completion_writer.py | 17 ++- .../prefix_completion_fuzzy_tester.py | 8 +- .../completion/prefix_completion_tester.py | 8 +- python/examples/lookup/compile_json.py | 45 +++--- python/examples/lookup/text_lookup_tester.py | 6 +- python/examples/lookup/value_lookup_tester.py | 6 +- python/examples/normalization/normalize.py | 4 +- .../var_length_short_calculation_test.py | 10 +- .../dictionary/dictionary_leak_test.py | 18 ++- python/setup.py | 46 ++++-- .../addons/CompletionDictionaryCompiler.pyx | 2 +- .../src/addons/CompletionDictionaryMerger.pyx | 2 +- python/src/addons/Dictionary.pyx | 5 +- python/src/addons/IntDictionaryCompiler.pyx | 1 - python/src/addons/IntDictionaryMerger.pyx | 2 +- python/src/addons/JsonDictionaryCompiler.pyx | 3 +- python/src/addons/JsonDictionaryMerger.pyx | 2 +- python/src/addons/KeyOnlyDictionaryMerger.pyx | 2 +- python/src/addons/Match.pyx | 2 +- ...condaryKeyCompletionDictionaryCompiler.pyx | 1 - ...ondaryKeyFloatVectorDictionaryCompiler.pyx | 1 - .../SecondaryKeyIntDictionaryCompiler.pyx | 1 - .../SecondaryKeyJsonDictionaryCompiler.pyx | 1 - .../SecondaryKeyKeyOnlyDictionaryCompiler.pyx | 1 - .../SecondaryKeyStringDictionaryCompiler.pyx | 1 - python/src/addons/match_iterator.pyx | 2 +- python/src/extra/attributes_converter.h | 25 ++- python/src/pxds/dictionary.pxd | 8 +- python/src/pxds/dictionary_compiler.pxd | 2 +- python/src/pxds/dictionary_merger.pxd | 2 +- python/src/pxds/match.pxd | 1 - python/src/pxds/multi_word_completion.pxd | 2 - python/src/pxds/prefix_completion.pxd | 2 - python/src/pxds/secondary_key_dictionary.pxd | 4 +- python/src/py/keyvi/__init__.py | 11 +- python/src/py/keyvi/_pycore/keyvimerger.py | 34 +++-- python/src/py/keyvi/cli/cli.py | 143 ++++++++++-------- python/src/py/keyvi/compiler/__init__.py | 41 ++++- python/src/py/keyvi/completion/__init__.py | 4 +- python/src/py/keyvi/dictionary/__init__.py | 4 +- python/src/py/keyvi/index/__init__.py | 4 +- python/src/py/keyvi/util/__init__.py | 4 +- python/src/py/keyvi/vector/__init__.py | 4 +- .../forward_backward_completion_test.py | 22 ++- .../tests/completion/fuzzy_completion_test.py | 38 +++-- .../completion/multiword_completion_test.py | 75 +++++---- .../dictionary/dictionary_merger_test.py | 61 ++++---- python/tests/dictionary/dictionary_test.py | 5 +- .../floatvector_dictionary_test.py | 9 +- python/tests/dictionary/get_fuzzy_test.py | 13 +- .../dictionary/int_dictionary_merger_test.py | 52 +++---- python/tests/dictionary/iterators_test.py | 36 +++-- .../key_only_dictionary_merger_test.py | 46 ++---- python/tests/dictionary/loading_test.py | 49 +++--- python/tests/dictionary/near_test.py | 48 +++--- .../secondary_key_dictionary_test.py | 24 +-- .../string_dictionary_merger_test.py | 52 +++---- python/tests/dictionary/unicode_test.py | 10 +- python/tests/dictionary/zerobyte_test.py | 5 +- python/tests/dictionary_compiler_test.py | 26 ++-- python/tests/index/index_test.py | 52 +++---- python/tests/index/merger_binary_test.py | 7 +- python/tests/int/int_dictionary_test.py | 6 +- python/tests/statistics_test.py | 46 +++--- .../tests/utils/jump_consistent_hash_test.py | 5 +- python/tests/vector/basic_test.py | 24 +-- sphinx-docs/_static/custom.css | 2 +- sphinx-docs/conf_extra.py | 22 +-- sphinx-docs/cpp/dictionary_compiler.rst | 2 +- sphinx-docs/index.rst | 16 +- 102 files changed, 1020 insertions(+), 937 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 7569be5b0..eaa17b244 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,7 +1,7 @@