From eb3200e7095c6fc073051dc09634e05fd1b64566 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Sun, 26 Jan 2025 20:50:44 -0800 Subject: [PATCH 1/3] code cleanup and alignment with Java --- theta/include/theta_sketch.hpp | 4 +- theta/include/theta_sketch_impl.hpp | 58 ++++++++++++++--------------- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/theta/include/theta_sketch.hpp b/theta/include/theta_sketch.hpp index ad6421e7..4aab4b92 100644 --- a/theta/include/theta_sketch.hpp +++ b/theta/include/theta_sketch.hpp @@ -609,9 +609,11 @@ class wrapped_compact_theta_sketch_alloc::const_iterator { uint32_t index_; uint64_t previous_; bool is_block_mode_; - uint8_t buf_i_; uint8_t offset_; uint64_t buffer_[8]; + + inline void unpack1(); + inline void unpack8(); }; } /* namespace datasketches */ diff --git a/theta/include/theta_sketch_impl.hpp b/theta/include/theta_sketch_impl.hpp index b6a5d7ee..8f7b1e8d 100644 --- a/theta/include/theta_sketch_impl.hpp +++ b/theta/include/theta_sketch_impl.hpp @@ -817,23 +817,15 @@ num_entries_(num_entries), index_(index), previous_(0), is_block_mode_(num_entries_ >= 8), -buf_i_(0), offset_(0) { if (entry_bits == 64) { // no compression ptr_ = reinterpret_cast(ptr) + index; } else if (index < num_entries) { if (is_block_mode_) { - unpack_bits_block8(buffer_, reinterpret_cast(ptr_), entry_bits_); - ptr_ = reinterpret_cast(ptr_) + entry_bits_; - for (int i = 0; i < 8; ++i) { - buffer_[i] += previous_; - previous_ = buffer_[i]; - } + unpack8(); } else { - offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast(ptr_), offset_); - buffer_[0] += previous_; - previous_ = buffer_[0]; + unpack1(); } } } @@ -844,35 +836,41 @@ auto wrapped_compact_theta_sketch_alloc::const_iterator::operator++() ptr_ = reinterpret_cast(ptr_) + 1; return *this; } - ++index_; - if (index_ < num_entries_) { + if (++index_ < num_entries_) { if (is_block_mode_) { - ++buf_i_; - if (buf_i_ == 8) { - buf_i_ = 0; - if (index_ + 8 < num_entries_) { - unpack_bits_block8(buffer_, reinterpret_cast(ptr_), entry_bits_); - ptr_ = reinterpret_cast(ptr_) + entry_bits_; - for (int i = 0; i < 8; ++i) { - buffer_[i] += previous_; - previous_ = buffer_[i]; - } + if ((index_ & 7) == 0) { + if (num_entries_ - index_ >= 8) { + unpack8(); } else { is_block_mode_ = false; - offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast(ptr_), offset_); - buffer_[0] += previous_; - previous_ = buffer_[0]; + unpack1(); } } } else { - offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast(ptr_), offset_); - buffer_[0] += previous_; - previous_ = buffer_[0]; + unpack1(); } } return *this; } +template +void wrapped_compact_theta_sketch_alloc::const_iterator::unpack1() { + const uint32_t i = index_ & 7; + offset_ = unpack_bits(buffer_[i], entry_bits_, reinterpret_cast(ptr_), offset_); + buffer_[i] += previous_; + previous_ = buffer_[i]; +} + +template +void wrapped_compact_theta_sketch_alloc::const_iterator::unpack8() { + unpack_bits_block8(buffer_, reinterpret_cast(ptr_), entry_bits_); + ptr_ = reinterpret_cast(ptr_) + entry_bits_; + for (int i = 0; i < 8; ++i) { + buffer_[i] += previous_; + previous_ = buffer_[i]; + } +} + template auto wrapped_compact_theta_sketch_alloc::const_iterator::operator++(int) -> const_iterator { const_iterator tmp(*this); @@ -895,13 +893,13 @@ bool wrapped_compact_theta_sketch_alloc::const_iterator::operator==(c template auto wrapped_compact_theta_sketch_alloc::const_iterator::operator*() const -> reference { if (entry_bits_ == 64) return *reinterpret_cast(ptr_); - return buffer_[buf_i_]; + return buffer_[index_ & 7]; } template auto wrapped_compact_theta_sketch_alloc::const_iterator::operator->() const -> pointer { if (entry_bits_ == 64) return reinterpret_cast(ptr_); - return buffer_ + buf_i_; + return buffer_ + (index_ & 7); } } /* namespace datasketches */ From f82217d472f0d122b848cd379b231e7ac8616cf2 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Sun, 26 Jan 2025 20:55:40 -0800 Subject: [PATCH 2/3] test equivalence of packing and unpacking single values and blocks --- theta/test/bit_packing_test.cpp | 50 +++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/theta/test/bit_packing_test.cpp b/theta/test/bit_packing_test.cpp index b39f8996..0e0cf015 100644 --- a/theta/test/bit_packing_test.cpp +++ b/theta/test/bit_packing_test.cpp @@ -80,4 +80,54 @@ TEST_CASE("pack unpack blocks") { } } +TEST_CASE("pack bits unpack blocks") { + uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value + for (int m = 0; m < 10000; ++m) { + for (uint8_t bits = 1; bits <= 63; ++bits) { + const uint64_t mask = (1ULL << bits) - 1; + std::vector input(8, 0); + for (int i = 0; i < 8; ++i) { + input[i] = value & mask; + value += IGOLDEN64; + } + std::vector bytes(bits, 0); + uint8_t offset = 0; + uint8_t* ptr = bytes.data(); + for (int i = 0; i < 8; ++i) { + offset = pack_bits(input[i], bits, ptr, offset); + } + std::vector output(8, 0); + unpack_bits_block8(output.data(), bytes.data(), bits); + for (int i = 0; i < 8; ++i) { + REQUIRE(input[i] == output[i]); + } + } + } +} + +TEST_CASE("pack blocks unpack bits") { + uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value + for (int m = 0; m < 10000; ++m) { + for (uint8_t bits = 1; bits <= 63; ++bits) { + const uint64_t mask = (1ULL << bits) - 1; + std::vector input(8, 0); + for (int i = 0; i < 8; ++i) { + input[i] = value & mask; + value += IGOLDEN64; + } + std::vector bytes(bits, 0); + pack_bits_block8(input.data(), bytes.data(), bits); + std::vector output(8, 0); + uint8_t offset = 0; + const uint8_t* cptr = bytes.data(); + for (int i = 0; i < 8; ++i) { + offset = unpack_bits(output[i], bits, cptr, offset); + } + for (int i = 0; i < 8; ++i) { + REQUIRE(input[i] == output[i]); + } + } + } +} + } /* namespace datasketches */ From dea8d481cab8461f981e4edb5ab292936a87abff Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Sun, 26 Jan 2025 21:02:34 -0800 Subject: [PATCH 3/3] different starting points for pseudo-random sequences for more coverage --- theta/test/bit_packing_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/theta/test/bit_packing_test.cpp b/theta/test/bit_packing_test.cpp index 0e0cf015..0094f9fd 100644 --- a/theta/test/bit_packing_test.cpp +++ b/theta/test/bit_packing_test.cpp @@ -81,7 +81,7 @@ TEST_CASE("pack unpack blocks") { } TEST_CASE("pack bits unpack blocks") { - uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value + uint64_t value = 0; // arbitrary starting value for (int m = 0; m < 10000; ++m) { for (uint8_t bits = 1; bits <= 63; ++bits) { const uint64_t mask = (1ULL << bits) - 1; @@ -106,7 +106,7 @@ TEST_CASE("pack bits unpack blocks") { } TEST_CASE("pack blocks unpack bits") { - uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value + uint64_t value = 111; // arbitrary starting value for (int m = 0; m < 10000; ++m) { for (uint8_t bits = 1; bits <= 63; ++bits) { const uint64_t mask = (1ULL << bits) - 1;