diff --git a/crates/hash-sorted-map/LICENSE b/crates/hash-sorted-map/LICENSE new file mode 100644 index 0000000..163074d --- /dev/null +++ b/crates/hash-sorted-map/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 GitHub Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/crates/hash-sorted-map/README.md b/crates/hash-sorted-map/README.md index d4ec3fa..9498008 100644 --- a/crates/hash-sorted-map/README.md +++ b/crates/hash-sorted-map/README.md @@ -37,8 +37,10 @@ keys, which means: together, keeping a single insert's data within 1–2 cache lines. - **Optimized growth** — during resize, elements are re-inserted without duplicate checking and copied via raw pointers. -- **Generic key/value/hasher** — supports any `K: Hash + Eq`, any - `S: BuildHasher`, and `Borrow`-based lookups. +- **Generic key/value/hasher** — keys need only `Eq` (`Ord` to sort). + Customize hashing with the single-method [`SortingHash`] trait; any + standard `S: BuildHasher` works out of the box via a blanket impl, and + `Borrow`-based lookups are supported. ## Benchmark results diff --git a/crates/hash-sorted-map/benchmarks/Cargo.toml b/crates/hash-sorted-map/benchmarks/Cargo.toml index 0fddba9..d0ae1f1 100644 --- a/crates/hash-sorted-map/benchmarks/Cargo.toml +++ b/crates/hash-sorted-map/benchmarks/Cargo.toml @@ -1,6 +1,7 @@ [package] name = "hash-sorted-map-benchmarks" edition = "2021" +publish = false [lib] path = "lib.rs" diff --git a/crates/hash-sorted-map/src/hash_sorted_map.rs b/crates/hash-sorted-map/src/hash_sorted_map.rs index 0cc37b4..6608349 100644 --- a/crates/hash-sorted-map/src/hash_sorted_map.rs +++ b/crates/hash-sorted-map/src/hash_sorted_map.rs @@ -1,6 +1,5 @@ use core::mem::MaybeUninit; use std::borrow::Borrow; -use std::collections::hash_map::RandomState; use std::hash::{BuildHasher, Hash}; use std::marker::PhantomData; @@ -16,6 +15,53 @@ fn tag(hash: u64) -> u8 { (hash as u8) | 0x80 } +// ──────────────────────────────────────────────────────────────────────── +// SortingHash +// ──────────────────────────────────────────────────────────────────────── + +/// Maps a key to the 64-bit hash that determines its position in the map. +/// +/// The high bits select the primary group, so visiting groups in index order +/// yields entries in ascending hash order — the property [`HashSortedMap`] +/// relies on for sorted iteration and linear-time merging. The hash should +/// therefore be well distributed in its high bits. +/// +/// Every [`BuildHasher`] ([`RandomState`](std::collections::hash_map::RandomState), +/// `foldhash`, `ahash`, `fnv`, …) implements this trait automatically through a +/// blanket impl, so it can be used as a drop-in. For full control — including +/// keys that do not implement [`Hash`] — implement this single method directly +/// instead of the +/// streaming [`Hasher`](std::hash::Hasher) interface: +/// +/// ``` +/// use hash_sorted_map::{HashSortedMap, SortingHash}; +/// +/// #[derive(Default)] +/// struct Identity; +/// impl SortingHash for Identity { +/// fn hash(&self, &key: &u32) -> u64 { +/// (key as u64) | ((key as u64) << 32) +/// } +/// } +/// +/// let mut map = HashSortedMap::with_hasher(Identity); +/// map.insert(42u32, "answer"); +/// assert_eq!(map.get(&42), Some(&"answer")); +/// ``` +pub trait SortingHash { + /// Returns the hash of `key`. + fn hash(&self, key: &K) -> u64; +} + +/// Bridges the standard library's [`BuildHasher`] to [`SortingHash`], so any +/// existing hasher keeps working unchanged. +impl SortingHash for S { + #[inline] + fn hash(&self, key: &K) -> u64 { + self.hash_one(key) + } +} + // ──────────────────────────────────────────────────────────────────────── // HashSortedMap // ──────────────────────────────────────────────────────────────────────── @@ -23,37 +69,49 @@ fn tag(hash: u64) -> u8 { /// Insertion-only hash map with SIMD group scanning. /// /// Uses NEON on aarch64, SSE2 on x86_64, scalar fallback elsewhere. -/// Generic over key type `K`, value type `V`, and hash builder `S`. -pub struct HashSortedMap { +/// Generic over key type `K`, value type `V`, and hashing strategy `S` +/// (any [`SortingHash`](SortingHash), which every [`BuildHasher`] satisfies). +/// +/// `S` has no default: because the hasher determines the iteration order, there +/// is no meaningful "default" hasher for a sort-by-hash map. Construct one with +/// [`with_hasher`](Self::with_hasher), or — for a [`Default`] hasher `S` — with +/// [`new`](Self::new) / [`with_capacity`](Self::with_capacity) / [`Default`]. +pub struct HashSortedMap { pub(crate) groups: Box<[Group]>, pub(crate) num_groups: u32, pub(crate) n_bits: u32, pub(crate) len: usize, - hash_builder: S, + hasher: S, } -impl Default for HashSortedMap { +impl Default for HashSortedMap { fn default() -> Self { Self::new() } } -impl HashSortedMap { +impl HashSortedMap { + /// Creates an empty map using the default-constructed hasher `S`. pub fn new() -> Self { - Self::with_capacity_and_hasher(0, RandomState::new()) + Self::with_capacity_and_hasher(0, S::default()) } + /// Creates an empty map that can hold at least `capacity` entries without + /// growing, using the default-constructed hasher `S`. pub fn with_capacity(capacity: usize) -> Self { - Self::with_capacity_and_hasher(capacity, RandomState::new()) + Self::with_capacity_and_hasher(capacity, S::default()) } } impl HashSortedMap { - pub fn with_hasher(hash_builder: S) -> Self { - Self::with_capacity_and_hasher(0, hash_builder) + /// Creates an empty map that hashes keys with `hasher`. + pub fn with_hasher(hasher: S) -> Self { + Self::with_capacity_and_hasher(0, hasher) } - pub fn with_capacity_and_hasher(capacity: usize, hash_builder: S) -> Self { + /// Creates an empty map that hashes keys with `hasher` and can hold at + /// least `capacity` entries without growing. + pub fn with_capacity_and_hasher(capacity: usize, hasher: S) -> Self { let adjusted = (capacity as f64 / group_ops::MAX_FILL).ceil() as usize; let min_groups = (adjusted.div_ceil(GROUP_SIZE)).max(1).next_power_of_two(); let n_bits = min_groups.trailing_zeros().max(1); @@ -63,14 +121,16 @@ impl HashSortedMap { num_groups, n_bits, len: 0, - hash_builder, + hasher, } } + /// Returns the number of entries in the map. pub fn len(&self) -> usize { self.len } + /// Returns `true` if the map contains no entries. pub fn is_empty(&self) -> bool { self.len == 0 } @@ -89,7 +149,7 @@ impl HashSortedMap { } } -impl HashSortedMap { +impl> HashSortedMap { /// Sort all entries within each primary group chain by their hash value, /// breaking ties by key. /// @@ -137,9 +197,7 @@ impl HashSortedMap { for &cgi in &chain[..chain.len() - 1] { let g = &self.groups[cgi as usize]; for slot in 0..GROUP_SIZE { - let hash = self - .hash_builder - .hash_one(unsafe { g.keys[slot].assume_init_ref() }); + let hash = self.hasher.hash(unsafe { g.keys[slot].assume_init_ref() }); hashes.push(hash); } } @@ -149,9 +207,7 @@ impl HashSortedMap { if g.ctrl[slot] == CTRL_EMPTY { break; } - let hash = self - .hash_builder - .hash_one(unsafe { g.keys[slot].assume_init_ref() }); + let hash = self.hasher.hash(unsafe { g.keys[slot].assume_init_ref() }); hashes.push(hash); } @@ -204,18 +260,25 @@ impl HashSortedMap { } } -impl HashSortedMap { +impl> HashSortedMap { + /// Inserts a key/value pair, returning the previous value for `key` if it + /// was already present (otherwise `None`). pub fn insert(&mut self, key: K, value: V) -> Option { - let hash = self.hash_builder.hash_one(&key); + let hash = self.hasher.hash(&key); self.insert_hashed(hash, key, value) } + /// Returns a reference to the value for `key`, or `None` if it is absent. + /// + /// The key may be any borrowed form of `K`, as long as the borrowed value + /// hashes and compares equal to the owned key. pub fn get(&self, key: &Q) -> Option<&V> where K: Borrow, - Q: Hash + Eq + ?Sized, + Q: Eq + ?Sized, + S: SortingHash, { - let hash = self.hash_builder.hash_one(key); + let hash = self.hasher.hash(key); self.get_hashed(hash, key) } @@ -238,7 +301,7 @@ impl HashSortedMap { /// the resulting `VacantEntry` already knows where to write. #[inline] pub fn entry(&mut self, key: K) -> Entry<'_, K, V, S> { - let hash = self.hash_builder.hash_one(&key); + let hash = self.hasher.hash(&key); match self.find_or_insertion_slot(hash, &key) { FindResult::Found(ptr) => Entry::Occupied(OccupiedEntry { // SAFETY: pointer is valid for `'_` (bounded by `&mut self`). @@ -382,9 +445,7 @@ impl HashSortedMap { for group in &old_groups[..old_num_groups] { for i in 0..group_ops::count_occupied(&group.ctrl) { - let hash = self - .hash_builder - .hash_one(unsafe { group.keys[i].assume_init_ref() }); + let hash = self.hasher.hash(unsafe { group.keys[i].assume_init_ref() }); self.insert_for_grow(hash, group.keys[i].as_ptr(), group.values[i].as_ptr()); } } @@ -465,7 +526,9 @@ enum Insertion { /// View into a single entry in a [`HashSortedMap`], either occupied or vacant. pub enum Entry<'a, K, V, S> { + /// An occupied entry whose key already exists in the map. Occupied(OccupiedEntry<'a, V>), + /// A vacant entry whose key is absent from the map. Vacant(VacantEntry<'a, K, V, S>), } @@ -484,7 +547,7 @@ pub struct VacantEntry<'a, K, V, S> { insertion: Insertion, } -impl<'a, K: Hash + Eq, V, S: BuildHasher> Entry<'a, K, V, S> { +impl<'a, K: Eq, V, S: SortingHash> Entry<'a, K, V, S> { /// Insert `default` if vacant; return a mutable reference to the value either way. #[inline] pub fn or_insert(self, default: V) -> &'a mut V { @@ -545,7 +608,7 @@ impl<'a, V> OccupiedEntry<'a, V> { } } -impl<'a, K: Hash + Eq, V, S: BuildHasher> VacantEntry<'a, K, V, S> { +impl<'a, K: Eq, V, S: SortingHash> VacantEntry<'a, K, V, S> { /// Insert `value` and return a mutable reference to it. /// Writes directly to the slot pre-computed during `entry()`; only re-walks /// the chain on the rare grow path (where the pre-computed pointers become @@ -597,7 +660,7 @@ impl<'a, K: Hash + Eq, V, S: BuildHasher> VacantEntry<'a, K, V, S> { /// the new insertion slot. #[cold] #[inline(never)] -fn insert_after_grow( +fn insert_after_grow>( map: &mut HashSortedMap, key: K, value: V, @@ -621,13 +684,14 @@ impl Drop for HashSortedMap { #[cfg(test)] mod tests { + use std::collections::hash_map::RandomState; use std::hash::{BuildHasher, Hasher}; use super::*; #[test] fn insert_and_get() { - let mut map = HashSortedMap::new(); + let mut map = HashSortedMap::<_, _, RandomState>::new(); map.insert(100, "hello"); map.insert(200, "world"); assert_eq!(map.get(&100), Some(&"hello")); @@ -638,7 +702,7 @@ mod tests { #[test] fn insert_overwrite() { - let mut map = HashSortedMap::new(); + let mut map = HashSortedMap::<_, _, RandomState>::new(); map.insert(42, "a"); assert_eq!(map.insert(42, "b"), Some("a")); assert_eq!(map.get(&42), Some(&"b")); @@ -647,7 +711,7 @@ mod tests { #[test] fn grow_preserves_entries() { - let mut map = HashSortedMap::new(); + let mut map = HashSortedMap::<_, _, RandomState>::new(); for i in 0..200u32 { map.insert(i, i * 10); } @@ -659,7 +723,7 @@ mod tests { #[test] fn many_entries() { - let mut map = HashSortedMap::with_capacity(2000); + let mut map = HashSortedMap::<_, _, RandomState>::with_capacity(2000); for i in 0..2000u32 { map.insert(i.wrapping_mul(2654435761), i); } @@ -671,7 +735,7 @@ mod tests { #[test] fn overflow_chain() { - let mut map = HashSortedMap::with_capacity(8); + let mut map = HashSortedMap::<_, _, RandomState>::with_capacity(8); for i in 0..20u32 { let key = i | 0xAB000000; map.insert(key, i); @@ -685,7 +749,7 @@ mod tests { #[test] fn grow_on_overflow_exhaustion() { - let mut map = HashSortedMap::with_capacity(1); + let mut map = HashSortedMap::<_, _, RandomState>::with_capacity(1); let old_n_bits = map.n_bits; for i in 0..100u32 { let key = i | 0xFF000000; @@ -701,7 +765,7 @@ mod tests { #[test] fn string_keys() { - let mut map = HashSortedMap::new(); + let mut map = HashSortedMap::<_, _, RandomState>::new(); map.insert("hello".to_string(), 1); map.insert("world".to_string(), 2); assert_eq!(map.get("hello"), Some(&1)); @@ -716,7 +780,8 @@ mod tests { #[test] fn get_or_default_basics() { - let mut map: HashSortedMap<&str, i32> = HashSortedMap::new(); + let mut map: HashSortedMap<&str, i32, RandomState> = + HashSortedMap::<_, _, RandomState>::new(); // Inserts default (0), then mutates. *map.get_or_default("a") += 5; *map.get_or_default("b") += 7; @@ -729,7 +794,8 @@ mod tests { #[test] fn get_or_insert_with_lazy() { - let mut map: HashSortedMap = HashSortedMap::new(); + let mut map: HashSortedMap = + HashSortedMap::<_, _, RandomState>::new(); let mut call_count = 0; let mut make = |s: &str| { call_count += 1; @@ -756,7 +822,8 @@ mod tests { #[test] fn get_or_default_survives_grow() { - let mut map: HashSortedMap = HashSortedMap::with_capacity(1); + let mut map: HashSortedMap = + HashSortedMap::<_, _, RandomState>::with_capacity(1); for i in 0..500u32 { *map.get_or_default(i) = i * 2; } @@ -769,7 +836,8 @@ mod tests { #[test] fn entry_or_default_counting() { // Classic counting workload via Entry API. - let mut map: HashSortedMap<&str, u32> = HashSortedMap::new(); + let mut map: HashSortedMap<&str, u32, RandomState> = + HashSortedMap::<_, _, RandomState>::new(); for word in ["a", "b", "a", "c", "b", "a"] { *map.entry(word).or_default() += 1; } @@ -781,7 +849,8 @@ mod tests { #[test] fn entry_or_insert_lazy() { - let mut map: HashSortedMap = HashSortedMap::new(); + let mut map: HashSortedMap = + HashSortedMap::<_, _, RandomState>::new(); let mut call_count = 0; let mut make = |s: &str| { call_count += 1; @@ -798,7 +867,8 @@ mod tests { #[test] fn entry_and_modify() { - let mut map: HashSortedMap = HashSortedMap::new(); + let mut map: HashSortedMap = + HashSortedMap::<_, _, RandomState>::new(); // Vacant: and_modify is a no-op, then or_insert(0) runs. *map.entry(7).and_modify(|v| *v *= 10).or_insert(1) += 100; assert_eq!(map.get(&7), Some(&101)); @@ -840,18 +910,45 @@ mod tests { } } + #[test] + fn custom_sorting_hash_without_hash_key() { + // A key type that intentionally does NOT implement `Hash`, proving the + // map only requires `SortingHash` (not `std::hash::Hash`) when a custom + // hasher is supplied. + #[derive(PartialEq, Eq)] + struct Key(u32); + + struct ByValue; + impl SortingHash for ByValue { + fn hash(&self, key: &Key) -> u64 { + (key.0 as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15) + } + } + + let mut map = HashSortedMap::with_hasher(ByValue); + for i in 0..200u32 { + assert_eq!(map.insert(Key(i), i), None); + } + assert_eq!(map.len(), 200); + for i in 0..200u32 { + assert_eq!(map.get(&Key(i)), Some(&i)); + } + assert_eq!(map.get(&Key(999)), None); + } + // ── sort_by_hash tests ────────────────────────────────────────────── #[test] fn sort_by_hash_empty() { - let mut map: HashSortedMap = HashSortedMap::new(); + let mut map: HashSortedMap = + HashSortedMap::<_, _, RandomState>::new(); map.sort_by_hash(); assert_eq!(map.len(), 0); } #[test] fn sort_by_hash_single() { - let mut map = HashSortedMap::new(); + let mut map = HashSortedMap::<_, _, RandomState>::new(); map.insert(42u32, "hello"); map.sort_by_hash(); assert_eq!(map.len(), 1); @@ -861,7 +958,7 @@ mod tests { #[test] fn sort_by_hash_preserves_entries() { - let mut map = HashSortedMap::new(); + let mut map = HashSortedMap::<_, _, RandomState>::new(); for i in 0..200u32 { map.insert(i, i * 10); } @@ -880,8 +977,6 @@ mod tests { #[test] fn sort_by_hash_produces_hash_order() { - use std::collections::hash_map::RandomState; - let hasher = RandomState::new(); let mut map = HashSortedMap::with_hasher(hasher.clone()); for i in 0..500u32 { @@ -925,8 +1020,6 @@ mod tests { #[test] fn sort_by_hash_with_strings() { - use std::collections::hash_map::RandomState; - let hasher = RandomState::new(); let mut map = HashSortedMap::with_hasher(hasher.clone()); for i in 0..100u32 { diff --git a/crates/hash-sorted-map/src/iter.rs b/crates/hash-sorted-map/src/iter.rs index e981bad..ffc7772 100644 --- a/crates/hash-sorted-map/src/iter.rs +++ b/crates/hash-sorted-map/src/iter.rs @@ -212,6 +212,7 @@ impl<'a, K, V, S> IntoIterator for &'a mut HashSortedMap { #[cfg(test)] mod tests { + use std::collections::hash_map::RandomState; use std::hash::{BuildHasher, Hasher}; use super::*; @@ -238,13 +239,13 @@ mod tests { #[test] fn iter_empty() { - let map: HashSortedMap = HashSortedMap::new(); + let map: HashSortedMap = HashSortedMap::<_, _, RandomState>::new(); assert_eq!(map.iter().count(), 0); } #[test] fn iter_yields_all_entries() { - let mut map = HashSortedMap::new(); + let mut map = HashSortedMap::<_, _, RandomState>::new(); for i in 0..100u32 { map.insert(i, i * 10); } @@ -272,7 +273,7 @@ mod tests { #[test] fn iter_mut_mutates_values() { - let mut map = HashSortedMap::new(); + let mut map = HashSortedMap::<_, _, RandomState>::new(); for i in 0..20u32 { map.insert(i, i); } @@ -286,7 +287,7 @@ mod tests { #[test] fn into_iter_yields_all() { - let mut map = HashSortedMap::new(); + let mut map = HashSortedMap::<_, _, RandomState>::new(); for i in 0..100u32 { map.insert(i, i * 3); } @@ -300,7 +301,8 @@ mod tests { #[test] fn into_iter_partial_consume_then_drop() { - let mut map: HashSortedMap = HashSortedMap::new(); + let mut map: HashSortedMap = + HashSortedMap::<_, _, RandomState>::new(); for i in 0..50u32 { map.insert(format!("key-{i}"), format!("val-{i}")); } @@ -313,7 +315,7 @@ mod tests { #[test] fn into_iter_empty() { - let map: HashSortedMap = HashSortedMap::new(); + let map: HashSortedMap = HashSortedMap::<_, _, RandomState>::new(); assert_eq!(map.into_iter().count(), 0); } @@ -333,7 +335,7 @@ mod tests { #[test] fn into_iter_after_grow() { - let mut map = HashSortedMap::with_capacity(1); + let mut map = HashSortedMap::<_, _, RandomState>::with_capacity(1); for i in 0..500u32 { map.insert(i, i); } @@ -358,7 +360,7 @@ mod tests { let counter = Rc::new(Cell::new(0usize)); let n = 100; { - let mut map = HashSortedMap::new(); + let mut map = HashSortedMap::<_, _, RandomState>::new(); for i in 0..n { map.insert(i, Tracked(counter.clone())); } @@ -372,7 +374,7 @@ mod tests { #[test] fn for_loop_ref() { - let mut map = HashSortedMap::new(); + let mut map = HashSortedMap::<_, _, RandomState>::new(); map.insert(1, "a"); map.insert(2, "b"); let mut count = 0; @@ -384,7 +386,7 @@ mod tests { #[test] fn for_loop_mut() { - let mut map = HashSortedMap::new(); + let mut map = HashSortedMap::<_, _, RandomState>::new(); map.insert(1u32, 10u32); map.insert(2, 20); for (_, v) in &mut map { @@ -396,7 +398,7 @@ mod tests { #[test] fn for_loop_owned() { - let mut map = HashSortedMap::new(); + let mut map = HashSortedMap::<_, _, RandomState>::new(); map.insert(1, 10); map.insert(2, 20); let mut sum = 0; diff --git a/crates/hash-sorted-map/src/lib.rs b/crates/hash-sorted-map/src/lib.rs index 3ff5461..7deb853 100644 --- a/crates/hash-sorted-map/src/lib.rs +++ b/crates/hash-sorted-map/src/lib.rs @@ -1,7 +1,35 @@ +//! A hash map whose groups are ordered by hash prefix, enabling efficient +//! sorted-order iteration and linear-time merging of two maps. +//! +//! [`HashSortedMap`] is a Swiss-table-inspired, insertion-only hash map. Its +//! groups are laid out by hash prefix, so visiting them in order yields entries +//! sorted by hashed key. This makes merging two maps a single linear scan and +//! lets serialization in hash-key order happen without an extra sort. +//! +//! Hashing is customized through the single-method [`SortingHash`] trait. Any +//! standard [`BuildHasher`](std::hash::BuildHasher) works out of the box via a +//! blanket implementation. +//! +//! ``` +//! use std::collections::hash_map::RandomState; +//! use hash_sorted_map::HashSortedMap; +//! +//! let mut map: HashSortedMap<_, _, RandomState> = HashSortedMap::new(); +//! map.insert("hello", 1); +//! map.insert("world", 2); +//! assert_eq!(map.get("hello"), Some(&1)); +//! +//! // Iterate in ascending hash order. +//! map.sort_by_hash(); +//! let entries: Vec<_> = map.iter().map(|(&k, &v)| (k, v)).collect(); +//! assert_eq!(entries.len(), 2); +//! ``` +#![warn(missing_docs)] + mod group; mod group_ops; mod hash_sorted_map; mod iter; -pub use hash_sorted_map::{Entry, HashSortedMap, OccupiedEntry, VacantEntry}; +pub use hash_sorted_map::{Entry, HashSortedMap, OccupiedEntry, SortingHash, VacantEntry}; pub use iter::{IntoIter, Iter, IterMut};